OSSP: CVS Repository: ossp-pkg/str/str

ossp-pkg/str/str_token.c
/*
**  OSSP str - String Handling
**  Copyright (c) 1999-2005 Ralf S. Engelschall <rse@engelschall.com>
**  Copyright (c) 1999-2005 The OSSP Project <http://www.ossp.org/>
**
**  This file is part of OSSP str, a string handling and manipulation
**  library which can be found at http://www.ossp.org/pkg/lib/str/.
**
**  Permission to use, copy, modify, and distribute this software for
**  any purpose with or without fee is hereby granted, provided that
**  the above copyright notice and this permission notice appear in all
**  copies.
**
**  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
**  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
**  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
**  IN NO EVENT SHALL THE AUTHORS AND COPYRIGHT HOLDERS AND THEIR
**  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
**  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
**  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
**  USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
**  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
**  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
**  OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
**  SUCH DAMAGE.
**
**  str_token.c: tokenizing functions
*/

#include "str_p.h"

/*
 * str_token -- tokenize a string.
 * This is the implementation of our tokenization function str_token(3).
 * It is partly derived from an achient strqtok(3) function, written
 * 1991 by William Deich <will@surya.caltech.edu> which itself was
 * already a superset of POSIX strtok(3). The main differences between
 * our str_token(3) and a plain POSIX strtok(3) is that our str_token(3)
 * is reentrant to support multithreading environments, supports
 * quoted tokens, can ignore trailing comments and is aware of ANSI-C
 * backslashed escape sequences and trigraphs. This way it is a lot
 * more powerful and useful in practice than a stock POSIX strtok(3) or
 * similar functions.
 */

/*
 * isoneof -- check whether c is one of the chars in c
 */
static int
isoneof(
    register char c,
    register const char *s)
{
    for (; *s != NUL; s++)
        if (*s == c)
            return TRUE;
    return FALSE;
}

/*
 * nextchar -- get next character from a string
 */
static char *
nextchar(
    register char *s,  /* string from which to collect character(s) */
    register char *c,  /* return character here */
    int bTrigraphs,    /* whether to interpret trigraphs according to ANSI rules */
    int *bBackslashed) /* return FALSE if character was started with a backslash */
{
    register char ch;

    if ((*bBackslashed = (*s == '\\'))) {
        /*
         * ANSI C backslashed escape sequence ("\x")
         */
        switch (*++s) {
            case 'a':  *c = '\a'; break;
            case 'b':  *c = '\b'; break;
            case 'f':  *c = '\f'; break;
            case 'n':  *c = '\n'; break;
            case 'r':  *c = '\r'; break;
            case 't':  *c = '\t'; break;
            case 'v':  *c = '\v'; break;
            case '\\': *c = '\\'; break;
            case '^':  *c = '^';  break;
            case '\'': *c = '\''; break;
            case '"':  *c = '"';  break;
            case '?':  *c = '?';  break;
            case '0': case '1': case '2':
            case '3': case '4': case '5':
            case '6': case '7':
                /* convert octal digits into number */
                ch = 0;
                if (str_isdigit(*s) && *s != '8' && *s != '9') {
                    ch = *s++ - '0';
                    if (str_isdigit(*s) && *s != '8' && *s != '9') {
                        ch <<= 3;
                        ch |= *s++ - '0';
                        if (str_isdigit(*s) && *s != '8' && *s != '9') {
                            ch <<= 3;
                            ch |= *s++ - '0';
                        }
                    }
                }
                s--;
                *c = ch;
                break;
            case 'x':
                /* convert hexadecimal digits into number */
                s++;
                for (ch = 0; str_isxdigit(*s); s++) {
                    ch <<= 4;
                    ch |= str_isdigit(*s) ? *s - '0' :
                          str_islower(*s) ? *s + 10 - 'a' : *s + 10 - 'A';
                }
                s--;
                *c = ch;
                break;
            default:
                *c = *s;
                break;
        }
    }
    else if (bTrigraphs && (*s == '?') && (*(s + 1) == '?')) {
        /*
         * ANSI C trigraph ("??x")
         */
        switch (*(s + 2)) {
            case '=':  *c = '#';  s += 2; break;
            case '(':  *c = '[';  s += 2; break;
            case '/':  *c = '\\'; s += 2; break;
            case ')':  *c = ']';  s += 2; break;
            case '\'': *c = '^';  s += 2; break;
            case '<':  *c = '{';  s += 2; break;
            case '!':  *c = '|';  s += 2; break;
            case '>':  *c = '}';  s += 2; break;
            case '-':  *c = '~';  s += 2; break;
            default:
                /* not a trigraph sequence */
                *c = *s;
        }
        *c = *s;
    }
    else {
        /*
         * Ordinary Character
         */
        *c = *s;
    }
    return (*s != NUL) ? s + 1 : NULL;
}

/*
 * str_token -- the API tokenization function
 */
char *
str_token(
    char **s,
    const char *cs_delim,
    const char *cs_quote,
    const char *cs_comment,
    int mode)
{
    register char *p, *q;
    int bBackslashed, bInQuote, bInToken, bWithTrigraphs;
    char c, cLeftQuote;
    char *cpToken;

    /* argument checking */
    if (s == NULL || *s == NULL)
        return NULL;
    if (cs_delim == NULL)
        return NULL;
    if (cs_quote == NULL)
        cs_quote = "";
    if (cs_comment == NULL)
        cs_comment = "";

    /* skip leading delimiters */
    p = *s;
    while (*p != NUL && isoneof(*p, cs_delim))
        p++;

    /* end of string, so stop parsing */
    if (*p == NUL)
        return NULL;

    /*
     * start of comment reached, so stop parsing but update the parsing
     * cursor just in case the user wants to recover the comment
     */
    if (isoneof(*p, cs_comment)) {
        *s = p;
        (*s)++;
        return NULL;
    }

    /*
     * Set `cpToken' to point to returned string.
     * Then use p and q to walk through the string:
     *  - p will follow the input characters;
     *  - q will overwrite string with output characters,
     *      (minus possibly-stripped quotes and including NULs after tokens)
     */

    cpToken  = q = p;
    bInQuote = FALSE;
    bInToken = TRUE;
    bWithTrigraphs = (mode & STR_TRIGRAPHS);
    cLeftQuote = NUL;

    if ((mode & STR_BACKSLASHESC) || (mode & STR_TRIGRAPHS)) {
        /*
         * parse while recognizing backslash escapes
         */
        while (bInToken && (p = nextchar(p, &c, bWithTrigraphs, &bBackslashed)) != NULL) {
            if (bBackslashed) {
                /* treat as plain character */
                *q++ = c;
            }
            else if (!bInQuote && *cs_delim != NUL && isoneof(c, cs_delim)) {
                /* reached end of token */
                *q = NUL;
                bInToken = FALSE;
            }
            else if (!bInQuote && *cs_comment != NUL && isoneof(c, cs_comment)) {
                /* reached end of token */
                *q = NUL;
                *p = NUL;
                bInToken = FALSE;
            }
            else if (!bInQuote && *cs_quote != NUL && isoneof(c, cs_quote)) {
                /* beginning a quoted segment */
                bInQuote = TRUE;
                cLeftQuote = c;
                if (!(mode & STR_STRIPQUOTES))
                    *q++ = c;
            }
            else if (bInQuote && cLeftQuote == c) {
                /* ending a quoted segment */
                bInQuote = FALSE;
                if (!(mode & STR_STRIPQUOTES))
                    *q++ = cLeftQuote;
            }
            else {
                /* ordinary character */
                *q++ = c;
            }
        }
    }
    else {
        /*
         * parse while ignoring backslash escapes
         */
        while (bInToken && *p != NUL) {
            if (!bInQuote && *cs_delim != NUL && isoneof(*p, cs_delim)) {
                /* reached end of token */
                *q = NUL;
                p++;
                bInToken = FALSE;
            }
            else if (!bInQuote && *cs_comment != NUL && isoneof(*p, cs_comment)) {
                /* reached end of token */
                *q = NUL;
                *p = NUL;
                bInToken = FALSE;
            }
            else if (!bInQuote && *cs_quote != NUL && isoneof(*p, cs_quote)) {
                /* beginning a quoted segment */
                bInQuote = TRUE;
                cLeftQuote = *p++;
                if (!(mode & STR_STRIPQUOTES))
                    *q++ = cLeftQuote;
            }
            else if (bInQuote && cLeftQuote == *p) {
                /* ending a quoted segment */
                bInQuote = FALSE;
                p++;
                if (!(mode & STR_STRIPQUOTES))
                    *q++ = cLeftQuote;
            }
            else {
                /* ordinary character */
                *q++ = *p++;
            }
        }
    }

    /* terminate token and update parsing cursor */
    *q = NUL;
    *s = p;

    /* skip trailing delimiters
       (if requested only, else we do it on next round) */
    if ((mode & STR_SKIPDELIMS) && *s != NULL) {
        while (*(*s) != NUL && isoneof(*(*s), cs_delim))
            (*s)++;
    }

    /* return the resulting token */
    return cpToken;
}
OSSP CVS Repository