/* ** OSSP str - String Handling ** Copyright (c) 1999-2005 Ralf S. Engelschall ** Copyright (c) 1999-2005 The OSSP Project ** ** This file is part of OSSP str, a string handling and manipulation ** library which can be found at http://www.ossp.org/pkg/lib/str/. ** ** Permission to use, copy, modify, and distribute this software for ** any purpose with or without fee is hereby granted, provided that ** the above copyright notice and this permission notice appear in all ** copies. ** ** THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED ** WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ** MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. ** IN NO EVENT SHALL THE AUTHORS AND COPYRIGHT HOLDERS AND THEIR ** CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ** LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF ** USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ** ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ** OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT ** OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ** SUCH DAMAGE. ** ** str_token.c: tokenizing functions */ #include "str_p.h" /* * str_token -- tokenize a string. * This is the implementation of our tokenization function str_token(3). * It is partly derived from an achient strqtok(3) function, written * 1991 by William Deich which itself was * already a superset of POSIX strtok(3). The main differences between * our str_token(3) and a plain POSIX strtok(3) is that our str_token(3) * is reentrant to support multithreading environments, supports * quoted tokens, can ignore trailing comments and is aware of ANSI-C * backslashed escape sequences and trigraphs. This way it is a lot * more powerful and useful in practice than a stock POSIX strtok(3) or * similar functions. */ /* * isoneof -- check whether c is one of the chars in c */ static int isoneof( register char c, register const char *s) { for (; *s != NUL; s++) if (*s == c) return TRUE; return FALSE; } /* * nextchar -- get next character from a string */ static char * nextchar( register char *s, /* string from which to collect character(s) */ register char *c, /* return character here */ int bTrigraphs, /* whether to interpret trigraphs according to ANSI rules */ int *bBackslashed) /* return FALSE if character was started with a backslash */ { register char ch; if ((*bBackslashed = (*s == '\\'))) { /* * ANSI C backslashed escape sequence ("\x") */ switch (*++s) { case 'a': *c = '\a'; break; case 'b': *c = '\b'; break; case 'f': *c = '\f'; break; case 'n': *c = '\n'; break; case 'r': *c = '\r'; break; case 't': *c = '\t'; break; case 'v': *c = '\v'; break; case '\\': *c = '\\'; break; case '^': *c = '^'; break; case '\'': *c = '\''; break; case '"': *c = '"'; break; case '?': *c = '?'; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* convert octal digits into number */ ch = 0; if (str_isdigit(*s) && *s != '8' && *s != '9') { ch = *s++ - '0'; if (str_isdigit(*s) && *s != '8' && *s != '9') { ch <<= 3; ch |= *s++ - '0'; if (str_isdigit(*s) && *s != '8' && *s != '9') { ch <<= 3; ch |= *s++ - '0'; } } } s--; *c = ch; break; case 'x': /* convert hexadecimal digits into number */ s++; for (ch = 0; str_isxdigit(*s); s++) { ch <<= 4; ch |= str_isdigit(*s) ? *s - '0' : str_islower(*s) ? *s + 10 - 'a' : *s + 10 - 'A'; } s--; *c = ch; break; default: *c = *s; break; } } else if (bTrigraphs && (*s == '?') && (*(s + 1) == '?')) { /* * ANSI C trigraph ("??x") */ switch (*(s + 2)) { case '=': *c = '#'; s += 2; break; case '(': *c = '['; s += 2; break; case '/': *c = '\\'; s += 2; break; case ')': *c = ']'; s += 2; break; case '\'': *c = '^'; s += 2; break; case '<': *c = '{'; s += 2; break; case '!': *c = '|'; s += 2; break; case '>': *c = '}'; s += 2; break; case '-': *c = '~'; s += 2; break; default: /* not a trigraph sequence */ *c = *s; } *c = *s; } else { /* * Ordinary Character */ *c = *s; } return (*s != NUL) ? s + 1 : NULL; } /* * str_token -- the API tokenization function */ char * str_token( char **s, const char *cs_delim, const char *cs_quote, const char *cs_comment, int mode) { register char *p, *q; int bBackslashed, bInQuote, bInToken, bWithTrigraphs; char c, cLeftQuote; char *cpToken; /* argument checking */ if (s == NULL || *s == NULL) return NULL; if (cs_delim == NULL) return NULL; if (cs_quote == NULL) cs_quote = ""; if (cs_comment == NULL) cs_comment = ""; /* skip leading delimiters */ p = *s; while (*p != NUL && isoneof(*p, cs_delim)) p++; /* end of string, so stop parsing */ if (*p == NUL) return NULL; /* * start of comment reached, so stop parsing but update the parsing * cursor just in case the user wants to recover the comment */ if (isoneof(*p, cs_comment)) { *s = p; (*s)++; return NULL; } /* * Set `cpToken' to point to returned string. * Then use p and q to walk through the string: * - p will follow the input characters; * - q will overwrite string with output characters, * (minus possibly-stripped quotes and including NULs after tokens) */ cpToken = q = p; bInQuote = FALSE; bInToken = TRUE; bWithTrigraphs = (mode & STR_TRIGRAPHS); cLeftQuote = NUL; if ((mode & STR_BACKSLASHESC) || (mode & STR_TRIGRAPHS)) { /* * parse while recognizing backslash escapes */ while (bInToken && (p = nextchar(p, &c, bWithTrigraphs, &bBackslashed)) != NULL) { if (bBackslashed) { /* treat as plain character */ *q++ = c; } else if (!bInQuote && *cs_delim != NUL && isoneof(c, cs_delim)) { /* reached end of token */ *q = NUL; bInToken = FALSE; } else if (!bInQuote && *cs_comment != NUL && isoneof(c, cs_comment)) { /* reached end of token */ *q = NUL; *p = NUL; bInToken = FALSE; } else if (!bInQuote && *cs_quote != NUL && isoneof(c, cs_quote)) { /* beginning a quoted segment */ bInQuote = TRUE; cLeftQuote = c; if (!(mode & STR_STRIPQUOTES)) *q++ = c; } else if (bInQuote && cLeftQuote == c) { /* ending a quoted segment */ bInQuote = FALSE; if (!(mode & STR_STRIPQUOTES)) *q++ = cLeftQuote; } else { /* ordinary character */ *q++ = c; } } } else { /* * parse while ignoring backslash escapes */ while (bInToken && *p != NUL) { if (!bInQuote && *cs_delim != NUL && isoneof(*p, cs_delim)) { /* reached end of token */ *q = NUL; p++; bInToken = FALSE; } else if (!bInQuote && *cs_comment != NUL && isoneof(*p, cs_comment)) { /* reached end of token */ *q = NUL; *p = NUL; bInToken = FALSE; } else if (!bInQuote && *cs_quote != NUL && isoneof(*p, cs_quote)) { /* beginning a quoted segment */ bInQuote = TRUE; cLeftQuote = *p++; if (!(mode & STR_STRIPQUOTES)) *q++ = cLeftQuote; } else if (bInQuote && cLeftQuote == *p) { /* ending a quoted segment */ bInQuote = FALSE; p++; if (!(mode & STR_STRIPQUOTES)) *q++ = cLeftQuote; } else { /* ordinary character */ *q++ = *p++; } } } /* terminate token and update parsing cursor */ *q = NUL; *s = p; /* skip trailing delimiters (if requested only, else we do it on next round) */ if ((mode & STR_SKIPDELIMS) && *s != NULL) { while (*(*s) != NUL && isoneof(*(*s), cs_delim)) (*s)++; } /* return the resulting token */ return cpToken; }