ossp-pkg/str/str_token.c
/*
** OSSP str - String Handling
** Copyright (c) 1999-2005 Ralf S. Engelschall <rse@engelschall.com>
** Copyright (c) 1999-2005 The OSSP Project <http://www.ossp.org/>
**
** This file is part of OSSP str, a string handling and manipulation
** library which can be found at http://www.ossp.org/pkg/lib/str/.
**
** Permission to use, copy, modify, and distribute this software for
** any purpose with or without fee is hereby granted, provided that
** the above copyright notice and this permission notice appear in all
** copies.
**
** THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
** WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
** MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
** IN NO EVENT SHALL THE AUTHORS AND COPYRIGHT HOLDERS AND THEIR
** CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
** LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
** USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
** ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
** OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
** OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
** SUCH DAMAGE.
**
** str_token.c: tokenizing functions
*/
#include "str_p.h"
/*
* str_token -- tokenize a string.
* This is the implementation of our tokenization function str_token(3).
* It is partly derived from an achient strqtok(3) function, written
* 1991 by William Deich <will@surya.caltech.edu> which itself was
* already a superset of POSIX strtok(3). The main differences between
* our str_token(3) and a plain POSIX strtok(3) is that our str_token(3)
* is reentrant to support multithreading environments, supports
* quoted tokens, can ignore trailing comments and is aware of ANSI-C
* backslashed escape sequences and trigraphs. This way it is a lot
* more powerful and useful in practice than a stock POSIX strtok(3) or
* similar functions.
*/
/*
* isoneof -- check whether c is one of the chars in c
*/
static int
isoneof(
register char c,
register const char *s)
{
for (; *s != NUL; s++)
if (*s == c)
return TRUE;
return FALSE;
}
/*
* nextchar -- get next character from a string
*/
static char *
nextchar(
register char *s, /* string from which to collect character(s) */
register char *c, /* return character here */
int bTrigraphs, /* whether to interpret trigraphs according to ANSI rules */
int *bBackslashed) /* return FALSE if character was started with a backslash */
{
register char ch;
if ((*bBackslashed = (*s == '\\'))) {
/*
* ANSI C backslashed escape sequence ("\x")
*/
switch (*++s) {
case 'a': *c = '\a'; break;
case 'b': *c = '\b'; break;
case 'f': *c = '\f'; break;
case 'n': *c = '\n'; break;
case 'r': *c = '\r'; break;
case 't': *c = '\t'; break;
case 'v': *c = '\v'; break;
case '\\': *c = '\\'; break;
case '^': *c = '^'; break;
case '\'': *c = '\''; break;
case '"': *c = '"'; break;
case '?': *c = '?'; break;
case '0': case '1': case '2':
case '3': case '4': case '5':
case '6': case '7':
/* convert octal digits into number */
ch = 0;
if (str_isdigit(*s) && *s != '8' && *s != '9') {
ch = *s++ - '0';
if (str_isdigit(*s) && *s != '8' && *s != '9') {
ch <<= 3;
ch |= *s++ - '0';
if (str_isdigit(*s) && *s != '8' && *s != '9') {
ch <<= 3;
ch |= *s++ - '0';
}
}
}
s--;
*c = ch;
break;
case 'x':
/* convert hexadecimal digits into number */
s++;
for (ch = 0; str_isxdigit(*s); s++) {
ch <<= 4;
ch |= str_isdigit(*s) ? *s - '0' :
str_islower(*s) ? *s + 10 - 'a' : *s + 10 - 'A';
}
s--;
*c = ch;
break;
default:
*c = *s;
break;
}
}
else if (bTrigraphs && (*s == '?') && (*(s + 1) == '?')) {
/*
* ANSI C trigraph ("??x")
*/
switch (*(s + 2)) {
case '=': *c = '#'; s += 2; break;
case '(': *c = '['; s += 2; break;
case '/': *c = '\\'; s += 2; break;
case ')': *c = ']'; s += 2; break;
case '\'': *c = '^'; s += 2; break;
case '<': *c = '{'; s += 2; break;
case '!': *c = '|'; s += 2; break;
case '>': *c = '}'; s += 2; break;
case '-': *c = '~'; s += 2; break;
default:
/* not a trigraph sequence */
*c = *s;
}
*c = *s;
}
else {
/*
* Ordinary Character
*/
*c = *s;
}
return (*s != NUL) ? s + 1 : NULL;
}
/*
* str_token -- the API tokenization function
*/
char *
str_token(
char **s,
const char *cs_delim,
const char *cs_quote,
const char *cs_comment,
int mode)
{
register char *p, *q;
int bBackslashed, bInQuote, bInToken, bWithTrigraphs;
char c, cLeftQuote;
char *cpToken;
/* argument checking */
if (s == NULL || *s == NULL)
return NULL;
if (cs_delim == NULL)
return NULL;
if (cs_quote == NULL)
cs_quote = "";
if (cs_comment == NULL)
cs_comment = "";
/* skip leading delimiters */
p = *s;
while (*p != NUL && isoneof(*p, cs_delim))
p++;
/* end of string, so stop parsing */
if (*p == NUL)
return NULL;
/*
* start of comment reached, so stop parsing but update the parsing
* cursor just in case the user wants to recover the comment
*/
if (isoneof(*p, cs_comment)) {
*s = p;
(*s)++;
return NULL;
}
/*
* Set `cpToken' to point to returned string.
* Then use p and q to walk through the string:
* - p will follow the input characters;
* - q will overwrite string with output characters,
* (minus possibly-stripped quotes and including NULs after tokens)
*/
cpToken = q = p;
bInQuote = FALSE;
bInToken = TRUE;
bWithTrigraphs = (mode & STR_TRIGRAPHS);
cLeftQuote = NUL;
if ((mode & STR_BACKSLASHESC) || (mode & STR_TRIGRAPHS)) {
/*
* parse while recognizing backslash escapes
*/
while (bInToken && (p = nextchar(p, &c, bWithTrigraphs, &bBackslashed)) != NULL) {
if (bBackslashed) {
/* treat as plain character */
*q++ = c;
}
else if (!bInQuote && *cs_delim != NUL && isoneof(c, cs_delim)) {
/* reached end of token */
*q = NUL;
bInToken = FALSE;
}
else if (!bInQuote && *cs_comment != NUL && isoneof(c, cs_comment)) {
/* reached end of token */
*q = NUL;
*p = NUL;
bInToken = FALSE;
}
else if (!bInQuote && *cs_quote != NUL && isoneof(c, cs_quote)) {
/* beginning a quoted segment */
bInQuote = TRUE;
cLeftQuote = c;
if (!(mode & STR_STRIPQUOTES))
*q++ = c;
}
else if (bInQuote && cLeftQuote == c) {
/* ending a quoted segment */
bInQuote = FALSE;
if (!(mode & STR_STRIPQUOTES))
*q++ = cLeftQuote;
}
else {
/* ordinary character */
*q++ = c;
}
}
}
else {
/*
* parse while ignoring backslash escapes
*/
while (bInToken && *p != NUL) {
if (!bInQuote && *cs_delim != NUL && isoneof(*p, cs_delim)) {
/* reached end of token */
*q = NUL;
p++;
bInToken = FALSE;
}
else if (!bInQuote && *cs_comment != NUL && isoneof(*p, cs_comment)) {
/* reached end of token */
*q = NUL;
*p = NUL;
bInToken = FALSE;
}
else if (!bInQuote && *cs_quote != NUL && isoneof(*p, cs_quote)) {
/* beginning a quoted segment */
bInQuote = TRUE;
cLeftQuote = *p++;
if (!(mode & STR_STRIPQUOTES))
*q++ = cLeftQuote;
}
else if (bInQuote && cLeftQuote == *p) {
/* ending a quoted segment */
bInQuote = FALSE;
p++;
if (!(mode & STR_STRIPQUOTES))
*q++ = cLeftQuote;
}
else {
/* ordinary character */
*q++ = *p++;
}
}
}
/* terminate token and update parsing cursor */
*q = NUL;
*s = p;
/* skip trailing delimiters
(if requested only, else we do it on next round) */
if ((mode & STR_SKIPDELIMS) && *s != NULL) {
while (*(*s) != NUL && isoneof(*(*s), cs_delim))
(*s)++;
}
/* return the resulting token */
return cpToken;
}