ossp-pkg/str/str_parse.c
/*
** OSSP str - String Handling
** Copyright (c) 1999-2005 Ralf S. Engelschall <rse@engelschall.com>
** Copyright (c) 1999-2005 The OSSP Project <http://www.ossp.org/>
**
** This file is part of OSSP str, a string handling and manipulation
** library which can be found at http://www.ossp.org/pkg/lib/str/.
**
** Permission to use, copy, modify, and distribute this software for
** any purpose with or without fee is hereby granted, provided that
** the above copyright notice and this permission notice appear in all
** copies.
**
** THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
** WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
** MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
** IN NO EVENT SHALL THE AUTHORS AND COPYRIGHT HOLDERS AND THEIR
** CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
** LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
** USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
** ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
** OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
** OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
** SUCH DAMAGE.
**
** str_parse.c: parsing functions
*/
#include "str_p.h"
/* compile a regular expression pattern from string into internal format */
static int
pattern_compile(
const char *ptr,
int len,
int opt,
pcre **p_pcre,
pcre_extra **p_pcre_extra)
{
const char *err_str;
char buf[128];
int err_pos;
char *cp;
if (ptr[len] == NUL) {
/* plain string, so we can speed up processing... */
*p_pcre = pcre_compile(ptr, opt, &err_str, &err_pos, NULL);
}
else {
/* ...else we have to create a temporary NUL-terminated string */
if (len < sizeof(buf)) {
/* either we use a local buffer to avoid malloc/free ping-pong... */
memcpy(buf, ptr, len);
buf[len] = NUL;
*p_pcre = pcre_compile(buf, opt, &err_str, &err_pos, NULL);
}
else {
/* ...or we have to actually allocate a memory chunk :-( */
if ((cp = malloc(len+1)) == NULL)
return FALSE;
memcpy(cp, ptr, len);
cp[len] = NUL;
*p_pcre = pcre_compile(cp, opt, &err_str, &err_pos, NULL);
free(cp);
}
}
if (*p_pcre == NULL)
return FALSE;
/* optionally study pattern */
if (p_pcre_extra != NULL) {
*p_pcre_extra = pcre_study(*p_pcre, 0, &err_str);
if (err_str != NULL) {
free(p_pcre);
return FALSE;
}
}
return TRUE;
}
/* the hash table entry in the pattern cache */
struct hash_entry {
struct hash_entry *next;
char *key;
int keylen;
pcre *p_pcre;
pcre_extra *p_pcre_extra;
};
/* size of the cache hash table; is prime */
#define HASH_SIZE 101
/* the pattern cache hash table */
static struct hash_entry *pattern_hash[HASH_SIZE];
/* initialization flag for hash table */
static int hash_initialized = FALSE;
/* initialize cache hash table */
static void
hash_init(void)
{
int i;
for (i = 0; i < HASH_SIZE; i++)
pattern_hash[i] = NULL;
return;
}
/* destroy cache hash table */
static void
hash_destroy(void)
{
int i;
struct hash_entry *he, *ohe;
for (i = 0; i < HASH_SIZE; i++) {
he = pattern_hash[i];
pattern_hash[i] = NULL;
while (he != NULL) {
ohe = he;
he = he->next;
if (ohe->key != NULL)
free(ohe->key);
free(ohe);
}
}
return;
}
/* the hashing function: a popular `times 33' hash */
static unsigned int
hash_func(
const char *key,
int keylen)
{
unsigned int h;
int i;
h = 0xDEAD;
for (i = 0; key[i] != NUL; i++)
h = ((((h<<5)+h)+key[i]) % HASH_SIZE);
return h;
}
/* cache a pattern */
static void
pattern_cache(
const char *key,
int keylen,
pcre *p_pcre,
pcre_extra *p_pcre_extra)
{
int h;
struct hash_entry *he, *che;
if ((he = (struct hash_entry *)malloc(sizeof(struct hash_entry))) == NULL)
return;
if ((he->key = malloc(keylen)) == NULL) {
free(he);
return;
}
he->next = NULL;
memcpy(he->key, key, keylen);
he->keylen = keylen;
he->p_pcre = p_pcre;
he->p_pcre_extra = p_pcre_extra;
h = hash_func(key, keylen);
if (pattern_hash[h] == NULL)
pattern_hash[h] = he;
else {
che = pattern_hash[h];
while (che->next != NULL)
che = che->next;
che->next = he;
}
return;
}
/* lookup a pattern */
static void
pattern_lookup(
const char *key,
int keylen,
pcre **p_pcre,
pcre_extra **p_pcre_extra)
{
int h;
struct hash_entry *he;
*p_pcre = NULL;
*p_pcre_extra = NULL;
h = hash_func(key, keylen);
he = pattern_hash[h];
while (he != NULL) {
if (he->keylen == keylen)
if (memcmp(he->key, key, keylen))
break;
he = he->next;
}
if (he == NULL)
return;
*p_pcre = he->p_pcre;
*p_pcre_extra = he->p_pcre_extra;
return;
}
static int
str_parse_flush_nop(
str_vformat_t *sf)
{
sf->data[2].i = sf->data[2].i + sf->data[1].i;
sf->curpos = (char *)sf->data[0].p;
return 0;
}
static int
str_parse_flush_str(
str_vformat_t *sf)
{
return -1;
}
static char *
str_parse_format(
str_vformat_t *sf,
char *cpPrefix,
char *cpPad,
int *ipStrLen,
char *cpBuf,
int nBufLen,
char *cpExtinfo,
int cFmt,
va_list ap)
{
char *pStr;
int n;
int *cap_vec;
int cap_num;
char *string;
pStr = NULL;
if (cFmt == 'R') {
if (cpExtinfo != NULL && str_isdigit(cpExtinfo[0]) && cpExtinfo[1] == NUL) {
n = cpExtinfo[0] - '0';
string = (char *)sf->data[3].p;
cap_vec = (int *)sf->data[4].p;
cap_num = sf->data[5].i;
if (n <= cap_num) {
if (cap_vec[(n*2)] != -1 && cap_vec[(n*2)+1] != -1) {
pStr = (char *)(string+cap_vec[(n*2)]);
*ipStrLen = (cap_vec[(n*2)+1] - cap_vec[(n*2)]);
}
}
}
}
return pStr;
}
/* the API parsing function */
int str_parse(const char *string, const char *pattern, ...)
{
va_list ap;
int rv;
va_start(ap, pattern);
rv = str_parse_va(string, pattern, ap);
va_end(ap);
return rv;
}
int str_parse_va(const char *string, const char *pattern, va_list ap)
{
pcre *p_pcre = NULL;
pcre_extra *p_pcre_extra = NULL;
const char *match_ptr;
int match_len;
int match_opt;
int match_once;
int match_1resbuf;
const char *subst_ptr;
int subst_len;
int *cap_vec;
int cap_num;
int cap_len;
char *cp;
char *cp2;
char **cpp;
char cb[2];
int n;
int i;
int k;
int l;
int ismop;
int issop;
char buf[128];
char buf2[128];
char *buf_ptr;
str_vformat_t sf;
va_list ap_temp;
/*
* Caching support
*/
/* hash table initialization */
if (!hash_initialized) {
hash_init();
atexit(hash_destroy);
hash_initialized = TRUE;
}
/* hash table destruction */
if (string == NULL && pattern == NULL) {
hash_destroy();
return 0;
}
/*
* Check input parameters
*/
if (string == NULL || pattern == NULL)
return -1;
/*
* Parse pattern
*/
match_ptr = NULL;
match_len = 0;
match_opt = 0;
match_once = FALSE;
match_1resbuf = FALSE;
subst_ptr = NULL;
subst_len = 0;
ismop = FALSE;
issop = FALSE;
cp = NULL; /* compiler happyness only */
cp2 = NULL; /* compiler happyness only */
/* determine type of pattern and remember important positions */
if (*pattern == 'm' && str_len(pattern) >= 3)
if ((cp = str_span(pattern, 0, "imsxob", STR_RIGHT)) > pattern+1)
if (*(pattern+1) == *cp)
ismop = TRUE;
if (!ismop)
if (*pattern == 's' && str_len(pattern) >= 4)
if ((cp = str_span(pattern, 0, "imsxo", STR_RIGHT)) > pattern+1)
if ((cb[0] = *cp, cb[1] = NUL,
cp2 = str_span(pattern, cp-pattern, cb, STR_RIGHT|STR_COMPLEMENT)) > pattern+1)
if (*(pattern+1) == *cp && *(pattern+1) == *cp2)
issop = TRUE;
/* finish parsing */
if (ismop) {
/* pattern is a match operation */
match_ptr = pattern + 2;
match_len = cp - match_ptr;
cp++;
for (i = 0; cp[i] != NUL; i++) {
switch (cp[i]) {
case 'i': match_opt |= PCRE_CASELESS; break;
case 'm': match_opt |= PCRE_MULTILINE; break;
case 's': match_opt |= PCRE_DOTALL; break;
case 'x': match_opt |= PCRE_EXTENDED; break;
case 'o': match_once = TRUE; break;
case 'b': match_1resbuf = TRUE; break;
default:
return -1;
}
}
}
else if (issop) {
/* pattern is a substitute operation */
match_ptr = pattern + 2;
match_len = cp2 - match_ptr;
subst_ptr = cp2 + 1;
subst_len = cp - subst_ptr;
cp++;
for (i = 0; cp[i] != NUL; i++) {
switch (cp[i]) {
case 'i': match_opt |= PCRE_CASELESS; break;
case 'm': match_opt |= PCRE_MULTILINE; break;
case 's': match_opt |= PCRE_DOTALL; break;
case 'x': match_opt |= PCRE_EXTENDED; break;
case 'o': match_once = TRUE; break;
default:
return -1;
}
}
}
else {
/* fallback: treat pattern as a match operation */
match_ptr = pattern;
match_len = str_len(pattern);
ismop = TRUE;
}
/*
* Compile pattern into internal PCRE structure
*/
if (match_once) {
/* optimized processing: up to factor 15(!) for complex regular expressions */
pattern_lookup(match_ptr, match_len, &p_pcre, &p_pcre_extra);
if (p_pcre == NULL) {
if (!pattern_compile(match_ptr, match_len, match_opt, &p_pcre, &p_pcre_extra))
return -1;
pattern_cache(match_ptr, match_len, p_pcre, p_pcre_extra);
}
}
else {
/* unoptimized processing */
p_pcre_extra = NULL;
if (!pattern_compile(match_ptr, match_len, match_opt, &p_pcre, NULL))
return -1;
}
/*
* Allocate storage for offset table of captured substrings
*/
cap_vec = NULL;
cap_len = 0;
cap_num = pcre_info(p_pcre, NULL, NULL);
if (cap_num > 0) {
cap_len = (cap_num+1)*3;
if ((cap_vec = (int *)malloc(cap_len*sizeof(int))) == NULL) {
if (p_pcre != NULL)
free(p_pcre);
if (p_pcre_extra != NULL)
free(p_pcre_extra);
return -1;
}
}
/*
* Perform the matching operation
*/
n = pcre_exec(p_pcre, p_pcre_extra, string, str_len(string), 0, 0, cap_vec, cap_len);
if (n < 0) {
if (cap_vec != NULL)
free(cap_vec);
if (p_pcre != NULL)
free(p_pcre);
if (p_pcre_extra != NULL)
free(p_pcre_extra);
if (n == PCRE_ERROR_NOMATCH)
return 0;
return -1;
}
/*
* Create either matching or substitution result
*/
if (ismop && cap_num > 0) {
/*
* extract captured substrings into caller provided pointer variables
*/
if (match_1resbuf) {
/* use a single result buffer */
l = 0;
for (i = 1; i <= cap_num && i <= (n-1); i++) {
if (cap_vec[(i*2)] != -1 && cap_vec[(i*2)+1] != -1) {
k = (cap_vec[(i*2)+1] - cap_vec[(i*2)]);
if (k > 0)
l += k+1;
}
}
cpp = va_arg(ap, char **);
if (cpp == NULL)
cpp = &cp;
if ((*cpp = malloc(l)) != NULL) {
cp = *cpp;
for (i = 1; i <= cap_num; i++) {
cpp = va_arg(ap, char **);
if (cpp != NULL) {
if (i <= (n-1)) {
if (cap_vec[(i*2)] != -1 && cap_vec[(i*2)+1] != -1) {
k = (cap_vec[(i*2)+1] - cap_vec[(i*2)]);
if (k > 0) {
memcpy(cp, (char *)(string+cap_vec[(i*2)]), k);
cp += k;
*cp++ = NUL;
continue;
}
}
}
*cpp = cp;
*cp++ = NUL;
}
}
}
}
else {
/* use multiple result buffers */
for (i = 1; i <= cap_num; i++) {
cpp = va_arg(ap, char **);
if (cpp != NULL) {
if (i <= (n-1)) {
if (cap_vec[(i*2)] != -1 && cap_vec[(i*2)+1] != -1) {
k = (cap_vec[(i*2)+1] - cap_vec[(i*2)]);
if (k > 0) {
if ((*cpp = malloc(k+1)) != NULL) {
memcpy(*cpp, (char *)(string+cap_vec[(i*2)]), k);
(*cpp)[k] = NUL;
continue;
}
}
}
}
*cpp = strdup("");
}
}
}
}
else if (issop) {
/*
* create a substitutional string with optional expansions
*/
/* determine required buffer len */
l = 0;
for (cp = (char *)subst_ptr; cp < (subst_ptr+subst_len); cp++, l++) {
if (*cp == '$') {
if (!(cp > subst_ptr && *(cp-1) == '\\')) {
if (cp < (subst_ptr+subst_len-1) && str_isdigit(*(cp+1))) {
cp += 1;
l += 4;
}
}
}
}
l++; /* NUL char */
/* allocate temp buffer */
if (l <= sizeof(buf))
buf_ptr = buf;
else
buf_ptr = (char *)malloc(l);
/* copy subst string into temp buffer and replace $N with %{N}R */
for (cp = (char *)subst_ptr, cp2 = buf_ptr; cp < (subst_ptr+subst_len); ) {
if (*cp == '$') {
if (!(cp > subst_ptr && *(cp-1) == '\\')) {
if (cp < (subst_ptr+subst_len-1) && str_isdigit(*(cp+1))) {
*cp2++ = '%';
*cp2++ = '{';
*cp2++ = *(cp+1);
*cp2++ = '}';
*cp2++ = 'R';
cp += 2;
continue;
}
}
}
*cp2++ = *cp++;
}
*cp2 = NUL;
/* remove output argument from varargs */
cpp = va_arg(ap, char **);
/* calculate output buffer requirement */
sf.curpos = buf2;
sf.endpos = buf2 + sizeof(buf2) - 1;
sf.flush = str_parse_flush_nop;
sf.format = str_parse_format;
sf.data[0].p = buf2;
sf.data[1].i = sizeof(buf2);
sf.data[2].i = 0;
sf.data[3].p = (char *)string;
sf.data[4].p = cap_vec;
sf.data[5].i = cap_num;
va_copy(ap_temp, ap);
l = str_vformat(&sf, buf_ptr, ap_temp);
/* allocate output buffer */
if ((*cpp = (char *)malloc(l+1)) == NULL) {
if (cap_vec != NULL)
free(cap_vec);
if (p_pcre != NULL)
free(p_pcre);
if (p_pcre_extra != NULL)
free(p_pcre_extra);
return -1; /* XXX */
}
/* finally expand the substitutions string into output buffer */
sf.curpos = *cpp;
sf.endpos = *cpp + l;
sf.flush = str_parse_flush_str;
sf.format = str_parse_format;
sf.data[3].p = (char *)string;
sf.data[4].p = cap_vec;
sf.data[5].i = cap_num;
str_vformat(&sf, buf_ptr, ap);
*((*cpp)+l) = NUL;
/* free temp buffer */
if (buf_ptr != buf)
free(buf_ptr);
}
/* cleanup */
if (cap_vec != NULL)
free(cap_vec);
if (p_pcre != NULL)
free(p_pcre);
if (p_pcre_extra != NULL)
free(p_pcre_extra);
/* return success */
return 1;
}