OSSP CVS Repository

ossp - Difference in ossp-pkg/pcre/pcre.c versions 1.7 and 1.8
Not logged in
[Honeypot]  [Browse]  [Home]  [Login]  [Reports
[Search]  [Ticket]  [Timeline
  [History

ossp-pkg/pcre/pcre.c 1.7 -> 1.8

--- pcre.c       2000/08/29 19:24:17     1.7
+++ pcre.c       2002/01/07 15:21:06     1.8
@@ -9,7 +9,7 @@
 
 Written by: Philip Hazel <ph10@cam.ac.uk>
 
-           Copyright (c) 1997-2000 University of Cambridge
+           Copyright (c) 1997-2001 University of Cambridge
 
 -----------------------------------------------------------------------------
 Permission is granted to anyone to use this software for any purpose on any
@@ -60,8 +60,11 @@
 #endif
 
 
-/* Number of items on the nested bracket stacks at compile time. This should
-not be set greater than 200. */
+/* Maximum number of items on the nested bracket stacks at compile time. This
+applies to the nesting of all kinds of parentheses. It does not limit
+un-nested, non-capturing parentheses. This number can be made bigger if
+necessary - it is used to dimension one int and one unsigned char vector at
+compile time. */
 
 #define BRASTACK_SIZE 200
 
@@ -95,7 +98,7 @@
   "class", "Ref", "Recurse",
   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
-  "Brazero", "Braminzero", "Bra"
+  "Brazero", "Braminzero", "Branumber", "Bra"
 };
 #endif
 
@@ -111,9 +114,9 @@
     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
-  '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */
-    0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
-    0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */
+  '`',      7, -ESC_b,      0, -ESC_d,  ESC_E,  ESC_F,      0,   /* ` - g */
+    0,      0,      0,      0,      0,      0,  ESC_N,      0,   /* h - o */
+    0,      0,  ESC_R, -ESC_s,  ESC_T,      0,      0, -ESC_w,   /* p - w */
     0,      0, -ESC_z                                            /* x - z */
 };
 
@@ -208,12 +211,12 @@
   if (md->utf8 && (c & 0xc0) == 0xc0) \
     { \
     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int s = 6 - a;                  /* Amount to shift next byte */  \
-    c &= utf8_table3[a];            /* Low order bits from first byte */ \
+    int s = 6*a; \
+    c = (c & utf8_table3[a]) << s; \
     while (a-- > 0) \
       { \
+      s -= 6; \
       c |= (*eptr++ & 0x3f) << s; \
-      s += 6; \
       } \
     }
 
@@ -224,14 +227,14 @@
   len = 1; \
   if (md->utf8 && (c & 0xc0) == 0xc0) \
     { \
-    int _i; \
+    int i; \
     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int s = 6 - a;                  /* Amount to shift next byte */  \
-    c &= utf8_table3[a];            /* Low order bits from first byte */ \
-    for (_i = 1; _i <= a; _i++) \
+    int s = 6*a; \
+    c = (c & utf8_table3[a]) << s; \
+    for (i = 1; i <= a; i++) \
       { \
+      s -= 6; \
       c |= (eptr[i] & 0x3f) << s; \
-      s += 6; \
       } \
     len += a; \
     }
@@ -258,6 +261,7 @@
 #include "pcre_chartables.c"
 
 
+
 #ifdef SUPPORT_UTF8
 /*************************************************
 *           Tables for UTF-8 support             *
@@ -305,13 +309,13 @@
 register int i, j;
 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
   if (cvalue <= utf8_table1[i]) break;
-*buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
-cvalue >>= 6 - i;
-for (j = 0; j < i; j++)
-  {
-  *buffer++ = 0x80 | (cvalue & 0x3f);
-  cvalue >>= 6;
-  }
+buffer += i;
+for (j = i; j > 0; j--)
+ {
+ *buffer-- = 0x80 | (cvalue & 0x3f);
+ cvalue >>= 6;
+ }
+*buffer = utf8_table2[i] | cvalue;
 return i + 1;
 }
 #endif
@@ -813,10 +817,11 @@
     /* Skip over things that don't match chars */
 
     case OP_REVERSE:
+    case OP_BRANUMBER:
+    case OP_CREF:
     cc++;
     /* Fall through */
 
-    case OP_CREF:
     case OP_OPT:
     cc++;
     /* Fall through */
@@ -870,7 +875,7 @@
     /* Check a class for variable quantification */
 
     case OP_CLASS:
-    cc += (*cc == OP_REF)? 2 : 33;
+    cc += 33;
 
     switch (*cc)
       {
@@ -977,7 +982,7 @@
 
 Arguments:
   options      the option bits
-  brackets     points to number of brackets used
+  brackets     points to number of extracting brackets used
   code         points to the pointer to the current code point
   ptrptr       points to the current pattern pointer
   errorptr     points to pointer to error message
@@ -1028,7 +1033,7 @@
   int class_charcount;
   int class_lastchar;
   int newoptions;
-  int condref;
+  int skipbytes;
   int subreqchar;
 
   c = *ptr;
@@ -1577,7 +1582,7 @@
       OP_BRAZERO in front of it, and because the group appears once in the
       data, whereas in other cases it appears the minimum number of times. For
       this reason, it is simplest to treat this case separately, as otherwise
-      the code gets far too mess. There are several special subcases when the
+      the code gets far too messy. There are several special subcases when the
       minimum is zero. */
 
       if (repeat_min == 0)
@@ -1728,7 +1733,7 @@
 
     case '(':
     newoptions = options;
-    condref = -1;
+    skipbytes = 0;
 
     if (*(++ptr) == '?')
       {
@@ -1751,7 +1756,7 @@
         bravalue = OP_COND;       /* Conditional group */
         if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
           {
-          condref = *ptr - '0';
+          int condref = *ptr - '0';
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
           if (condref == 0)
             {
@@ -1759,6 +1764,10 @@
             goto FAILED;
             }
           ptr++;
+          code[3] = OP_CREF;
+          code[4] = condref >> 8;
+          code[5] = condref & 255;
+          skipbytes = 3;
           }
         else ptr--;
         break;
@@ -1861,16 +1870,21 @@
         }
       }
 
-    /* Else we have a referencing group; adjust the opcode. */
+    /* Else we have a referencing group; adjust the opcode. If the bracket
+    number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
+    arrange for the true number to follow later, in an OP_BRANUMBER item. */
 
     else
       {
-      if (++(*brackets) > EXTRACT_MAX)
+      if (++(*brackets) > EXTRACT_BASIC_MAX)
         {
-        *errorptr = ERR13;
-        goto FAILED;
+        bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
+        code[3] = OP_BRANUMBER;
+        code[4] = *brackets >> 8;
+        code[5] = *brackets & 255;
+        skipbytes = 3;
         }
-      bravalue = OP_BRA + *brackets;
+      else bravalue = OP_BRA + *brackets;
       }
 
     /* Process nested bracketed re. Assertions may not be repeated, but other
@@ -1886,13 +1900,13 @@
          options | PCRE_INGROUP,       /* Set for all nested groups */
          ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
            newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
-         brackets,                     /* Bracket level */
+         brackets,                     /* Extracting bracket count */
          &tempcode,                    /* Where to put code (updated) */
          &ptr,                         /* Input pointer (updated) */
          errorptr,                     /* Where to put an error message */
          (bravalue == OP_ASSERTBACK ||
           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
-         condref,                      /* Condition reference number */
+         skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
          &subreqchar,                  /* For possible last char */
          &subcountlits,                /* For literal count */
          cd))                          /* Tables block */
@@ -1906,7 +1920,7 @@
     /* If this is a conditional bracket, check that there are no more than
     two branches in the group. */
 
-    if (bravalue == OP_COND)
+    else if (bravalue == OP_COND)
       {
       uschar *tc = code;
       condcount = 0;
@@ -1973,9 +1987,11 @@
       {
       if (-c >= ESC_REF)
         {
+        int number = -c - ESC_REF;
         previous = code;
         *code++ = OP_REF;
-        *code++ = -c - ESC_REF;
+        *code++ = number >> 8;
+        *code++ = number & 255;
         }
       else
         {
@@ -2099,7 +2115,7 @@
   ptrptr      -> the address of the current pattern pointer
   errorptr    -> pointer to error message
   lookbehind  TRUE if this is a lookbehind assertion
-  condref     >= 0 for OPT_CREF setting at start of conditional group
+  skipbytes   skip this many bytes at start (for OP_COND, OP_BRANUMBER)
   reqchar     -> place to put the last required character, or a negative number
   countlits   -> place to put the shortest literal count of any branch
   cd          points to the data block with tables pointers
@@ -2109,7 +2125,7 @@
 
 static BOOL
 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
-  const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
+  const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
   int *reqchar, int *countlits, compile_data *cd)
 {
 const uschar *ptr = *ptrptr;
@@ -2122,16 +2138,7 @@
 
 *reqchar = -1;
 *countlits = INT_MAX;
-code += 3;
-
-/* At the start of a reference-based conditional group, insert the reference
-number as an OP_CREF item. */
-
-if (condref >= 0)
-  {
-  *code++ = OP_CREF;
-  *code++ = condref;
-  }
+code += 3 + skipbytes;
 
 /* Loop for each alternative branch */
 
@@ -2283,7 +2290,8 @@
     break;
 
     case OP_CREF:
-    code += 2;
+    case OP_BRANUMBER:
+    code += 3;
     break;
 
     case OP_WORD_BOUNDARY:
@@ -2546,6 +2554,7 @@
   {
   int min, max;
   int class_charcount;
+  int bracket_length;
 
   if ((options & PCRE_EXTENDED) != 0)
     {
@@ -2580,7 +2589,7 @@
       }
     length++;
 
-    /* A back reference needs an additional char, plus either one or 5
+    /* A back reference needs an additional 2 bytes, plus either one or 5
     bytes for a repeat. We also need to keep the value of the highest
     back reference. */
 
@@ -2588,7 +2597,7 @@
       {
       int refnum = -c - ESC_REF;
       if (refnum > top_backref) top_backref = refnum;
-      length++;   /* For single back reference */
+      length += 2;   /* For single back reference */
       if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
         {
         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
@@ -2686,6 +2695,7 @@
 
     case '(':
     branch_newextra = 0;
+    bracket_length = 3;
 
     /* Handle special forms of bracket, which all start (? */
 
@@ -2753,7 +2763,7 @@
         if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
           {
           ptr += 4;
-          length += 2;
+          length += 3;
           while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
           if (*ptr != ')')
             {
@@ -2880,15 +2890,19 @@
       }
 
     /* Extracting brackets must be counted so we can process escapes in a
-    Perlish way. */
+    Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
+    need an additional 3 bytes of store per extracting bracket. */
 
-    else bracount++;
+    else
+      {
+      bracount++;
+      if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
+      }
 
-    /* Non-special forms of bracket. Save length for computing whole length
-    at end if there's a repeat that requires duplication of the group. Also
-    save the current value of branch_extra, and start the new group with
-    the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
-    for a lookbehind assertion. */
+    /* Save length for computing whole length at end if there's a repeat that
+    requires duplication of the group. Also save the current value of
+    branch_extra, and start the new group with the new value. If non-zero, this
+    will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
 
     if (brastackptr >= sizeof(brastack)/sizeof(int))
       {
@@ -2900,7 +2914,7 @@
     branch_extra = branch_newextra;
 
     brastack[brastackptr++] = length;
-    length += 3;
+    length += bracket_length;
     continue;
 
     /* Handle ket. Look for subsequent max/min; for certain sets of values we
@@ -3061,7 +3075,7 @@
 code = re->code;
 *code = OP_BRA;
 bracount = 0;
-(void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
+(void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
   &reqchar, &countlits, &compile_block);
 re->top_bracket = bracount;
 re->top_backref = top_backref;
@@ -3175,7 +3189,10 @@
 
   if (*code >= OP_BRA)
     {
-    printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
+    if (*code - OP_BRA > EXTRACT_BASIC_MAX)
+      printf("%3d Bra extra", (code[1] << 8) + code[2]);
+    else
+      printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
     code += 2;
     }
 
@@ -3186,16 +3203,6 @@
     code++;
     break;
 
-    case OP_COND:
-    printf("%3d Cond", (code[1] << 8) + code[2]);
-    code += 2;
-    break;
-
-    case OP_CREF:
-    printf(" %.2d %s", code[1], OP_names[*code]);
-    code++;
-    break;
-
     case OP_CHARS:
     charlength = *(++code);
     printf("%3d ", charlength);
@@ -3212,11 +3219,10 @@
     case OP_ASSERTBACK:
     case OP_ASSERTBACK_NOT:
     case OP_ONCE:
-    printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
-    code += 2;
-    break;
-
     case OP_REVERSE:
+    case OP_BRANUMBER:
+    case OP_COND:
+    case OP_CREF:
     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
     code += 2;
     break;
@@ -3289,8 +3295,8 @@
     break;
 
     case OP_REF:
-    printf("    \\%d", *(++code));
-    code ++;
+    printf("    \\%d", (code[1] << 8) | code[2]);
+    code += 3;
     goto CLASS_REF_REPEAT;
 
     case OP_CLASS:
@@ -3503,8 +3509,14 @@
 
   if (op > OP_BRA)
     {
+    int offset;
     int number = op - OP_BRA;
-    int offset = number << 1;
+
+    /* For extended extraction brackets (large number), we have to fish out the
+    number from a dummy opcode at the start. */
+
+    if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
+    offset = number << 1;
 
 #ifdef DEBUG
     printf("start bracket %d subject=", number);
@@ -3534,6 +3546,7 @@
       md->offset_vector[offset] = save_offset1;
       md->offset_vector[offset+1] = save_offset2;
       md->offset_vector[md->offset_end - number] = save_offset3;
+
       return FALSE;
       }
 
@@ -3566,10 +3579,10 @@
     case OP_COND:
     if (ecode[3] == OP_CREF)         /* Condition is extraction test */
       {
-      int offset = ecode[4] << 1;    /* Doubled reference number */
+      int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
       return match(eptr,
         ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
-          5 : 3 + (ecode[1] << 8) + ecode[2]),
+          6 : 3 + (ecode[1] << 8) + ecode[2]),
         offset_top, md, ims, eptrb, match_isgroup);
       }
 
@@ -3589,10 +3602,12 @@
       }
     /* Control never reaches here */
 
-    /* Skip over conditional reference data if encountered (should not be) */
+    /* Skip over conditional reference or large extraction number data if
+    encountered. */
 
     case OP_CREF:
-    ecode += 2;
+    case OP_BRANUMBER:
+    ecode += 3;
     break;
 
     /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
@@ -3858,8 +3873,14 @@
 
       if (*prev != OP_COND)
         {
+        int offset;
         int number = *prev - OP_BRA;
-        int offset = number << 1;
+
+        /* For extended extraction brackets (large number), we have to fish out
+        the number from a dummy opcode at the start. */
+
+        if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
+        offset = number << 1;
 
 #ifdef DEBUG
         printf("end bracket %d", number);
@@ -4053,8 +4074,8 @@
     case OP_REF:
       {
       int length;
-      int offset = ecode[1] << 1;                /* Doubled reference number */
-      ecode += 2;                                /* Advance past the item */
+      int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
+      ecode += 3;                                     /* Advance past item */
 
       /* If the reference is unset, set the length to be longer than the amount
       of subject left; this ensures that every attempt at a match fails. We
@@ -4878,8 +4899,8 @@
 const real_pcre *re = (const real_pcre *)external_re;
 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
 BOOL using_temporary_offsets = FALSE;
-BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
-BOOL startline = (re->options & PCRE_STARTLINE) != 0;
+BOOL anchored;
+BOOL startline;
 
 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
 
@@ -4887,6 +4908,9 @@
    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 
+anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
+startline = (re->options & PCRE_STARTLINE) != 0;
+
 match_block.start_pattern = re->code;
 match_block.start_subject = (const uschar *)subject;
 match_block.end_subject = match_block.start_subject + length;
@@ -5120,7 +5144,7 @@
 
   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
 
-  if (match_block.offset_end < 2) rc = 0; else
+  if (offsetcount < 2) rc = 0; else
     {
     offsets[0] = start_match - match_block.start_subject;
     offsets[1] = match_block.end_match_ptr - match_block.start_subject;

CVSTrac 2.0.1