1 files changed, 119 insertions, 72 deletions
diff --git a/usr.sbin/nginx/src/pcre/pcre_compile.c b/usr.sbin/nginx/src/pcre/pcre_compile.c
index 8070f510266..1ffa2c5fb43 100644
--- a/usr.sbin/nginx/src/pcre/pcre_compile.c
+++ b/usr.sbin/nginx/src/pcre/pcre_compile.c
@@ -489,6 +489,9 @@ static const char error_texts[] =
   "too many forward references\0"
   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
   "invalid UTF-16 string\0"
+  /* 75 */
+  "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
+  "character value in \\u.... sequence is too large\0"
   ;
 
 /* Table to identify digits and hex digits. This is used when compiling
@@ -829,6 +832,18 @@ else
           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 #endif
           }
+
+#ifdef COMPILE_PCRE8
+        if (c > (utf ? 0x10ffff : 0xff))
+#else
+#ifdef COMPILE_PCRE16
+        if (c > (utf ? 0x10ffff : 0xffff))
+#endif
+#endif
+          {
+          *errorcodeptr = ERR76;
+          }
+        else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
         }
       }
     else
@@ -2225,32 +2240,60 @@ for (;;)
       {
       case OP_CHAR:
       case OP_CHARI:
+      case OP_NOT:
+      case OP_NOTI:
       case OP_EXACT:
       case OP_EXACTI:
+      case OP_NOTEXACT:
+      case OP_NOTEXACTI:
       case OP_UPTO:
       case OP_UPTOI:
+      case OP_NOTUPTO:
+      case OP_NOTUPTOI:
       case OP_MINUPTO:
       case OP_MINUPTOI:
+      case OP_NOTMINUPTO:
+      case OP_NOTMINUPTOI:
       case OP_POSUPTO:
       case OP_POSUPTOI:
+      case OP_NOTPOSUPTO:
+      case OP_NOTPOSUPTOI:
       case OP_STAR:
       case OP_STARI:
+      case OP_NOTSTAR:
+      case OP_NOTSTARI:
       case OP_MINSTAR:
       case OP_MINSTARI:
+      case OP_NOTMINSTAR:
+      case OP_NOTMINSTARI:
       case OP_POSSTAR:
       case OP_POSSTARI:
+      case OP_NOTPOSSTAR:
+      case OP_NOTPOSSTARI:
       case OP_PLUS:
       case OP_PLUSI:
+      case OP_NOTPLUS:
+      case OP_NOTPLUSI:
       case OP_MINPLUS:
       case OP_MINPLUSI:
+      case OP_NOTMINPLUS:
+      case OP_NOTMINPLUSI:
       case OP_POSPLUS:
       case OP_POSPLUSI:
+      case OP_NOTPOSPLUS:
+      case OP_NOTPOSPLUSI:
       case OP_QUERY:
       case OP_QUERYI:
+      case OP_NOTQUERY:
+      case OP_NOTQUERYI:
       case OP_MINQUERY:
       case OP_MINQUERYI:
+      case OP_NOTMINQUERY:
+      case OP_NOTMINQUERYI:
       case OP_POSQUERY:
       case OP_POSQUERYI:
+      case OP_NOTPOSQUERY:
+      case OP_NOTPOSQUERYI:
       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
       break;
       }
@@ -3069,22 +3112,28 @@ if (next >= 0) switch(op_code)
 #endif  /* SUPPORT_UTF */
   return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
 
-  /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
-  opcodes are not used for multi-byte characters, because they are coded using
-  an XCLASS instead. */
-
   case OP_NOT:
-  return (c = *previous) == next;
+#ifdef SUPPORT_UTF
+  GETCHARTEST(c, previous);
+#else
+  c = *previous;
+#endif
+  return c == next;
 
   case OP_NOTI:
-  if ((c = *previous) == next) return TRUE;
+#ifdef SUPPORT_UTF
+  GETCHARTEST(c, previous);
+#else
+  c = *previous;
+#endif
+  if (c == next) return TRUE;
 #ifdef SUPPORT_UTF
   if (utf)
     {
     unsigned int othercase;
     if (next < 128) othercase = cd->fcc[next]; else
 #ifdef SUPPORT_UCP
-    othercase = UCD_OTHERCASE(next);
+    othercase = UCD_OTHERCASE((unsigned int)next);
 #else
     othercase = NOTACHAR;
 #endif
@@ -3092,28 +3141,28 @@ if (next >= 0) switch(op_code)
     }
   else
 #endif  /* SUPPORT_UTF */
-  return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next)));  /* Non-UTF-8 mode */
+  return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
 
   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
 
   case OP_DIGIT:
-  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
+  return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
 
   case OP_NOT_DIGIT:
-  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
+  return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
 
   case OP_WHITESPACE:
-  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
+  return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
 
   case OP_NOT_WHITESPACE:
-  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
+  return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
 
   case OP_WORDCHAR:
-  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
+  return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
 
   case OP_NOT_WORDCHAR:
-  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
+  return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
 
   case OP_HSPACE:
   case OP_NOT_HSPACE:
@@ -3191,22 +3240,22 @@ switch(op_code)
   switch(-next)
     {
     case ESC_d:
-    return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
+    return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
 
     case ESC_D:
-    return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
+    return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
 
     case ESC_s:
-    return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
+    return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
 
     case ESC_S:
-    return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
+    return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
 
     case ESC_w:
-    return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
+    return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
 
     case ESC_W:
-    return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
+    return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
 
     case ESC_h:
     case ESC_H:
@@ -3315,10 +3364,10 @@ switch(op_code)
   return next == -ESC_d;
 
   case OP_WHITESPACE:
-  return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
+  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
 
   case OP_NOT_WHITESPACE:
-  return next == -ESC_s || next == -ESC_h || next == -ESC_v;
+  return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
 
   case OP_HSPACE:
   return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
@@ -4482,42 +4531,35 @@ for (;; ptr++)
       LONE_SINGLE_CHARACTER:
 
       /* Only the value of 1 matters for class_single_char. */
+
       if (class_single_char < 2) class_single_char++;
 
       /* If class_charcount is 1, we saw precisely one character. As long as
-      there were no negated characters >= 128 and there was no use of \p or \P,
-      in other words, no use of any XCLASS features, we can optimize.
-
-      In UTF-8 mode, we can optimize the negative case only if there were no
-      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
-      operate on single-bytes characters only. This is an historical hangover.
-      Maybe one day we can tidy these opcodes to handle multi-byte characters.
+      there was no use of \p or \P, in other words, no use of any XCLASS
+      features, we can optimize.
 
       The optimization throws away the bit map. We turn the item into a
       1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
-      Note that OP_NOT[I] does not support multibyte characters. In the positive
-      case, it can cause firstchar to be set. Otherwise, there can be no first
-      char if this item is first, whatever repeat count may follow. In the case
-      of reqchar, save the previous value for reinstating. */
+      In the positive case, it can cause firstchar to be set. Otherwise, there
+      can be no first char if this item is first, whatever repeat count may
+      follow. In the case of reqchar, save the previous value for reinstating. */
 
-#ifdef SUPPORT_UTF
-      if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
-        && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
-#else
       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
-#endif
         {
         ptr++;
         zeroreqchar = reqchar;
 
-        /* The OP_NOT[I] opcodes work on single characters only. */
-
         if (negate_class)
           {
           if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
           zerofirstchar = firstchar;
           *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
-          *code++ = c;
+#ifdef SUPPORT_UTF
+          if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
+            code += PRIV(ord2utf)(c, code);
+          else
+#endif
+            *code++ = c;
           goto NOT_CHAR;
           }
 
@@ -4775,15 +4817,23 @@ for (;; ptr++)
 
     /* Now handle repetition for the different types of item. */
 
-    /* If previous was a character match, abolish the item and generate a
-    repeat item instead. If a char item has a minumum of more than one, ensure
-    that it is set in reqchar - it might not be if a sequence such as x{3} is
-    the first thing in a branch because the x will have gone into firstchar
-    instead.  */
+    /* If previous was a character or negated character match, abolish the item
+    and generate a repeat item instead. If a char item has a minimum of more
+    than one, ensure that it is set in reqchar - it might not be if a sequence
+    such as x{3} is the first thing in a branch because the x will have gone
+    into firstchar instead.  */
 
-    if (*previous == OP_CHAR || *previous == OP_CHARI)
+    if (*previous == OP_CHAR || *previous == OP_CHARI
+        || *previous == OP_NOT || *previous == OP_NOTI)
       {
-      op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
+      switch (*previous)
+        {
+        default: /* Make compiler happy. */
+        case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
+        case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
+        case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
+        case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
+        }
 
       /* Deal with UTF characters that take up more than one character. It's
       easier to write this out separately than try to macrify it. Use c to
@@ -4806,7 +4856,8 @@ for (;; ptr++)
       with UTF disabled, or for a single character UTF character. */
         {
         c = code[-1];
-        if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
+        if (*previous <= OP_CHARI && repeat_min > 1)
+          reqchar = c | req_caseopt | cd->req_varyopt;
         }
 
       /* If the repetition is unlimited, it pays to see if the next thing on
@@ -4825,26 +4876,6 @@ for (;; ptr++)
       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
       }
 
-    /* If previous was a single negated character ([^a] or similar), we use
-    one of the special opcodes, replacing it. The code is shared with single-
-    character repeats by setting opt_type to add a suitable offset into
-    repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
-    are currently used only for single-byte chars. */
-
-    else if (*previous == OP_NOT || *previous == OP_NOTI)
-      {
-      op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
-      c = previous[1];
-      if (!possessive_quantifier &&
-          repeat_max < 0 &&
-          check_auto_possessive(previous, utf, ptr + 1, options, cd))
-        {
-        repeat_type = 0;    /* Force greedy */
-        possessive_quantifier = TRUE;
-        }
-      goto OUTPUT_SINGLE_REPEAT;
-      }
-
     /* If previous was a character type match (\d or similar), abolish it and
     create a suitable repeat item. The code is shared with single-character
     repeats by setting op_type to add a suitable offset into repeat_type. Note
@@ -5585,6 +5616,11 @@ for (;; ptr++)
         arg = ++ptr;
         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
         arglen = (int)(ptr - arg);
+        if (arglen > (int)MAX_MARK)
+          {
+          *errorcodeptr = ERR75;
+          goto FAILED;
+          }
         }
 
       if (*ptr != CHAR_RIGHT_PARENTHESIS)
@@ -6836,10 +6872,13 @@ for (;; ptr++)
       /* For the rest (including \X when Unicode properties are supported), we
       can obtain the OP value by negating the escape value in the default
       situation when PCRE_UCP is not set. When it *is* set, we substitute
-      Unicode property tests. */
+      Unicode property tests. Note that \b and \B do a one-character
+      lookbehind. */
 
       else
         {
+        if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
+          cd->max_lookbehind = 1;
 #ifdef SUPPORT_UCP
         if (-c >= ESC_DU && -c <= ESC_wu)
           {
@@ -7147,7 +7186,12 @@ for (;;)
         *ptrptr = ptr;
         return FALSE;
         }
-      else { PUT(reverse_count, 0, fixed_length); }
+      else
+        {
+        if (fixed_length > cd->max_lookbehind)
+          cd->max_lookbehind = fixed_length;
+        PUT(reverse_count, 0, fixed_length);
+        }
       }
     }
 
@@ -7817,6 +7861,7 @@ cd->start_pattern = (const pcre_uchar *)pattern;
 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
 cd->req_varyopt = 0;
 cd->assert_depth = 0;
+cd->max_lookbehind = 0;
 cd->external_options = options;
 cd->external_flags = 0;
 cd->open_caps = NULL;
@@ -7867,7 +7912,6 @@ re->magic_number = MAGIC_NUMBER;
 re->size = (int)size;
 re->options = cd->external_options;
 re->flags = cd->external_flags;
-re->dummy1 = 0;
 re->first_char = 0;
 re->req_char = 0;
 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
@@ -7887,6 +7931,7 @@ field; this time it's used for remembering forward references to subpatterns.
 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
 cd->assert_depth = 0;
 cd->bracount = 0;
+cd->max_lookbehind = 0;
 cd->names_found = 0;
 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
 codestart = cd->name_table + re->name_entry_size * re->name_count;
@@ -7908,6 +7953,7 @@ code = (pcre_uchar *)codestart;
   &firstchar, &reqchar, NULL, cd, NULL);
 re->top_bracket = cd->bracount;
 re->top_backref = cd->top_backref;
+re->max_lookbehind = cd->max_lookbehind;
 re->flags = cd->external_flags | PCRE_MODE;
 
 if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
@@ -7995,6 +8041,7 @@ if (cd->check_lookbehind)
                     (fixed_length == -4)? ERR70 : ERR25;
         break;
         }
+      if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
       PUT(cc, 1, fixed_length);
       }
     cc += 1 + LINK_SIZE;