diff options
| author | 2012-02-18 11:28:28 +0000 | |
|---|---|---|
| committer | 2012-02-18 11:28:28 +0000 | |
| commit | 4c64784e1754a12bcee2849f2480afe311febcce (patch) | |
| tree | 2ea4aa1a762ff8e0460b68efef87581dc37a6aee /usr.sbin/nginx/src | |
| parent | update to 1.0.12 (diff) | |
| download | wireguard-openbsd-4c64784e1754a12bcee2849f2480afe311febcce.tar.xz wireguard-openbsd-4c64784e1754a12bcee2849f2480afe311febcce.zip | |
update pcre to 8.30 to be in sync with ports
Diffstat (limited to 'usr.sbin/nginx/src')
| -rw-r--r-- | usr.sbin/nginx/src/pcre/config.h | 24 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre.h | 203 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_chartables.c | 2 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_compile.c | 1980 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_exec.c | 1058 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_fullinfo.c | 47 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_globals.c | 22 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_internal.h | 775 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_newline.c | 50 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_ord2utf8.c | 30 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_tables.c | 50 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_try_flipped.c | 139 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_ucd.c | 16 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_valid_utf8.c | 12 | ||||
| -rw-r--r-- | usr.sbin/nginx/src/pcre/pcre_xclass.c | 54 |
15 files changed, 2756 insertions, 1706 deletions
diff --git a/usr.sbin/nginx/src/pcre/config.h b/usr.sbin/nginx/src/pcre/config.h index b55d68b23b5..0bb73f32e01 100644 --- a/usr.sbin/nginx/src/pcre/config.h +++ b/usr.sbin/nginx/src/pcre/config.h @@ -31,8 +31,8 @@ them both to 0; an emulation function will be used. */ character codes, define this macro as 1. On systems that can use "configure", this can be done via --enable-ebcdic. PCRE will then assume that all input strings are in EBCDIC. If you do not define this macro, PCRE - will assume input strings are ASCII or UTF-8 Unicode. It is not possible to - build a version of PCRE that supports both EBCDIC and UTF-8. */ + will assume input strings are ASCII or UTF-8/16 Unicode. It is not possible + to build a version of PCRE that supports both EBCDIC and UTF-8/16. */ /* #undef EBCDIC */ /* Define to 1 if you have the `bcopy' function. */ @@ -190,7 +190,7 @@ them both to 0; an emulation function will be used. */ #define PACKAGE_NAME "PCRE" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "PCRE 8.20" +#define PACKAGE_STRING "PCRE 8.30" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "pcre" @@ -199,7 +199,7 @@ them both to 0; an emulation function will be used. */ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "8.20" +#define PACKAGE_VERSION "8.30" /* The value of PCREGREP_BUFSIZE determines the size of buffer used by pcregrep to hold parts of the file it is searching. On systems that support @@ -251,20 +251,26 @@ them both to 0; an emulation function will be used. */ handle .gz files. */ /* #undef SUPPORT_LIBZ */ +/* Define to enable the 16 bit PCRE library. */ +#define SUPPORT_PCRE16 /**/ + +/* Define to enable the 8 bit PCRE library. */ +#define SUPPORT_PCRE8 /**/ + /* Define to enable JIT support in pcregrep. */ /* #undef SUPPORT_PCREGREP_JIT */ /* Define to enable support for Unicode properties. */ #define SUPPORT_UCP /**/ -/* Define to enable support for the UTF-8 Unicode encoding. This will work +/* Define to enable support for the UTF-8/16 Unicode encoding. This will work even in an EBCDIC environment, but it is incompatible with the EBCDIC - macro. That is, PCRE can support *either* EBCDIC code *or* ASCII/UTF-8, but - not both at once. */ -#define SUPPORT_UTF8 /**/ + macro. That is, PCRE can support *either* EBCDIC code *or* ASCII/UTF-8/16, + but not both at once. */ +#define SUPPORT_UTF /**/ /* Version number of package */ -#define VERSION "8.20" +#define VERSION "8.30" /* Define to empty if `const' does not conform to ANSI C. */ /* #undef const */ diff --git a/usr.sbin/nginx/src/pcre/pcre.h b/usr.sbin/nginx/src/pcre/pcre.h index 58ea327e9b2..712bd3d714b 100644 --- a/usr.sbin/nginx/src/pcre/pcre.h +++ b/usr.sbin/nginx/src/pcre/pcre.h @@ -5,7 +5,7 @@ /* This is the public header file for the PCRE library, to be #included by applications that call the PCRE functions. - Copyright (c) 1997-2011 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE. /* The current PCRE version information. */ #define PCRE_MAJOR 8 -#define PCRE_MINOR 21 +#define PCRE_MINOR 30 #define PCRE_PRERELEASE -#define PCRE_DATE 2011-12-12 +#define PCRE_DATE 2012-02-04 /* When an application links to a PCRE DLL in Windows, the symbols that are imported have to be identified as such. When building PCRE, the appropriate @@ -116,9 +116,13 @@ compiling). */ #define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */ #define PCRE_UNGREEDY 0x00000200 /* Compile */ #define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */ -#define PCRE_UTF8 0x00000800 /* Compile, used in exec, DFA exec */ +/* The next two are also used in exec and DFA exec */ +#define PCRE_UTF8 0x00000800 /* Compile (same as PCRE_UTF16) */ +#define PCRE_UTF16 0x00000800 /* Compile (same as PCRE_UTF8) */ #define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */ -#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */ +/* The next two are also used in exec and DFA exec */ +#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile (same as PCRE_NO_UTF16_CHECK) */ +#define PCRE_NO_UTF16_CHECK 0x00002000 /* Compile (same as PCRE_NO_UTF8_CHECK) */ #define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */ #define PCRE_PARTIAL_SOFT 0x00008000 /* Exec, DFA exec */ #define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */ @@ -142,34 +146,39 @@ compiling). */ /* Exec-time and get/set-time error codes */ -#define PCRE_ERROR_NOMATCH (-1) -#define PCRE_ERROR_NULL (-2) -#define PCRE_ERROR_BADOPTION (-3) -#define PCRE_ERROR_BADMAGIC (-4) -#define PCRE_ERROR_UNKNOWN_OPCODE (-5) -#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */ -#define PCRE_ERROR_NOMEMORY (-6) -#define PCRE_ERROR_NOSUBSTRING (-7) -#define PCRE_ERROR_MATCHLIMIT (-8) -#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */ -#define PCRE_ERROR_BADUTF8 (-10) -#define PCRE_ERROR_BADUTF8_OFFSET (-11) -#define PCRE_ERROR_PARTIAL (-12) -#define PCRE_ERROR_BADPARTIAL (-13) -#define PCRE_ERROR_INTERNAL (-14) -#define PCRE_ERROR_BADCOUNT (-15) -#define PCRE_ERROR_DFA_UITEM (-16) -#define PCRE_ERROR_DFA_UCOND (-17) -#define PCRE_ERROR_DFA_UMLIMIT (-18) -#define PCRE_ERROR_DFA_WSSIZE (-19) -#define PCRE_ERROR_DFA_RECURSE (-20) -#define PCRE_ERROR_RECURSIONLIMIT (-21) -#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */ -#define PCRE_ERROR_BADNEWLINE (-23) -#define PCRE_ERROR_BADOFFSET (-24) -#define PCRE_ERROR_SHORTUTF8 (-25) -#define PCRE_ERROR_RECURSELOOP (-26) -#define PCRE_ERROR_JIT_STACKLIMIT (-27) +#define PCRE_ERROR_NOMATCH (-1) +#define PCRE_ERROR_NULL (-2) +#define PCRE_ERROR_BADOPTION (-3) +#define PCRE_ERROR_BADMAGIC (-4) +#define PCRE_ERROR_UNKNOWN_OPCODE (-5) +#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */ +#define PCRE_ERROR_NOMEMORY (-6) +#define PCRE_ERROR_NOSUBSTRING (-7) +#define PCRE_ERROR_MATCHLIMIT (-8) +#define PCRE_ERROR_CALLOUT (-9) /* Never used by PCRE itself */ +#define PCRE_ERROR_BADUTF8 (-10) /* Same for 8/16 */ +#define PCRE_ERROR_BADUTF16 (-10) /* Same for 8/16 */ +#define PCRE_ERROR_BADUTF8_OFFSET (-11) /* Same for 8/16 */ +#define PCRE_ERROR_BADUTF16_OFFSET (-11) /* Same for 8/16 */ +#define PCRE_ERROR_PARTIAL (-12) +#define PCRE_ERROR_BADPARTIAL (-13) +#define PCRE_ERROR_INTERNAL (-14) +#define PCRE_ERROR_BADCOUNT (-15) +#define PCRE_ERROR_DFA_UITEM (-16) +#define PCRE_ERROR_DFA_UCOND (-17) +#define PCRE_ERROR_DFA_UMLIMIT (-18) +#define PCRE_ERROR_DFA_WSSIZE (-19) +#define PCRE_ERROR_DFA_RECURSE (-20) +#define PCRE_ERROR_RECURSIONLIMIT (-21) +#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */ +#define PCRE_ERROR_BADNEWLINE (-23) +#define PCRE_ERROR_BADOFFSET (-24) +#define PCRE_ERROR_SHORTUTF8 (-25) +#define PCRE_ERROR_SHORTUTF16 (-25) /* Same for 8/16 */ +#define PCRE_ERROR_RECURSELOOP (-26) +#define PCRE_ERROR_JIT_STACKLIMIT (-27) +#define PCRE_ERROR_BADMODE (-28) +#define PCRE_ERROR_BADENDIANNESS (-29) /* Specific error codes for UTF-8 validity checks */ @@ -196,6 +205,14 @@ compiling). */ #define PCRE_UTF8_ERR20 20 #define PCRE_UTF8_ERR21 21 +/* Specific error codes for UTF-16 validity checks */ + +#define PCRE_UTF16_ERR0 0 +#define PCRE_UTF16_ERR1 1 +#define PCRE_UTF16_ERR2 2 +#define PCRE_UTF16_ERR3 3 +#define PCRE_UTF16_ERR4 4 + /* Request types for pcre_fullinfo() */ #define PCRE_INFO_OPTIONS 0 @@ -231,13 +248,15 @@ compatible. */ #define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7 #define PCRE_CONFIG_BSR 8 #define PCRE_CONFIG_JIT 9 +#define PCRE_CONFIG_UTF16 10 +#define PCRE_CONFIG_JITTARGET 11 /* Request types for pcre_study(). Do not re-arrange, in order to remain compatible. */ #define PCRE_STUDY_JIT_COMPILE 0x0001 -/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine +/* Bit flags for the pcre[16]_extra structure. Do not re-arrange or redefine these bits, just add new ones on the end, in order to remain compatible. */ #define PCRE_EXTRA_STUDY_DATA 0x0001 @@ -253,9 +272,26 @@ these bits, just add new ones on the end, in order to remain compatible. */ struct real_pcre; /* declaration; the definition is private */ typedef struct real_pcre pcre; +struct real_pcre16; /* declaration; the definition is private */ +typedef struct real_pcre16 pcre16; + struct real_pcre_jit_stack; /* declaration; the definition is private */ typedef struct real_pcre_jit_stack pcre_jit_stack; +struct real_pcre16_jit_stack; /* declaration; the definition is private */ +typedef struct real_pcre16_jit_stack pcre16_jit_stack; + +/* If PCRE is compiled with 16 bit character support, PCRE_UCHAR16 must contain +a 16 bit wide signed data type. Otherwise it can be a dummy data type since +pcre16 functions are not implemented. There is a check for this in pcre_internal.h. */ +#ifndef PCRE_UCHAR16 +#define PCRE_UCHAR16 unsigned short +#endif + +#ifndef PCRE_SPTR16 +#define PCRE_SPTR16 const PCRE_UCHAR16 * +#endif + /* When PCRE is compiled as a C++ library, the subject pointer type can be replaced with a custom type. For conventional use, the public interface is a const char *. */ @@ -279,6 +315,19 @@ typedef struct pcre_extra { void *executable_jit; /* Contains a pointer to a compiled jit code */ } pcre_extra; +/* Same structure as above, but with 16 bit char pointers. */ + +typedef struct pcre16_extra { + unsigned long int flags; /* Bits for which fields are set */ + void *study_data; /* Opaque data from pcre_study() */ + unsigned long int match_limit; /* Maximum number of calls to match() */ + void *callout_data; /* Data passed back in callouts */ + const unsigned char *tables; /* Pointer to character tables */ + unsigned long int match_limit_recursion; /* Max recursive calls to match() */ + PCRE_UCHAR16 **mark; /* For passing back a mark pointer */ + void *executable_jit; /* Contains a pointer to a compiled jit code */ +} pcre16_extra; + /* The structure for passing out data via the pcre_callout_function. We use a structure so that new fields can be added on the end in future versions, without changing the API of the function, thereby allowing old clients to work @@ -304,6 +353,28 @@ typedef struct pcre_callout_block { /* ------------------------------------------------------------------ */ } pcre_callout_block; +/* Same structure as above, but with 16 bit char pointers. */ + +typedef struct pcre16_callout_block { + int version; /* Identifies version of block */ + /* ------------------------ Version 0 ------------------------------- */ + int callout_number; /* Number compiled into pattern */ + int *offset_vector; /* The offset vector */ + PCRE_SPTR16 subject; /* The subject being matched */ + int subject_length; /* The length of the subject */ + int start_match; /* Offset to start of this match attempt */ + int current_position; /* Where we currently are in the subject */ + int capture_top; /* Max current capture */ + int capture_last; /* Most recently closed capture */ + void *callout_data; /* Data passed in with the call */ + /* ------------------- Added for Version 1 -------------------------- */ + int pattern_position; /* Offset to next item in the pattern */ + int next_item_length; /* Length of next item in the pattern */ + /* ------------------- Added for Version 2 -------------------------- */ + const PCRE_UCHAR16 *mark; /* Pointer to current mark or NULL */ + /* ------------------------------------------------------------------ */ +} pcre16_callout_block; + /* Indirection for store get and free functions. These can be set to alternative malloc/free functions if required. Special ones are used in the non-recursive case for "frames". There is also an optional callout function @@ -316,58 +387,114 @@ PCRE_EXP_DECL void (*pcre_free)(void *); PCRE_EXP_DECL void *(*pcre_stack_malloc)(size_t); PCRE_EXP_DECL void (*pcre_stack_free)(void *); PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *); + +PCRE_EXP_DECL void *(*pcre16_malloc)(size_t); +PCRE_EXP_DECL void (*pcre16_free)(void *); +PCRE_EXP_DECL void *(*pcre16_stack_malloc)(size_t); +PCRE_EXP_DECL void (*pcre16_stack_free)(void *); +PCRE_EXP_DECL int (*pcre16_callout)(pcre16_callout_block *); #else /* VPCOMPAT */ PCRE_EXP_DECL void *pcre_malloc(size_t); PCRE_EXP_DECL void pcre_free(void *); PCRE_EXP_DECL void *pcre_stack_malloc(size_t); PCRE_EXP_DECL void pcre_stack_free(void *); PCRE_EXP_DECL int pcre_callout(pcre_callout_block *); + +PCRE_EXP_DECL void *pcre16_malloc(size_t); +PCRE_EXP_DECL void pcre16_free(void *); +PCRE_EXP_DECL void *pcre16_stack_malloc(size_t); +PCRE_EXP_DECL void pcre16_stack_free(void *); +PCRE_EXP_DECL int pcre16_callout(pcre16_callout_block *); #endif /* VPCOMPAT */ /* User defined callback which provides a stack just before the match starts. */ typedef pcre_jit_stack *(*pcre_jit_callback)(void *); +typedef pcre16_jit_stack *(*pcre16_jit_callback)(void *); /* Exported PCRE functions */ PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *, const unsigned char *); +PCRE_EXP_DECL pcre16 *pcre16_compile(PCRE_SPTR16, int, const char **, int *, + const unsigned char *); PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **, int *, const unsigned char *); +PCRE_EXP_DECL pcre16 *pcre16_compile2(PCRE_SPTR16, int, int *, const char **, + int *, const unsigned char *); PCRE_EXP_DECL int pcre_config(int, void *); +PCRE_EXP_DECL int pcre16_config(int, void *); PCRE_EXP_DECL int pcre_copy_named_substring(const pcre *, const char *, int *, int, const char *, char *, int); -PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, char *, - int); +PCRE_EXP_DECL int pcre16_copy_named_substring(const pcre16 *, PCRE_SPTR16, + int *, int, PCRE_SPTR16, PCRE_UCHAR16 *, int); +PCRE_EXP_DECL int pcre_copy_substring(const char *, int *, int, int, + char *, int); +PCRE_EXP_DECL int pcre16_copy_substring(PCRE_SPTR16, int *, int, int, + PCRE_UCHAR16 *, int); PCRE_EXP_DECL int pcre_dfa_exec(const pcre *, const pcre_extra *, const char *, int, int, int, int *, int , int *, int); +PCRE_EXP_DECL int pcre16_dfa_exec(const pcre16 *, const pcre16_extra *, + PCRE_SPTR16, int, int, int, int *, int , int *, int); PCRE_EXP_DECL int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR, int, int, int, int *, int); +PCRE_EXP_DECL int pcre16_exec(const pcre16 *, const pcre16_extra *, + PCRE_SPTR16, int, int, int, int *, int); PCRE_EXP_DECL void pcre_free_substring(const char *); +PCRE_EXP_DECL void pcre16_free_substring(PCRE_SPTR16); PCRE_EXP_DECL void pcre_free_substring_list(const char **); +PCRE_EXP_DECL void pcre16_free_substring_list(PCRE_SPTR16 *); PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int, void *); +PCRE_EXP_DECL int pcre16_fullinfo(const pcre16 *, const pcre16_extra *, int, + void *); PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *, int *, int, const char *, const char **); +PCRE_EXP_DECL int pcre16_get_named_substring(const pcre16 *, PCRE_SPTR16, + int *, int, PCRE_SPTR16, PCRE_SPTR16 *); PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *); +PCRE_EXP_DECL int pcre16_get_stringnumber(const pcre16 *, PCRE_SPTR16); PCRE_EXP_DECL int pcre_get_stringtable_entries(const pcre *, const char *, char **, char **); +PCRE_EXP_DECL int pcre16_get_stringtable_entries(const pcre16 *, PCRE_SPTR16, + PCRE_UCHAR16 **, PCRE_UCHAR16 **); PCRE_EXP_DECL int pcre_get_substring(const char *, int *, int, int, const char **); +PCRE_EXP_DECL int pcre16_get_substring(PCRE_SPTR16, int *, int, int, + PCRE_SPTR16 *); PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int, const char ***); -PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *); +PCRE_EXP_DECL int pcre16_get_substring_list(PCRE_SPTR16, int *, int, + PCRE_SPTR16 **); PCRE_EXP_DECL const unsigned char *pcre_maketables(void); +PCRE_EXP_DECL const unsigned char *pcre16_maketables(void); PCRE_EXP_DECL int pcre_refcount(pcre *, int); +PCRE_EXP_DECL int pcre16_refcount(pcre16 *, int); PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **); +PCRE_EXP_DECL pcre16_extra *pcre16_study(const pcre16 *, int, const char **); PCRE_EXP_DECL void pcre_free_study(pcre_extra *); +PCRE_EXP_DECL void pcre16_free_study(pcre16_extra *); PCRE_EXP_DECL const char *pcre_version(void); +PCRE_EXP_DECL const char *pcre16_version(void); + +/* Utility functions for byte order swaps. */ +PCRE_EXP_DECL int pcre_pattern_to_host_byte_order(pcre *, pcre_extra *, + const unsigned char *); +PCRE_EXP_DECL int pcre16_pattern_to_host_byte_order(pcre16 *, pcre16_extra *, + const unsigned char *); +PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *, + PCRE_SPTR16, int, int *, int); /* JIT compiler related functions. */ PCRE_EXP_DECL pcre_jit_stack *pcre_jit_stack_alloc(int, int); +PCRE_EXP_DECL pcre16_jit_stack *pcre16_jit_stack_alloc(int, int); PCRE_EXP_DECL void pcre_jit_stack_free(pcre_jit_stack *); -PCRE_EXP_DECL void pcre_assign_jit_stack(pcre_extra *, pcre_jit_callback, void *); +PCRE_EXP_DECL void pcre16_jit_stack_free(pcre16_jit_stack *); +PCRE_EXP_DECL void pcre_assign_jit_stack(pcre_extra *, + pcre_jit_callback, void *); +PCRE_EXP_DECL void pcre16_assign_jit_stack(pcre16_extra *, + pcre16_jit_callback, void *); #ifdef __cplusplus } /* extern "C" */ diff --git a/usr.sbin/nginx/src/pcre/pcre_chartables.c b/usr.sbin/nginx/src/pcre/pcre_chartables.c index 9117ae3c7fa..2a39e9ff33a 100644 --- a/usr.sbin/nginx/src/pcre/pcre_chartables.c +++ b/usr.sbin/nginx/src/pcre/pcre_chartables.c @@ -26,7 +26,7 @@ unit might reference this" and so it will always be supplied to the linker. */ #include "pcre_internal.h" -const unsigned char _pcre_default_tables[] = { +const pcre_uint8 PRIV(default_tables)[] = { /* This table is a lower casing table. */ diff --git a/usr.sbin/nginx/src/pcre/pcre_compile.c b/usr.sbin/nginx/src/pcre/pcre_compile.c index d3da5f62109..8070f510266 100644 --- a/usr.sbin/nginx/src/pcre/pcre_compile.c +++ b/usr.sbin/nginx/src/pcre/pcre_compile.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2011 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -53,12 +53,16 @@ supporting internal functions that are not used by other modules. */ #include "pcre_internal.h" -/* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is -also used by pcretest. PCRE_DEBUG is not defined when building a production -library. */ +/* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which +is also used by pcretest. PCRE_DEBUG is not defined when building a production +library. We do not need to select pcre16_printint.c specially, because the +COMPILE_PCREx macro will already be appropriately set. */ #ifdef PCRE_DEBUG -#include "pcre_printint.src" +/* pcre_printint.c should not include any headers */ +#define PCRE_INCLUDED +#include "pcre_printint.c" +#undef PCRE_INCLUDED #endif @@ -104,6 +108,14 @@ overrun before it actually does run off the end of the data block. */ #define WORK_SIZE_SAFETY_MARGIN (100) +/* Private flags added to firstchar and reqchar. */ + +#define REQ_CASELESS 0x10000000l /* Indicates caselessness */ +#define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */ + +/* Repeated character flags. */ + +#define UTF_LENGTH 0x10000000l /* The char contains its length. */ /* Table for handling escaped characters in the range '0'-'z'. Positive returns are simple data values; negative values are for special things like \d and so @@ -238,7 +250,7 @@ static const char posix_names[] = STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 STRING_word0 STRING_xdigit; -static const uschar posix_name_lengths[] = { +static const pcre_uint8 posix_name_lengths[] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; /* Table of class bit maps for each POSIX class. Each class is formed from a @@ -273,47 +285,101 @@ substitutes must be in the order of the names, defined above, and there are both positive and negative cases. NULL means no substitute. */ #ifdef SUPPORT_UCP -static const uschar *substitutes[] = { - (uschar *)"\\P{Nd}", /* \D */ - (uschar *)"\\p{Nd}", /* \d */ - (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */ - (uschar *)"\\p{Xsp}", /* \s */ - (uschar *)"\\P{Xwd}", /* \W */ - (uschar *)"\\p{Xwd}" /* \w */ +static const pcre_uchar string_PNd[] = { + CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, + CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_pNd[] = { + CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, + CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_PXsp[] = { + CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, + CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_pXsp[] = { + CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, + CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_PXwd[] = { + CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, + CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_pXwd[] = { + CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, + CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; + +static const pcre_uchar *substitutes[] = { + string_PNd, /* \D */ + string_pNd, /* \d */ + string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */ + string_pXsp, /* \s */ + string_PXwd, /* \W */ + string_pXwd /* \w */ }; -static const uschar *posix_substitutes[] = { - (uschar *)"\\p{L}", /* alpha */ - (uschar *)"\\p{Ll}", /* lower */ - (uschar *)"\\p{Lu}", /* upper */ - (uschar *)"\\p{Xan}", /* alnum */ - NULL, /* ascii */ - (uschar *)"\\h", /* blank */ - NULL, /* cntrl */ - (uschar *)"\\p{Nd}", /* digit */ - NULL, /* graph */ - NULL, /* print */ - NULL, /* punct */ - (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */ - (uschar *)"\\p{Xwd}", /* word */ - NULL, /* xdigit */ +static const pcre_uchar string_pL[] = { + CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, + CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_pLl[] = { + CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, + CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_pLu[] = { + CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, + CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_pXan[] = { + CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, + CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_h[] = { + CHAR_BACKSLASH, CHAR_h, '\0' }; +static const pcre_uchar string_pXps[] = { + CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, + CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_PL[] = { + CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, + CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_PLl[] = { + CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, + CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_PLu[] = { + CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, + CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_PXan[] = { + CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, + CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; +static const pcre_uchar string_H[] = { + CHAR_BACKSLASH, CHAR_H, '\0' }; +static const pcre_uchar string_PXps[] = { + CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, + CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; + +static const pcre_uchar *posix_substitutes[] = { + string_pL, /* alpha */ + string_pLl, /* lower */ + string_pLu, /* upper */ + string_pXan, /* alnum */ + NULL, /* ascii */ + string_h, /* blank */ + NULL, /* cntrl */ + string_pNd, /* digit */ + NULL, /* graph */ + NULL, /* print */ + NULL, /* punct */ + string_pXps, /* space */ /* NOTE: Xps is POSIX space */ + string_pXwd, /* word */ + NULL, /* xdigit */ /* Negated cases */ - (uschar *)"\\P{L}", /* ^alpha */ - (uschar *)"\\P{Ll}", /* ^lower */ - (uschar *)"\\P{Lu}", /* ^upper */ - (uschar *)"\\P{Xan}", /* ^alnum */ - NULL, /* ^ascii */ - (uschar *)"\\H", /* ^blank */ - NULL, /* ^cntrl */ - (uschar *)"\\P{Nd}", /* ^digit */ - NULL, /* ^graph */ - NULL, /* ^print */ - NULL, /* ^punct */ - (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */ - (uschar *)"\\P{Xwd}", /* ^word */ - NULL /* ^xdigit */ + string_PL, /* ^alpha */ + string_PLl, /* ^lower */ + string_PLu, /* ^upper */ + string_PXan, /* ^alnum */ + NULL, /* ^ascii */ + string_H, /* ^blank */ + NULL, /* ^cntrl */ + string_PNd, /* ^digit */ + NULL, /* ^graph */ + NULL, /* ^print */ + NULL, /* ^punct */ + string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */ + string_PXwd, /* ^word */ + NULL /* ^xdigit */ }; -#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *)) +#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *)) #endif #define STRING(a) # a @@ -372,7 +438,7 @@ static const char error_texts[] = /* 30 */ "unknown POSIX class name\0" "POSIX collating elements are not supported\0" - "this version of PCRE is not compiled with PCRE_UTF8 support\0" + "this version of PCRE is compiled without UTF support\0" "spare error\0" /** DEAD **/ "character value in \\x{...} sequence is too large\0" /* 35 */ @@ -395,7 +461,7 @@ static const char error_texts[] = "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" /* 50 */ "repeated subpattern is too long\0" /** DEAD **/ - "octal value is greater than \\377 (not in UTF-8 mode)\0" + "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0" "internal error: overran compiling workspace\0" "internal error: previously-checked referenced subpattern not found\0" "DEFINE group contains more than one branch\0" @@ -414,13 +480,15 @@ static const char error_texts[] = /* 65 */ "different names for subpatterns of the same number are not allowed\0" "(*MARK) must have an argument\0" - "this version of PCRE is not compiled with PCRE_UCP support\0" + "this version of PCRE is not compiled with Unicode property support\0" "\\c must be followed by an ASCII character\0" "\\k is not followed by a braced, angle-bracketed, or quoted name\0" /* 70 */ "internal error: unknown opcode in find_fixedlength()\0" "\\N is not supported in a class\0" "too many forward references\0" + "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" + "invalid UTF-16 string\0" ; /* Table to identify digits and hex digits. This is used when compiling @@ -439,12 +507,18 @@ For convenience, we use the same bit definitions as in chartables: Then we can use ctype_digit and ctype_xdigit in the code. */ +/* Using a simple comparison for decimal numbers rather than a memory read +is much faster, and the resulting code is simpler (the compiler turns it +into a subtraction and unsigned comparison). */ + +#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9) + #ifndef EBCDIC /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in UTF-8 mode. */ -static const unsigned char digitab[] = +static const pcre_uint8 digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ @@ -483,7 +557,7 @@ static const unsigned char digitab[] = /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ -static const unsigned char digitab[] = +static const pcre_uint8 digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ @@ -518,7 +592,7 @@ static const unsigned char digitab[] = 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */ 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ -static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ +static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */ 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */ 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */ @@ -557,7 +631,7 @@ static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ /* Definition to allow mutual recursion */ static BOOL - compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int, + compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int, int *, int *, branch_chain *, compile_data *, int *); @@ -604,7 +678,7 @@ Returns: 0 if all went well, else an error number static int expand_workspace(compile_data *cd) { -uschar *newspace; +pcre_uchar *newspace; int newsize = cd->workspace_size * 2; if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX; @@ -612,13 +686,12 @@ if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX || newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN) return ERR72; -newspace = (pcre_malloc)(newsize); +newspace = (PUBL(malloc))(IN_UCHARS(newsize)); if (newspace == NULL) return ERR21; - -memcpy(newspace, cd->start_workspace, cd->workspace_size); -cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace); +memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar)); +cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace); if (cd->workspace_size > COMPILE_WORK_SIZE) - (pcre_free)((void *)cd->start_workspace); + (PUBL(free))((void *)cd->start_workspace); cd->start_workspace = newspace; cd->workspace_size = newsize; return 0; @@ -642,17 +715,19 @@ Returns: TRUE or FALSE */ static BOOL -is_counted_repeat(const uschar *p) +is_counted_repeat(const pcre_uchar *p) { -if ((digitab[*p++] & ctype_digit) == 0) return FALSE; -while ((digitab[*p] & ctype_digit) != 0) p++; +if (!IS_DIGIT(*p)) return FALSE; +p++; +while (IS_DIGIT(*p)) p++; if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; if (*p++ != CHAR_COMMA) return FALSE; if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; -if ((digitab[*p++] & ctype_digit) == 0) return FALSE; -while ((digitab[*p] & ctype_digit) != 0) p++; +if (!IS_DIGIT(*p)) return FALSE; +p++; +while (IS_DIGIT(*p)) p++; return (*p == CHAR_RIGHT_CURLY_BRACKET); } @@ -684,12 +759,14 @@ Returns: zero or positive => a data character */ static int -check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, +check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass) { -BOOL utf8 = (options & PCRE_UTF8) != 0; -const uschar *ptr = *ptrptr + 1; -int c, i; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +BOOL utf = (options & PCRE_UTF8) != 0; +const pcre_uchar *ptr = *ptrptr + 1; +pcre_int32 c; +int i; GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ ptr--; /* Set pointer back to the last byte */ @@ -703,11 +780,13 @@ in a table. A non-zero result is something that can be returned immediately. Otherwise further processing may be required. */ #ifndef EBCDIC /* ASCII/UTF-8 coding */ -else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */ +/* Not alphanumeric */ +else if (c < CHAR_0 || c > CHAR_z) {} else if ((i = escapes[c - CHAR_0]) != 0) c = i; #else /* EBCDIC coding */ -else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */ +/* Not alphanumeric */ +else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {} else if ((i = escapes[c - 0x48]) != 0) c = i; #endif @@ -715,7 +794,7 @@ else if ((i = escapes[c - 0x48]) != 0) c = i; else { - const uschar *oldptr; + const pcre_uchar *oldptr; BOOL braced, negated; switch (c) @@ -733,8 +812,10 @@ else { /* In JavaScript, \u must be followed by four hexadecimal numbers. Otherwise it is a lowercase u letter. */ - if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0 - && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0) + if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0 + && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0 + && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0 + && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0) { c = 0; for (i = 0; i < 4; ++i) @@ -788,9 +869,9 @@ else if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { - const uschar *p; + const pcre_uchar *p; for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++) - if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break; + if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break; if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET) { c = -ESC_k; @@ -808,12 +889,21 @@ else } else negated = FALSE; + /* The integer range is limited by the machine's int representation. */ c = 0; - while ((digitab[ptr[1]] & ctype_digit) != 0) + while (IS_DIGIT(ptr[1])) + { + if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */ + { + c = -1; + break; + } c = c * 10 + *(++ptr) - CHAR_0; - - if (c < 0) /* Integer overflow */ + } + if (((unsigned int)c) > INT_MAX) /* Integer overflow */ { + while (IS_DIGIT(ptr[1])) + ptr++; *errorcodeptr = ERR61; break; } @@ -861,11 +951,21 @@ else if (!isclass) { oldptr = ptr; + /* The integer range is limited by the machine's int representation. */ c -= CHAR_0; - while ((digitab[ptr[1]] & ctype_digit) != 0) + while (IS_DIGIT(ptr[1])) + { + if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */ + { + c = -1; + break; + } c = c * 10 + *(++ptr) - CHAR_0; - if (c < 0) /* Integer overflow */ + } + if (((unsigned int)c) > INT_MAX) /* Integer overflow */ { + while (IS_DIGIT(ptr[1])) + ptr++; *errorcodeptr = ERR61; break; } @@ -891,26 +991,29 @@ else /* \0 always starts an octal number, but we may drop through to here with a larger first octal digit. The original code used just to take the least significant 8 bits of octal numbers (I think this is what early Perls used - to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more - than 3 octal digits. */ + to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, + but no more than 3 octal digits. */ case CHAR_0: c -= CHAR_0; while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) c = c * 8 + *(++ptr) - CHAR_0; - if (!utf8 && c > 255) *errorcodeptr = ERR51; +#ifdef COMPILE_PCRE8 + if (!utf && c > 0xff) *errorcodeptr = ERR51; +#endif break; /* \x is complicated. \x{ddd} is a character number which can be greater - than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is - treated as a data character. */ + than 0xff in utf or non-8bit mode, but only if the ddd are hex digits. + If not, { is treated as a data character. */ case CHAR_x: if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) { /* In JavaScript, \x must be followed by two hexadecimal numbers. Otherwise it is a lowercase x letter. */ - if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0) + if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0 + && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0) { c = 0; for (i = 0; i < 2; ++i) @@ -930,15 +1033,13 @@ else if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { - const uschar *pt = ptr + 2; - int count = 0; + const pcre_uchar *pt = ptr + 2; c = 0; - while ((digitab[*pt] & ctype_xdigit) != 0) + while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) { register int cc = *pt++; if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ - count++; #ifndef EBCDIC /* ASCII/UTF-8 coding */ if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ @@ -947,11 +1048,25 @@ else if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); #endif + +#ifdef COMPILE_PCRE8 + if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; } +#else +#ifdef COMPILE_PCRE16 + if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; } +#endif +#endif + } + + if (c < 0) + { + while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++; + *errorcodeptr = ERR34; } if (*pt == CHAR_RIGHT_CURLY_BRACKET) { - if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; + if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; ptr = pt; break; } @@ -963,7 +1078,7 @@ else /* Read just a single-byte hex-defined char */ c = 0; - while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) + while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0) { int cc; /* Some compilers don't like */ cc = *(++ptr); /* ++ in initializers */ @@ -1061,11 +1176,11 @@ Returns: type value from ucp_type_table, or -1 for an invalid type */ static int -get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) +get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) { int c, i, bot, top; -const uschar *ptr = *ptrptr; -char name[32]; +const pcre_uchar *ptr = *ptrptr; +pcre_uchar name[32]; c = *(++ptr); if (c == 0) goto ERROR_RETURN; @@ -1082,7 +1197,7 @@ if (c == CHAR_LEFT_CURLY_BRACKET) *negptr = TRUE; ptr++; } - for (i = 0; i < (int)sizeof(name) - 1; i++) + for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++) { c = *(++ptr); if (c == 0) goto ERROR_RETURN; @@ -1106,16 +1221,16 @@ else /* Search for a recognized property name using binary chop */ bot = 0; -top = _pcre_utt_size; +top = PRIV(utt_size); while (bot < top) { i = (bot + top) >> 1; - c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); + c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); if (c == 0) { - *dptr = _pcre_utt[i].value; - return _pcre_utt[i].type; + *dptr = PRIV(utt)[i].value; + return PRIV(utt)[i].type; } if (c > 0) bot = i + 1; else top = i; } @@ -1153,8 +1268,8 @@ Returns: pointer to '}' on success; current ptr on error, with errorcodeptr set non-zero */ -static const uschar * -read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr) +static const pcre_uchar * +read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr) { int min = 0; int max = -1; @@ -1162,7 +1277,7 @@ int max = -1; /* Read the minimum value and do a paranoid check: a negative value indicates an integer overflow. */ -while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0; +while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0; if (min < 0 || min > 65535) { *errorcodeptr = ERR5; @@ -1177,7 +1292,7 @@ if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) { max = 0; - while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0; + while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0; if (max < 0 || max > 65535) { *errorcodeptr = ERR5; @@ -1232,17 +1347,17 @@ Arguments: name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode - utf8 TRUE if we are in UTF-8 mode + utf TRUE if we are in UTF-8 / UTF-16 mode count pointer to the current capturing subpattern number (updated) Returns: the number of the named subpattern, or -1 if not found */ static int -find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn, - BOOL xmode, BOOL utf8, int *count) +find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn, + BOOL xmode, BOOL utf, int *count) { -uschar *ptr = *ptrptr; +pcre_uchar *ptr = *ptrptr; int start_count = *count; int hwm_count = start_count; BOOL dup_parens = FALSE; @@ -1309,7 +1424,7 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS) ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE) { int term; - const uschar *thisname; + const pcre_uchar *thisname; *count += 1; if (name == NULL && *count == lorn) return *count; term = *ptr++; @@ -1317,7 +1432,7 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS) thisname = ptr; while (*ptr != term) ptr++; if (name != NULL && lorn == ptr - thisname && - strncmp((const char *)name, (const char *)thisname, lorn) == 0) + STRNCMP_UC_UC(name, thisname, lorn) == 0) return *count; term++; } @@ -1360,7 +1475,7 @@ for (; ptr < cd->end_pattern; ptr++) { if (ptr[2] == CHAR_E) ptr+= 2; - else if (strncmp((const char *)ptr+2, + else if (STRNCMP_UC_C8(ptr + 2, STR_Q STR_BACKSLASH STR_E, 3) == 0) ptr += 4; else @@ -1408,8 +1523,8 @@ for (; ptr < cd->end_pattern; ptr++) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } ptr++; -#ifdef SUPPORT_UTF8 - if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#ifdef SUPPORT_UTF + if (utf) FORWARDCHAR(ptr); #endif } if (*ptr == 0) goto FAIL_EXIT; @@ -1420,7 +1535,7 @@ for (; ptr < cd->end_pattern; ptr++) if (*ptr == CHAR_LEFT_PARENTHESIS) { - int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count); + int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count); if (rc > 0) return rc; if (*ptr == 0) goto FAIL_EXIT; } @@ -1466,16 +1581,16 @@ Arguments: name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode - utf8 TRUE if we are in UTF-8 mode + utf TRUE if we are in UTF-8 / UTF-16 mode Returns: the number of the found subpattern, or -1 if not found */ static int -find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode, - BOOL utf8) +find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode, + BOOL utf) { -uschar *ptr = (uschar *)cd->start_pattern; +pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern; int count = 0; int rc; @@ -1486,7 +1601,7 @@ matching closing parens. That is why we have to have a loop. */ for (;;) { - rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count); + rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count); if (rc > 0 || *ptr++ == 0) break; } @@ -1513,8 +1628,8 @@ Arguments: Returns: pointer to the first significant opcode */ -static const uschar* -first_significant_code(const uschar *code, BOOL skipassert) +static const pcre_uchar* +first_significant_code(const pcre_uchar *code, BOOL skipassert) { for (;;) { @@ -1525,7 +1640,7 @@ for (;;) case OP_ASSERTBACK_NOT: if (!skipassert) return code; do code += GET(code, 1); while (*code == OP_ALT); - code += _pcre_OP_lengths[*code]; + code += PRIV(OP_lengths)[*code]; break; case OP_WORD_BOUNDARY: @@ -1539,7 +1654,7 @@ for (;;) case OP_RREF: case OP_NRREF: case OP_DEF: - code += _pcre_OP_lengths[*code]; + code += PRIV(OP_lengths)[*code]; break; default: @@ -1569,7 +1684,7 @@ and doing the check at the end; a flag specifies which mode we are running in. Arguments: code points to the start of the pattern (the bracket) - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode atend TRUE if called when the pattern is complete cd the "compile data" structure @@ -1581,12 +1696,12 @@ Returns: the fixed length, */ static int -find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd) +find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd) { int length = -1; register int branchlength = 0; -register uschar *cc = code + 1 + LINK_SIZE; +register pcre_uchar *cc = code + 1 + LINK_SIZE; /* Scan along the opcodes for this branch. If we get to the end of the branch, check the length against that of the other branches. */ @@ -1594,8 +1709,9 @@ branch, check the length against that of the other branches. */ for (;;) { int d; - uschar *ce, *cs; + pcre_uchar *ce, *cs; register int op = *cc; + switch (op) { /* We only need to continue for OP_CBRA (normal capturing bracket) and @@ -1608,7 +1724,7 @@ for (;;) case OP_ONCE: case OP_ONCE_NC: case OP_COND: - d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd); + d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (*cc == OP_ALT); @@ -1639,10 +1755,10 @@ for (;;) case OP_RECURSE: if (!atend) return -3; - cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */ - do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ - if (cc > cs && cc < ce) return -1; /* Recursion */ - d = find_fixedlength(cs + 2, utf8, atend, cd); + cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */ + do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ + if (cc > cs && cc < ce) return -1; /* Recursion */ + d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd); if (d < 0) return d; branchlength += d; cc += 1 + LINK_SIZE; @@ -1655,7 +1771,8 @@ for (;;) case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: do cc += GET(cc, 1); while (*cc == OP_ALT); - /* Fall through */ + cc += PRIV(OP_lengths)[*cc]; + break; /* Skip over things that don't match chars */ @@ -1663,7 +1780,7 @@ for (;;) case OP_PRUNE_ARG: case OP_SKIP_ARG: case OP_THEN_ARG: - cc += cc[1] + _pcre_OP_lengths[*cc]; + cc += cc[1] + PRIV(OP_lengths)[*cc]; break; case OP_CALLOUT: @@ -1690,7 +1807,7 @@ for (;;) case OP_SOM: case OP_THEN: case OP_WORD_BOUNDARY: - cc += _pcre_OP_lengths[*cc]; + cc += PRIV(OP_lengths)[*cc]; break; /* Handle literal characters */ @@ -1701,8 +1818,8 @@ for (;;) case OP_NOTI: branchlength++; cc += 2; -#ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -1714,16 +1831,16 @@ for (;;) case OP_NOTEXACT: case OP_NOTEXACTI: branchlength += GET2(cc,1); - cc += 4; -#ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; + cc += 2 + IMM2_SIZE; +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; case OP_TYPEEXACT: branchlength += GET2(cc,1); - if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; - cc += 4; + if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; + cc += 1 + IMM2_SIZE + 1; break; /* Handle single-char matchers */ @@ -1757,15 +1874,15 @@ for (;;) /* Check a class for variable quantification */ -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || defined COMPILE_PCRE16 case OP_XCLASS: - cc += GET(cc, 1) - 33; + cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS]; /* Fall through */ #endif case OP_CLASS: case OP_NCLASS: - cc += 33; + cc += PRIV(OP_lengths)[OP_CLASS]; switch (*cc) { @@ -1779,9 +1896,9 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: - if (GET2(cc,1) != GET2(cc,3)) return -1; + if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1; branchlength += GET2(cc,1); - cc += 5; + cc += 1 + 2 * IMM2_SIZE; break; default: @@ -1896,14 +2013,14 @@ length. Arguments: code points to start of expression - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode number the required bracket number or negative to find a lookbehind Returns: pointer to the opcode for the bracket, or NULL if not found */ -const uschar * -_pcre_find_bracket(const uschar *code, BOOL utf8, int number) +const pcre_uchar * +PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number) { for (;;) { @@ -1921,8 +2038,8 @@ for (;;) else if (c == OP_REVERSE) { - if (number < 0) return (uschar *)code; - code += _pcre_OP_lengths[c]; + if (number < 0) return (pcre_uchar *)code; + code += PRIV(OP_lengths)[c]; } /* Handle capturing bracket */ @@ -1931,8 +2048,8 @@ for (;;) c == OP_CBRAPOS || c == OP_SCBRAPOS) { int n = GET2(code, 1+LINK_SIZE); - if (n == number) return (uschar *)code; - code += _pcre_OP_lengths[c]; + if (n == number) return (pcre_uchar *)code; + code += PRIV(OP_lengths)[c]; } /* Otherwise, we can get the item's length from the table, except that for @@ -1960,7 +2077,8 @@ for (;;) case OP_TYPEMINUPTO: case OP_TYPEEXACT: case OP_TYPEPOSUPTO: - if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; + if (code[1 + IMM2_SIZE] == OP_PROP + || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2; break; case OP_MARK: @@ -1976,14 +2094,14 @@ for (;;) /* Add in the fixed length from the table */ - code += _pcre_OP_lengths[c]; + code += PRIV(OP_lengths)[c]; /* In UTF-8 mode, opcodes that are followed by a character may be followed by a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. */ -#ifdef SUPPORT_UTF8 - if (utf8) switch(c) +#ifdef SUPPORT_UTF + if (utf) switch(c) { case OP_CHAR: case OP_CHARI: @@ -2013,11 +2131,11 @@ for (;;) case OP_MINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: - if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; + if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); break; } #else - (void)(utf8); /* Keep compiler happy by referencing function argument */ + (void)(utf); /* Keep compiler happy by referencing function argument */ #endif } } @@ -2034,13 +2152,13 @@ instance of OP_RECURSE. Arguments: code points to start of expression - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode Returns: pointer to the opcode for OP_RECURSE, or NULL if not found */ -static const uschar * -find_recurse(const uschar *code, BOOL utf8) +static const pcre_uchar * +find_recurse(const pcre_uchar *code, BOOL utf) { for (;;) { @@ -2079,7 +2197,8 @@ for (;;) case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEEXACT: - if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; + if (code[1 + IMM2_SIZE] == OP_PROP + || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2; break; case OP_MARK: @@ -2095,14 +2214,14 @@ for (;;) /* Add in the fixed length from the table */ - code += _pcre_OP_lengths[c]; + code += PRIV(OP_lengths)[c]; /* In UTF-8 mode, opcodes that are followed by a character may be followed by a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. */ -#ifdef SUPPORT_UTF8 - if (utf8) switch(c) +#ifdef SUPPORT_UTF + if (utf) switch(c) { case OP_CHAR: case OP_CHARI: @@ -2132,11 +2251,11 @@ for (;;) case OP_MINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: - if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; + if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); break; } #else - (void)(utf8); /* Keep compiler happy by referencing function argument */ + (void)(utf); /* Keep compiler happy by referencing function argument */ #endif } } @@ -2159,22 +2278,22 @@ bracket whose current branch will already have been scanned. Arguments: code points to start of search endcode points to where to stop - utf8 TRUE if in UTF8 mode + utf TRUE if in UTF-8 / UTF-16 mode cd contains pointers to tables etc. Returns: TRUE if what is matched could be empty */ static BOOL -could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8, - compile_data *cd) +could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode, + BOOL utf, compile_data *cd) { register int c; -for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); +for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); code < endcode; - code = first_significant_code(code + _pcre_OP_lengths[c], TRUE)) + code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE)) { - const uschar *ccode; + const pcre_uchar *ccode; c = *code; @@ -2197,7 +2316,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); if (c == OP_RECURSE) { - const uschar *scode; + const pcre_uchar *scode; BOOL empty_branch; /* Test for forward reference */ @@ -2215,7 +2334,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); do { - if (could_be_empty_branch(scode, endcode, utf8, cd)) + if (could_be_empty_branch(scode, endcode, utf, cd)) { empty_branch = TRUE; break; @@ -2233,7 +2352,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO || c == OP_BRAPOSZERO) { - code += _pcre_OP_lengths[c]; + code += PRIV(OP_lengths)[c]; do code += GET(code, 1); while (*code == OP_ALT); c = *code; continue; @@ -2271,7 +2390,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); empty_branch = FALSE; do { - if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd)) + if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd)) empty_branch = TRUE; code += GET(code, 1); } @@ -2289,11 +2408,11 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); { /* Check for quantifiers after a class. XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single - high-valued characters. The length in _pcre_OP_lengths[] is zero; the + high-valued characters. The length in PRIV(OP_lengths)[] is zero; the actual length is stored in the compiled code, so we must update "code" here. */ -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: ccode = code += GET(code, 1); goto CHECK_CLASS_REPEAT; @@ -2301,9 +2420,9 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); case OP_CLASS: case OP_NCLASS: - ccode = code + 33; + ccode = code + PRIV(OP_lengths)[OP_CLASS]; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 CHECK_CLASS_REPEAT: #endif @@ -2376,7 +2495,8 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: - if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; + if (code[1 + IMM2_SIZE] == OP_PROP + || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2; break; /* End of branch */ @@ -2391,7 +2511,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, MINUPTO, and POSUPTO may be followed by a multibyte character */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF case OP_STAR: case OP_STARI: case OP_MINSTAR: @@ -2404,7 +2524,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); case OP_MINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: - if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f]; + if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]); break; case OP_UPTO: @@ -2413,7 +2533,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE); case OP_MINUPTOI: case OP_POSUPTO: case OP_POSUPTOI: - if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f]; + if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]); break; #endif @@ -2457,19 +2577,19 @@ Arguments: code points to start of the recursion endcode points to where to stop (current RECURSE item) bcptr points to the chain of current (unclosed) branch starts - utf8 TRUE if in UTF-8 mode + utf TRUE if in UTF-8 / UTF-16 mode cd pointers to tables etc Returns: TRUE if what is matched could be empty */ static BOOL -could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, - BOOL utf8, compile_data *cd) +could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode, + branch_chain *bcptr, BOOL utf, compile_data *cd) { while (bcptr != NULL && bcptr->current_branch >= code) { - if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd)) + if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd)) return FALSE; bcptr = bcptr->outer; } @@ -2521,7 +2641,7 @@ Returns: TRUE or FALSE */ static BOOL -check_posix_syntax(const uschar *ptr, const uschar **endptr) +check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr) { int terminator; /* Don't combine these lines; the Solaris cc */ terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ @@ -2565,14 +2685,14 @@ Returns: a value representing the name, or -1 if unknown */ static int -check_posix_name(const uschar *ptr, int len) +check_posix_name(const pcre_uchar *ptr, int len) { const char *pn = posix_names; register int yield = 0; while (posix_name_lengths[yield] != 0) { if (len == posix_name_lengths[yield] && - strncmp((const char *)ptr, pn, len) == 0) return yield; + STRNCMP_UC_C8(ptr, pn, len) == 0) return yield; pn += posix_name_lengths[yield] + 1; yield++; } @@ -2604,7 +2724,7 @@ value in the reference (which is a group number). Arguments: group points to the start of the group adjust the amount by which the group is to be moved - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode cd contains pointers to tables etc. save_hwm the hwm forward reference pointer at the start of the group @@ -2612,15 +2732,15 @@ Returns: nothing */ static void -adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, - uschar *save_hwm) +adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd, + pcre_uchar *save_hwm) { -uschar *ptr = group; +pcre_uchar *ptr = group; -while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) +while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) { int offset; - uschar *hc; + pcre_uchar *hc; /* See if this recursion is on the forward reference list. If so, adjust the reference. */ @@ -2665,14 +2785,14 @@ Arguments: Returns: new code pointer */ -static uschar * -auto_callout(uschar *code, const uschar *ptr, compile_data *cd) +static pcre_uchar * +auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd) { *code++ = OP_CALLOUT; *code++ = 255; PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */ PUT(code, LINK_SIZE, 0); /* Default length */ -return code + 2*LINK_SIZE; +return code + 2 * LINK_SIZE; } @@ -2694,7 +2814,7 @@ Returns: nothing */ static void -complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd) +complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd) { int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2)); PUT(previous_callout, 2 + LINK_SIZE, length); @@ -2777,7 +2897,7 @@ switch(ptype) prop->chartype == ucp_Lt) == negated; case PT_GC: - return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated; + return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; case PT_PC: return (pdata == prop->chartype) == negated; @@ -2788,23 +2908,23 @@ switch(ptype) /* These are specials */ case PT_ALNUM: - return (_pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated; + return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; case PT_SPACE: /* Perl space */ - return (_pcre_ucp_gentype[prop->chartype] == ucp_Z || + return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) == negated; case PT_PXSPACE: /* POSIX space */ - return (_pcre_ucp_gentype[prop->chartype] == ucp_Z || + return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) == negated; case PT_WORD: - return (_pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N || + return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) == negated; } return FALSE; @@ -2823,7 +2943,7 @@ sense to automatically possessify the repeated item. Arguments: previous pointer to the repeated opcode - utf8 TRUE in UTF-8 mode + utf TRUE in UTF-8 / UTF-16 mode ptr next character in pattern options options bits cd contains pointers to tables etc. @@ -2832,10 +2952,10 @@ Returns: TRUE if possessifying is wanted */ static BOOL -check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr, - int options, compile_data *cd) +check_auto_possessive(const pcre_uchar *previous, BOOL utf, + const pcre_uchar *ptr, int options, compile_data *cd) { -int c, next; +pcre_int32 c, next; int op_code = *previous++; /* Skip whitespace and comments in extended mode */ @@ -2844,7 +2964,7 @@ if ((options & PCRE_EXTENDED) != 0) { for (;;) { - while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; + while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == CHAR_NUMBER_SIGN) { ptr++; @@ -2852,8 +2972,8 @@ if ((options & PCRE_EXTENDED) != 0) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } ptr++; -#ifdef SUPPORT_UTF8 - if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#ifdef SUPPORT_UTF + if (utf) FORWARDCHAR(ptr); #endif } } @@ -2871,15 +2991,13 @@ if (*ptr == CHAR_BACKSLASH) if (temperrorcode != 0) return FALSE; ptr++; /* Point after the escape sequence */ } - -else if ((cd->ctypes[*ptr] & ctype_meta) == 0) +else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0) { -#ifdef SUPPORT_UTF8 - if (utf8) { GETCHARINC(next, ptr); } else +#ifdef SUPPORT_UTF + if (utf) { GETCHARINC(next, ptr); } else #endif next = *ptr++; } - else return FALSE; /* Skip whitespace and comments in extended mode */ @@ -2888,7 +3006,7 @@ if ((options & PCRE_EXTENDED) != 0) { for (;;) { - while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; + while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == CHAR_NUMBER_SIGN) { ptr++; @@ -2896,8 +3014,8 @@ if ((options & PCRE_EXTENDED) != 0) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } ptr++; -#ifdef SUPPORT_UTF8 - if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#ifdef SUPPORT_UTF + if (utf) FORWARDCHAR(ptr); #endif } } @@ -2908,7 +3026,7 @@ if ((options & PCRE_EXTENDED) != 0) /* If the next thing is itself optional, we have to give up. */ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || - strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) + STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) return FALSE; /* Now compare the next item with the previous opcode. First, handle cases when @@ -2917,7 +3035,7 @@ the next item is a character. */ if (next >= 0) switch(op_code) { case OP_CHAR: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF GETCHARTEST(c, previous); #else c = *previous; @@ -2929,14 +3047,14 @@ if (next >= 0) switch(op_code) high-valued characters. */ case OP_CHARI: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF GETCHARTEST(c, previous); #else c = *previous; #endif if (c == next) return FALSE; -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else @@ -2948,8 +3066,8 @@ if (next >= 0) switch(op_code) return (unsigned int)c != othercase; } else -#endif /* SUPPORT_UTF8 */ - return (c != cd->fcc[next]); /* Non-UTF-8 mode */ +#endif /* SUPPORT_UTF */ + return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These opcodes are not used for multi-byte characters, because they are coded using @@ -2960,8 +3078,8 @@ if (next >= 0) switch(op_code) case OP_NOTI: if ((c = *previous) == next) return TRUE; -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else @@ -2973,8 +3091,8 @@ if (next >= 0) switch(op_code) return (unsigned int)c == othercase; } else -#endif /* SUPPORT_UTF8 */ - return (c == cd->fcc[next]); /* Non-UTF-8 mode */ +#endif /* SUPPORT_UTF */ + return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next))); /* Non-UTF-8 mode */ /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ @@ -3065,7 +3183,7 @@ switch(op_code) { case OP_CHAR: case OP_CHARI: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF GETCHARTEST(c, previous); #else c = *previous; @@ -3170,7 +3288,7 @@ switch(op_code) to the original \d etc. At this point, ptr will point to a zero byte. */ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || - strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) + STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) return FALSE; /* Do the property check. */ @@ -3248,8 +3366,8 @@ Arguments: codeptr points to the pointer to the current code point ptrptr points to the current pattern pointer errorcodeptr points to error code variable - firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) - reqbyteptr set to the last literal character required, else < 0 + firstcharptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) + reqcharptr set to the last literal character required, else < 0 bcptr points to current branch chain cond_depth conditional nesting depth cd contains pointers to tables etc. @@ -3261,47 +3379,54 @@ Returns: TRUE on success */ static BOOL -compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, - int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, - int cond_depth, compile_data *cd, int *lengthptr) +compile_branch(int *optionsptr, pcre_uchar **codeptr, + const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr, + pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth, + compile_data *cd, int *lengthptr) { int repeat_type, op_type; int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ int bravalue = 0; int greedy_default, greedy_non_default; -int firstbyte, reqbyte; -int zeroreqbyte, zerofirstbyte; -int req_caseopt, reqvary, tempreqvary; +pcre_int32 firstchar, reqchar; +pcre_int32 zeroreqchar, zerofirstchar; +pcre_int32 req_caseopt, reqvary, tempreqvary; int options = *optionsptr; /* May change dynamically */ int after_manual_callout = 0; int length_prevgroup = 0; register int c; -register uschar *code = *codeptr; -uschar *last_code = code; -uschar *orig_code = code; -uschar *tempcode; +register pcre_uchar *code = *codeptr; +pcre_uchar *last_code = code; +pcre_uchar *orig_code = code; +pcre_uchar *tempcode; BOOL inescq = FALSE; -BOOL groupsetfirstbyte = FALSE; -const uschar *ptr = *ptrptr; -const uschar *tempptr; -const uschar *nestptr = NULL; -uschar *previous = NULL; -uschar *previous_callout = NULL; -uschar *save_hwm = NULL; -uschar classbits[32]; +BOOL groupsetfirstchar = FALSE; +const pcre_uchar *ptr = *ptrptr; +const pcre_uchar *tempptr; +const pcre_uchar *nestptr = NULL; +pcre_uchar *previous = NULL; +pcre_uchar *previous_callout = NULL; +pcre_uchar *save_hwm = NULL; +pcre_uint8 classbits[32]; /* We can fish out the UTF-8 setting once and for all into a BOOL, but we must not do this for other options (e.g. PCRE_EXTENDED) because they may change dynamically as we process the pattern. */ -#ifdef SUPPORT_UTF8 -BOOL class_utf8; -BOOL utf8 = (options & PCRE_UTF8) != 0; -uschar *class_utf8data; -uschar *class_utf8data_base; -uschar utf8_char[6]; +#ifdef SUPPORT_UTF +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +BOOL utf = (options & PCRE_UTF8) != 0; +pcre_uchar utf_chars[6]; #else -BOOL utf8 = FALSE; +BOOL utf = FALSE; +#endif + +/* Helper variables for OP_XCLASS opcode (for characters > 255). */ + +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 +BOOL xclass; +pcre_uchar *class_uchardata; +pcre_uchar *class_uchardata_base; #endif #ifdef PCRE_DEBUG @@ -3315,22 +3440,23 @@ greedy_non_default = greedy_default ^ 1; /* Initialize no first byte, no required byte. REQ_UNSET means "no char matching encountered yet". It gets changed to REQ_NONE if we hit something that -matches a non-fixed char first char; reqbyte just remains unset if we never +matches a non-fixed char first char; reqchar just remains unset if we never find one. When we hit a repeat whose minimum is zero, we may have to adjust these values to take the zero repeat into account. This is implemented by setting them to -zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual +zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual item types that can be repeated set these backoff variables appropriately. */ -firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; +firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET; -/* The variable req_caseopt contains either the REQ_CASELESS value or zero, -according to the current setting of the caseless flag. REQ_CASELESS is a bit -value > 255. It is added into the firstbyte or reqbyte variables to record the -case status of the value. This is used only for ASCII characters. */ +/* The variable req_caseopt contains either the REQ_CASELESS value +or zero, according to the current setting of the caseless flag. The +REQ_CASELESS leaves the lower 28 bit empty. It is added into the +firstchar or reqchar variables to record the case status of the +value. This is used only for ASCII characters. */ -req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; +req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0; /* Switch on next character until the end of the branch */ @@ -3342,20 +3468,20 @@ for (;; ptr++) BOOL is_quantifier; BOOL is_recurse; BOOL reset_bracount; - int class_charcount; - int class_lastchar; + int class_has_8bitchar; + int class_single_char; int newoptions; int recno; int refsign; int skipbytes; - int subreqbyte; - int subfirstbyte; + int subreqchar; + int subfirstchar; int terminator; int mclength; int tempbracount; - uschar mcbuffer[8]; + pcre_uchar mcbuffer[8]; - /* Get next byte in the pattern */ + /* Get next character in the pattern */ c = *ptr; @@ -3401,8 +3527,8 @@ for (;; ptr++) } *lengthptr += (int)(code - last_code); - DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code), - c)); + DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr, + (int)(code - last_code), c, c)); /* If "previous" is set and it is not at the start of the work space, move it back to there, in order to avoid filling up the work space. Otherwise, @@ -3412,7 +3538,7 @@ for (;; ptr++) { if (previous > orig_code) { - memmove(orig_code, previous, code - previous); + memmove(orig_code, previous, IN_UCHARS(code - previous)); code -= previous - orig_code; previous = orig_code; } @@ -3481,7 +3607,7 @@ for (;; ptr++) if ((options & PCRE_EXTENDED) != 0) { - if ((cd->ctypes[c] & ctype_space) != 0) continue; + if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue; if (c == CHAR_NUMBER_SIGN) { ptr++; @@ -3489,8 +3615,8 @@ for (;; ptr++) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } ptr++; -#ifdef SUPPORT_UTF8 - if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; +#ifdef SUPPORT_UTF + if (utf) FORWARDCHAR(ptr); #endif } if (*ptr != 0) continue; @@ -3514,8 +3640,8 @@ for (;; ptr++) case 0: /* The branch terminates at string end */ case CHAR_VERTICAL_LINE: /* or | or ) */ case CHAR_RIGHT_PARENTHESIS: - *firstbyteptr = firstbyte; - *reqbyteptr = reqbyte; + *firstcharptr = firstchar; + *reqcharptr = reqchar; *codeptr = code; *ptrptr = ptr; if (lengthptr != NULL) @@ -3539,7 +3665,7 @@ for (;; ptr++) previous = NULL; if ((options & PCRE_MULTILINE) != 0) { - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + if (firstchar == REQ_UNSET) firstchar = REQ_NONE; *code++ = OP_CIRCM; } else *code++ = OP_CIRC; @@ -3551,12 +3677,12 @@ for (;; ptr++) break; /* There can never be a first char if '.' is first, whatever happens about - repeats. The value of reqbyte doesn't change either. */ + repeats. The value of reqchar doesn't change either. */ case CHAR_DOT: - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - zerofirstbyte = firstbyte; - zeroreqbyte = reqbyte; + if (firstchar == REQ_UNSET) firstchar = REQ_NONE; + zerofirstchar = firstchar; + zeroreqchar = reqchar; previous = code; *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; break; @@ -3611,8 +3737,7 @@ for (;; ptr++) { if (ptr[1] == CHAR_E) ptr++; - else if (strncmp((const char *)ptr+1, - STR_Q STR_BACKSLASH STR_E, 3) == 0) + else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0) ptr += 3; else break; @@ -3631,8 +3756,8 @@ for (;; ptr++) (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) { *code++ = negate_class? OP_ALLANY : OP_FAIL; - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - zerofirstbyte = firstbyte; + if (firstchar == REQ_UNSET) firstchar = REQ_NONE; + zerofirstchar = firstchar; break; } @@ -3642,24 +3767,25 @@ for (;; ptr++) should_flip_negation = FALSE; - /* Keep a count of chars with values < 256 so that we can optimize the case - of just a single character (as long as it's < 256). However, For higher - valued UTF-8 characters, we don't yet do any optimization. */ + /* For optimization purposes, we track some properties of the class. + class_has_8bitchar will be non-zero, if the class contains at least one + < 256 character. class_single_char will be 1 if the class contains only + a single character. */ - class_charcount = 0; - class_lastchar = -1; + class_has_8bitchar = 0; + class_single_char = 0; /* Initialize the 32-char bit map to all zeros. We build the map in a temporary bit of memory, in case the class contains only 1 character (less than 256), because in that case the compiled code doesn't use the bit map. */ - memset(classbits, 0, 32 * sizeof(uschar)); + memset(classbits, 0, 32 * sizeof(pcre_uint8)); -#ifdef SUPPORT_UTF8 - class_utf8 = FALSE; /* No chars >= 256 */ - class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ - class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 + xclass = FALSE; /* No chars >= 256 */ + class_uchardata = code + LINK_SIZE + 2; /* For UTF-8 items */ + class_uchardata_base = class_uchardata; /* For resetting in pass 1 */ #endif /* Process characters until ] is reached. By writing this as a "do" it @@ -3668,25 +3794,26 @@ for (;; ptr++) if (c != 0) do { - const uschar *oldptr; + const pcre_uchar *oldptr; -#ifdef SUPPORT_UTF8 - if (utf8 && c > 127) +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(c)) { /* Braces are required because the */ GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ } +#endif - /* In the pre-compile phase, accumulate the length of any UTF-8 extra +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 + /* In the pre-compile phase, accumulate the length of any extra data and reset the pointer. This is so that very large classes that - contain a zillion UTF-8 characters no longer overwrite the work space + contain a zillion > 255 characters no longer overwrite the work space (which is on the stack). */ if (lengthptr != NULL) { - *lengthptr += (int)(class_utf8data - class_utf8data_base); - class_utf8data = class_utf8data_base; + *lengthptr += class_uchardata - class_uchardata_base; + class_uchardata = class_uchardata_base; } - #endif /* Inside \Q...\E everything is literal except \E */ @@ -3714,8 +3841,8 @@ for (;; ptr++) { BOOL local_negate = FALSE; int posix_class, taboffset, tabopt; - register const uschar *cbits = cd->cbits; - uschar pbits[32]; + register const pcre_uint8 *cbits = cd->cbits; + pcre_uint8 pbits[32]; if (ptr[1] != CHAR_COLON) { @@ -3770,7 +3897,7 @@ for (;; ptr++) /* Copy in the first table (always present) */ memcpy(pbits, cbits + posix_class_maps[posix_class], - 32 * sizeof(uschar)); + 32 * sizeof(pcre_uint8)); /* If there is a second table, add or remove it as required. */ @@ -3801,16 +3928,20 @@ for (;; ptr++) for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; ptr = tempptr + 1; - class_charcount = 10; /* Set > 1; assumes more than 1 per class */ + /* Every class contains at least one < 256 characters. */ + class_has_8bitchar = 1; + /* Every class contains at least two characters. */ + class_single_char = 2; continue; /* End of POSIX syntax handling */ } /* Backslash may introduce a single character, or it may introduce one of the specials, which just set a flag. The sequence \b is a special case. Inside a class (and only there) it is treated as backspace. We - assume that other escapes have more than one character in them, so set - class_charcount bigger than one. Unrecognized escapes fall through and - are either treated as literal characters (by default), or are faulted if + assume that other escapes have more than one character in them, so + speculatively set both class_has_8bitchar and class_single_char bigger + than one. Unrecognized escapes fall through and are either treated + as literal characters (by default), or are faulted if PCRE_EXTRA is set. */ if (c == CHAR_BACKSLASH) @@ -3837,8 +3968,11 @@ for (;; ptr++) if (c < 0) { - register const uschar *cbits = cd->cbits; - class_charcount += 2; /* Greater than 1 is what matters */ + register const pcre_uint8 *cbits = cd->cbits; + /* Every class contains at least two < 256 characters. */ + class_has_8bitchar++; + /* Every class contains at least two characters. */ + class_single_char += 2; switch (-c) { @@ -3851,7 +3985,7 @@ for (;; ptr++) case ESC_SU: nestptr = ptr; ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */ - class_charcount -= 2; /* Undo! */ + class_has_8bitchar--; /* Undo! */ continue; #endif case ESC_d: @@ -3892,23 +4026,38 @@ for (;; ptr++) SETBIT(classbits, 0x09); /* VT */ SETBIT(classbits, 0x20); /* SPACE */ SETBIT(classbits, 0xa0); /* NSBP */ -#ifdef SUPPORT_UTF8 - if (utf8) +#ifndef COMPILE_PCRE8 + xclass = TRUE; + *class_uchardata++ = XCL_SINGLE; + *class_uchardata++ = 0x1680; + *class_uchardata++ = XCL_SINGLE; + *class_uchardata++ = 0x180e; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x2000; + *class_uchardata++ = 0x200a; + *class_uchardata++ = XCL_SINGLE; + *class_uchardata++ = 0x202f; + *class_uchardata++ = XCL_SINGLE; + *class_uchardata++ = 0x205f; + *class_uchardata++ = XCL_SINGLE; + *class_uchardata++ = 0x3000; +#elif defined SUPPORT_UTF + if (utf) { - class_utf8 = TRUE; - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data); - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data); + xclass = TRUE; + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata); } #endif continue; @@ -3926,32 +4075,59 @@ for (;; ptr++) } classbits[c] |= x; } - -#ifdef SUPPORT_UTF8 - if (utf8) +#ifndef COMPILE_PCRE8 + xclass = TRUE; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x0100; + *class_uchardata++ = 0x167f; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x1681; + *class_uchardata++ = 0x180d; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x180f; + *class_uchardata++ = 0x1fff; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x200b; + *class_uchardata++ = 0x202e; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x2030; + *class_uchardata++ = 0x205e; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x2060; + *class_uchardata++ = 0x2fff; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x3001; +#ifdef SUPPORT_UTF + if (utf) + class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); + else +#endif + *class_uchardata++ = 0xffff; +#elif defined SUPPORT_UTF + if (utf) { - class_utf8 = TRUE; - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); + xclass = TRUE; + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); } #endif continue; @@ -3962,13 +4138,18 @@ for (;; ptr++) SETBIT(classbits, 0x0c); /* FF */ SETBIT(classbits, 0x0d); /* CR */ SETBIT(classbits, 0x85); /* NEL */ -#ifdef SUPPORT_UTF8 - if (utf8) +#ifndef COMPILE_PCRE8 + xclass = TRUE; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x2028; + *class_uchardata++ = 0x2029; +#elif defined SUPPORT_UTF + if (utf) { - class_utf8 = TRUE; - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); + xclass = TRUE; + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata); } #endif continue; @@ -3990,16 +4171,29 @@ for (;; ptr++) classbits[c] |= x; } -#ifdef SUPPORT_UTF8 - if (utf8) +#ifndef COMPILE_PCRE8 + xclass = TRUE; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x0100; + *class_uchardata++ = 0x2027; + *class_uchardata++ = XCL_RANGE; + *class_uchardata++ = 0x202a; +#ifdef SUPPORT_UTF + if (utf) + class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); + else +#endif + *class_uchardata++ = 0xffff; +#elif defined SUPPORT_UTF + if (utf) { - class_utf8 = TRUE; - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data); - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); - class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); + xclass = TRUE; + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata); + class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); } #endif continue; @@ -4012,12 +4206,12 @@ for (;; ptr++) int pdata; int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); if (ptype < 0) goto FAILED; - class_utf8 = TRUE; - *class_utf8data++ = ((-c == ESC_p) != negated)? + xclass = TRUE; + *class_uchardata++ = ((-c == ESC_p) != negated)? XCL_PROP : XCL_NOTPROP; - *class_utf8data++ = ptype; - *class_utf8data++ = pdata; - class_charcount -= 2; /* Not a < 256 character */ + *class_uchardata++ = ptype; + *class_uchardata++ = pdata; + class_has_8bitchar--; /* Undo! */ continue; } #endif @@ -4031,14 +4225,15 @@ for (;; ptr++) *errorcodeptr = ERR7; goto FAILED; } - class_charcount -= 2; /* Undo the default count from above */ - c = *ptr; /* Get the final character and fall through */ + class_has_8bitchar--; /* Undo the speculative increase. */ + class_single_char -= 2; /* Undo the speculative increase. */ + c = *ptr; /* Get the final character and fall through */ break; } } /* Fall through if we have a single character (c >= 0). This may be - greater than 256 in UTF-8 mode. */ + greater than 256. */ } /* End of backslash handling */ @@ -4086,8 +4281,8 @@ for (;; ptr++) goto LONE_SINGLE_CHARACTER; } -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { /* Braces are required because the */ GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ } @@ -4131,22 +4326,36 @@ for (;; ptr++) if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; + /* Since we found a character range, single character optimizations + cannot be done anymore. */ + class_single_char = 2; + /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless matching, we have to use an XCLASS with extra data items. Caseless matching for characters > 127 is available only if UCP support is available. */ -#ifdef SUPPORT_UTF8 - if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) +#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8) + if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127))) +#elif defined SUPPORT_UTF + if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) +#elif !(defined COMPILE_PCRE8) + if (d > 255) +#endif +#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) { - class_utf8 = TRUE; + xclass = TRUE; /* With UCP support, we can find the other case equivalents of the relevant characters. There may be several ranges. Optimize how they fit with the basic range. */ #ifdef SUPPORT_UCP +#ifndef COMPILE_PCRE8 + if (utf && (options & PCRE_CASELESS) != 0) +#else if ((options & PCRE_CASELESS) != 0) +#endif { unsigned int occ, ocd; unsigned int cc = c; @@ -4172,14 +4381,14 @@ for (;; ptr++) if (occ == ocd) { - *class_utf8data++ = XCL_SINGLE; + *class_uchardata++ = XCL_SINGLE; } else { - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(occ, class_utf8data); + *class_uchardata++ = XCL_RANGE; + class_uchardata += PRIV(ord2utf)(occ, class_uchardata); } - class_utf8data += _pcre_ord2utf8(ocd, class_utf8data); + class_uchardata += PRIV(ord2utf)(ocd, class_uchardata); } } #endif /* SUPPORT_UCP */ @@ -4187,33 +4396,69 @@ for (;; ptr++) /* Now record the original range, possibly modified for UCP caseless overlapping ranges. */ - *class_utf8data++ = XCL_RANGE; - class_utf8data += _pcre_ord2utf8(c, class_utf8data); - class_utf8data += _pcre_ord2utf8(d, class_utf8data); + *class_uchardata++ = XCL_RANGE; +#ifdef SUPPORT_UTF +#ifndef COMPILE_PCRE8 + if (utf) + { + class_uchardata += PRIV(ord2utf)(c, class_uchardata); + class_uchardata += PRIV(ord2utf)(d, class_uchardata); + } + else + { + *class_uchardata++ = c; + *class_uchardata++ = d; + } +#else + class_uchardata += PRIV(ord2utf)(c, class_uchardata); + class_uchardata += PRIV(ord2utf)(d, class_uchardata); +#endif +#else /* SUPPORT_UTF */ + *class_uchardata++ = c; + *class_uchardata++ = d; +#endif /* SUPPORT_UTF */ /* With UCP support, we are done. Without UCP support, there is no - caseless matching for UTF-8 characters > 127; we can use the bit map - for the smaller ones. */ + caseless matching for UTF characters > 127; we can use the bit map + for the smaller ones. As for 16 bit characters without UTF, we + can still use */ #ifdef SUPPORT_UCP - continue; /* With next character in the class */ -#else - if ((options & PCRE_CASELESS) == 0 || c > 127) continue; +#ifndef COMPILE_PCRE8 + if (utf) +#endif + continue; /* With next character in the class */ +#endif /* SUPPORT_UCP */ +#if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8) + if (utf) + { + if ((options & PCRE_CASELESS) == 0 || c > 127) continue; + /* Adjust upper limit and fall through to set up the map */ + d = 127; + } + else + { + if (c > 255) continue; + /* Adjust upper limit and fall through to set up the map */ + d = 255; + } +#elif defined SUPPORT_UTF && !defined(SUPPORT_UCP) + if ((options & PCRE_CASELESS) == 0 || c > 127) continue; /* Adjust upper limit and fall through to set up the map */ - d = 127; - -#endif /* SUPPORT_UCP */ +#else + if (c > 255) continue; + /* Adjust upper limit and fall through to set up the map */ + d = 255; +#endif /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */ } -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */ - /* We use the bit map for all cases when not in UTF-8 mode; else - ranges that lie entirely within 0-127 when there is UCP support; else - for partial ranges without UCP support. */ + /* We use the bit map for 8 bit mode, or when the characters fall + partially or entirely to [0-255] ([0-127] for UCP) ranges. */ - class_charcount += d - c + 1; - class_lastchar = d; + class_has_8bitchar = 1; /* We can save a bit of time by skipping this in the pre-compile. */ @@ -4222,7 +4467,7 @@ for (;; ptr++) classbits[c/8] |= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { - int uc = cd->fcc[c]; /* flip case */ + int uc = cd->fcc[c]; /* flip case */ classbits[uc/8] |= (1 << (uc&7)); } } @@ -4236,41 +4481,117 @@ for (;; ptr++) LONE_SINGLE_CHARACTER: - /* Handle a character that cannot go in the bit map */ + /* Only the value of 1 matters for class_single_char. */ + if (class_single_char < 2) class_single_char++; + + /* If class_charcount is 1, we saw precisely one character. As long as + there were no negated characters >= 128 and there was no use of \p or \P, + in other words, no use of any XCLASS features, we can optimize. -#ifdef SUPPORT_UTF8 - if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) + In UTF-8 mode, we can optimize the negative case only if there were no + characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR + operate on single-bytes characters only. This is an historical hangover. + Maybe one day we can tidy these opcodes to handle multi-byte characters. + + The optimization throws away the bit map. We turn the item into a + 1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative. + Note that OP_NOT[I] does not support multibyte characters. In the positive + case, it can cause firstchar to be set. Otherwise, there can be no first + char if this item is first, whatever repeat count may follow. In the case + of reqchar, save the previous value for reinstating. */ + +#ifdef SUPPORT_UTF + if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET + && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1))) +#else + if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) +#endif { - class_utf8 = TRUE; - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(c, class_utf8data); + ptr++; + zeroreqchar = reqchar; + + /* The OP_NOT[I] opcodes work on single characters only. */ + + if (negate_class) + { + if (firstchar == REQ_UNSET) firstchar = REQ_NONE; + zerofirstchar = firstchar; + *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT; + *code++ = c; + goto NOT_CHAR; + } + + /* For a single, positive character, get the value into mcbuffer, and + then we can handle this with the normal one-character code. */ + +#ifdef SUPPORT_UTF + if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) + mclength = PRIV(ord2utf)(c, mcbuffer); + else +#endif + { + mcbuffer[0] = c; + mclength = 1; + } + goto ONE_CHAR; + } /* End of 1-char optimization */ + + /* Handle a character that cannot go in the bit map. */ + +#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8) + if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127))) +#elif defined SUPPORT_UTF + if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) +#elif !(defined COMPILE_PCRE8) + if (c > 255) +#endif + +#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) + { + xclass = TRUE; + *class_uchardata++ = XCL_SINGLE; +#ifdef SUPPORT_UTF +#ifndef COMPILE_PCRE8 + /* In non 8 bit mode, we can get here even if we are not in UTF mode. */ + if (!utf) + *class_uchardata++ = c; + else +#endif + class_uchardata += PRIV(ord2utf)(c, class_uchardata); +#else /* SUPPORT_UTF */ + *class_uchardata++ = c; +#endif /* SUPPORT_UTF */ #ifdef SUPPORT_UCP +#ifdef COMPILE_PCRE8 if ((options & PCRE_CASELESS) != 0) +#else + /* In non 8 bit mode, we can get here even if we are not in UTF mode. */ + if (utf && (options & PCRE_CASELESS) != 0) +#endif { unsigned int othercase; - if ((othercase = UCD_OTHERCASE(c)) != (unsigned int)c) + if ((int)(othercase = UCD_OTHERCASE(c)) != c) { - *class_utf8data++ = XCL_SINGLE; - class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); + *class_uchardata++ = XCL_SINGLE; + class_uchardata += PRIV(ord2utf)(othercase, class_uchardata); } } #endif /* SUPPORT_UCP */ } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF || COMPILE_PCRE16 */ /* Handle a single-byte character */ { + class_has_8bitchar = 1; classbits[c/8] |= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { - c = cd->fcc[c]; /* flip case */ + c = cd->fcc[c]; /* flip case */ classbits[c/8] |= (1 << (c&7)); } - class_charcount++; - class_lastchar = c; } } @@ -4291,66 +4612,13 @@ for (;; ptr++) goto FAILED; } - /* If class_charcount is 1, we saw precisely one character whose value is - less than 256. As long as there were no characters >= 128 and there was no - use of \p or \P, in other words, no use of any XCLASS features, we can - optimize. - - In UTF-8 mode, we can optimize the negative case only if there were no - characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR - operate on single-bytes characters only. This is an historical hangover. - Maybe one day we can tidy these opcodes to handle multi-byte characters. - - The optimization throws away the bit map. We turn the item into a - 1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative. - Note that OP_NOT[I] does not support multibyte characters. In the positive - case, it can cause firstbyte to be set. Otherwise, there can be no first - char if this item is first, whatever repeat count may follow. In the case - of reqbyte, save the previous value for reinstating. */ - -#ifdef SUPPORT_UTF8 - if (class_charcount == 1 && !class_utf8 && - (!utf8 || !negate_class || class_lastchar < 128)) -#else - if (class_charcount == 1) -#endif - { - zeroreqbyte = reqbyte; - - /* The OP_NOT[I] opcodes work on one-byte characters only. */ - - if (negate_class) - { - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - zerofirstbyte = firstbyte; - *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT; - *code++ = class_lastchar; - break; - } - - /* For a single, positive character, get the value into mcbuffer, and - then we can handle this with the normal one-character code. */ - -#ifdef SUPPORT_UTF8 - if (utf8 && class_lastchar > 127) - mclength = _pcre_ord2utf8(class_lastchar, mcbuffer); - else -#endif - { - mcbuffer[0] = class_lastchar; - mclength = 1; - } - goto ONE_CHAR; - } /* End of 1-char optimization */ - - /* The general case - not the one-char optimization. If this is the first - thing in the branch, there can be no first char setting, whatever the - repeat count. Any reqbyte setting must remain unchanged after any kind of - repeat. */ + /* If this is the first thing in the branch, there can be no first char + setting, whatever the repeat count. Any reqchar setting must remain + unchanged after any kind of repeat. */ - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; - zerofirstbyte = firstbyte; - zeroreqbyte = reqbyte; + if (firstchar == REQ_UNSET) firstchar = REQ_NONE; + zerofirstchar = firstchar; + zeroreqchar = reqchar; /* If there are characters with values > 255, we have to compile an extended class, with its own opcode, unless there was a negated special @@ -4360,25 +4628,30 @@ for (;; ptr++) be listed) there are no characters < 256, we can omit the bitmap in the actual compiled code. */ -#ifdef SUPPORT_UTF8 - if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0)) +#ifdef SUPPORT_UTF + if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0)) +#elif !defined COMPILE_PCRE8 + if (xclass && !should_flip_negation) +#endif +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 { - *class_utf8data++ = XCL_END; /* Marks the end of extra data */ + *class_uchardata++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; code += LINK_SIZE; - *code = negate_class? XCL_NOT : 0; + *code = negate_class? XCL_NOT:0; /* If the map is required, move up the extra data to make room for it; otherwise just move the code pointer to the end of the extra data. */ - if (class_charcount > 0) + if (class_has_8bitchar > 0) { *code++ |= XCL_MAP; - memmove(code + 32, code, class_utf8data - code); + memmove(code + (32 / sizeof(pcre_uchar)), code, + IN_UCHARS(class_uchardata - code)); memcpy(code, classbits, 32); - code = class_utf8data + 32; + code = class_uchardata + (32 / sizeof(pcre_uchar)); } - else code = class_utf8data; + else code = class_uchardata; /* Now fill in the complete length of the item */ @@ -4394,16 +4667,14 @@ for (;; ptr++) negating it if necessary. */ *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; - if (negate_class) - { - if (lengthptr == NULL) /* Save time in the pre-compile phase */ - for (c = 0; c < 32; c++) code[c] = ~classbits[c]; - } - else + if (lengthptr == NULL) /* Save time in the pre-compile phase */ { + if (negate_class) + for (c = 0; c < 32; c++) classbits[c] = ~classbits[c]; memcpy(code, classbits, 32); } - code += 32; + code += 32 / sizeof(pcre_uchar); + NOT_CHAR: break; @@ -4440,8 +4711,8 @@ for (;; ptr++) if (repeat_min == 0) { - firstbyte = zerofirstbyte; /* Adjust for zero repeat */ - reqbyte = zeroreqbyte; /* Ditto */ + firstchar = zerofirstchar; /* Adjust for zero repeat */ + reqchar = zeroreqchar; /* Ditto */ } /* Remember whether this is a variable length repeat */ @@ -4483,7 +4754,7 @@ for (;; ptr++) if (*previous == OP_RECURSE) { - memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE); + memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE)); *previous = OP_ONCE; PUT(previous, 1, 2 + 2*LINK_SIZE); previous[2 + 2*LINK_SIZE] = OP_KET; @@ -4506,37 +4777,36 @@ for (;; ptr++) /* If previous was a character match, abolish the item and generate a repeat item instead. If a char item has a minumum of more than one, ensure - that it is set in reqbyte - it might not be if a sequence such as x{3} is - the first thing in a branch because the x will have gone into firstbyte + that it is set in reqchar - it might not be if a sequence such as x{3} is + the first thing in a branch because the x will have gone into firstchar instead. */ if (*previous == OP_CHAR || *previous == OP_CHARI) { op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR; - /* Deal with UTF-8 characters that take up more than one byte. It's + /* Deal with UTF characters that take up more than one character. It's easier to write this out separately than try to macrify it. Use c to - hold the length of the character in bytes, plus 0x80 to flag that it's a - length rather than a small character. */ + hold the length of the character in bytes, plus UTF_LENGTH to flag that + it's a length rather than a small character. */ -#ifdef SUPPORT_UTF8 - if (utf8 && (code[-1] & 0x80) != 0) +#ifdef SUPPORT_UTF + if (utf && NOT_FIRSTCHAR(code[-1])) { - uschar *lastchar = code - 1; - while((*lastchar & 0xc0) == 0x80) lastchar--; + pcre_uchar *lastchar = code - 1; + BACKCHAR(lastchar); c = (int)(code - lastchar); /* Length of UTF-8 character */ - memcpy(utf8_char, lastchar, c); /* Save the char */ - c |= 0x80; /* Flag c as a length */ + memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */ + c |= UTF_LENGTH; /* Flag c as a length */ } else -#endif - - /* Handle the case of a single byte - either with no UTF8 support, or - with UTF-8 disabled, or for a UTF-8 character < 128. */ +#endif /* SUPPORT_UTF */ + /* Handle the case of a single charater - either with no UTF support, or + with UTF disabled, or for a single character UTF character. */ { c = code[-1]; - if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; + if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt; } /* If the repetition is unlimited, it pays to see if the next thing on @@ -4546,7 +4816,7 @@ for (;; ptr++) if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(previous, utf8, ptr + 1, options, cd)) + check_auto_possessive(previous, utf, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -4567,7 +4837,7 @@ for (;; ptr++) c = previous[1]; if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(previous, utf8, ptr + 1, options, cd)) + check_auto_possessive(previous, utf, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -4584,14 +4854,14 @@ for (;; ptr++) else if (*previous < OP_EODN) { - uschar *oldcode; + pcre_uchar *oldcode; int prop_type, prop_value; op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ c = *previous; if (!possessive_quantifier && repeat_max < 0 && - check_auto_possessive(previous, utf8, ptr + 1, options, cd)) + check_auto_possessive(previous, utf, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; @@ -4671,14 +4941,14 @@ for (;; ptr++) we have to insert the character for the previous code. For a repeated Unicode property match, there are two extra bytes that define the required property. In UTF-8 mode, long characters have their length in - c, with the 0x80 bit as a flag. */ + c, with the UTF_LENGTH bit as a flag. */ if (repeat_max < 0) { -#ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) +#ifdef SUPPORT_UTF + if (utf && (c & UTF_LENGTH) != 0) { - memcpy(code, utf8_char, c & 7); + memcpy(code, utf_chars, IN_UCHARS(c & 7)); code += c & 7; } else @@ -4700,10 +4970,10 @@ for (;; ptr++) else if (repeat_max != repeat_min) { -#ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) +#ifdef SUPPORT_UTF + if (utf && (c & UTF_LENGTH) != 0) { - memcpy(code, utf8_char, c & 7); + memcpy(code, utf_chars, IN_UCHARS(c & 7)); code += c & 7; } else @@ -4730,10 +5000,10 @@ for (;; ptr++) /* The character or character type itself comes last in all cases. */ -#ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) +#ifdef SUPPORT_UTF + if (utf && (c & UTF_LENGTH) != 0) { - memcpy(code, utf8_char, c & 7); + memcpy(code, utf_chars, IN_UCHARS(c & 7)); code += c & 7; } else @@ -4757,7 +5027,7 @@ for (;; ptr++) else if (*previous == OP_CLASS || *previous == OP_NCLASS || -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 *previous == OP_XCLASS || #endif *previous == OP_REF || @@ -4806,8 +5076,8 @@ for (;; ptr++) { register int i; int len = (int)(code - previous); - uschar *bralink = NULL; - uschar *brazeroptr = NULL; + pcre_uchar *bralink = NULL; + pcre_uchar *brazeroptr = NULL; /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so we just ignore the repeat. */ @@ -4860,8 +5130,8 @@ for (;; ptr++) if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { *code = OP_END; - adjust_recurse(previous, 1, utf8, cd, save_hwm); - memmove(previous+1, previous, len); + adjust_recurse(previous, 1, utf, cd, save_hwm); + memmove(previous + 1, previous, IN_UCHARS(len)); code++; if (repeat_max == 0) { @@ -4884,8 +5154,8 @@ for (;; ptr++) { int offset; *code = OP_END; - adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); - memmove(previous + 2 + LINK_SIZE, previous, len); + adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm); + memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len)); code += 2 + LINK_SIZE; *previous++ = OP_BRAZERO + repeat_type; *previous++ = OP_BRA; @@ -4938,13 +5208,13 @@ for (;; ptr++) else { - if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; + if (groupsetfirstchar && reqchar < 0) reqchar = firstchar; for (i = 1; i < repeat_min; i++) { - uschar *hc; - uschar *this_hwm = cd->hwm; - memcpy(code, previous, len); + pcre_uchar *hc; + pcre_uchar *this_hwm = cd->hwm; + memcpy(code, previous, IN_UCHARS(len)); while (cd->hwm > cd->start_workspace + cd->workspace_size - WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm)) @@ -4953,8 +5223,8 @@ for (;; ptr++) int this_offset = this_hwm - cd->start_workspace; *errorcodeptr = expand_workspace(cd); if (*errorcodeptr != 0) goto FAILED; - save_hwm = (uschar *)cd->start_workspace + save_offset; - this_hwm = (uschar *)cd->start_workspace + this_offset; + save_hwm = (pcre_uchar *)cd->start_workspace + save_offset; + this_hwm = (pcre_uchar *)cd->start_workspace + this_offset; } for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) @@ -5006,8 +5276,8 @@ for (;; ptr++) else for (i = repeat_max - 1; i >= 0; i--) { - uschar *hc; - uschar *this_hwm = cd->hwm; + pcre_uchar *hc; + pcre_uchar *this_hwm = cd->hwm; *code++ = OP_BRAZERO + repeat_type; @@ -5023,7 +5293,7 @@ for (;; ptr++) PUTINC(code, 0, offset); } - memcpy(code, previous, len); + memcpy(code, previous, IN_UCHARS(len)); /* Ensure there is enough workspace for forward references before copying them. */ @@ -5035,8 +5305,8 @@ for (;; ptr++) int this_offset = this_hwm - cd->start_workspace; *errorcodeptr = expand_workspace(cd); if (*errorcodeptr != 0) goto FAILED; - save_hwm = (uschar *)cd->start_workspace + save_offset; - this_hwm = (uschar *)cd->start_workspace + this_offset; + save_hwm = (pcre_uchar *)cd->start_workspace + save_offset; + this_hwm = (pcre_uchar *)cd->start_workspace + this_offset; } for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) @@ -5055,7 +5325,7 @@ for (;; ptr++) { int oldlinkoffset; int offset = (int)(code - bralink + 1); - uschar *bra = code - offset; + pcre_uchar *bra = code - offset; oldlinkoffset = GET(bra, 1); bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; *code++ = OP_KET; @@ -5091,8 +5361,8 @@ for (;; ptr++) else { - uschar *ketcode = code - 1 - LINK_SIZE; - uschar *bracode = ketcode - GET(ketcode, 1); + pcre_uchar *ketcode = code - 1 - LINK_SIZE; + pcre_uchar *bracode = ketcode - GET(ketcode, 1); /* Convert possessive ONCE brackets to non-capturing */ @@ -5114,10 +5384,10 @@ for (;; ptr++) if (lengthptr == NULL) { - uschar *scode = bracode; + pcre_uchar *scode = bracode; do { - if (could_be_empty_branch(scode, ketcode, utf8, cd)) + if (could_be_empty_branch(scode, ketcode, utf, cd)) { *bracode += OP_SBRA - OP_BRA; break; @@ -5140,8 +5410,8 @@ for (;; ptr++) { int nlen = (int)(code - bracode); *code = OP_END; - adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm); - memmove(bracode + 1+LINK_SIZE, bracode, nlen); + adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm); + memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen)); code += 1 + LINK_SIZE; nlen += 1 + LINK_SIZE; *bracode = OP_BRAPOS; @@ -5210,15 +5480,16 @@ for (;; ptr++) int len; if (*tempcode == OP_TYPEEXACT) - tempcode += _pcre_OP_lengths[*tempcode] + - ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0); + tempcode += PRIV(OP_lengths)[*tempcode] + + ((tempcode[1 + IMM2_SIZE] == OP_PROP + || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT) { - tempcode += _pcre_OP_lengths[*tempcode]; -#ifdef SUPPORT_UTF8 - if (utf8 && tempcode[-1] >= 0xc0) - tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f]; + tempcode += PRIV(OP_lengths)[*tempcode]; +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(tempcode[-1])) + tempcode += GET_EXTRALEN(tempcode[-1]); #endif } @@ -5255,8 +5526,8 @@ for (;; ptr++) default: *code = OP_END; - adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm); - memmove(tempcode + 1+LINK_SIZE, tempcode, len); + adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm); + memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; tempcode[0] = OP_ONCE; @@ -5268,7 +5539,7 @@ for (;; ptr++) } /* In all case we no longer have a previous item. We also set the - "follows varying string" flag for subsequently encountered reqbytes if + "follows varying string" flag for subsequently encountered reqchars if it isn't already set and we have just passed a varying length item. */ END_REPEAT: @@ -5291,16 +5562,18 @@ for (;; ptr++) /* First deal with various "verbs" that can be introduced by '*'. */ - if (*(++ptr) == CHAR_ASTERISK && - ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':')) + ptr++; + if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':' + || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0)))) { int i, namelen; int arglen = 0; const char *vn = verbnames; - const uschar *name = ptr + 1; - const uschar *arg = NULL; + const pcre_uchar *name = ptr + 1; + const pcre_uchar *arg = NULL; previous = NULL; - while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; + ptr++; + while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; namelen = (int)(ptr - name); /* It appears that Perl allows any characters whatsoever, other than @@ -5325,7 +5598,7 @@ for (;; ptr++) for (i = 0; i < verbcount; i++) { if (namelen == verbs[i].len && - strncmp((char *)name, vn, namelen) == 0) + STRNCMP_UC_C8(name, vn, namelen) == 0) { /* Check for open captures before ACCEPT and convert it to ASSERT_ACCEPT if in an assertion. */ @@ -5346,8 +5619,8 @@ for (;; ptr++) } *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; - /* Do not set firstbyte after *ACCEPT */ - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + /* Do not set firstchar after *ACCEPT */ + if (firstchar == REQ_UNSET) firstchar = REQ_NONE; } /* Handle other cases with/without an argument */ @@ -5373,7 +5646,7 @@ for (;; ptr++) *code = verbs[i].op_arg; if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN; *code++ = arglen; - memcpy(code, arg, arglen); + memcpy(code, arg, IN_UCHARS(arglen)); code += arglen; *code++ = 0; } @@ -5396,8 +5669,8 @@ for (;; ptr++) { int i, set, unset, namelen; int *optset; - const uschar *name; - uschar *slot; + const pcre_uchar *name; + pcre_uchar *slot; switch (*(++ptr)) { @@ -5450,10 +5723,10 @@ for (;; ptr++) break; /* Most other conditions use OP_CREF (a couple change to OP_RREF - below), and all need to skip 3 bytes at the start of the group. */ + below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */ code[1+LINK_SIZE] = OP_CREF; - skipbytes = 3; + skipbytes = 1+IMM2_SIZE; refsign = -1; /* Check for a test for recursion in a named group. */ @@ -5486,7 +5759,7 @@ for (;; ptr++) /* We now expect to read a name; any thing else is an error */ - if ((cd->ctypes[ptr[1]] & ctype_word) == 0) + if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0) { ptr += 1; /* To get the right offset */ *errorcodeptr = ERR28; @@ -5497,11 +5770,10 @@ for (;; ptr++) recno = 0; name = ++ptr; - while ((cd->ctypes[*ptr] & ctype_word) != 0) + while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) { if (recno >= 0) - recno = ((digitab[*ptr] & ctype_digit) != 0)? - recno * 10 + *ptr - CHAR_0 : -1; + recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1; ptr++; } namelen = (int)(ptr - name); @@ -5549,7 +5821,7 @@ for (;; ptr++) slot = cd->name_table; for (i = 0; i < cd->names_found; i++) { - if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; + if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break; slot += cd->name_entry_size; } @@ -5565,7 +5837,7 @@ for (;; ptr++) /* Search the pattern for a forward reference */ else if ((i = find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0, utf8)) > 0) + (options & PCRE_EXTENDED) != 0, utf)) > 0) { PUT2(code, 2+LINK_SIZE, i); code[1+LINK_SIZE]++; @@ -5591,7 +5863,7 @@ for (;; ptr++) recno = 0; for (i = 1; i < namelen; i++) { - if ((digitab[name[i]] & ctype_digit) == 0) + if (!IS_DIGIT(name[i])) { *errorcodeptr = ERR15; goto FAILED; @@ -5606,7 +5878,7 @@ for (;; ptr++) /* Similarly, check for the (?(DEFINE) "condition", which is always false. */ - else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0) + else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0) { code[1+LINK_SIZE] = OP_DEF; skipbytes = 1; @@ -5669,7 +5941,8 @@ for (;; ptr++) break; default: /* Could be name define, else bad */ - if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME; + if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0) + goto DEFINE_NAME; ptr++; /* Correct offset for error */ *errorcodeptr = ERR24; goto FAILED; @@ -5691,8 +5964,9 @@ for (;; ptr++) *code++ = OP_CALLOUT; { int n = 0; - while ((digitab[*(++ptr)] & ctype_digit) != 0) - n = n * 10 + *ptr - CHAR_0; + ptr++; + while(IS_DIGIT(*ptr)) + n = n * 10 + *ptr++ - CHAR_0; if (*ptr != CHAR_RIGHT_PARENTHESIS) { *errorcodeptr = ERR39; @@ -5737,7 +6011,7 @@ for (;; ptr++) CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; name = ++ptr; - while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; + while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++; namelen = (int)(ptr - name); /* In the pre-compile phase, just do a syntax check. */ @@ -5754,9 +6028,9 @@ for (;; ptr++) *errorcodeptr = ERR49; goto FAILED; } - if (namelen + 3 > cd->name_entry_size) + if (namelen + IMM2_SIZE + 1 > cd->name_entry_size) { - cd->name_entry_size = namelen + 3; + cd->name_entry_size = namelen + IMM2_SIZE + 1; if (namelen > MAX_NAME_SIZE) { *errorcodeptr = ERR48; @@ -5785,10 +6059,10 @@ for (;; ptr++) for (i = 0; i < cd->names_found; i++) { - int crc = memcmp(name, slot+2, namelen); + int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen)); if (crc == 0) { - if (slot[2+namelen] == 0) + if (slot[IMM2_SIZE+namelen] == 0) { if (GET2(slot, 0) != cd->bracount + 1 && (options & PCRE_DUPNAMES) == 0) @@ -5809,7 +6083,7 @@ for (;; ptr++) if (crc < 0) { memmove(slot + cd->name_entry_size, slot, - (cd->names_found - i) * cd->name_entry_size); + IN_UCHARS((cd->names_found - i) * cd->name_entry_size)); break; } @@ -5823,7 +6097,7 @@ for (;; ptr++) if (!dupname) { - uschar *cslot = cd->name_table; + pcre_uchar *cslot = cd->name_table; for (i = 0; i < cd->names_found; i++) { if (cslot != slot) @@ -5840,8 +6114,8 @@ for (;; ptr++) } PUT2(slot, 0, cd->bracount + 1); - memcpy(slot + 2, name, namelen); - slot[2+namelen] = 0; + memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen)); + slot[IMM2_SIZE + namelen] = 0; } } @@ -5867,7 +6141,7 @@ for (;; ptr++) NAMED_REF_OR_RECURSE: name = ++ptr; - while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; + while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++; namelen = (int)(ptr - name); /* In the pre-compile phase, do a syntax check. We used to just set @@ -5879,7 +6153,7 @@ for (;; ptr++) if (lengthptr != NULL) { - const uschar *temp; + const pcre_uchar *temp; if (namelen == 0) { @@ -5909,7 +6183,7 @@ for (;; ptr++) temp = cd->end_pattern; cd->end_pattern = ptr; recno = find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0, utf8); + (options & PCRE_EXTENDED) != 0, utf); cd->end_pattern = temp; if (recno < 0) recno = 0; /* Forward ref; set dummy number */ } @@ -5924,8 +6198,8 @@ for (;; ptr++) slot = cd->name_table; for (i = 0; i < cd->names_found; i++) { - if (strncmp((char *)name, (char *)slot+2, namelen) == 0 && - slot[2+namelen] == 0) + if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 && + slot[IMM2_SIZE+namelen] == 0) break; slot += cd->name_entry_size; } @@ -5936,7 +6210,7 @@ for (;; ptr++) } else if ((recno = /* Forward back reference */ find_parens(cd, name, namelen, - (options & PCRE_EXTENDED) != 0, utf8)) <= 0) + (options & PCRE_EXTENDED) != 0, utf)) <= 0) { *errorcodeptr = ERR15; goto FAILED; @@ -5961,7 +6235,7 @@ for (;; ptr++) case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: { - const uschar *called; + const pcre_uchar *called; terminator = CHAR_RIGHT_PARENTHESIS; /* Come here from the \g<...> and \g'...' code (Oniguruma @@ -5975,7 +6249,7 @@ for (;; ptr++) if ((refsign = *ptr) == CHAR_PLUS) { ptr++; - if ((digitab[*ptr] & ctype_digit) == 0) + if (!IS_DIGIT(*ptr)) { *errorcodeptr = ERR63; goto FAILED; @@ -5983,13 +6257,13 @@ for (;; ptr++) } else if (refsign == CHAR_MINUS) { - if ((digitab[ptr[1]] & ctype_digit) == 0) + if (!IS_DIGIT(ptr[1])) goto OTHER_CHAR_AFTER_QUERY; ptr++; } recno = 0; - while((digitab[*ptr] & ctype_digit) != 0) + while(IS_DIGIT(*ptr)) recno = recno * 10 + *ptr++ - CHAR_0; if (*ptr != terminator) @@ -6040,14 +6314,14 @@ for (;; ptr++) { *code = OP_END; if (recno != 0) - called = _pcre_find_bracket(cd->start_code, utf8, recno); + called = PRIV(find_bracket)(cd->start_code, utf, recno); /* Forward reference */ if (called == NULL) { if (find_parens(cd, NULL, recno, - (options & PCRE_EXTENDED) != 0, utf8) < 0) + (options & PCRE_EXTENDED) != 0, utf) < 0) { *errorcodeptr = ERR15; goto FAILED; @@ -6077,7 +6351,7 @@ for (;; ptr++) conditional subpatterns will be picked up then. */ else if (GET(called, 1) == 0 && cond_depth <= 0 && - could_be_empty(called, code, bcptr, utf8, cd)) + could_be_empty(called, code, bcptr, utf, cd)) { *errorcodeptr = ERR40; goto FAILED; @@ -6085,18 +6359,18 @@ for (;; ptr++) } /* Insert the recursion/subroutine item. It does not have a set first - byte (relevant if it is repeated, because it will then be wrapped - with ONCE brackets). */ + character (relevant if it is repeated, because it will then be + wrapped with ONCE brackets). */ *code = OP_RECURSE; PUT(code, 1, (int)(called - cd->start_code)); code += 1 + LINK_SIZE; - groupsetfirstbyte = FALSE; + groupsetfirstchar = FALSE; } /* Can't determine a first byte now */ - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + if (firstchar == REQ_UNSET) firstchar = REQ_NONE; continue; @@ -6153,7 +6427,7 @@ for (;; ptr++) both phases. If we are not at the pattern start, reset the greedy defaults and the - case value for firstbyte and reqbyte. */ + case value for firstchar and reqchar. */ if (*ptr == CHAR_RIGHT_PARENTHESIS) { @@ -6166,7 +6440,7 @@ for (;; ptr++) { greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); greedy_non_default = greedy_default ^ 1; - req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; + req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0; } /* Change options at this level, and pass them back for use @@ -6203,7 +6477,7 @@ for (;; ptr++) NUMBERED_GROUP: cd->bracount += 1; PUT2(code, 1+LINK_SIZE, cd->bracount); - skipbytes = 2; + skipbytes = IMM2_SIZE; } /* Process nested bracketed regex. Assertions used not to be repeatable, @@ -6229,8 +6503,8 @@ for (;; ptr++) skipbytes, /* Skip over bracket number */ cond_depth + ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */ - &subfirstbyte, /* For possible first char */ - &subreqbyte, /* For possible last char */ + &subfirstchar, /* For possible first char */ + &subreqchar, /* For possible last char */ bcptr, /* Current branch chain */ cd, /* Tables block */ (lengthptr == NULL)? NULL : /* Actual compile phase */ @@ -6258,7 +6532,7 @@ for (;; ptr++) if (bravalue == OP_COND && lengthptr == NULL) { - uschar *tc = code; + pcre_uchar *tc = code; int condcount = 0; do { @@ -6281,7 +6555,7 @@ for (;; ptr++) } /* A "normal" conditional group. If there is just one branch, we must not - make use of its firstbyte or reqbyte, because this is equivalent to an + make use of its firstchar or reqchar, because this is equivalent to an empty second branch. */ else @@ -6291,7 +6565,7 @@ for (;; ptr++) *errorcodeptr = ERR27; goto FAILED; } - if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; + if (condcount == 1) subfirstchar = subreqchar = REQ_NONE; } } @@ -6335,55 +6609,55 @@ for (;; ptr++) /* Handle updating of the required and first characters for other types of group. Update for normal brackets of all kinds, and conditions with two branches (see code above). If the bracket is followed by a quantifier with - zero repeat, we have to back off. Hence the definition of zeroreqbyte and - zerofirstbyte outside the main loop so that they can be accessed for the + zero repeat, we have to back off. Hence the definition of zeroreqchar and + zerofirstchar outside the main loop so that they can be accessed for the back off. */ - zeroreqbyte = reqbyte; - zerofirstbyte = firstbyte; - groupsetfirstbyte = FALSE; + zeroreqchar = reqchar; + zerofirstchar = firstchar; + groupsetfirstchar = FALSE; if (bravalue >= OP_ONCE) { - /* If we have not yet set a firstbyte in this branch, take it from the + /* If we have not yet set a firstchar in this branch, take it from the subpattern, remembering that it was set here so that a repeat of more - than one can replicate it as reqbyte if necessary. If the subpattern has - no firstbyte, set "none" for the whole branch. In both cases, a zero - repeat forces firstbyte to "none". */ + than one can replicate it as reqchar if necessary. If the subpattern has + no firstchar, set "none" for the whole branch. In both cases, a zero + repeat forces firstchar to "none". */ - if (firstbyte == REQ_UNSET) + if (firstchar == REQ_UNSET) { - if (subfirstbyte >= 0) + if (subfirstchar >= 0) { - firstbyte = subfirstbyte; - groupsetfirstbyte = TRUE; + firstchar = subfirstchar; + groupsetfirstchar = TRUE; } - else firstbyte = REQ_NONE; - zerofirstbyte = REQ_NONE; + else firstchar = REQ_NONE; + zerofirstchar = REQ_NONE; } - /* If firstbyte was previously set, convert the subpattern's firstbyte - into reqbyte if there wasn't one, using the vary flag that was in + /* If firstchar was previously set, convert the subpattern's firstchar + into reqchar if there wasn't one, using the vary flag that was in existence beforehand. */ - else if (subfirstbyte >= 0 && subreqbyte < 0) - subreqbyte = subfirstbyte | tempreqvary; + else if (subfirstchar >= 0 && subreqchar < 0) + subreqchar = subfirstchar | tempreqvary; /* If the subpattern set a required byte (or set a first byte that isn't really the first byte - see above), set it. */ - if (subreqbyte >= 0) reqbyte = subreqbyte; + if (subreqchar >= 0) reqchar = subreqchar; } - /* For a forward assertion, we take the reqbyte, if set. This can be + /* For a forward assertion, we take the reqchar, if set. This can be helpful if the pattern that follows the assertion doesn't set a different - char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte + char. For example, it's useful for /(?=abcde).+/. We can't set firstchar for an assertion, however because it leads to incorrect effect for patterns - such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead - of a firstbyte. This is overcome by a scan at the end if there's no - firstbyte, looking for an asserted first char. */ + such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead + of a firstchar. This is overcome by a scan at the end if there's no + firstchar, looking for an asserted first char. */ - else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; + else if (bravalue == OP_ASSERT && subreqchar >= 0) reqchar = subreqchar; break; /* End of processing '(' */ @@ -6416,13 +6690,13 @@ for (;; ptr++) /* For metasequences that actually match a character, we disable the setting of a first character if it hasn't already been set. */ - if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z) - firstbyte = REQ_NONE; + if (firstchar == REQ_UNSET && -c > ESC_b && -c < ESC_Z) + firstchar = REQ_NONE; /* Set values to reset to if this is followed by a zero repeat. */ - zerofirstbyte = firstbyte; - zeroreqbyte = reqbyte; + zerofirstchar = firstchar; + zeroreqchar = reqchar; /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n' is a subroutine call by number (Oniguruma syntax). In fact, the value @@ -6433,7 +6707,7 @@ for (;; ptr++) if (-c == ESC_g) { - const uschar *p; + const pcre_uchar *p; save_hwm = cd->hwm; /* Normally this is set when '(' is read */ terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; @@ -6450,10 +6724,11 @@ for (;; ptr++) if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS) { - BOOL isnumber = TRUE; + BOOL is_a_number = TRUE; for (p = ptr + 1; *p != 0 && *p != terminator; p++) { - if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE; + if (!MAX_255(*p)) { is_a_number = FALSE; break; } + if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE; if ((cd->ctypes[*p] & ctype_word) == 0) break; } if (*p != terminator) @@ -6461,7 +6736,7 @@ for (;; ptr++) *errorcodeptr = ERR57; break; } - if (isnumber) + if (is_a_number) { ptr++; goto HANDLE_NUMERICAL_RECURSION; @@ -6473,7 +6748,7 @@ for (;; ptr++) /* Test a signed number in angle brackets or quotes. */ p = ptr + 2; - while ((digitab[*p] & ctype_digit) != 0) p++; + while (IS_DIGIT(*p)) p++; if (*p != terminator) { *errorcodeptr = ERR57; @@ -6501,7 +6776,7 @@ for (;; ptr++) goto NAMED_REF_OR_RECURSE; } - /* Back references are handled specially; must disable firstbyte if + /* Back references are handled specially; must disable firstchar if not set to cope with cases like (?=(\w+))\1: which would otherwise set ':' later. */ @@ -6511,7 +6786,7 @@ for (;; ptr++) recno = -c - ESC_REF; HANDLE_REFERENCE: /* Come here from named backref handling */ - if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + if (firstchar == REQ_UNSET) firstchar = REQ_NONE; previous = code; *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF; PUT2INC(code, 0, recno); @@ -6578,7 +6853,7 @@ for (;; ptr++) { previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; - *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c; + *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c; } } continue; @@ -6588,9 +6863,9 @@ for (;; ptr++) a value > 127. We set its representation in the length/buffer, and then handle it as a data character. */ -#ifdef SUPPORT_UTF8 - if (utf8 && c > 127) - mclength = _pcre_ord2utf8(c, mcbuffer); +#ifdef SUPPORT_UTF + if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) + mclength = PRIV(ord2utf)(c, mcbuffer); else #endif @@ -6611,12 +6886,9 @@ for (;; ptr++) mclength = 1; mcbuffer[0] = c; -#ifdef SUPPORT_UTF8 - if (utf8 && c >= 0xc0) - { - while ((ptr[1] & 0xc0) == 0x80) - mcbuffer[mclength++] = *(++ptr); - } +#ifdef SUPPORT_UTF + if (utf && HAS_EXTRALEN(c)) + ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr)); #endif /* At this point we have the character's bytes in mcbuffer, and the length @@ -6634,34 +6906,34 @@ for (;; ptr++) /* Set the first and required bytes appropriately. If no previous first byte, set it from this character, but revert to none on a zero repeat. - Otherwise, leave the firstbyte value alone, and don't change it on a zero + Otherwise, leave the firstchar value alone, and don't change it on a zero repeat. */ - if (firstbyte == REQ_UNSET) + if (firstchar == REQ_UNSET) { - zerofirstbyte = REQ_NONE; - zeroreqbyte = reqbyte; + zerofirstchar = REQ_NONE; + zeroreqchar = reqchar; - /* If the character is more than one byte long, we can set firstbyte + /* If the character is more than one byte long, we can set firstchar only if it is not to be matched caselessly. */ if (mclength == 1 || req_caseopt == 0) { - firstbyte = mcbuffer[0] | req_caseopt; - if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt; + firstchar = mcbuffer[0] | req_caseopt; + if (mclength != 1) reqchar = code[-1] | cd->req_varyopt; } - else firstbyte = reqbyte = REQ_NONE; + else firstchar = reqchar = REQ_NONE; } - /* firstbyte was previously set; we can set reqbyte only if the length is + /* firstchar was previously set; we can set reqchar only if the length is 1 or the matching is caseful. */ else { - zerofirstbyte = firstbyte; - zeroreqbyte = reqbyte; + zerofirstchar = firstchar; + zeroreqchar = reqchar; if (mclength == 1 || req_caseopt == 0) - reqbyte = code[-1] | req_caseopt | cd->req_varyopt; + reqchar = code[-1] | req_caseopt | cd->req_varyopt; } break; /* End of literal character handling */ @@ -6701,8 +6973,8 @@ Arguments: reset_bracount TRUE to reset the count for each branch skipbytes skip this many bytes at start (for brackets and OP_COND) cond_depth depth of nesting for conditional subpatterns - firstbyteptr place to put the first required character, or a negative number - reqbyteptr place to put the last required character, or a negative number + firstcharptr place to put the first required character, or a negative number + reqcharptr place to put the last required character, or a negative number bcptr pointer to the chain of currently open branches cd points to the data block with tables pointers etc. lengthptr NULL during the real compile phase @@ -6712,20 +6984,20 @@ Returns: TRUE on success */ static BOOL -compile_regex(int options, uschar **codeptr, const uschar **ptrptr, +compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr, int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, - int cond_depth, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, - compile_data *cd, int *lengthptr) + int cond_depth, pcre_int32 *firstcharptr, pcre_int32 *reqcharptr, + branch_chain *bcptr, compile_data *cd, int *lengthptr) { -const uschar *ptr = *ptrptr; -uschar *code = *codeptr; -uschar *last_branch = code; -uschar *start_bracket = code; -uschar *reverse_count = NULL; +const pcre_uchar *ptr = *ptrptr; +pcre_uchar *code = *codeptr; +pcre_uchar *last_branch = code; +pcre_uchar *start_bracket = code; +pcre_uchar *reverse_count = NULL; open_capitem capitem; int capnumber = 0; -int firstbyte, reqbyte; -int branchfirstbyte, branchreqbyte; +pcre_int32 firstchar, reqchar; +pcre_int32 branchfirstchar, branchreqchar; int length; int orig_bracount; int max_bracount; @@ -6734,7 +7006,7 @@ branch_chain bc; bc.outer = bcptr; bc.current_branch = code; -firstbyte = reqbyte = REQ_UNSET; +firstchar = reqchar = REQ_UNSET; /* Accumulate the length for use in the pre-compile phase. Start with the length of the BRA and KET and any extra bytes that are required at the @@ -6793,8 +7065,8 @@ for (;;) /* Now compile the branch; in the pre-compile phase its length gets added into the length. */ - if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, - &branchreqbyte, &bc, cond_depth, cd, + if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar, + &branchreqchar, &bc, cond_depth, cd, (lengthptr == NULL)? NULL : &length)) { *ptrptr = ptr; @@ -6810,43 +7082,43 @@ for (;;) if (lengthptr == NULL) { - /* If this is the first branch, the firstbyte and reqbyte values for the + /* If this is the first branch, the firstchar and reqchar values for the branch become the values for the regex. */ if (*last_branch != OP_ALT) { - firstbyte = branchfirstbyte; - reqbyte = branchreqbyte; + firstchar = branchfirstchar; + reqchar = branchreqchar; } - /* If this is not the first branch, the first char and reqbyte have to + /* If this is not the first branch, the first char and reqchar have to match the values from all the previous branches, except that if the - previous value for reqbyte didn't have REQ_VARY set, it can still match, + previous value for reqchar didn't have REQ_VARY set, it can still match, and we set REQ_VARY for the regex. */ else { - /* If we previously had a firstbyte, but it doesn't match the new branch, - we have to abandon the firstbyte for the regex, but if there was - previously no reqbyte, it takes on the value of the old firstbyte. */ + /* If we previously had a firstchar, but it doesn't match the new branch, + we have to abandon the firstchar for the regex, but if there was + previously no reqchar, it takes on the value of the old firstchar. */ - if (firstbyte >= 0 && firstbyte != branchfirstbyte) + if (firstchar >= 0 && firstchar != branchfirstchar) { - if (reqbyte < 0) reqbyte = firstbyte; - firstbyte = REQ_NONE; + if (reqchar < 0) reqchar = firstchar; + firstchar = REQ_NONE; } - /* If we (now or from before) have no firstbyte, a firstbyte from the - branch becomes a reqbyte if there isn't a branch reqbyte. */ + /* If we (now or from before) have no firstchar, a firstchar from the + branch becomes a reqchar if there isn't a branch reqchar. */ - if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) - branchreqbyte = branchfirstbyte; + if (firstchar < 0 && branchfirstchar >= 0 && branchreqchar < 0) + branchreqchar = branchfirstchar; - /* Now ensure that the reqbytes match */ + /* Now ensure that the reqchars match */ - if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) - reqbyte = REQ_NONE; - else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ + if ((reqchar & ~REQ_VARY) != (branchreqchar & ~REQ_VARY)) + reqchar = REQ_NONE; + else reqchar |= branchreqchar; /* To "or" REQ_VARY */ } /* If lookbehind, check that this branch matches a fixed-length string, and @@ -6916,7 +7188,7 @@ for (;;) if (cd->open_caps->flag) { memmove(start_bracket + 1 + LINK_SIZE, start_bracket, - code - start_bracket); + IN_UCHARS(code - start_bracket)); *start_bracket = OP_ONCE; code += 1 + LINK_SIZE; PUT(start_bracket, 1, (int)(code - start_bracket)); @@ -6936,8 +7208,8 @@ for (;;) *codeptr = code; *ptrptr = ptr; - *firstbyteptr = firstbyte; - *reqbyteptr = reqbyte; + *firstcharptr = firstchar; + *reqcharptr = reqchar; if (lengthptr != NULL) { if (OFLOW_MAX - *lengthptr < length) @@ -7018,12 +7290,12 @@ Returns: TRUE or FALSE */ static BOOL -is_anchored(register const uschar *code, unsigned int bracket_map, +is_anchored(register const pcre_uchar *code, unsigned int bracket_map, unsigned int backref_map) { do { - const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], - FALSE); + const pcre_uchar *scode = first_significant_code( + code + PRIV(OP_lengths)[*code], FALSE); register int op = *scode; /* Non-capturing brackets */ @@ -7095,12 +7367,12 @@ Returns: TRUE or FALSE */ static BOOL -is_startline(const uschar *code, unsigned int bracket_map, +is_startline(const pcre_uchar *code, unsigned int bracket_map, unsigned int backref_map) { do { - const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], - FALSE); + const pcre_uchar *scode = first_significant_code( + code + PRIV(OP_lengths)[*code], FALSE); register int op = *scode; /* If we are at the start of a conditional assertion group, *both* the @@ -7111,7 +7383,7 @@ do { if (op == OP_COND) { scode += 1 + LINK_SIZE; - if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT]; + if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; switch (*scode) { case OP_CREF: @@ -7198,14 +7470,15 @@ Returns: -1 or the fixed first char */ static int -find_firstassertedchar(const uschar *code, BOOL inassert) +find_firstassertedchar(const pcre_uchar *code, BOOL inassert) { register int c = -1; do { int d; int xl = (*code == OP_CBRA || *code == OP_SCBRA || - *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? 2:0; - const uschar *scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE); + *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0; + const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl, + TRUE); register int op = *scode; switch(op) @@ -7229,7 +7502,7 @@ do { break; case OP_EXACT: - scode += 2; + scode += IMM2_SIZE; /* Fall through */ case OP_CHAR: @@ -7242,7 +7515,7 @@ do { break; case OP_EXACTI: - scode += 2; + scode += IMM2_SIZE; /* Fall through */ case OP_CHARI: @@ -7285,28 +7558,45 @@ Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) +#else +PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION +pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr, + int *erroroffset, const unsigned char *tables) +#endif { +#ifdef COMPILE_PCRE8 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); +#else +return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables); +#endif } +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION pcre_compile2(const char *pattern, int options, int *errorcodeptr, const char **errorptr, int *erroroffset, const unsigned char *tables) +#else +PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION +pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr, + const char **errorptr, int *erroroffset, const unsigned char *tables) +#endif { -real_pcre *re; +REAL_PCRE *re; int length = 1; /* For final END opcode */ -int firstbyte, reqbyte, newline; +pcre_int32 firstchar, reqchar; +int newline; int errorcode = 0; int skipatstart = 0; -BOOL utf8; +BOOL utf; size_t size; -uschar *code; -const uschar *codestart; -const uschar *ptr; +pcre_uchar *code; +const pcre_uchar *codestart; +const pcre_uchar *ptr; compile_data compile_block; compile_data *cd = &compile_block; @@ -7317,11 +7607,11 @@ this purpose. The same space is used in the second phase for remembering where to fill in forward references to subpatterns. That may overflow, in which case new memory is obtained from malloc(). */ -uschar cworkspace[COMPILE_WORK_SIZE]; +pcre_uchar cworkspace[COMPILE_WORK_SIZE]; /* Set this early so that early errors get offset 0. */ -ptr = (const uschar *)pattern; +ptr = (const pcre_uchar *)pattern; /* We can't pass back an error message if errorptr is NULL; I guess the best we can do is just return NULL, but we can set a code value if there is a code @@ -7348,7 +7638,7 @@ if (erroroffset == NULL) /* Set up pointers to the individual character tables */ -if (tables == NULL) tables = _pcre_default_tables; +if (tables == NULL) tables = PRIV(default_tables); cd->lcc = tables + lcc_offset; cd->fcc = tables + fcc_offset; cd->cbits = tables + cbits_offset; @@ -7371,27 +7661,33 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && int newnl = 0; int newbsr = 0; - if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0) +#ifdef COMPILE_PCRE8 + if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 5) == 0) { skipatstart += 7; options |= PCRE_UTF8; continue; } - else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0) +#endif +#ifdef COMPILE_PCRE16 + if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0) + { skipatstart += 8; options |= PCRE_UTF16; continue; } +#endif + else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0) { skipatstart += 6; options |= PCRE_UCP; continue; } - else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0) + else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0) { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; } - if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0) + if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } - else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0) + else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0) + else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5) == 0) { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0) + else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0) { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } - else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0) + else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0) { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } - else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0) + else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0) { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } - else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0) + else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0) { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } if (newnl != 0) @@ -7401,22 +7697,27 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && else break; } -utf8 = (options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +utf = (options & PCRE_UTF8) != 0; -/* Can't support UTF8 unless PCRE has been compiled to include the code. The -return of an error code from _pcre_valid_utf8() is a new feature, introduced in +/* Can't support UTF unless PCRE has been compiled to include the code. The +return of an error code from PRIV(valid_utf)() is a new feature, introduced in release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is not used here. */ -#ifdef SUPPORT_UTF8 -if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && - (errorcode = _pcre_valid_utf8((USPTR)pattern, -1, erroroffset)) != 0) +#ifdef SUPPORT_UTF +if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 && + (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) { +#ifdef COMPILE_PCRE8 errorcode = ERR44; +#else + errorcode = ERR74; +#endif goto PCRE_EARLY_ERROR_RETURN2; } #else -if (utf8) +if (utf) { errorcode = ERR32; goto PCRE_EARLY_ERROR_RETURN; @@ -7492,7 +7793,10 @@ cd->backref_map = 0; /* Reflect pattern for debugging output */ DPRINTF(("------------------------------------------------------------------\n")); -DPRINTF(("%s\n", pattern)); +#ifdef PCRE_DEBUG +print_puchar(stdout, (PCRE_PUCHAR)pattern); +#endif +DPRINTF(("\n")); /* Pretend to compile the pattern while actually just accumulating the length of memory required. This behaviour is triggered by passing a non-NULL final @@ -7509,9 +7813,10 @@ cd->start_code = cworkspace; cd->hwm = cworkspace; cd->start_workspace = cworkspace; cd->workspace_size = COMPILE_WORK_SIZE; -cd->start_pattern = (const uschar *)pattern; -cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); +cd->start_pattern = (const pcre_uchar *)pattern; +cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern)); cd->req_varyopt = 0; +cd->assert_depth = 0; cd->external_options = options; cd->external_flags = 0; cd->open_caps = NULL; @@ -7526,11 +7831,11 @@ ptr += skipatstart; code = cworkspace; *code = OP_BRA; (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE, - FALSE, 0, 0, &firstbyte, &reqbyte, NULL, cd, &length); + FALSE, 0, 0, &firstchar, &reqchar, NULL, cd, &length); if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, - cd->hwm - cworkspace)); + (int)(cd->hwm - cworkspace))); if (length > MAX_PATTERN_SIZE) { @@ -7543,8 +7848,8 @@ externally provided function. Integer overflow should no longer be possible because nowadays we limit the maximum value of cd->names_found and cd->name_entry_size. */ -size = length + sizeof(real_pcre) + cd->names_found * cd->name_entry_size; -re = (real_pcre *)(pcre_malloc)(size); +size = sizeof(REAL_PCRE) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar); +re = (REAL_PCRE *)(PUBL(malloc))(size); if (re == NULL) { @@ -7563,13 +7868,13 @@ re->size = (int)size; re->options = cd->external_options; re->flags = cd->external_flags; re->dummy1 = 0; -re->first_byte = 0; -re->req_byte = 0; -re->name_table_offset = sizeof(real_pcre); +re->first_char = 0; +re->req_char = 0; +re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar); re->name_entry_size = cd->name_entry_size; re->name_count = cd->names_found; re->ref_count = 0; -re->tables = (tables == _pcre_default_tables)? NULL : tables; +re->tables = (tables == PRIV(default_tables))? NULL : tables; re->nullpad = NULL; /* The starting points of the name/number translation table and of the code are @@ -7583,10 +7888,10 @@ cd->final_bracount = cd->bracount; /* Save for checking forward references */ cd->assert_depth = 0; cd->bracount = 0; cd->names_found = 0; -cd->name_table = (uschar *)re + re->name_table_offset; +cd->name_table = (pcre_uchar *)re + re->name_table_offset; codestart = cd->name_table + re->name_entry_size * re->name_count; cd->start_code = codestart; -cd->hwm = (uschar *)(cd->start_workspace); +cd->hwm = (pcre_uchar *)(cd->start_workspace); cd->req_varyopt = 0; cd->had_accept = FALSE; cd->check_lookbehind = FALSE; @@ -7596,16 +7901,16 @@ cd->open_caps = NULL; error, errorcode will be set non-zero, so we don't need to look at the result of the function here. */ -ptr = (const uschar *)pattern + skipatstart; -code = (uschar *)codestart; +ptr = (const pcre_uchar *)pattern + skipatstart; +code = (pcre_uchar *)codestart; *code = OP_BRA; (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0, - &firstbyte, &reqbyte, NULL, cd, NULL); + &firstchar, &reqchar, NULL, cd, NULL); re->top_bracket = cd->bracount; re->top_backref = cd->top_backref; -re->flags = cd->external_flags; +re->flags = cd->external_flags | PCRE_MODE; -if (cd->had_accept) reqbyte = REQ_NONE; /* Must disable after (*ACCEPT) */ +if (cd->had_accept) reqchar = REQ_NONE; /* Must disable after (*ACCEPT) */ /* If not reached end of pattern on success, there's an excess bracket. */ @@ -7626,7 +7931,7 @@ references; optimize for them, as searching a large regex takes time. */ if (cd->hwm > cd->start_workspace) { int prev_recno = -1; - const uschar *groupptr = NULL; + const pcre_uchar *groupptr = NULL; while (errorcode == 0 && cd->hwm > cd->start_workspace) { int offset, recno; @@ -7635,18 +7940,18 @@ if (cd->hwm > cd->start_workspace) recno = GET(codestart, offset); if (recno != prev_recno) { - groupptr = _pcre_find_bracket(codestart, utf8, recno); + groupptr = PRIV(find_bracket)(codestart, utf, recno); prev_recno = recno; } if (groupptr == NULL) errorcode = ERR53; - else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart)); + else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart)); } } /* If the workspace had to be expanded, free the new memory. */ if (cd->workspace_size > COMPILE_WORK_SIZE) - (pcre_free)((void *)cd->start_workspace); + (PUBL(free))((void *)cd->start_workspace); /* Give an error if there's back reference to a non-existent capturing subpattern. */ @@ -7663,21 +7968,21 @@ length, and set their lengths. */ if (cd->check_lookbehind) { - uschar *cc = (uschar *)codestart; + pcre_uchar *cc = (pcre_uchar *)codestart; /* Loop, searching for OP_REVERSE items, and process those that do not have their length set. (Actually, it will also re-process any that have a length of zero, but that is a pathological case, and it does no harm.) When we find one, we temporarily terminate the branch it is in while we scan it. */ - for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1); + for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1); cc != NULL; - cc = (uschar *)_pcre_find_bracket(cc, utf8, -1)) + cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1)) { if (GET(cc, 1) == 0) { int fixed_length; - uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE); + pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE); int end_op = *be; *be = OP_END; fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE, @@ -7700,9 +8005,9 @@ if (cd->check_lookbehind) if (errorcode != 0) { - (pcre_free)(re); + (PUBL(free))(re); PCRE_EARLY_ERROR_RETURN: - *erroroffset = (int)(ptr - (const uschar *)pattern); + *erroroffset = (int)(ptr - (const pcre_uchar *)pattern); PCRE_EARLY_ERROR_RETURN2: *errorptr = find_error_text(errorcode); if (errorcodeptr != NULL) *errorcodeptr = errorcode; @@ -7725,13 +8030,38 @@ if ((re->options & PCRE_ANCHORED) == 0) re->options |= PCRE_ANCHORED; else { - if (firstbyte < 0) - firstbyte = find_firstassertedchar(codestart, FALSE); - if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ + if (firstchar < 0) + firstchar = find_firstassertedchar(codestart, FALSE); + if (firstchar >= 0) /* Remove caseless flag for non-caseable chars */ { - int ch = firstbyte & 255; - re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && - cd->fcc[ch] == ch)? ch : firstbyte; +#ifdef COMPILE_PCRE8 + re->first_char = firstchar & 0xff; +#else +#ifdef COMPILE_PCRE16 + re->first_char = firstchar & 0xffff; +#endif +#endif + if ((firstchar & REQ_CASELESS) != 0) + { +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + /* We ignore non-ASCII first chars in 8 bit mode. */ + if (utf) + { + if (re->first_char < 128) + { + if (cd->fcc[re->first_char] != re->first_char) + re->flags |= PCRE_FCH_CASELESS; + } + else if (UCD_OTHERCASE(re->first_char) != re->first_char) + re->flags |= PCRE_FCH_CASELESS; + } + else +#endif + if (MAX_255(re->first_char) + && cd->fcc[re->first_char] != re->first_char) + re->flags |= PCRE_FCH_CASELESS; + } + re->flags |= PCRE_FIRSTSET; } else if (is_startline(codestart, 0, cd->backref_map)) @@ -7743,12 +8073,36 @@ if ((re->options & PCRE_ANCHORED) == 0) variable length item in the regex. Remove the caseless flag for non-caseable bytes. */ -if (reqbyte >= 0 && - ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) +if (reqchar >= 0 && + ((re->options & PCRE_ANCHORED) == 0 || (reqchar & REQ_VARY) != 0)) { - int ch = reqbyte & 255; - re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && - cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; +#ifdef COMPILE_PCRE8 + re->req_char = reqchar & 0xff; +#else +#ifdef COMPILE_PCRE16 + re->req_char = reqchar & 0xffff; +#endif +#endif + if ((reqchar & REQ_CASELESS) != 0) + { +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + /* We ignore non-ASCII first chars in 8 bit mode. */ + if (utf) + { + if (re->req_char < 128) + { + if (cd->fcc[re->req_char] != re->req_char) + re->flags |= PCRE_RCH_CASELESS; + } + else if (UCD_OTHERCASE(re->req_char) != re->req_char) + re->flags |= PCRE_RCH_CASELESS; + } + else +#endif + if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char) + re->flags |= PCRE_RCH_CASELESS; + } + re->flags |= PCRE_REQCHSET; } @@ -7763,38 +8117,46 @@ printf("Options=%08x\n", re->options); if ((re->flags & PCRE_FIRSTSET) != 0) { - int ch = re->first_byte & 255; - const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? - "" : " (caseless)"; - if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); + pcre_uchar ch = re->first_char; + const char *caseless = + ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)"; + if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless); else printf("First char = \\x%02x%s\n", ch, caseless); } if ((re->flags & PCRE_REQCHSET) != 0) { - int ch = re->req_byte & 255; - const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? - "" : " (caseless)"; - if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); + pcre_uchar ch = re->req_char; + const char *caseless = + ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)"; + if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless); else printf("Req char = \\x%02x%s\n", ch, caseless); } -pcre_printint(re, stdout, TRUE); +#ifdef COMPILE_PCRE8 +pcre_printint((pcre *)re, stdout, TRUE); +#else +pcre16_printint((pcre *)re, stdout, TRUE); +#endif /* This check is done here in the debugging case so that the code that was compiled can be seen. */ if (code - codestart > length) { - (pcre_free)(re); + (PUBL(free))(re); *errorptr = find_error_text(ERR23); - *erroroffset = ptr - (uschar *)pattern; + *erroroffset = ptr - (pcre_uchar *)pattern; if (errorcodeptr != NULL) *errorcodeptr = ERR23; return NULL; } #endif /* PCRE_DEBUG */ +#ifdef COMPILE_PCRE8 return (pcre *)re; +#else +return (pcre16 *)re; +#endif } /* End of pcre_compile.c */ diff --git a/usr.sbin/nginx/src/pcre/pcre_exec.c b/usr.sbin/nginx/src/pcre/pcre_exec.c index 46498d590cf..b71535355c1 100644 --- a/usr.sbin/nginx/src/pcre/pcre_exec.c +++ b/usr.sbin/nginx/src/pcre/pcre_exec.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2011 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -113,7 +113,7 @@ Returns: nothing */ static void -pchars(const uschar *p, int length, BOOL is_subject, match_data *md) +pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md) { unsigned int c; if (is_subject && length > md->end_subject - p) length = md->end_subject - p; @@ -144,11 +144,11 @@ Returns: < 0 if not matched, otherwise the number of subject bytes matched */ static int -match_ref(int offset, register USPTR eptr, int length, match_data *md, +match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md, BOOL caseless) { -USPTR eptr_start = eptr; -register USPTR p = md->start_subject + md->offset_vector[offset]; +PCRE_PUCHAR eptr_start = eptr; +register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset]; #ifdef PCRE_DEBUG if (eptr >= md->end_subject) @@ -173,9 +173,9 @@ ASCII characters. */ if (caseless) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF #ifdef SUPPORT_UCP - if (md->utf8) + if (md->utf) { /* Match characters up to the end of the reference. NOTE: the number of bytes matched may differ, because there are some characters whose upper and @@ -185,7 +185,7 @@ if (caseless) the latter. It is important, therefore, to check the length along the reference, not along the subject (earlier code did this wrong). */ - USPTR endptr = p + length; + PCRE_PUCHAR endptr = p + length; while (p < endptr) { int c, d; @@ -204,7 +204,11 @@ if (caseless) { if (eptr + length > md->end_subject) return -1; while (length-- > 0) - { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; } + { + if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1; + p++; + eptr++; + } } } @@ -307,7 +311,7 @@ argument of match(), which never changes. */ #define RMATCH(ra,rb,rc,rd,re,rw)\ {\ - heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\ + heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\ if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\ frame->Xwhere = rw; \ newframe->Xeptr = ra;\ @@ -328,7 +332,7 @@ argument of match(), which never changes. */ {\ heapframe *oldframe = frame;\ frame = oldframe->Xprevframe;\ - (pcre_stack_free)(oldframe);\ + if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\ if (frame != NULL)\ {\ rrc = ra;\ @@ -345,24 +349,24 @@ typedef struct heapframe { /* Function arguments that may change */ - USPTR Xeptr; - const uschar *Xecode; - USPTR Xmstart; + PCRE_PUCHAR Xeptr; + const pcre_uchar *Xecode; + PCRE_PUCHAR Xmstart; int Xoffset_top; eptrblock *Xeptrb; unsigned int Xrdepth; /* Function local variables */ - USPTR Xcallpat; -#ifdef SUPPORT_UTF8 - USPTR Xcharptr; + PCRE_PUCHAR Xcallpat; +#ifdef SUPPORT_UTF + PCRE_PUCHAR Xcharptr; #endif - USPTR Xdata; - USPTR Xnext; - USPTR Xpp; - USPTR Xprev; - USPTR Xsaved_eptr; + PCRE_PUCHAR Xdata; + PCRE_PUCHAR Xnext; + PCRE_PUCHAR Xpp; + PCRE_PUCHAR Xprev; + PCRE_PUCHAR Xsaved_eptr; recursion_info Xnew_recursive; @@ -375,7 +379,7 @@ typedef struct heapframe { int Xprop_value; int Xprop_fail_result; int Xoclength; - uschar Xocchars[8]; + pcre_uchar Xocchars[6]; #endif int Xcodelink; @@ -440,7 +444,7 @@ the subject. */ /* Performance note: It might be tempting to extract commonly used fields from -the md structure (e.g. utf8, end_subject) into individual variables to improve +the md structure (e.g. utf, end_subject) into individual variables to improve performance. Tests using gcc on a SPARC disproved this; in the first case, it made performance worse. @@ -463,8 +467,9 @@ Returns: MATCH_MATCH if matched ) these values are >= 0 */ static int -match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, - int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth) +match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode, + PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb, + unsigned int rdepth) { /* These variables do not need to be preserved over recursion in this function, so they can be ordinary variables in all cases. Mark some of them with @@ -473,20 +478,22 @@ so they can be ordinary variables in all cases. Mark some of them with register int rrc; /* Returns from recursive calls */ register int i; /* Used for loops not involving calls to RMATCH() */ register unsigned int c; /* Character values not kept over RMATCH() calls */ -register BOOL utf8; /* Local copy of UTF-8 flag for speed */ +register BOOL utf; /* Local copy of UTF flag for speed */ BOOL minimize, possessive; /* Quantifier options */ BOOL caseless; int condcode; /* When recursion is not being used, all "local" variables that have to be -preserved over calls to RMATCH() are part of a "frame" which is obtained from -heap storage. Set up the top-level frame here; others are obtained from the -heap whenever RMATCH() does a "recursion". See the macro definitions above. */ +preserved over calls to RMATCH() are part of a "frame". We set up the top-level +frame on the stack here; subsequent instantiations are obtained from the heap +whenever RMATCH() does a "recursion". See the macro definitions above. Putting +the top-level on the stack rather than malloc-ing them all gives a performance +boost in many cases where there is not much "recursion". */ #ifdef NO_RECURSE -heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe)); -if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY); +heapframe frame_zero; +heapframe *frame = &frame_zero; frame->Xprevframe = NULL; /* Marks the top level */ /* Copy in the original argument variables */ @@ -513,7 +520,7 @@ HEAP_RECURSE: /* Ditto for the local variables */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF #define charptr frame->Xcharptr #endif #define callpat frame->Xcallpat @@ -571,15 +578,15 @@ declarations can be cut out in a block. The only declarations within blocks below are for variables that do not have to be preserved over a recursive call to RMATCH(). */ -#ifdef SUPPORT_UTF8 -const uschar *charptr; +#ifdef SUPPORT_UTF +const pcre_uchar *charptr; #endif -const uschar *callpat; -const uschar *data; -const uschar *next; -USPTR pp; -const uschar *prev; -USPTR saved_eptr; +const pcre_uchar *callpat; +const pcre_uchar *data; +const pcre_uchar *next; +PCRE_PUCHAR pp; +const pcre_uchar *prev; +PCRE_PUCHAR saved_eptr; recursion_info new_recursive; @@ -592,7 +599,7 @@ int prop_type; int prop_value; int prop_fail_result; int oclength; -uschar occhars[8]; +pcre_uchar occhars[6]; #endif int codelink; @@ -608,6 +615,23 @@ int save_offset1, save_offset2, save_offset3; int stacksave[REC_STACK_SAVE_MAX]; eptrblock newptrb; + +/* There is a special fudge for calling match() in a way that causes it to +measure the size of its basic stack frame when the stack is being used for +recursion. The second argument (ecode) being NULL triggers this behaviour. It +cannot normally ever be NULL. The return is the negated value of the frame +size. */ + +if (ecode == NULL) + { + if (rdepth == 0) + return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1); + else + { + int len = (char *)&rdepth - (char *)eptr; + return (len > 0)? -len : len; + } + } #endif /* NO_RECURSE */ /* To save space on the stack and in the heap frame, I have doubled up on some @@ -620,6 +644,8 @@ the alternative names that are used. */ #define code_offset codelink #define condassert condition #define matched_once prev_is_word +#define foc number +#define save_mark data /* These statements are here to stop the compiler complaining about unitialized variables. */ @@ -645,10 +671,10 @@ defined). However, RMATCH isn't like a function call because it's quite a complicated macro. It has to be used in one particular way. This shouldn't, however, impact performance when true recursion is being used. */ -#ifdef SUPPORT_UTF8 -utf8 = md->utf8; /* Local copy of the flag */ +#ifdef SUPPORT_UTF +utf = md->utf; /* Local copy of the flag */ #else -utf8 = FALSE; +utf = FALSE; #endif /* First check that we haven't called match() too many times, or that we @@ -689,7 +715,7 @@ for (;;) case OP_MARK: md->nomatch_mark = ecode + 2; md->mark = NULL; /* In case previously set by assertion */ - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, eptrb, RM55); if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && md->mark == NULL) md->mark = ecode + 2; @@ -702,7 +728,7 @@ for (;;) unaltered. */ else if (rrc == MATCH_SKIP_ARG && - strcmp((char *)(ecode + 2), (char *)(md->start_match_ptr)) == 0) + STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0) { md->start_match_ptr = eptr; RRETURN(MATCH_SKIP); @@ -715,7 +741,7 @@ for (;;) /* COMMIT overrides PRUNE, SKIP, and THEN */ case OP_COMMIT: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM52); if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG && @@ -726,7 +752,7 @@ for (;;) /* PRUNE overrides THEN */ case OP_PRUNE: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM51); if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); RRETURN(MATCH_PRUNE); @@ -734,7 +760,7 @@ for (;;) case OP_PRUNE_ARG: md->nomatch_mark = ecode + 2; md->mark = NULL; /* In case previously set by assertion */ - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, eptrb, RM56); if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && md->mark == NULL) md->mark = ecode + 2; @@ -744,7 +770,7 @@ for (;;) /* SKIP overrides PRUNE and THEN */ case OP_SKIP: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM53); if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) RRETURN(rrc); @@ -758,10 +784,10 @@ for (;;) case OP_SKIP_ARG: if (md->ignore_skip_arg) { - ecode += _pcre_OP_lengths[*ecode] + ecode[1]; + ecode += PRIV(OP_lengths)[*ecode] + ecode[1]; break; } - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, eptrb, RM57); if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN) RRETURN(rrc); @@ -779,7 +805,7 @@ for (;;) match pointer to do this. */ case OP_THEN: - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM54); if (rrc != MATCH_NOMATCH) RRETURN(rrc); md->start_match_ptr = ecode; @@ -788,7 +814,7 @@ for (;;) case OP_THEN_ARG: md->nomatch_mark = ecode + 2; md->mark = NULL; /* In case previously set by assertion */ - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md, eptrb, RM58); if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) && md->mark == NULL) md->mark = ecode + 2; @@ -812,6 +838,7 @@ for (;;) case OP_ONCE_NC: prev = ecode; saved_eptr = eptr; + save_mark = md->mark; do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64); @@ -830,6 +857,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode,1); + md->mark = save_mark; } while (*ecode == OP_ALT); @@ -909,6 +937,7 @@ for (;;) save_offset2 = md->offset_vector[offset+1]; save_offset3 = md->offset_vector[md->offset_end - number]; save_capture_last = md->capture_last; + save_mark = md->mark; DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); md->offset_vector[md->offset_end - number] = @@ -917,7 +946,7 @@ for (;;) for (;;) { if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM1); if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */ @@ -945,6 +974,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); md->capture_last = save_capture_last; ecode += GET(ecode, 1); + md->mark = save_mark; if (*ecode != OP_ALT) break; } @@ -1004,13 +1034,14 @@ for (;;) else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT) { - ecode += _pcre_OP_lengths[*ecode]; + ecode += PRIV(OP_lengths)[*ecode]; goto TAIL_RECURSE; } /* In all other cases, we have to make another call to match(). */ - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb, + save_mark = md->mark; + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM2); /* See comment in the code for capturing groups above about handling @@ -1028,7 +1059,7 @@ for (;;) { if (rrc == MATCH_ONCE) { - const uschar *scode = ecode; + const pcre_uchar *scode = ecode; if (*scode != OP_ONCE) /* If not at start, find it */ { while (*scode == OP_ALT) scode += GET(scode, 1); @@ -1039,6 +1070,7 @@ for (;;) RRETURN(rrc); } ecode += GET(ecode, 1); + md->mark = save_mark; if (*ecode != OP_ALT) break; } @@ -1093,7 +1125,7 @@ for (;;) md->offset_vector[md->offset_end - number] = (int)(eptr - md->start_subject); if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM63); if (rrc == MATCH_KETRPOS) { @@ -1165,7 +1197,7 @@ for (;;) for (;;) { if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP; - RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM48); if (rrc == MATCH_KETRPOS) { @@ -1215,13 +1247,17 @@ for (;;) if (ecode[LINK_SIZE+1] == OP_CALLOUT) { - if (pcre_callout != NULL) + if (PUBL(callout) != NULL) { - pcre_callout_block cb; + PUBL(callout_block) cb; cb.version = 2; /* Version 1 of the callout block */ cb.callout_number = ecode[LINK_SIZE+2]; cb.offset_vector = md->offset_vector; +#ifdef COMPILE_PCRE8 cb.subject = (PCRE_SPTR)md->start_subject; +#else + cb.subject = (PCRE_SPTR16)md->start_subject; +#endif cb.subject_length = (int)(md->end_subject - md->start_subject); cb.start_match = (int)(mstart - md->start_subject); cb.current_position = (int)(eptr - md->start_subject); @@ -1231,10 +1267,10 @@ for (;;) cb.capture_last = md->capture_last; cb.callout_data = md->callout_data; cb.mark = md->nomatch_mark; - if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } - ecode += _pcre_OP_lengths[OP_CALLOUT]; + ecode += PRIV(OP_lengths)[OP_CALLOUT]; } condcode = ecode[LINK_SIZE+1]; @@ -1260,7 +1296,7 @@ for (;;) if (!condition && condcode == OP_NRREF) { - uschar *slotA = md->name_table; + pcre_uchar *slotA = md->name_table; for (i = 0; i < md->name_count; i++) { if (GET2(slotA, 0) == recno) break; @@ -1273,11 +1309,11 @@ for (;;) if (i < md->name_count) { - uschar *slotB = slotA; + pcre_uchar *slotB = slotA; while (slotB > md->name_table) { slotB -= md->name_entry_size; - if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) + if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) { condition = GET2(slotB, 0) == md->recursive->group_num; if (condition) break; @@ -1293,7 +1329,7 @@ for (;;) for (i++; i < md->name_count; i++) { slotB += md->name_entry_size; - if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) + if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) { condition = GET2(slotB, 0) == md->recursive->group_num; if (condition) break; @@ -1306,7 +1342,7 @@ for (;;) /* Chose branch according to the condition */ - ecode += condition? 3 : GET(ecode, 1); + ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); } } @@ -1323,7 +1359,7 @@ for (;;) if (!condition && condcode == OP_NCREF) { int refno = offset >> 1; - uschar *slotA = md->name_table; + pcre_uchar *slotA = md->name_table; for (i = 0; i < md->name_count; i++) { @@ -1337,11 +1373,11 @@ for (;;) if (i < md->name_count) { - uschar *slotB = slotA; + pcre_uchar *slotB = slotA; while (slotB > md->name_table) { slotB -= md->name_entry_size; - if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) + if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) { offset = GET2(slotB, 0) << 1; condition = offset < offset_top && @@ -1359,7 +1395,7 @@ for (;;) for (i++; i < md->name_count; i++) { slotB += md->name_entry_size; - if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0) + if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0) { offset = GET2(slotB, 0) << 1; condition = offset < offset_top && @@ -1374,7 +1410,7 @@ for (;;) /* Chose branch according to the condition */ - ecode += condition? 3 : GET(ecode, 1); + ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1); } else if (condcode == OP_DEF) /* DEFINE - always false */ @@ -1466,7 +1502,7 @@ for (;;) md->offset_vector[offset+1] = (int)(eptr - md->start_subject); if (offset_top <= offset) offset_top = offset + 2; } - ecode += 3; + ecode += 1 + IMM2_SIZE; break; @@ -1513,6 +1549,7 @@ for (;;) case OP_ASSERT: case OP_ASSERTBACK: + save_mark = md->mark; if (md->match_function_type == MATCH_CONDASSERT) { condassert = TRUE; @@ -1534,6 +1571,7 @@ for (;;) if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode, 1); + md->mark = save_mark; } while (*ecode == OP_ALT); @@ -1557,6 +1595,7 @@ for (;;) case OP_ASSERT_NOT: case OP_ASSERTBACK_NOT: + save_mark = md->mark; if (md->match_function_type == MATCH_CONDASSERT) { condassert = TRUE; @@ -1567,6 +1606,7 @@ for (;;) do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); + md->mark = save_mark; if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH); if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) { @@ -1593,8 +1633,8 @@ for (;;) back a number of characters, not bytes. */ case OP_REVERSE: -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { i = GET(ecode, 1); while (i-- > 0) @@ -1625,13 +1665,17 @@ for (;;) function is able to force a failure. */ case OP_CALLOUT: - if (pcre_callout != NULL) + if (PUBL(callout) != NULL) { - pcre_callout_block cb; + PUBL(callout_block) cb; cb.version = 2; /* Version 1 of the callout block */ cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; +#ifdef COMPILE_PCRE8 cb.subject = (PCRE_SPTR)md->start_subject; +#else + cb.subject = (PCRE_SPTR16)md->start_subject; +#endif cb.subject_length = (int)(md->end_subject - md->start_subject); cb.start_match = (int)(mstart - md->start_subject); cb.current_position = (int)(eptr - md->start_subject); @@ -1641,7 +1685,7 @@ for (;;) cb.capture_last = md->capture_last; cb.callout_data = md->callout_data; cb.mark = md->nomatch_mark; - if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } ecode += 2 + 2*LINK_SIZE; @@ -1700,7 +1744,7 @@ for (;;) else { new_recursive.offset_save = - (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); + (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int)); if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); } memcpy(new_recursive.offset_save, md->offset_vector, @@ -1715,7 +1759,7 @@ for (;;) do { if (cbegroup) md->match_function_type = MATCH_CBEGROUP; - RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, + RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top, md, eptrb, RM6); memcpy(md->offset_vector, new_recursive.offset_save, new_recursive.saved_max * sizeof(int)); @@ -1724,7 +1768,7 @@ for (;;) { DPRINTF(("Recursion matched\n")); if (new_recursive.offset_save != stacksave) - (pcre_free)(new_recursive.offset_save); + (PUBL(free))(new_recursive.offset_save); /* Set where we got to in the subject, and reset the start in case it was changed by \K. This *is* propagated back out of a recursion, @@ -1742,7 +1786,7 @@ for (;;) { DPRINTF(("Recursion gave error %d\n", rrc)); if (new_recursive.offset_save != stacksave) - (pcre_free)(new_recursive.offset_save); + (PUBL(free))(new_recursive.offset_save); RRETURN(rrc); } @@ -1754,7 +1798,7 @@ for (;;) DPRINTF(("Recursion didn't match\n")); md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) - (pcre_free)(new_recursive.offset_save); + (PUBL(free))(new_recursive.offset_save); RRETURN(MATCH_NOMATCH); } @@ -2066,15 +2110,15 @@ for (;;) be "non-word" characters. Remember the earliest consulted character for partial matching. */ -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { /* Get status of previous character */ if (eptr == md->start_subject) prev_is_word = FALSE; else { - USPTR lastptr = eptr - 1; - while((*lastptr & 0xc0) == 0x80) lastptr--; + PCRE_PUCHAR lastptr = eptr - 1; + BACKCHAR(lastptr); if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr; GETCHAR(c, lastptr); #ifdef SUPPORT_UCP @@ -2139,7 +2183,8 @@ for (;;) } else #endif - prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0); + prev_is_word = MAX_255(eptr[-1]) + && ((md->ctypes[eptr[-1]] & ctype_word) != 0); } /* Get status of next character */ @@ -2162,7 +2207,8 @@ for (;;) } else #endif - cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0); + cur_is_word = MAX_255(*eptr) + && ((md->ctypes[*eptr] & ctype_word) != 0); } /* Now see if the situation is what we want */ @@ -2186,7 +2232,9 @@ for (;;) RRETURN(MATCH_NOMATCH); } eptr++; - if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; +#ifdef SUPPORT_UTF + if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); +#endif ecode++; break; @@ -2211,7 +2259,7 @@ for (;;) } GETCHARINCTEST(c, eptr); if ( -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) c < 256 && #endif (md->ctypes[c] & ctype_digit) != 0 @@ -2228,8 +2276,8 @@ for (;;) } GETCHARINCTEST(c, eptr); if ( -#ifdef SUPPORT_UTF8 - c >= 256 || +#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) + c > 255 || #endif (md->ctypes[c] & ctype_digit) == 0 ) @@ -2245,7 +2293,7 @@ for (;;) } GETCHARINCTEST(c, eptr); if ( -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) c < 256 && #endif (md->ctypes[c] & ctype_space) != 0 @@ -2262,8 +2310,8 @@ for (;;) } GETCHARINCTEST(c, eptr); if ( -#ifdef SUPPORT_UTF8 - c >= 256 || +#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) + c > 255 || #endif (md->ctypes[c] & ctype_space) == 0 ) @@ -2279,7 +2327,7 @@ for (;;) } GETCHARINCTEST(c, eptr); if ( -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) c < 256 && #endif (md->ctypes[c] & ctype_word) != 0 @@ -2296,8 +2344,8 @@ for (;;) } GETCHARINCTEST(c, eptr); if ( -#ifdef SUPPORT_UTF8 - c >= 256 || +#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) + c > 255 || #endif (md->ctypes[c] & ctype_word) == 0 ) @@ -2475,7 +2523,7 @@ for (;;) break; case PT_GC: - if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) + if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; @@ -2492,20 +2540,20 @@ for (;;) /* These are specials */ case PT_ALNUM: - if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_SPACE: /* Perl space */ - if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_PXSPACE: /* POSIX space */ - if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) == (op == OP_NOTPROP)) @@ -2513,8 +2561,8 @@ for (;;) break; case PT_WORD: - if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N || + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; @@ -2543,7 +2591,7 @@ for (;;) while (eptr < md->end_subject) { int len = 1; - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) != ucp_M) break; eptr += len; } @@ -2564,7 +2612,7 @@ for (;;) case OP_REFI: caseless = op == OP_REFI; offset = GET2(ecode, 1) << 1; /* Doubled ref number */ - ecode += 3; + ecode += 1 + IMM2_SIZE; /* If the reference is unset, there are two possibilities: @@ -2604,9 +2652,9 @@ for (;;) case OP_CRMINRANGE: minimize = (*ecode == OP_CRMINRANGE); min = GET2(ecode, 1); - max = GET2(ecode, 3); + max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; - ecode += 5; + ecode += 1 + 2 * IMM2_SIZE; break; default: /* No repeat follows */ @@ -2620,9 +2668,13 @@ for (;;) } /* Handle repeated back references. If the length of the reference is - zero, just continue with the main loop. */ + zero, just continue with the main loop. If the length is negative, it + means the reference is unset in non-Java-compatible mode. If the minimum is + zero, we can continue at the same level without recursion. For any other + minimum, carrying on will result in NOMATCH. */ if (length == 0) continue; + if (length < 0 && min == 0) continue; /* First, ensure the minimum number of matches are present. We get back the length of the reference string explicitly rather than passing the @@ -2703,8 +2755,11 @@ for (;;) case OP_NCLASS: case OP_CLASS: { + /* The data variable is saved across frames, so the byte map needs to + be stored there. */ +#define BYTE_MAP ((pcre_uint8 *)data) data = ecode + 1; /* Save for matching */ - ecode += 33; /* Advance past the item */ + ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */ switch (*ecode) { @@ -2725,9 +2780,9 @@ for (;;) case OP_CRMINRANGE: minimize = (*ecode == OP_CRMINRANGE); min = GET2(ecode, 1); - max = GET2(ecode, 3); + max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; - ecode += 5; + ecode += 1 + 2 * IMM2_SIZE; break; default: /* No repeat follows */ @@ -2737,9 +2792,8 @@ for (;;) /* First, ensure the minimum number of matches are present. */ -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { for (i = 1; i <= min; i++) { @@ -2754,14 +2808,12 @@ for (;;) if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); } else - { - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); - } + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = 1; i <= min; i++) { @@ -2771,7 +2823,14 @@ for (;;) RRETURN(MATCH_NOMATCH); } c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); +#ifndef COMPILE_PCRE8 + if (c > 255) + { + if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); + } + else +#endif + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } @@ -2785,9 +2844,8 @@ for (;;) if (minimize) { -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { for (fi = min;; fi++) { @@ -2805,14 +2863,12 @@ for (;;) if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); } else - { - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); - } + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (fi = min;; fi++) { @@ -2825,7 +2881,14 @@ for (;;) RRETURN(MATCH_NOMATCH); } c = *eptr++; - if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); +#ifndef COMPILE_PCRE8 + if (c > 255) + { + if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); + } + else +#endif + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -2837,9 +2900,8 @@ for (;;) { pp = eptr; -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { for (i = min; i < max; i++) { @@ -2855,9 +2917,7 @@ for (;;) if (op == OP_CLASS) break; } else - { - if ((data[c/8] & (1 << (c&7))) == 0) break; - } + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr += len; } for (;;) @@ -2870,7 +2930,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = min; i < max; i++) { @@ -2880,7 +2940,14 @@ for (;;) break; } c = *eptr; - if ((data[c/8] & (1 << (c&7))) == 0) break; +#ifndef COMPILE_PCRE8 + if (c > 255) + { + if (op == OP_CLASS) break; + } + else +#endif + if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr++; } while (eptr >= pp) @@ -2893,6 +2960,7 @@ for (;;) RRETURN(MATCH_NOMATCH); } +#undef BYTE_MAP } /* Control never gets here */ @@ -2901,7 +2969,7 @@ for (;;) when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 mode, because Unicode properties are supported in non-UTF-8 mode. */ -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: { data = ecode + 1 + LINK_SIZE; /* Save for matching */ @@ -2926,9 +2994,9 @@ for (;;) case OP_CRMINRANGE: minimize = (*ecode == OP_CRMINRANGE); min = GET2(ecode, 1); - max = GET2(ecode, 3); + max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; - ecode += 5; + ecode += 1 + 2 * IMM2_SIZE; break; default: /* No repeat follows */ @@ -2946,7 +3014,7 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); + if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); } /* If max == min we can continue with the main loop without the @@ -2970,7 +3038,7 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINCTEST(c, eptr); - if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); + if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } @@ -2988,8 +3056,12 @@ for (;;) SCHECK_PARTIAL(); break; } +#ifdef SUPPORT_UTF GETCHARLENTEST(c, eptr, len); - if (!_pcre_xclass(c, data)) break; +#else + c = *eptr; +#endif + if (!PRIV(xclass)(c, data, utf)) break; eptr += len; } for(;;) @@ -2997,7 +3069,9 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ - if (utf8) BACKCHAR(eptr); +#ifdef SUPPORT_UTF + if (utf) BACKCHAR(eptr); +#endif } RRETURN(MATCH_NOMATCH); } @@ -3009,8 +3083,8 @@ for (;;) /* Match a single character, casefully */ case OP_CHAR: -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { length = 1; ecode++; @@ -3024,8 +3098,7 @@ for (;;) } else #endif - - /* Non-UTF-8 mode */ + /* Not UTF mode */ { if (md->end_subject - eptr < 1) { @@ -3047,8 +3120,8 @@ for (;;) RRETURN(MATCH_NOMATCH); } -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { length = 1; ecode++; @@ -3061,7 +3134,10 @@ for (;;) if (fc < 128) { - if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (md->lcc[fc] + != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); + ecode++; + eptr++; } /* Otherwise we must pick up the subject character. Note that we cannot @@ -3087,11 +3163,13 @@ for (;;) } } else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ - /* Non-UTF-8 mode */ + /* Not UTF mode */ { - if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (TABLE_GET(ecode[1], md->lcc, ecode[1]) + != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH); + eptr++; ecode += 2; } break; @@ -3101,7 +3179,7 @@ for (;;) case OP_EXACT: case OP_EXACTI: min = max = GET2(ecode, 1); - ecode += 3; + ecode += 1 + IMM2_SIZE; goto REPEATCHAR; case OP_POSUPTO: @@ -3116,7 +3194,7 @@ for (;;) min = 0; max = GET2(ecode, 1); minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI; - ecode += 3; + ecode += 1 + IMM2_SIZE; goto REPEATCHAR; case OP_POSSTAR: @@ -3164,8 +3242,8 @@ for (;;) /* Common code for all repeated single-character matches. */ REPEATCHAR: -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { length = 1; charptr = ecode; @@ -3181,18 +3259,18 @@ for (;;) unsigned int othercase; if (op >= OP_STARI && /* Caseless */ (othercase = UCD_OTHERCASE(fc)) != fc) - oclength = _pcre_ord2utf8(othercase, occhars); + oclength = PRIV(ord2utf)(othercase, occhars); else oclength = 0; #endif /* SUPPORT_UCP */ for (i = 1; i <= min; i++) { if (eptr <= md->end_subject - length && - memcmp(eptr, charptr, length) == 0) eptr += length; + memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; #ifdef SUPPORT_UCP else if (oclength > 0 && eptr <= md->end_subject - oclength && - memcmp(eptr, occhars, oclength) == 0) eptr += oclength; + memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; #endif /* SUPPORT_UCP */ else { @@ -3211,11 +3289,11 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max) RRETURN(MATCH_NOMATCH); if (eptr <= md->end_subject - length && - memcmp(eptr, charptr, length) == 0) eptr += length; + memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; #ifdef SUPPORT_UCP else if (oclength > 0 && eptr <= md->end_subject - oclength && - memcmp(eptr, occhars, oclength) == 0) eptr += oclength; + memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; #endif /* SUPPORT_UCP */ else { @@ -3232,11 +3310,11 @@ for (;;) for (i = min; i < max; i++) { if (eptr <= md->end_subject - length && - memcmp(eptr, charptr, length) == 0) eptr += length; + memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length; #ifdef SUPPORT_UCP else if (oclength > 0 && eptr <= md->end_subject - oclength && - memcmp(eptr, occhars, oclength) == 0) eptr += oclength; + memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength; #endif /* SUPPORT_UCP */ else { @@ -3268,14 +3346,12 @@ for (;;) value of fc will always be < 128. */ } else -#endif /* SUPPORT_UTF8 */ - - /* When not in UTF-8 mode, load a single-byte character. */ - - fc = *ecode++; +#endif /* SUPPORT_UTF */ + /* When not in UTF-8 mode, load a single-byte character. */ + fc = *ecode++; - /* The value of fc at this point is always less than 256, though we may or - may not be in UTF-8 mode. The code is duplicated for the caseless and + /* The value of fc at this point is always one character, though we may + or may not be in UTF mode. The code is duplicated for the caseless and caseful cases, for speed, since matching characters is likely to be quite common. First, ensure the minimum number of matches are present. If min = max, continue at the same level without recursing. Otherwise, if @@ -3288,7 +3364,23 @@ for (;;) if (op >= OP_STARI) /* Caseless */ { - fc = md->lcc[fc]; +#ifdef COMPILE_PCRE8 + /* fc must be < 128 if UTF is enabled. */ + foc = md->fcc[fc]; +#else +#ifdef SUPPORT_UTF +#ifdef SUPPORT_UCP + if (utf && fc > 127) + foc = UCD_OTHERCASE(fc); +#else + if (utf && fc > 127) + foc = fc; +#endif /* SUPPORT_UCP */ + else +#endif /* SUPPORT_UTF */ + foc = TABLE_GET(fc, md->fcc, fc); +#endif /* COMPILE_PCRE8 */ + for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) @@ -3296,7 +3388,8 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH); + eptr++; } if (min == max) continue; if (minimize) @@ -3311,7 +3404,8 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH); + eptr++; } /* Control never gets here */ } @@ -3325,7 +3419,7 @@ for (;;) SCHECK_PARTIAL(); break; } - if (fc != md->lcc[*eptr]) break; + if (fc != *eptr && foc != *eptr) break; eptr++; } @@ -3414,11 +3508,25 @@ for (;;) GETCHARINCTEST(c, eptr); if (op == OP_NOTI) /* The caseless case */ { -#ifdef SUPPORT_UTF8 - if (c < 256) -#endif - c = md->lcc[c]; - if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); + register unsigned int ch, och; + ch = *ecode++; +#ifdef COMPILE_PCRE8 + /* ch must be < 128 if UTF is enabled. */ + och = md->fcc[ch]; +#else +#ifdef SUPPORT_UTF +#ifdef SUPPORT_UCP + if (utf && ch > 127) + och = UCD_OTHERCASE(ch); +#else + if (utf && ch > 127) + och = ch; +#endif /* SUPPORT_UCP */ + else +#endif /* SUPPORT_UTF */ + och = TABLE_GET(ch, md->fcc, ch); +#endif /* COMPILE_PCRE8 */ + if (ch == c || och == c) RRETURN(MATCH_NOMATCH); } else /* Caseful */ { @@ -3436,7 +3544,7 @@ for (;;) case OP_NOTEXACT: case OP_NOTEXACTI: min = max = GET2(ecode, 1); - ecode += 3; + ecode += 1 + IMM2_SIZE; goto REPEATNOTCHAR; case OP_NOTUPTO: @@ -3446,7 +3554,7 @@ for (;;) min = 0; max = GET2(ecode, 1); minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI; - ecode += 3; + ecode += 1 + IMM2_SIZE; goto REPEATNOTCHAR; case OP_NOTPOSSTAR: @@ -3478,7 +3586,7 @@ for (;;) possessive = TRUE; min = 0; max = GET2(ecode, 1); - ecode += 3; + ecode += 1 + IMM2_SIZE; goto REPEATNOTCHAR; case OP_NOTSTAR: @@ -3517,11 +3625,25 @@ for (;;) if (op >= OP_NOTSTARI) /* Caseless */ { - fc = md->lcc[fc]; +#ifdef COMPILE_PCRE8 + /* fc must be < 128 if UTF is enabled. */ + foc = md->fcc[fc]; +#else +#ifdef SUPPORT_UTF +#ifdef SUPPORT_UCP + if (utf && fc > 127) + foc = UCD_OTHERCASE(fc); +#else + if (utf && fc > 127) + foc = fc; +#endif /* SUPPORT_UCP */ + else +#endif /* SUPPORT_UTF */ + foc = TABLE_GET(fc, md->fcc, fc); +#endif /* COMPILE_PCRE8 */ -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { register unsigned int d; for (i = 1; i <= min; i++) @@ -3532,14 +3654,12 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); - if (d < 256) d = md->lcc[d]; - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d || (unsigned int) foc == d) RRETURN(MATCH_NOMATCH); } } else #endif - - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = 1; i <= min; i++) { @@ -3548,7 +3668,8 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); + eptr++; } } @@ -3556,9 +3677,8 @@ for (;;) if (minimize) { -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { register unsigned int d; for (fi = min;; fi++) @@ -3572,13 +3692,12 @@ for (;;) RRETURN(MATCH_NOMATCH); } GETCHARINC(d, eptr); - if (d < 256) d = md->lcc[d]; - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH); } } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (fi = min;; fi++) { @@ -3590,7 +3709,8 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); + if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH); + eptr++; } } /* Control never gets here */ @@ -3602,9 +3722,8 @@ for (;;) { pp = eptr; -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { register unsigned int d; for (i = min; i < max; i++) @@ -3616,12 +3735,11 @@ for (;;) break; } GETCHARLEN(d, eptr, len); - if (d < 256) d = md->lcc[d]; - if (fc == d) break; + if (fc == d || (unsigned int)foc == d) break; eptr += len; } - if (possessive) continue; - for(;;) + if (possessive) continue; + for(;;) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM30); if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -3631,7 +3749,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = min; i < max; i++) { @@ -3640,7 +3758,7 @@ for (;;) SCHECK_PARTIAL(); break; } - if (fc == md->lcc[*eptr]) break; + if (fc == *eptr || foc == *eptr) break; eptr++; } if (possessive) continue; @@ -3661,9 +3779,8 @@ for (;;) else { -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { register unsigned int d; for (i = 1; i <= min; i++) @@ -3679,7 +3796,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = 1; i <= min; i++) { @@ -3696,9 +3813,8 @@ for (;;) if (minimize) { -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { register unsigned int d; for (fi = min;; fi++) @@ -3717,7 +3833,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (fi = min;; fi++) { @@ -3741,9 +3857,8 @@ for (;;) { pp = eptr; -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { register unsigned int d; for (i = min; i < max; i++) @@ -3769,7 +3884,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (i = min; i < max; i++) { @@ -3802,7 +3917,7 @@ for (;;) case OP_TYPEEXACT: min = max = GET2(ecode, 1); minimize = TRUE; - ecode += 3; + ecode += 1 + IMM2_SIZE; goto REPEATTYPE; case OP_TYPEUPTO: @@ -3810,7 +3925,7 @@ for (;;) min = 0; max = GET2(ecode, 1); minimize = *ecode == OP_TYPEMINUPTO; - ecode += 3; + ecode += 1 + IMM2_SIZE; goto REPEATTYPE; case OP_TYPEPOSSTAR: @@ -3838,7 +3953,7 @@ for (;;) possessive = TRUE; min = 0; max = GET2(ecode, 1); - ecode += 3; + ecode += 1 + IMM2_SIZE; goto REPEATTYPE; case OP_TYPESTAR: @@ -4045,7 +4160,7 @@ for (;;) while (eptr < md->end_subject) { int len = 1; - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) != ucp_M) break; eptr += len; } @@ -4057,8 +4172,8 @@ for (;;) /* Handle all other cases when the coding is UTF-8 */ -#ifdef SUPPORT_UTF8 - if (utf8) switch(ctype) +#ifdef SUPPORT_UTF + if (utf) switch(ctype) { case OP_ANY: for (i = 1; i <= min; i++) @@ -4070,7 +4185,7 @@ for (;;) } if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4083,7 +4198,7 @@ for (;;) RRETURN(MATCH_NOMATCH); } eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4265,8 +4380,9 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) + if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + eptr++; /* No need to skip more bytes - we know it's a 1-byte character */ } break; @@ -4281,7 +4397,8 @@ for (;;) } if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); - while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); + eptr++; + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4293,8 +4410,9 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) + if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + eptr++; /* No need to skip more bytes - we know it's a 1-byte character */ } break; @@ -4309,7 +4427,8 @@ for (;;) } if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); - while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); + eptr++; + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } break; @@ -4321,8 +4440,9 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) + if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); + eptr++; /* No need to skip more bytes - we know it's a 1-byte character */ } break; @@ -4332,7 +4452,7 @@ for (;;) } /* End switch(ctype) */ else -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ /* Code for the non-UTF-8 case for minimum matching of operators other than OP_PROP and OP_NOTPROP. */ @@ -4392,6 +4512,10 @@ for (;;) case 0x000b: case 0x000c: case 0x0085: +#ifdef COMPILE_PCRE16 + case 0x2028: + case 0x2029: +#endif if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -4412,6 +4536,24 @@ for (;;) case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ +#ifdef COMPILE_PCRE16 + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif RRETURN(MATCH_NOMATCH); } } @@ -4431,6 +4573,24 @@ for (;;) case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ +#ifdef COMPILE_PCRE16 + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif break; } } @@ -4452,6 +4612,10 @@ for (;;) case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ +#ifdef COMPILE_PCRE16 + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif RRETURN(MATCH_NOMATCH); } } @@ -4473,6 +4637,10 @@ for (;;) case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ +#ifdef COMPILE_PCRE16 + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif break; } } @@ -4486,7 +4654,9 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) + RRETURN(MATCH_NOMATCH); + eptr++; } break; @@ -4498,7 +4668,9 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) + RRETURN(MATCH_NOMATCH); + eptr++; } break; @@ -4510,7 +4682,9 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); + if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) + RRETURN(MATCH_NOMATCH); + eptr++; } break; @@ -4522,7 +4696,9 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) + RRETURN(MATCH_NOMATCH); + eptr++; } break; @@ -4534,8 +4710,9 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_word) != 0) + if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); + eptr++; } break; @@ -4547,8 +4724,9 @@ for (;;) SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } - if ((md->ctypes[*eptr++] & ctype_word) == 0) + if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); + eptr++; } break; @@ -4766,7 +4944,7 @@ for (;;) while (eptr < md->end_subject) { int len = 1; - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) != ucp_M) break; eptr += len; } @@ -4775,9 +4953,8 @@ for (;;) else #endif /* SUPPORT_UCP */ -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { for (fi = min;; fi++) { @@ -4919,7 +5096,7 @@ for (;;) break; case OP_WHITESPACE: - if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) + if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); break; @@ -4940,7 +5117,7 @@ for (;;) } else #endif - /* Not UTF-8 mode */ + /* Not UTF mode */ { for (fi = min;; fi++) { @@ -4976,6 +5153,10 @@ for (;;) case 0x000b: case 0x000c: case 0x0085: +#ifdef COMPILE_PCRE16 + case 0x2028: + case 0x2029: +#endif if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } @@ -4988,6 +5169,24 @@ for (;;) case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ +#ifdef COMPILE_PCRE16 + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif RRETURN(MATCH_NOMATCH); } break; @@ -4999,6 +5198,24 @@ for (;;) case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ +#ifdef COMPILE_PCRE16 + case 0x1680: /* OGHAM SPACE MARK */ + case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ + case 0x2000: /* EN QUAD */ + case 0x2001: /* EM QUAD */ + case 0x2002: /* EN SPACE */ + case 0x2003: /* EM SPACE */ + case 0x2004: /* THREE-PER-EM SPACE */ + case 0x2005: /* FOUR-PER-EM SPACE */ + case 0x2006: /* SIX-PER-EM SPACE */ + case 0x2007: /* FIGURE SPACE */ + case 0x2008: /* PUNCTUATION SPACE */ + case 0x2009: /* THIN SPACE */ + case 0x200A: /* HAIR SPACE */ + case 0x202f: /* NARROW NO-BREAK SPACE */ + case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ + case 0x3000: /* IDEOGRAPHIC SPACE */ +#endif break; } break; @@ -5012,6 +5229,10 @@ for (;;) case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ +#ifdef COMPILE_PCRE16 + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif RRETURN(MATCH_NOMATCH); } break; @@ -5025,32 +5246,36 @@ for (;;) case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ +#ifdef COMPILE_PCRE16 + case 0x2028: /* LINE SEPARATOR */ + case 0x2029: /* PARAGRAPH SEPARATOR */ +#endif break; } break; case OP_NOT_DIGIT: - if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); + if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); break; case OP_DIGIT: - if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); + if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: - if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); + if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: - if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); + if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: - if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); + if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: - if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); + if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); break; default: @@ -5239,7 +5464,7 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, eptrb, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ - if (utf8) BACKCHAR(eptr); + if (utf) BACKCHAR(eptr); } } @@ -5256,13 +5481,13 @@ for (;;) SCHECK_PARTIAL(); break; } - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) == ucp_M) break; eptr += len; while (eptr < md->end_subject) { len = 1; - if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } + if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); } if (UCD_CATEGORY(c) != ucp_M) break; eptr += len; } @@ -5279,7 +5504,7 @@ for (;;) if (eptr-- == pp) break; /* Stop if tried at original pos */ for (;;) /* Move back over one extended */ { - if (!utf8) c = *eptr; else + if (!utf) c = *eptr; else { BACKCHAR(eptr); GETCHAR(c, eptr); @@ -5293,10 +5518,8 @@ for (;;) else #endif /* SUPPORT_UCP */ -#ifdef SUPPORT_UTF8 - /* UTF-8 mode */ - - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { switch(ctype) { @@ -5312,7 +5535,7 @@ for (;;) } if (IS_NEWLINE(eptr)) break; eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } } @@ -5329,7 +5552,7 @@ for (;;) } if (IS_NEWLINE(eptr)) break; eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } } break; @@ -5345,7 +5568,7 @@ for (;;) break; } eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++); } } else @@ -5578,9 +5801,8 @@ for (;;) } } else -#endif /* SUPPORT_UTF8 */ - - /* Not UTF-8 mode */ +#endif /* SUPPORT_UTF */ + /* Not UTF mode */ { switch(ctype) { @@ -5624,10 +5846,12 @@ for (;;) } else { - if (c != 0x000a && - (md->bsr_anycrlf || - (c != 0x000b && c != 0x000c && c != 0x0085))) - break; + if (c != 0x000a && (md->bsr_anycrlf || + (c != 0x000b && c != 0x000c && c != 0x0085 +#ifdef COMPILE_PCRE16 + && c != 0x2028 && c != 0x2029 +#endif + ))) break; eptr++; } } @@ -5642,7 +5866,12 @@ for (;;) break; } c = *eptr; - if (c == 0x09 || c == 0x20 || c == 0xa0) break; + if (c == 0x09 || c == 0x20 || c == 0xa0 +#ifdef COMPILE_PCRE16 + || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A) + || c == 0x202f || c == 0x205f || c == 0x3000 +#endif + ) break; eptr++; } break; @@ -5656,7 +5885,12 @@ for (;;) break; } c = *eptr; - if (c != 0x09 && c != 0x20 && c != 0xa0) break; + if (c != 0x09 && c != 0x20 && c != 0xa0 +#ifdef COMPILE_PCRE16 + && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A) + && c != 0x202f && c != 0x205f && c != 0x3000 +#endif + ) break; eptr++; } break; @@ -5670,8 +5904,11 @@ for (;;) break; } c = *eptr; - if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85) - break; + if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85 +#ifdef COMPILE_PCRE16 + || c == 0x2028 || c == 0x2029 +#endif + ) break; eptr++; } break; @@ -5685,8 +5922,11 @@ for (;;) break; } c = *eptr; - if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85) - break; + if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85 +#ifdef COMPILE_PCRE16 + && c != 0x2028 && c != 0x2029 +#endif + ) break; eptr++; } break; @@ -5699,7 +5939,7 @@ for (;;) SCHECK_PARTIAL(); break; } - if ((md->ctypes[*eptr] & ctype_digit) != 0) break; + if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break; eptr++; } break; @@ -5712,7 +5952,7 @@ for (;;) SCHECK_PARTIAL(); break; } - if ((md->ctypes[*eptr] & ctype_digit) == 0) break; + if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break; eptr++; } break; @@ -5725,7 +5965,7 @@ for (;;) SCHECK_PARTIAL(); break; } - if ((md->ctypes[*eptr] & ctype_space) != 0) break; + if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break; eptr++; } break; @@ -5738,7 +5978,7 @@ for (;;) SCHECK_PARTIAL(); break; } - if ((md->ctypes[*eptr] & ctype_space) == 0) break; + if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break; eptr++; } break; @@ -5751,7 +5991,7 @@ for (;;) SCHECK_PARTIAL(); break; } - if ((md->ctypes[*eptr] & ctype_word) != 0) break; + if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break; eptr++; } break; @@ -5764,7 +6004,7 @@ for (;;) SCHECK_PARTIAL(); break; } - if ((md->ctypes[*eptr] & ctype_word) == 0) break; + if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break; eptr++; } break; @@ -5827,16 +6067,23 @@ switch (frame->Xwhere) LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64) LBL(65) LBL(66) -#ifdef SUPPORT_UTF8 - LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 + LBL(21) +#endif +#ifdef SUPPORT_UTF + LBL(16) LBL(18) LBL(20) + LBL(22) LBL(23) LBL(28) LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) LBL(59) LBL(60) LBL(61) LBL(62) #endif /* SUPPORT_UCP */ -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ default: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); + +printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere); + return PCRE_ERROR_INTERNAL; } #undef LBL @@ -5923,64 +6170,90 @@ Returns: > 0 => success; value is the number of elements filled in < -1 => some kind of unexpected problem */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data, + PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets, + int offsetcount) +#endif { int rc, ocount, arg_offset_max; -int first_byte = -1; -int req_byte = -1; -int req_byte2 = -1; int newline; BOOL using_temporary_offsets = FALSE; BOOL anchored; BOOL startline; BOOL firstline; -BOOL first_byte_caseless = FALSE; -BOOL req_byte_caseless = FALSE; -BOOL utf8; +BOOL utf; +BOOL has_first_char = FALSE; +BOOL has_req_char = FALSE; +pcre_uchar first_char = 0; +pcre_uchar first_char2 = 0; +pcre_uchar req_char = 0; +pcre_uchar req_char2 = 0; match_data match_block; match_data *md = &match_block; -const uschar *tables; -const uschar *start_bits = NULL; -USPTR start_match = (USPTR)subject + start_offset; -USPTR end_subject; -USPTR start_partial = NULL; -USPTR req_byte_ptr = start_match - 1; - -pcre_study_data internal_study; +const pcre_uint8 *tables; +const pcre_uint8 *start_bits = NULL; +PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset; +PCRE_PUCHAR end_subject; +PCRE_PUCHAR start_partial = NULL; +PCRE_PUCHAR req_char_ptr = start_match - 1; + const pcre_study_data *study; +const REAL_PCRE *re = (const REAL_PCRE *)argument_re; -real_pcre internal_re; -const real_pcre *external_re = (const real_pcre *)argument_re; -const real_pcre *re = external_re; +/* Check for the special magic call that measures the size of the stack used +per recursive call of match(). */ + +if (re == NULL && extra_data == NULL && subject == NULL && length == -999 && + start_offset == -999) +#ifdef NO_RECURSE + return -sizeof(heapframe); +#else + return match(NULL, NULL, NULL, 0, NULL, NULL, 0); +#endif /* Plausibility checks */ if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; -if (re == NULL || subject == NULL || - (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; +if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) + return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; +/* Check that the first field in the block is the magic number. If it is not, +return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to +REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which +means that the pattern is likely compiled with different endianness. */ + +if (re->magic_number != MAGIC_NUMBER) + return re->magic_number == REVERSED_MAGIC_NUMBER? + PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; +if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; + /* These two settings are used in the code for checking a UTF-8 string that follows immediately afterwards. Other values in the md block are used only during "normal" pcre_exec() processing, not when the JIT support is in use, so they are set up later. */ -utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; +/* PCRE_UTF16 has the same value as PCRE_UTF8. */ +utf = md->utf = (re->options & PCRE_UTF8) != 0; md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 : ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0; /* Check a UTF-8 string if required. Pass back the character offset and error code for an invalid string if a results vector is available. */ -#ifdef SUPPORT_UTF8 -if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) +#ifdef SUPPORT_UTF +if (utf && (options & PCRE_NO_UTF8_CHECK) == 0) { int erroroffset; - int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset); + int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset); if (errorcode != 0) { if (offsetcount >= 2) @@ -5988,13 +6261,18 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) offsets[0] = erroroffset; offsets[1] = errorcode; } +#ifdef COMPILE_PCRE16 + return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)? + PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16; +#else return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)? PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8; +#endif } - /* Check that a start_offset points to the start of a UTF-8 character. */ + /* Check that a start_offset points to the start of a UTF character. */ if (start_offset > 0 && start_offset < length && - (((USPTR)subject)[start_offset] & 0xc0) == 0x80) + NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset])) return PCRE_ERROR_BADUTF8_OFFSET; } #endif @@ -6012,15 +6290,16 @@ if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_TABLES) == 0 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0) - return _pcre_jit_exec(re, extra_data->executable_jit, subject, length, - start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0) + return PRIV(jit_exec)(re, extra_data->executable_jit, + (const pcre_uchar *)subject, length, start_offset, options, + ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0) ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount); #endif /* Carry on with non-JIT matching. This information is for finding all the numbers associated with a given name, for condition testing. */ -md->name_table = (uschar *)re + re->name_table_offset; +md->name_table = (pcre_uchar *)re + re->name_table_offset; md->name_count = re->name_count; md->name_entry_size = re->name_entry_size; @@ -6034,7 +6313,7 @@ md->callout_data = NULL; /* The table pointer is always in native byte order. */ -tables = external_re->tables; +tables = re->tables; if (extra_data != NULL) { @@ -6054,19 +6333,7 @@ if (extra_data != NULL) is a feature that makes it possible to save compiled regex and re-use them in other programs later. */ -if (tables == NULL) tables = _pcre_default_tables; - -/* Check that the first field in the block is the magic number. If it is not, -test for a regex that was compiled on a host of opposite endianness. If this is -the case, flipped values are put in internal_re and internal_study if there was -study data too. */ - -if (re->magic_number != MAGIC_NUMBER) - { - re = _pcre_try_flipped(re, &internal_re, study, &internal_study); - if (re == NULL) return PCRE_ERROR_BADMAGIC; - if (study != NULL) study = &internal_study; - } +if (tables == NULL) tables = PRIV(default_tables); /* Set up other data */ @@ -6076,10 +6343,10 @@ firstline = (re->options & PCRE_FIRSTLINE) != 0; /* The code starts after the real_pcre block and the capture name table. */ -md->start_code = (const uschar *)external_re + re->name_table_offset + +md->start_code = (const pcre_uchar *)re + re->name_table_offset + re->name_count * re->name_entry_size; -md->start_subject = (USPTR)subject; +md->start_subject = (PCRE_PUCHAR)subject; md->start_offset = start_offset; md->end_subject = md->start_subject + length; end_subject = md->end_subject; @@ -6104,6 +6371,7 @@ md->recursive = NULL; /* No recursion at top level */ md->hasthen = (re->flags & PCRE_HASTHEN) != 0; md->lcc = tables + lcc_offset; +md->fcc = tables + fcc_offset; md->ctypes = tables + ctypes_offset; /* Handle different \R options. */ @@ -6190,7 +6458,7 @@ arg_offset_max = (2*ocount)/3; if (re->top_backref > 0 && re->top_backref >= ocount/3) { ocount = re->top_backref * 3 + 3; - md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); + md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int)); if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; using_temporary_offsets = TRUE; DPRINTF(("Got memory to hold back references\n")); @@ -6217,7 +6485,7 @@ if (md->offset_vector != NULL) md->offset_vector[0] = md->offset_vector[1] = -1; } -/* Set up the first character to match, if available. The first_byte value is +/* Set up the first character to match, if available. The first_char value is never set for an anchored regular expression, but the anchoring may be forced at run time, so we have to test for anchoring. The first char may be unset for an unanchored pattern, of course. If there's no first char and the pattern was @@ -6227,9 +6495,16 @@ if (!anchored) { if ((re->flags & PCRE_FIRSTSET) != 0) { - first_byte = re->first_byte & 255; - if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) - first_byte = md->lcc[first_byte]; + has_first_char = TRUE; + first_char = first_char2 = (pcre_uchar)(re->first_char); + if ((re->flags & PCRE_FCH_CASELESS) != 0) + { + first_char2 = TABLE_GET(first_char, md->fcc, first_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (utf && first_char > 127) + first_char2 = UCD_OTHERCASE(first_char); +#endif + } } else if (!startline && study != NULL && @@ -6242,14 +6517,19 @@ character" set. */ if ((re->flags & PCRE_REQCHSET) != 0) { - req_byte = re->req_byte & 255; - req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; - req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ + has_req_char = TRUE; + req_char = req_char2 = (pcre_uchar)(re->req_char); + if ((re->flags & PCRE_RCH_CASELESS) != 0) + { + req_char2 = TABLE_GET(req_char, md->fcc, req_char); +#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) + if (utf && req_char > 127) + req_char2 = UCD_OTHERCASE(req_char); +#endif + } } - - /* ==========================================================================*/ /* Loop for handling unanchored repeated matching attempts; for anchored regexs @@ -6257,8 +6537,8 @@ the loop runs just once. */ for(;;) { - USPTR save_end_subject = end_subject; - USPTR new_start_match; + PCRE_PUCHAR save_end_subject = end_subject; + PCRE_PUCHAR new_start_match; /* If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. That is, the match must be before or at the first @@ -6268,14 +6548,14 @@ for(;;) if (firstline) { - USPTR t = start_match; -#ifdef SUPPORT_UTF8 - if (utf8) + PCRE_PUCHAR t = start_match; +#ifdef SUPPORT_UTF + if (utf) { while (t < md->end_subject && !IS_NEWLINE(t)) { t++; - while (t < end_subject && (*t & 0xc0) == 0x80) t++; + ACROSSCHAR(t < end_subject, *t, t++); } } else @@ -6292,15 +6572,16 @@ for(;;) if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0) { - /* Advance to a unique first byte if there is one. */ + /* Advance to a unique first char if there is one. */ - if (first_byte >= 0) + if (has_first_char) { - if (first_byte_caseless) - while (start_match < end_subject && md->lcc[*start_match] != first_byte) + if (first_char != first_char2) + while (start_match < end_subject && + *start_match != first_char && *start_match != first_char2) start_match++; else - while (start_match < end_subject && *start_match != first_byte) + while (start_match < end_subject && *start_match != first_char) start_match++; } @@ -6310,14 +6591,14 @@ for(;;) { if (start_match > md->start_subject + start_offset) { -#ifdef SUPPORT_UTF8 - if (utf8) +#ifdef SUPPORT_UTF + if (utf) { while (start_match < end_subject && !WAS_NEWLINE(start_match)) { start_match++; - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; + ACROSSCHAR(start_match < end_subject, *start_match, + start_match++); } } else @@ -6344,13 +6625,18 @@ for(;;) while (start_match < end_subject) { register unsigned int c = *start_match; +#ifndef COMPILE_PCRE8 + if (c > 255) c = 255; +#endif if ((start_bits[c/8] & (1 << (c&7))) == 0) { start_match++; -#ifdef SUPPORT_UTF8 - if (utf8) - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 + /* In non 8-bit mode, the iteration will stop for + characters > 255 at the beginning or not stop at all. */ + if (utf) + ACROSSCHAR(start_match < end_subject, *start_match, + start_match++); #endif } else break; @@ -6379,8 +6665,8 @@ for(;;) break; } - /* If req_byte is set, we know that that character must appear in the - subject for the match to succeed. If the first character is set, req_byte + /* If req_char is set, we know that that character must appear in the + subject for the match to succeed. If the first character is set, req_char must be later in the subject; otherwise the test starts at the match point. This optimization can save a huge amount of backtracking in patterns with nested unlimited repeats that aren't going to match. Writing separate code @@ -6393,28 +6679,28 @@ for(;;) 32-megabyte string... so we don't do this when the string is sufficiently long. */ - if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) + if (has_req_char && end_subject - start_match < REQ_BYTE_MAX) { - register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); + register PCRE_PUCHAR p = start_match + (has_first_char? 1:0); /* We don't need to repeat the search if we haven't yet reached the place we found it at last time. */ - if (p > req_byte_ptr) + if (p > req_char_ptr) { - if (req_byte_caseless) + if (req_char != req_char2) { while (p < end_subject) { register int pp = *p++; - if (pp == req_byte || pp == req_byte2) { p--; break; } + if (pp == req_char || pp == req_char2) { p--; break; } } } else { while (p < end_subject) { - if (*p++ == req_byte) { p--; break; } + if (*p++ == req_char) { p--; break; } } } @@ -6431,7 +6717,7 @@ for(;;) found it, so that we don't search again next time round the loop if the start hasn't passed this character yet. */ - req_byte_ptr = p; + req_char_ptr = p; } } } @@ -6486,10 +6772,10 @@ for(;;) case MATCH_THEN: md->ignore_skip_arg = FALSE; new_start_match = start_match + 1; -#ifdef SUPPORT_UTF8 - if (utf8) - while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80) - new_start_match++; +#ifdef SUPPORT_UTF + if (utf) + ACROSSCHAR(new_start_match < end_subject, *new_start_match, + new_start_match++); #endif break; @@ -6527,9 +6813,13 @@ for(;;) /* If we have just passed a CR and we are now at a LF, and the pattern does not contain any explicit matches for \r or \n, and the newline option is CRLF - or ANY or ANYCRLF, advance the match position by one more character. */ + or ANY or ANYCRLF, advance the match position by one more character. In + normal matching start_match will aways be greater than the first position at + this stage, but a failed *SKIP can cause a return at the same point, which is + why the first test exists. */ - if (start_match[-1] == CHAR_CR && + if (start_match > (PCRE_PUCHAR)subject + start_offset && + start_match[-1] == CHAR_CR && start_match < end_subject && *start_match == CHAR_NL && (re->flags & PCRE_HASCRORLF) == 0 && @@ -6575,7 +6865,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) } if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE; DPRINTF(("Freeing temporary memory\n")); - (pcre_free)(md->offset_vector); + (PUBL(free))(md->offset_vector); } /* Set the return code to the number of captured strings, or 0 if there were @@ -6616,7 +6906,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) /* Return MARK data if requested */ if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) - *(extra_data->mark) = (unsigned char *)(md->mark); + *(extra_data->mark) = (pcre_uchar *)md->mark; DPRINTF((">>>> returning %d\n", rc)); return rc; } @@ -6627,7 +6917,7 @@ attempt has failed at all permitted starting positions. */ if (using_temporary_offsets) { DPRINTF(("Freeing temporary memory\n")); - (pcre_free)(md->offset_vector); + (PUBL(free))(md->offset_vector); } /* For anything other than nomatch or partial match, just return the code. */ @@ -6646,8 +6936,8 @@ if (start_partial != NULL) md->mark = NULL; if (offsetcount > 1) { - offsets[0] = (int)(start_partial - (USPTR)subject); - offsets[1] = (int)(end_subject - (USPTR)subject); + offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject); + offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject); } rc = PCRE_ERROR_PARTIAL; } @@ -6663,7 +6953,7 @@ else /* Return the MARK data if it has been requested. */ if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0) - *(extra_data->mark) = (unsigned char *)(md->nomatch_mark); + *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark; return rc; } diff --git a/usr.sbin/nginx/src/pcre/pcre_fullinfo.c b/usr.sbin/nginx/src/pcre/pcre_fullinfo.c index 4895b2aab2d..a3d1198b0e8 100644 --- a/usr.sbin/nginx/src/pcre/pcre_fullinfo.c +++ b/usr.sbin/nginx/src/pcre/pcre_fullinfo.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2011 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -65,13 +65,17 @@ Arguments: Returns: 0 if data returned, negative on error */ +#ifdef COMPILE_PCRE8 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION -pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, - void *where) +pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, + int what, void *where) +#else +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +pcre16_fullinfo(const pcre16 *argument_re, const pcre16_extra *extra_data, + int what, void *where) +#endif { -real_pcre internal_re; -pcre_study_data internal_study; -const real_pcre *re = (const real_pcre *)argument_re; +const REAL_PCRE *re = (const REAL_PCRE *)argument_re; const pcre_study_data *study = NULL; if (re == NULL || where == NULL) return PCRE_ERROR_NULL; @@ -79,12 +83,18 @@ if (re == NULL || where == NULL) return PCRE_ERROR_NULL; if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data *)extra_data->study_data; +/* Check that the first field in the block is the magic number. If it is not, +return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to +REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which +means that the pattern is likely compiled with different endianness. */ + if (re->magic_number != MAGIC_NUMBER) - { - re = _pcre_try_flipped(re, &internal_re, study, &internal_study); - if (re == NULL) return PCRE_ERROR_BADMAGIC; - if (study != NULL) study = &internal_study; - } + return re->magic_number == REVERSED_MAGIC_NUMBER? + PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC; + +/* Check that this pattern was compiled in the correct bit mode */ + +if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE; switch (what) { @@ -106,11 +116,10 @@ switch (what) (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 && extra_data->executable_jit != NULL)? - _pcre_jit_get_size(extra_data->executable_jit) : 0; + PRIV(jit_get_size)(extra_data->executable_jit) : 0; #else *((size_t *)where) = 0; #endif - break; case PCRE_INFO_CAPTURECOUNT: @@ -123,7 +132,7 @@ switch (what) case PCRE_INFO_FIRSTBYTE: *((int *)where) = - ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte : + ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char : ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2; break; @@ -131,7 +140,7 @@ switch (what) block, not the internal copy (with flipped integer fields). */ case PCRE_INFO_FIRSTTABLE: - *((const uschar **)where) = + *((const pcre_uint8 **)where) = (study != NULL && (study->flags & PCRE_STUDY_MAPPED) != 0)? ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL; break; @@ -139,7 +148,7 @@ switch (what) case PCRE_INFO_MINLENGTH: *((int *)where) = (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0)? - (int)study->minlength : -1; + (int)(study->minlength) : -1; break; case PCRE_INFO_JIT: @@ -150,7 +159,7 @@ switch (what) case PCRE_INFO_LASTLITERAL: *((int *)where) = - ((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1; + ((re->flags & PCRE_REQCHSET) != 0)? re->req_char : -1; break; case PCRE_INFO_NAMEENTRYSIZE: @@ -162,11 +171,11 @@ switch (what) break; case PCRE_INFO_NAMETABLE: - *((const uschar **)where) = (const uschar *)re + re->name_table_offset; + *((const pcre_uchar **)where) = (const pcre_uchar *)re + re->name_table_offset; break; case PCRE_INFO_DEFAULT_TABLES: - *((const uschar **)where) = (const uschar *)(_pcre_default_tables); + *((const pcre_uint8 **)where) = (const pcre_uint8 *)(PRIV(default_tables)); break; /* From release 8.00 this will always return TRUE because NOPARTIAL is diff --git a/usr.sbin/nginx/src/pcre/pcre_globals.c b/usr.sbin/nginx/src/pcre/pcre_globals.c index 4562e0a069a..36e6ddb3a89 100644 --- a/usr.sbin/nginx/src/pcre/pcre_globals.c +++ b/usr.sbin/nginx/src/pcre/pcre_globals.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2008 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -67,18 +67,18 @@ static void LocalPcreFree(void* aPtr) { free(aPtr); } -PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = LocalPcreMalloc; -PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = LocalPcreFree; -PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = LocalPcreMalloc; -PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = LocalPcreFree; -PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL; +PCRE_EXP_DATA_DEFN void *(*PUBL(malloc))(size_t) = LocalPcreMalloc; +PCRE_EXP_DATA_DEFN void (*PUBL(free))(void *) = LocalPcreFree; +PCRE_EXP_DATA_DEFN void *(*PUBL(stack_malloc))(size_t) = LocalPcreMalloc; +PCRE_EXP_DATA_DEFN void (*PUBL(stack_free))(void *) = LocalPcreFree; +PCRE_EXP_DATA_DEFN int (*PUBL(callout))(PUBL(callout_block) *) = NULL; #elif !defined VPCOMPAT -PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = malloc; -PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = free; -PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = malloc; -PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = free; -PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL; +PCRE_EXP_DATA_DEFN void *(*PUBL(malloc))(size_t) = malloc; +PCRE_EXP_DATA_DEFN void (*PUBL(free))(void *) = free; +PCRE_EXP_DATA_DEFN void *(*PUBL(stack_malloc))(size_t) = malloc; +PCRE_EXP_DATA_DEFN void (*PUBL(stack_free))(void *) = free; +PCRE_EXP_DATA_DEFN int (*PUBL(callout))(PUBL(callout_block) *) = NULL; #endif /* End of pcre_globals.c */ diff --git a/usr.sbin/nginx/src/pcre/pcre_internal.h b/usr.sbin/nginx/src/pcre/pcre_internal.h index 6ea397a39ef..e5a4b6a526d 100644 --- a/usr.sbin/nginx/src/pcre/pcre_internal.h +++ b/usr.sbin/nginx/src/pcre/pcre_internal.h @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2011 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -40,7 +40,8 @@ POSSIBILITY OF SUCH DAMAGE. /* This header contains definitions that are shared between the different modules, but which are not relevant to the exported API. This includes some -functions whose names all begin with "_pcre_". */ +functions whose names all begin with "_pcre_" or "_pcre16_" depending on +the PRIV macro. */ #ifndef PCRE_INTERNAL_H #define PCRE_INTERNAL_H @@ -51,20 +52,39 @@ functions whose names all begin with "_pcre_". */ #define PCRE_DEBUG #endif -/* We do not support both EBCDIC and UTF-8 at the same time. The "configure" -script prevents both being selected, but not everybody uses "configure". */ - -#if defined EBCDIC && defined SUPPORT_UTF8 -#error The use of both EBCDIC and SUPPORT_UTF8 is not supported. +/* PCRE is compiled as an 8 bit library if it is not requested otherwise. */ +#ifndef COMPILE_PCRE16 +#define COMPILE_PCRE8 #endif -/* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The +/* If SUPPORT_UCP is defined, SUPPORT_UTF must also be defined. The "configure" script ensures this, but not everybody uses "configure". */ -#if defined SUPPORT_UCP && !defined SUPPORT_UTF8 +#if defined SUPPORT_UCP && !(defined SUPPORT_UTF) +#define SUPPORT_UTF 1 +#endif + +/* We define SUPPORT_UTF if SUPPORT_UTF8 is enabled for compatibility +reasons with existing code. */ + +#if defined SUPPORT_UTF8 && !(defined SUPPORT_UTF) +#define SUPPORT_UTF 1 +#endif + +/* Fixme: SUPPORT_UTF8 should be eventually disappear from the code. +Until then we define it if SUPPORT_UTF is defined. */ + +#if defined SUPPORT_UTF && !(defined SUPPORT_UTF8) #define SUPPORT_UTF8 1 #endif +/* We do not support both EBCDIC and UTF-8/16 at the same time. The "configure" +script prevents both being selected, but not everybody uses "configure". */ + +#if defined EBCDIC && defined SUPPORT_UTF +#error The use of both EBCDIC and SUPPORT_UTF8/16 is not supported. +#endif + /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef inline, and there are *still* stupid compilers about that don't like indented pre-processor statements, or at least there were when I first wrote this. After @@ -158,12 +178,14 @@ set, we ensure here that it has no effect. */ #define PCRE_CALL_CONVENTION #endif -/* We need to have types that specify unsigned 16-bit and 32-bit integers. We +/* We need to have types that specify unsigned 8, 16 and 32-bit integers. We cannot determine these outside the compilation (e.g. by running a program as part of "configure") because PCRE is often cross-compiled for use on other systems. Instead we make use of the maximum sizes that are available at preprocessor time in standard C environments. */ +typedef unsigned char pcre_uint8; + #if USHRT_MAX == 65535 typedef unsigned short pcre_uint16; typedef short pcre_int16; @@ -206,12 +228,47 @@ by "configure". */ /* All character handling must be done as unsigned characters. Otherwise there are problems with top-bit-set characters and functions such as isspace(). -However, we leave the interface to the outside world as char *, because that -should make things easier for callers. We define a short type for unsigned char -to save lots of typing. I tried "uchar", but it causes problems on Digital -Unix, where it is defined in sys/types, so use "uschar" instead. */ +However, we leave the interface to the outside world as char * or short *, +because that should make things easier for callers. This character type is +called pcre_uchar. + +The IN_UCHARS macro multiply its argument with the byte size of the current +pcre_uchar type. Useful for memcpy and such operations, whose require the +byte size of their input/output buffers. + +The MAX_255 macro checks whether its pcre_uchar input is less than 256. -typedef unsigned char uschar; +The TABLE_GET macro is designed for accessing elements of tables whose contain +exactly 256 items. When the character is able to contain more than 256 +items, some check is needed before accessing these tables. +*/ + +#ifdef COMPILE_PCRE8 + +typedef unsigned char pcre_uchar; +#define IN_UCHARS(x) (x) +#define MAX_255(c) 1 +#define TABLE_GET(c, table, default) ((table)[c]) + +#else + +#ifdef COMPILE_PCRE16 +#if USHRT_MAX != 65535 +/* This is a warning message. Change PCRE_UCHAR16 to a 16 bit data type in +pcre.h(.in) and disable (comment out) this message. */ +#error Warning: PCRE_UCHAR16 is not a 16 bit data type. +#endif + +typedef pcre_uint16 pcre_uchar; +#define IN_UCHARS(x) ((x) << 1) +#define MAX_255(c) ((c) <= 255u) +#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) + +#else +#error Unsupported compiling mode +#endif /* COMPILE_PCRE16 */ + +#endif /* COMPILE_PCRE8 */ /* This is an unsigned int value that no character can ever have. UTF-8 characters only go up to 0x7fffffff (though Unicode doesn't go beyond @@ -234,8 +291,8 @@ start/end of string field names are. */ #define IS_NEWLINE(p) \ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ ((p) < NLBLOCK->PSEND && \ - _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\ - utf8)) \ + PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, \ + &(NLBLOCK->nllen), utf)) \ : \ ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ (p)[0] == NLBLOCK->nl[0] && \ @@ -248,8 +305,8 @@ start/end of string field names are. */ #define WAS_NEWLINE(p) \ ((NLBLOCK->nltype != NLTYPE_FIXED)? \ ((p) > NLBLOCK->PSSTART && \ - _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ - &(NLBLOCK->nllen), utf8)) \ + PRIV(was_newline)((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ + &(NLBLOCK->nllen), utf)) \ : \ ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ @@ -267,15 +324,11 @@ used for the external interface and appears in pcre.h, which is why its name must begin with PCRE_. */ #ifdef CUSTOM_SUBJECT_PTR -#define PCRE_SPTR CUSTOM_SUBJECT_PTR -#define USPTR CUSTOM_SUBJECT_PTR +#define PCRE_PUCHAR CUSTOM_SUBJECT_PTR #else -#define PCRE_SPTR const char * -#define USPTR const unsigned char * +#define PCRE_PUCHAR const pcre_uchar * #endif - - /* Include the public PCRE header and the definitions of UCP character property values. */ @@ -343,6 +396,8 @@ The macros are controlled by the value of LINK_SIZE. This defaults to 2 in the config.h file, but can be overridden by using -D on the command line. This is automated on Unix systems via the "configure" command. */ +#ifdef COMPILE_PCRE8 + #if LINK_SIZE == 2 #define PUT(a,n,d) \ @@ -379,13 +434,54 @@ is automated on Unix systems via the "configure" command. */ #define GET(a,n) \ (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) -#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ +/* Keep it positive */ +#define MAX_PATTERN_SIZE (1 << 30) +#else +#error LINK_SIZE must be either 2, 3, or 4 +#endif + +#else /* COMPILE_PCRE8 */ + +#ifdef COMPILE_PCRE16 + +#if LINK_SIZE == 2 + +#undef LINK_SIZE +#define LINK_SIZE 1 + +#define PUT(a,n,d) \ + (a[n] = (d)) + +#define GET(a,n) \ + (a[n]) + +#define MAX_PATTERN_SIZE (1 << 16) + +#elif LINK_SIZE == 3 || LINK_SIZE == 4 + +#undef LINK_SIZE +#define LINK_SIZE 2 + +#define PUT(a,n,d) \ + (a[n] = (d) >> 16), \ + (a[(n)+1] = (d) & 65535) + +#define GET(a,n) \ + (((a)[n] << 16) | (a)[(n)+1]) + +/* Keep it positive */ +#define MAX_PATTERN_SIZE (1 << 30) #else #error LINK_SIZE must be either 2, 3, or 4 #endif +#else +#error Unsupported compiling mode +#endif /* COMPILE_PCRE16 */ + +#endif /* COMPILE_PCRE8 */ /* Convenience macro defined in terms of the others */ @@ -396,6 +492,10 @@ is automated on Unix systems via the "configure" command. */ offsets changes. There are used for repeat counts and for other things such as capturing parenthesis numbers in back references. */ +#ifdef COMPILE_PCRE8 + +#define IMM2_SIZE 2 + #define PUT2(a,n,d) \ a[n] = (d) >> 8; \ a[(n)+1] = (d) & 255 @@ -403,17 +503,39 @@ capturing parenthesis numbers in back references. */ #define GET2(a,n) \ (((a)[n] << 8) | (a)[(n)+1]) -#define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 +#else /* COMPILE_PCRE8 */ + +#ifdef COMPILE_PCRE16 + +#define IMM2_SIZE 1 + +#define PUT2(a,n,d) \ + a[n] = d + +#define GET2(a,n) \ + a[n] + +#else +#error Unsupported compiling mode +#endif /* COMPILE_PCRE16 */ + +#endif /* COMPILE_PCRE8 */ +#define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE -/* When UTF-8 encoding is being used, a character is no longer just a single -byte. The macros for character handling generate simple sequences when used in -byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is -not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should -never be called in byte mode. To make sure they can never even appear when -UTF-8 support is omitted, we don't even define them. */ +/* When UTF encoding is being used, a character is no longer just a single +character. The macros for character handling generate simple sequences when +used in character-mode, and more complicated ones for UTF characters. +GETCHARLENTEST and other macros are not used when UTF is not supported, +so they are not defined. To make sure they can never even appear when +UTF support is omitted, we don't even define them. */ -#ifndef SUPPORT_UTF8 +#ifndef SUPPORT_UTF + +/* #define MAX_VALUE_FOR_SINGLE_CHAR */ +/* #define HAS_EXTRALEN(c) */ +/* #define GET_EXTRALEN(c) */ +/* #define NOT_FIRSTCHAR(c) */ #define GETCHAR(c, eptr) c = *eptr; #define GETCHARTEST(c, eptr) c = *eptr; #define GETCHARINC(c, eptr) c = *eptr++; @@ -421,14 +543,36 @@ UTF-8 support is omitted, we don't even define them. */ #define GETCHARLEN(c, eptr, len) c = *eptr; /* #define GETCHARLENTEST(c, eptr, len) */ /* #define BACKCHAR(eptr) */ +/* #define FORWARDCHAR(eptr) */ +/* #define ACROSSCHAR(condition, eptr, action) */ + +#else /* SUPPORT_UTF */ -#else /* SUPPORT_UTF8 */ +#ifdef COMPILE_PCRE8 /* These macros were originally written in the form of loops that used data -from the tables whose names start with _pcre_utf8_table. They were rewritten by +from the tables whose names start with PRIV(utf8_table). They were rewritten by a user so as not to use loops, because in some environments this gives a significant performance advantage, and it seems never to do any harm. */ +/* Tells the biggest code point which can be encoded as a single character. */ + +#define MAX_VALUE_FOR_SINGLE_CHAR 127 + +/* Tests whether the code point needs extra characters to decode. */ + +#define HAS_EXTRALEN(c) ((c) >= 0xc0) + +/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. +Otherwise it has an undefined behaviour. */ + +#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f]) + +/* Returns TRUE, if the given character is not the first character +of a UTF sequence. */ + +#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80) + /* Base macro to pick up the remaining bytes of a UTF-8 character, not advancing the pointer. */ @@ -463,7 +607,7 @@ pointer. */ #define GETCHARTEST(c, eptr) \ c = *eptr; \ - if (utf8 && c >= 0xc0) GETUTF8(c, eptr); + if (utf && c >= 0xc0) GETUTF8(c, eptr); /* Base macro to pick up the remaining bytes of a UTF-8 character, advancing the pointer. */ @@ -511,7 +655,7 @@ This is called when we don't know if we are in UTF-8 mode. */ #define GETCHARINCTEST(c, eptr) \ c = *eptr++; \ - if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr); + if (utf && c >= 0xc0) GETUTF8INC(c, eptr); /* Base macro to pick up the remaining bytes of a UTF-8 character, not advancing the pointer, incrementing the length. */ @@ -563,7 +707,7 @@ do not know if we are in UTF-8 mode. */ #define GETCHARLENTEST(c, eptr, len) \ c = *eptr; \ - if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len); + if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len); /* If the pointer is not at the start of a character, move it back until it is. This is called only in UTF-8 mode - we don't put a test within the macro @@ -571,7 +715,116 @@ because almost all calls are already within a block of UTF-8 only code. */ #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- -#endif /* SUPPORT_UTF8 */ +/* Same as above, just in the other direction. */ +#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++ + +/* Same as above, but it allows a fully customizable form. */ +#define ACROSSCHAR(condition, eptr, action) \ + while((condition) && ((eptr) & 0xc0) == 0x80) action + +#else /* COMPILE_PCRE8 */ + +#ifdef COMPILE_PCRE16 + +/* Tells the biggest code point which can be encoded as a single character. */ + +#define MAX_VALUE_FOR_SINGLE_CHAR 65535 + +/* Tests whether the code point needs extra characters to decode. */ + +#define HAS_EXTRALEN(c) (((c) & 0xfc00) == 0xd800) + +/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. +Otherwise it has an undefined behaviour. */ + +#define GET_EXTRALEN(c) 1 + +/* Returns TRUE, if the given character is not the first character +of a UTF sequence. */ + +#define NOT_FIRSTCHAR(c) (((c) & 0xfc00) == 0xdc00) + +/* Base macro to pick up the low surrogate of a UTF-16 character, not +advancing the pointer. */ + +#define GETUTF16(c, eptr) \ + { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; } + +/* Get the next UTF-16 character, not advancing the pointer. This is called when +we know we are in UTF-16 mode. */ + +#define GETCHAR(c, eptr) \ + c = *eptr; \ + if ((c & 0xfc00) == 0xd800) GETUTF16(c, eptr); + +/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the +pointer. */ + +#define GETCHARTEST(c, eptr) \ + c = *eptr; \ + if (utf && (c & 0xfc00) == 0xd800) GETUTF16(c, eptr); + +/* Base macro to pick up the low surrogate of a UTF-16 character, advancing +the pointer. */ + +#define GETUTF16INC(c, eptr) \ + { c = (((c & 0x3ff) << 10) | (*eptr++ & 0x3ff)) + 0x10000; } + +/* Get the next UTF-16 character, advancing the pointer. This is called when we +know we are in UTF-16 mode. */ + +#define GETCHARINC(c, eptr) \ + c = *eptr++; \ + if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr); + +/* Get the next character, testing for UTF-16 mode, and advancing the pointer. +This is called when we don't know if we are in UTF-16 mode. */ + +#define GETCHARINCTEST(c, eptr) \ + c = *eptr++; \ + if (utf && (c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr); + +/* Base macro to pick up the low surrogate of a UTF-16 character, not +advancing the pointer, incrementing the length. */ + +#define GETUTF16LEN(c, eptr, len) \ + { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; len++; } + +/* Get the next UTF-16 character, not advancing the pointer, incrementing +length if there is a low surrogate. This is called when we know we are in +UTF-16 mode. */ + +#define GETCHARLEN(c, eptr, len) \ + c = *eptr; \ + if ((c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); + +/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the +pointer, incrementing length if there is a low surrogate. This is called when +we do not know if we are in UTF-16 mode. */ + +#define GETCHARLENTEST(c, eptr, len) \ + c = *eptr; \ + if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len); + +/* If the pointer is not at the start of a character, move it back until +it is. This is called only in UTF-16 mode - we don't put a test within the +macro because almost all calls are already within a block of UTF-16 only +code. */ + +#define BACKCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr-- + +/* Same as above, just in the other direction. */ +#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++ + +/* Same as above, but it allows a fully customizable form. */ +#define ACROSSCHAR(condition, eptr, action) \ + if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action + +#endif + +#endif /* COMPILE_PCRE8 */ + +#endif /* SUPPORT_UTF */ /* In case there is no definition of offsetof() provided - though any proper @@ -588,13 +841,21 @@ are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as the restrictions on partial matching have been lifted. It remains for backwards compatibility. */ -#define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */ -#define PCRE_FIRSTSET 0x0002 /* first_byte is set */ -#define PCRE_REQCHSET 0x0004 /* req_byte is set */ -#define PCRE_STARTLINE 0x0008 /* start after \n for multiline */ -#define PCRE_JCHANGED 0x0010 /* j option used in regex */ -#define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */ -#define PCRE_HASTHEN 0x0040 /* pattern contains (*THEN) */ +#ifdef COMPILE_PCRE8 +#define PCRE_MODE 0x0001 /* compiled in 8 bit mode */ +#endif +#ifdef COMPILE_PCRE16 +#define PCRE_MODE 0x0002 /* compiled in 16 bit mode */ +#endif +#define PCRE_FIRSTSET 0x0010 /* first_char is set */ +#define PCRE_FCH_CASELESS 0x0020 /* caseless first char */ +#define PCRE_REQCHSET 0x0040 /* req_byte is set */ +#define PCRE_RCH_CASELESS 0x0080 /* caseless requested char */ +#define PCRE_STARTLINE 0x0100 /* start after \n for multiline */ +#define PCRE_NOPARTIAL 0x0200 /* can't use partial with this regex */ +#define PCRE_JCHANGED 0x0400 /* j option used in regex */ +#define PCRE_HASCRORLF 0x0800 /* explicit \r or \n in pattern */ +#define PCRE_HASTHEN 0x1000 /* pattern contains (*THEN) */ /* Flags for the "extra" block produced by pcre_study(). */ @@ -628,11 +889,15 @@ time, run time, or study time, respectively. */ #define PUBLIC_STUDY_OPTIONS \ PCRE_STUDY_JIT_COMPILE -/* Magic number to provide a small check against being handed junk. Also used -to detect whether a pattern was compiled on a host of different endianness. */ +/* Magic number to provide a small check against being handed junk. */ #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ +/* This variable is used to detect a loaded regular expression +in different endianness. */ + +#define REVERSED_MAGIC_NUMBER 0x45524350UL /* 'ERCP' */ + /* Negative values for the firstchar and reqchar variables */ #define REQ_UNSET (-2) @@ -643,12 +908,6 @@ req_byte match. */ #define REQ_BYTE_MAX 1000 -/* Flags added to firstbyte or reqbyte; a "non-literal" item is either a -variable-length repeat, or a anything other than literal characters. */ - -#define REQ_CASELESS 0x0100 /* indicates caselessness */ -#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ - /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in environments where these macros are defined elsewhere. Unfortunately, there is no way to do the same for the typedef. */ @@ -677,7 +936,7 @@ for) in a minority area (EBCDIC platforms), this is not sensible. Any application that did need both could compile two versions of the library, using macros to give the functions distinct names. */ -#ifndef SUPPORT_UTF8 +#ifndef SUPPORT_UTF /* UTF-8 support is not enabled; use the platform-dependent character literals so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ @@ -937,11 +1196,16 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */ #define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" #define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" #define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" -#define STRING_UTF8_RIGHTPAR "UTF8)" +#ifdef COMPILE_PCRE8 +#define STRING_UTF_RIGHTPAR "UTF8)" +#endif +#ifdef COMPILE_PCRE16 +#define STRING_UTF_RIGHTPAR "UTF16)" +#endif #define STRING_UCP_RIGHTPAR "UCP)" #define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" -#else /* SUPPORT_UTF8 */ +#else /* SUPPORT_UTF */ /* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode @@ -1192,11 +1456,16 @@ only. */ #define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS #define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS #define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS -#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS +#ifdef COMPILE_PCRE8 +#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS +#endif +#ifdef COMPILE_PCRE16 +#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS +#endif #define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS #define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ /* Escape items that are just an encoding of a particular data value. */ @@ -1236,7 +1505,7 @@ only. */ #define PT_WORD 8 /* Word - L plus N plus underscore */ /* Flag bits and data types for the extended class (OP_XCLASS) for classes that -contain UTF-8 characters with values greater than 255. */ +contain characters with values greater than 255. */ #define XCL_NOT 0x01 /* Flag: this is a negative class */ #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ @@ -1252,7 +1521,7 @@ value such as \n. They must have non-zero values, as check_escape() returns their negation. Also, they must appear in the same order as in the opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it corresponds to "." in DOTALL mode rather than an escape sequence. It is also -used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In +used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves like \N. The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. @@ -1433,8 +1702,8 @@ enum { OP_CLASS, /* 106 Match a character class, chars < 256 only */ OP_NCLASS, /* 107 Same, but the bitmap was created from a negative class - the difference is relevant only when a - UTF-8 character > 255 is encountered. */ - OP_XCLASS, /* 108 Extended class for handling UTF-8 chars within the + character > 255 is encountered. */ + OP_XCLASS, /* 108 Extended class for handling > 255 chars within the class. This does both positive and negative. */ OP_REF, /* 109 Match a back reference, casefully */ OP_REFI, /* 110 Match a back reference, caselessly */ @@ -1591,30 +1860,35 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 2, /* noti */ \ /* Positive single-char repeats ** These are */ \ 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ - 4, 4, 4, /* upto, minupto, exact ** mode */ \ - 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* upto, minupto ** mode */ \ + 2+IMM2_SIZE, /* exact */ \ + 2, 2, 2, 2+IMM2_SIZE, /* *+, ++, ?+, upto+ */ \ 2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \ - 4, 4, 4, /* upto I, minupto I, exact I */ \ - 2, 2, 2, 4, /* *+I, ++I, ?+I, upto+I */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* upto I, minupto I */ \ + 2+IMM2_SIZE, /* exact I */ \ + 2, 2, 2, 2+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ \ /* Negative single-char repeats - only for chars < 256 */ \ 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ - 4, 4, 4, /* NOT upto, minupto, exact */ \ - 2, 2, 2, 4, /* Possessive NOT *, +, ?, upto */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto, minupto */ \ + 2+IMM2_SIZE, /* NOT exact */ \ + 2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *, +, ?, upto */ \ 2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \ - 4, 4, 4, /* NOT upto I, minupto I, exact I */ \ - 2, 2, 2, 4, /* Possessive NOT *I, +I, ?I, upto I */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto I, minupto I */ \ + 2+IMM2_SIZE, /* NOT exact I */ \ + 2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *I, +I, ?I, upto I */ \ /* Positive type repeats */ \ 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ - 4, 4, 4, /* Type upto, minupto, exact */ \ - 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ + 2+IMM2_SIZE, 2+IMM2_SIZE, /* Type upto, minupto */ \ + 2+IMM2_SIZE, /* Type exact */ \ + 2, 2, 2, 2+IMM2_SIZE, /* Possessive *+, ++, ?+, upto+ */ \ /* Character class & ref repeats */ \ 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ - 5, 5, /* CRRANGE, CRMINRANGE */ \ - 33, /* CLASS */ \ - 33, /* NCLASS */ \ + 1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \ + 1+(32/sizeof(pcre_uchar)), /* CLASS */ \ + 1+(32/sizeof(pcre_uchar)), /* NCLASS */ \ 0, /* XCLASS - variable length */ \ - 3, /* REF */ \ - 3, /* REFI */ \ + 1+IMM2_SIZE, /* REF */ \ + 1+IMM2_SIZE, /* REFI */ \ 1+LINK_SIZE, /* RECURSE */ \ 2+2*LINK_SIZE, /* CALLOUT */ \ 1+LINK_SIZE, /* Alt */ \ @@ -1631,23 +1905,23 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1+LINK_SIZE, /* ONCE_NC */ \ 1+LINK_SIZE, /* BRA */ \ 1+LINK_SIZE, /* BRAPOS */ \ - 3+LINK_SIZE, /* CBRA */ \ - 3+LINK_SIZE, /* CBRAPOS */ \ + 1+LINK_SIZE+IMM2_SIZE, /* CBRA */ \ + 1+LINK_SIZE+IMM2_SIZE, /* CBRAPOS */ \ 1+LINK_SIZE, /* COND */ \ 1+LINK_SIZE, /* SBRA */ \ 1+LINK_SIZE, /* SBRAPOS */ \ - 3+LINK_SIZE, /* SCBRA */ \ - 3+LINK_SIZE, /* SCBRAPOS */ \ + 1+LINK_SIZE+IMM2_SIZE, /* SCBRA */ \ + 1+LINK_SIZE+IMM2_SIZE, /* SCBRAPOS */ \ 1+LINK_SIZE, /* SCOND */ \ - 3, 3, /* CREF, NCREF */ \ - 3, 3, /* RREF, NRREF */ \ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* CREF, NCREF */ \ + 1+IMM2_SIZE, 1+IMM2_SIZE, /* RREF, NRREF */ \ 1, /* DEF */ \ 1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \ 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \ 1, 3, /* SKIP, SKIP_ARG */ \ 1, 3, /* THEN, THEN_ARG */ \ 1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \ - 3, 1 /* CLOSE, SKIPZERO */ + 1+IMM2_SIZE, 1 /* CLOSE, SKIPZERO */ /* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion" condition. */ @@ -1665,7 +1939,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, - ERR70, ERR71, ERR72, ERRCOUNT }; + ERR70, ERR71, ERR72, ERR73, ERR74, ERRCOUNT }; /* The real format of the start of the pcre block; the index of names and the code vector run on as long as necessary after the end. We store an explicit @@ -1684,7 +1958,13 @@ fields are present. Currently PCRE always sets the dummy fields to zero. NOTE NOTE NOTE */ -typedef struct real_pcre { +#ifdef COMPILE_PCRE8 +#define REAL_PCRE real_pcre +#else +#define REAL_PCRE real_pcre16 +#endif + +typedef struct REAL_PCRE { pcre_uint32 magic_number; pcre_uint32 size; /* Total that was malloced */ pcre_uint32 options; /* Public options */ @@ -1692,16 +1972,16 @@ typedef struct real_pcre { pcre_uint16 dummy1; /* For future use */ pcre_uint16 top_bracket; pcre_uint16 top_backref; - pcre_uint16 first_byte; - pcre_uint16 req_byte; + pcre_uint16 first_char; /* Starting character */ + pcre_uint16 req_char; /* This character must be seen */ pcre_uint16 name_table_offset; /* Offset to name table that follows */ pcre_uint16 name_entry_size; /* Size of any name items */ pcre_uint16 name_count; /* Number of name items */ pcre_uint16 ref_count; /* Reference count */ - const unsigned char *tables; /* Pointer to tables or NULL for std */ - const unsigned char *nullpad; /* NULL padding */ -} real_pcre; + const pcre_uint8 *tables; /* Pointer to tables or NULL for std */ + const pcre_uint8 *nullpad; /* NULL padding */ +} REAL_PCRE; /* The format of the block used to store data from pcre_study(). The same remark (see NOTE above) about extending this structure applies. */ @@ -1709,7 +1989,7 @@ remark (see NOTE above) about extending this structure applies. */ typedef struct pcre_study_data { pcre_uint32 size; /* Total that was malloced */ pcre_uint32 flags; /* Private flags */ - uschar start_bits[32]; /* Starting char bits */ + pcre_uint8 start_bits[32]; /* Starting char bits */ pcre_uint32 minlength; /* Minimum subject length */ } pcre_study_data; @@ -1728,33 +2008,33 @@ typedef struct open_capitem { doing the compiling, so that they are thread-safe. */ typedef struct compile_data { - const uschar *lcc; /* Points to lower casing table */ - const uschar *fcc; /* Points to case-flipping table */ - const uschar *cbits; /* Points to character type table */ - const uschar *ctypes; /* Points to table of type maps */ - const uschar *start_workspace;/* The start of working space */ - const uschar *start_code; /* The start of the compiled code */ - const uschar *start_pattern; /* The start of the pattern */ - const uschar *end_pattern; /* The end of the pattern */ - open_capitem *open_caps; /* Chain of open capture items */ - uschar *hwm; /* High watermark of workspace */ - uschar *name_table; /* The name/number table */ - int names_found; /* Number of entries so far */ - int name_entry_size; /* Size of each entry */ - int workspace_size; /* Size of workspace */ - int bracount; /* Count of capturing parens as we compile */ - int final_bracount; /* Saved value after first pass */ - int top_backref; /* Maximum back reference */ - unsigned int backref_map; /* Bitmap of low back refs */ - int assert_depth; /* Depth of nested assertions */ - int external_options; /* External (initial) options */ - int external_flags; /* External flag bits to be set */ - int req_varyopt; /* "After variable item" flag for reqbyte */ - BOOL had_accept; /* (*ACCEPT) encountered */ - BOOL check_lookbehind; /* Lookbehinds need later checking */ - int nltype; /* Newline type */ - int nllen; /* Newline string length */ - uschar nl[4]; /* Newline string when fixed length */ + const pcre_uint8 *lcc; /* Points to lower casing table */ + const pcre_uint8 *fcc; /* Points to case-flipping table */ + const pcre_uint8 *cbits; /* Points to character type table */ + const pcre_uint8 *ctypes; /* Points to table of type maps */ + const pcre_uchar *start_workspace;/* The start of working space */ + const pcre_uchar *start_code; /* The start of the compiled code */ + const pcre_uchar *start_pattern; /* The start of the pattern */ + const pcre_uchar *end_pattern; /* The end of the pattern */ + open_capitem *open_caps; /* Chain of open capture items */ + pcre_uchar *hwm; /* High watermark of workspace */ + pcre_uchar *name_table; /* The name/number table */ + int names_found; /* Number of entries so far */ + int name_entry_size; /* Size of each entry */ + int workspace_size; /* Size of workspace */ + int bracount; /* Count of capturing parens as we compile */ + int final_bracount; /* Saved value after first pass */ + int top_backref; /* Maximum back reference */ + unsigned int backref_map; /* Bitmap of low back refs */ + int assert_depth; /* Depth of nested assertions */ + int external_options; /* External (initial) options */ + int external_flags; /* External flag bits to be set */ + int req_varyopt; /* "After variable item" flag for reqbyte */ + BOOL had_accept; /* (*ACCEPT) encountered */ + BOOL check_lookbehind; /* Lookbehinds need later checking */ + int nltype; /* Newline type */ + int nllen; /* Newline string length */ + pcre_uchar nl[4]; /* Newline string when fixed length */ } compile_data; /* Structure for maintaining a chain of pointers to the currently incomplete @@ -1762,7 +2042,7 @@ branches, for testing for left recursion while compiling. */ typedef struct branch_chain { struct branch_chain *outer; - uschar *current_branch; + pcre_uchar *current_branch; } branch_chain; /* Structure for items in a linked list that represents an explicit recursive @@ -1773,7 +2053,7 @@ typedef struct recursion_info { int group_num; /* Number of group that was called */ int *offset_save; /* Pointer to start of saved offsets */ int saved_max; /* Number of saved offsets */ - USPTR subject_position; /* Position at start of recursion */ + PCRE_PUCHAR subject_position; /* Position at start of recursion */ } recursion_info; /* A similar structure for pcre_dfa_exec(). */ @@ -1781,7 +2061,7 @@ typedef struct recursion_info { typedef struct dfa_recursion_info { struct dfa_recursion_info *prevrec; int group_num; - USPTR subject_position; + PCRE_PUCHAR subject_position; } dfa_recursion_info; /* Structure for building a chain of data for holding the values of the subject @@ -1791,7 +2071,7 @@ pcre_exec(). */ typedef struct eptrblock { struct eptrblock *epb_prev; - USPTR epb_saved_eptr; + PCRE_PUCHAR epb_saved_eptr; } eptrblock; @@ -1802,67 +2082,68 @@ typedef struct match_data { unsigned long int match_call_count; /* As it says */ unsigned long int match_limit; /* As it says */ unsigned long int match_limit_recursion; /* As it says */ - int *offset_vector; /* Offset vector */ - int offset_end; /* One past the end */ - int offset_max; /* The maximum usable for return data */ - int nltype; /* Newline type */ - int nllen; /* Newline string length */ - int name_count; /* Number of names in name table */ - int name_entry_size; /* Size of entry in names table */ - uschar *name_table; /* Table of names */ - uschar nl[4]; /* Newline string when fixed */ - const uschar *lcc; /* Points to lower casing table */ - const uschar *ctypes; /* Points to table of type maps */ - BOOL offset_overflow; /* Set if too many extractions */ - BOOL notbol; /* NOTBOL flag */ - BOOL noteol; /* NOTEOL flag */ - BOOL utf8; /* UTF8 flag */ - BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ - BOOL use_ucp; /* PCRE_UCP flag */ - BOOL endonly; /* Dollar not before final \n */ - BOOL notempty; /* Empty string match not wanted */ - BOOL notempty_atstart; /* Empty string match at start not wanted */ - BOOL hitend; /* Hit the end of the subject at some point */ - BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ - BOOL hasthen; /* Pattern contains (*THEN) */ - BOOL ignore_skip_arg; /* For re-run when SKIP name not found */ - const uschar *start_code; /* For use when recursing */ - USPTR start_subject; /* Start of the subject string */ - USPTR end_subject; /* End of the subject string */ - USPTR start_match_ptr; /* Start of matched string */ - USPTR end_match_ptr; /* Subject position at end match */ - USPTR start_used_ptr; /* Earliest consulted character */ - int partial; /* PARTIAL options */ - int end_offset_top; /* Highwater mark at end of match */ - int capture_last; /* Most recent capture number */ - int start_offset; /* The start offset value */ - int match_function_type; /* Set for certain special calls of MATCH() */ - eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ - int eptrn; /* Next free eptrblock */ - recursion_info *recursive; /* Linked list of recursion data */ - void *callout_data; /* To pass back to callouts */ - const uschar *mark; /* Mark pointer to pass back on success */ - const uschar *nomatch_mark; /* Mark pointer to pass back on failure */ - const uschar *once_target; /* Where to back up to for atomic groups */ + int *offset_vector; /* Offset vector */ + int offset_end; /* One past the end */ + int offset_max; /* The maximum usable for return data */ + int nltype; /* Newline type */ + int nllen; /* Newline string length */ + int name_count; /* Number of names in name table */ + int name_entry_size; /* Size of entry in names table */ + pcre_uchar *name_table; /* Table of names */ + pcre_uchar nl[4]; /* Newline string when fixed */ + const pcre_uint8 *lcc; /* Points to lower casing table */ + const pcre_uint8 *fcc; /* Points to case-flipping table */ + const pcre_uint8 *ctypes; /* Points to table of type maps */ + BOOL offset_overflow; /* Set if too many extractions */ + BOOL notbol; /* NOTBOL flag */ + BOOL noteol; /* NOTEOL flag */ + BOOL utf; /* UTF-8 / UTF-16 flag */ + BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ + BOOL use_ucp; /* PCRE_UCP flag */ + BOOL endonly; /* Dollar not before final \n */ + BOOL notempty; /* Empty string match not wanted */ + BOOL notempty_atstart; /* Empty string match at start not wanted */ + BOOL hitend; /* Hit the end of the subject at some point */ + BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ + BOOL hasthen; /* Pattern contains (*THEN) */ + BOOL ignore_skip_arg; /* For re-run when SKIP name not found */ + const pcre_uchar *start_code; /* For use when recursing */ + PCRE_PUCHAR start_subject; /* Start of the subject string */ + PCRE_PUCHAR end_subject; /* End of the subject string */ + PCRE_PUCHAR start_match_ptr; /* Start of matched string */ + PCRE_PUCHAR end_match_ptr; /* Subject position at end match */ + PCRE_PUCHAR start_used_ptr; /* Earliest consulted character */ + int partial; /* PARTIAL options */ + int end_offset_top; /* Highwater mark at end of match */ + int capture_last; /* Most recent capture number */ + int start_offset; /* The start offset value */ + int match_function_type; /* Set for certain special calls of MATCH() */ + eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ + int eptrn; /* Next free eptrblock */ + recursion_info *recursive; /* Linked list of recursion data */ + void *callout_data; /* To pass back to callouts */ + const pcre_uchar *mark; /* Mark pointer to pass back on success */ + const pcre_uchar *nomatch_mark;/* Mark pointer to pass back on failure */ + const pcre_uchar *once_target; /* Where to back up to for atomic groups */ } match_data; /* A similar structure is used for the same purpose by the DFA matching functions. */ typedef struct dfa_match_data { - const uschar *start_code; /* Start of the compiled pattern */ - const uschar *start_subject; /* Start of the subject string */ - const uschar *end_subject; /* End of subject string */ - const uschar *start_used_ptr; /* Earliest consulted character */ - const uschar *tables; /* Character tables */ - int start_offset; /* The start offset value */ - int moptions; /* Match options */ - int poptions; /* Pattern options */ - int nltype; /* Newline type */ - int nllen; /* Newline string length */ - uschar nl[4]; /* Newline string when fixed */ - void *callout_data; /* To pass back to callouts */ - dfa_recursion_info *recursive; /* Linked list of recursion data */ + const pcre_uchar *start_code; /* Start of the compiled pattern */ + const pcre_uchar *start_subject ; /* Start of the subject string */ + const pcre_uchar *end_subject; /* End of subject string */ + const pcre_uchar *start_used_ptr; /* Earliest consulted character */ + const pcre_uint8 *tables; /* Character tables */ + int start_offset; /* The start offset value */ + int moptions; /* Match options */ + int poptions; /* Pattern options */ + int nltype; /* Newline type */ + int nllen; /* Newline string length */ + pcre_uchar nl[4]; /* Newline string when fixed */ + void *callout_data; /* To pass back to callouts */ + dfa_recursion_info *recursive; /* Linked list of recursion data */ } dfa_match_data; /* Bit definitions for entries in the pcre_ctypes table. */ @@ -1898,6 +2179,28 @@ total length. */ #define ctypes_offset (cbits_offset + cbit_length) #define tables_length (ctypes_offset + 256) +/* Internal function prefix */ + +#ifdef COMPILE_PCRE8 +#ifndef PUBL +#define PUBL(name) pcre_##name +#endif +#ifndef PRIV +#define PRIV(name) _pcre_##name +#endif +#else /* COMPILE_PCRE8 */ +#ifdef COMPILE_PCRE16 +#ifndef PUBL +#define PUBL(name) pcre16_##name +#endif +#ifndef PRIV +#define PRIV(name) _pcre16_##name +#endif +#else +#error Unsupported compiling mode +#endif /* COMPILE_PCRE16 */ +#endif /* COMPILE_PCRE8 */ + /* Layout of the UCP type table that translates property names into types and codes. Each entry used to point directly to a name, but to reduce the number of relocations in shared libraries, it now has an offset into a single string @@ -1915,75 +2218,115 @@ of the exported public functions. They have to be "external" in the C sense, but are not part of the PCRE public API. The data for these tables is in the pcre_tables.c module. */ -extern const int _pcre_utf8_table1[]; -extern const int _pcre_utf8_table2[]; -extern const int _pcre_utf8_table3[]; -extern const uschar _pcre_utf8_table4[]; +#ifdef COMPILE_PCRE8 -#ifdef SUPPORT_JIT -extern const uschar _pcre_utf8_char_sizes[]; -#endif +extern const int PRIV(utf8_table1)[]; +extern const int PRIV(utf8_table1_size); +extern const int PRIV(utf8_table2)[]; +extern const int PRIV(utf8_table3)[]; +extern const pcre_uint8 PRIV(utf8_table4)[]; -extern const int _pcre_utf8_table1_size; +#endif /* COMPILE_PCRE8 */ -extern const char _pcre_utt_names[]; -extern const ucp_type_table _pcre_utt[]; -extern const int _pcre_utt_size; +extern const char PRIV(utt_names)[]; +extern const ucp_type_table PRIV(utt)[]; +extern const int PRIV(utt_size); -extern const uschar _pcre_default_tables[]; +extern const pcre_uint8 PRIV(default_tables)[]; -extern const uschar _pcre_OP_lengths[]; +extern const pcre_uint8 PRIV(OP_lengths)[]; /* Internal shared functions. These are functions that are used by more than one of the exported public functions. They have to be "external" in the C sense, but are not part of the PCRE public API. */ -extern const uschar *_pcre_find_bracket(const uschar *, BOOL, int); -extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL); -extern int _pcre_ord2utf8(int, uschar *); -extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, - const pcre_study_data *, pcre_study_data *); -extern int _pcre_valid_utf8(USPTR, int, int *); -extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL); -extern BOOL _pcre_xclass(int, const uschar *); +/* String comparison functions. */ +#ifdef COMPILE_PCRE8 + +#define STRCMP_UC_UC(str1, str2) \ + strcmp((char *)(str1), (char *)(str2)) +#define STRCMP_UC_C8(str1, str2) \ + strcmp((char *)(str1), (str2)) +#define STRNCMP_UC_UC(str1, str2, num) \ + strncmp((char *)(str1), (char *)(str2), (num)) +#define STRNCMP_UC_C8(str1, str2, num) \ + strncmp((char *)(str1), (str2), (num)) +#define STRLEN_UC(str) strlen((const char *)str) + +#else + +extern int PRIV(strcmp_uc_uc)(const pcre_uchar *, + const pcre_uchar *); +extern int PRIV(strcmp_uc_c8)(const pcre_uchar *, + const char *); +extern int PRIV(strncmp_uc_uc)(const pcre_uchar *, + const pcre_uchar *, unsigned int num); +extern int PRIV(strncmp_uc_c8)(const pcre_uchar *, + const char *, unsigned int num); +extern unsigned int PRIV(strlen_uc)(const pcre_uchar *str); + +#define STRCMP_UC_UC(str1, str2) \ + PRIV(strcmp_uc_uc)((str1), (str2)) +#define STRCMP_UC_C8(str1, str2) \ + PRIV(strcmp_uc_c8)((str1), (str2)) +#define STRNCMP_UC_UC(str1, str2, num) \ + PRIV(strncmp_uc_uc)((str1), (str2), (num)) +#define STRNCMP_UC_C8(str1, str2, num) \ + PRIV(strncmp_uc_c8)((str1), (str2), (num)) +#define STRLEN_UC(str) PRIV(strlen_uc)(str) + +#endif /* COMPILE_PCRE8 */ + +extern const pcre_uchar *PRIV(find_bracket)(const pcre_uchar *, BOOL, int); +extern BOOL PRIV(is_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, + int *, BOOL); +extern int PRIV(ord2utf)(pcre_uint32, pcre_uchar *); +extern int PRIV(valid_utf)(PCRE_PUCHAR, int, int *); +extern BOOL PRIV(was_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR, + int *, BOOL); +extern BOOL PRIV(xclass)(int, const pcre_uchar *, BOOL); #ifdef SUPPORT_JIT -extern void _pcre_jit_compile(const real_pcre *, pcre_extra *); -extern int _pcre_jit_exec(const real_pcre *, void *, PCRE_SPTR, - int, int, int, int, int *, int); -extern void _pcre_jit_free(void *); -extern int _pcre_jit_get_size(void *); +extern void PRIV(jit_compile)(const REAL_PCRE *, PUBL(extra) *); +extern int PRIV(jit_exec)(const REAL_PCRE *, void *, + const pcre_uchar *, int, int, int, int, int *, int); +extern void PRIV(jit_free)(void *); +extern int PRIV(jit_get_size)(void *); +extern const char* PRIV(jit_get_target)(void); #endif /* Unicode character database (UCD) */ typedef struct { - uschar script; - uschar chartype; + pcre_uint8 script; + pcre_uint8 chartype; pcre_int32 other_case; } ucd_record; -extern const ucd_record _pcre_ucd_records[]; -extern const uschar _pcre_ucd_stage1[]; -extern const pcre_uint16 _pcre_ucd_stage2[]; -extern const int _pcre_ucp_gentype[]; +extern const ucd_record PRIV(ucd_records)[]; +extern const pcre_uint8 PRIV(ucd_stage1)[]; +extern const pcre_uint16 PRIV(ucd_stage2)[]; +extern const int PRIV(ucp_gentype)[]; #ifdef SUPPORT_JIT -extern const int _pcre_ucp_typerange[]; +extern const int PRIV(ucp_typerange)[]; #endif +#ifdef SUPPORT_UCP /* UCD access macros */ #define UCD_BLOCK_SIZE 128 -#define GET_UCD(ch) (_pcre_ucd_records + \ - _pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \ +#define GET_UCD(ch) (PRIV(ucd_records) + \ + PRIV(ucd_stage2)[PRIV(ucd_stage1)[(ch) / UCD_BLOCK_SIZE] * \ UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE]) #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype #define UCD_SCRIPT(ch) GET_UCD(ch)->script -#define UCD_CATEGORY(ch) _pcre_ucp_gentype[UCD_CHARTYPE(ch)] +#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)] #define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case) +#endif /* SUPPORT_UCP */ + #endif /* End of pcre_internal.h */ diff --git a/usr.sbin/nginx/src/pcre/pcre_newline.c b/usr.sbin/nginx/src/pcre/pcre_newline.c index 38cf7f72f8d..a0a13c8ed11 100644 --- a/usr.sbin/nginx/src/pcre/pcre_newline.c +++ b/usr.sbin/nginx/src/pcre/pcre_newline.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -67,16 +67,25 @@ Arguments: type the newline type endptr pointer to the end of the string lenptr where to return the length - utf8 TRUE if in utf8 mode + utf TRUE if in utf mode Returns: TRUE or FALSE */ BOOL -_pcre_is_newline(USPTR ptr, int type, USPTR endptr, int *lenptr, BOOL utf8) +PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr, + BOOL utf) { int c; -if (utf8) { GETCHAR(c, ptr); } else c = *ptr; +(void)utf; +#ifdef SUPPORT_UTF +if (utf) + { + GETCHAR(c, ptr); + } +else +#endif /* SUPPORT_UTF */ + c = *ptr; if (type == NLTYPE_ANYCRLF) switch(c) { @@ -95,9 +104,15 @@ else switch(c) case 0x000c: *lenptr = 1; return TRUE; /* FF */ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; return TRUE; /* CR */ - case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ +#ifdef COMPILE_PCRE8 + case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */ case 0x2028: /* LS */ case 0x2029: *lenptr = 3; return TRUE; /* PS */ +#else + case 0x0085: /* NEL */ + case 0x2028: /* LS */ + case 0x2029: *lenptr = 1; return TRUE; /* PS */ +#endif /* COMPILE_PCRE8 */ default: return FALSE; } } @@ -116,26 +131,27 @@ Arguments: type the newline type startptr pointer to the start of the string lenptr where to return the length - utf8 TRUE if in utf8 mode + utf TRUE if in utf mode Returns: TRUE or FALSE */ BOOL -_pcre_was_newline(USPTR ptr, int type, USPTR startptr, int *lenptr, BOOL utf8) +PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr, + BOOL utf) { int c; +(void)utf; ptr--; -#ifdef SUPPORT_UTF8 -if (utf8) +#ifdef SUPPORT_UTF +if (utf) { BACKCHAR(ptr); GETCHAR(c, ptr); } -else c = *ptr; -#else /* no UTF-8 support */ -c = *ptr; -#endif /* SUPPORT_UTF8 */ +else +#endif /* SUPPORT_UTF */ + c = *ptr; if (type == NLTYPE_ANYCRLF) switch(c) { @@ -152,9 +168,15 @@ else switch(c) case 0x000b: /* VT */ case 0x000c: /* FF */ case 0x000d: *lenptr = 1; return TRUE; /* CR */ - case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ +#ifdef COMPILE_PCRE8 + case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */ case 0x2028: /* LS */ case 0x2029: *lenptr = 3; return TRUE; /* PS */ +#else + case 0x0085: /* NEL */ + case 0x2028: /* LS */ + case 0x2029: *lenptr = 1; return TRUE; /* PS */ +#endif /* COMPILE_PCRE8 */ default: return FALSE; } } diff --git a/usr.sbin/nginx/src/pcre/pcre_ord2utf8.c b/usr.sbin/nginx/src/pcre/pcre_ord2utf8.c index 6f4eb9ebe95..50fca9525ac 100644 --- a/usr.sbin/nginx/src/pcre/pcre_ord2utf8.c +++ b/usr.sbin/nginx/src/pcre/pcre_ord2utf8.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2008 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -52,35 +52,45 @@ character value into a UTF8 string. */ * Convert character value to UTF-8 * *************************************************/ -/* This function takes an integer value in the range 0 - 0x7fffffff -and encodes it as a UTF-8 character in 0 to 6 bytes. +/* This function takes an integer value in the range 0 - 0x10ffff +and encodes it as a UTF-8 character in 1 to 6 pcre_uchars. Arguments: cvalue the character value - buffer pointer to buffer for result - at least 6 bytes long + buffer pointer to buffer for result - at least 6 pcre_uchars long Returns: number of characters placed in the buffer */ int -_pcre_ord2utf8(int cvalue, uschar *buffer) +PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer) { -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF + register int i, j; -for (i = 0; i < _pcre_utf8_table1_size; i++) - if (cvalue <= _pcre_utf8_table1[i]) break; + +/* Checking invalid cvalue character, encoded as invalid UTF-16 character. +Should never happen in practice. */ +if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000) + cvalue = 0xfffe; + +for (i = 0; i < PRIV(utf8_table1_size); i++) + if ((int)cvalue <= PRIV(utf8_table1)[i]) break; buffer += i; for (j = i; j > 0; j--) { *buffer-- = 0x80 | (cvalue & 0x3f); cvalue >>= 6; } -*buffer = _pcre_utf8_table2[i] | cvalue; +*buffer = PRIV(utf8_table2)[i] | cvalue; return i + 1; + #else + (void)(cvalue); /* Keep compiler happy; this function won't ever be */ -(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */ +(void)(buffer); /* called when SUPPORT_UTF is not defined. */ return 0; + #endif } diff --git a/usr.sbin/nginx/src/pcre/pcre_tables.c b/usr.sbin/nginx/src/pcre/pcre_tables.c index 45c221181ac..c8134ec318f 100644 --- a/usr.sbin/nginx/src/pcre/pcre_tables.c +++ b/usr.sbin/nginx/src/pcre/pcre_tables.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -37,6 +37,7 @@ POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ +#ifndef PCRE_INCLUDED /* This module contains some fixed tables that are used by more than one of the PCRE code modules. The tables are also #included by the pcretest program, which @@ -50,11 +51,12 @@ clashes with the library. */ #include "pcre_internal.h" +#endif /* PCRE_INCLUDED */ /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that the definition is next to the definition of the opcodes in pcre_internal.h. */ -const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; +const pcre_uint8 PRIV(OP_lengths)[] = { OP_LENGTHS }; @@ -65,44 +67,38 @@ const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; /* These are the breakpoints for different numbers of bytes in a UTF-8 character. */ -#ifdef SUPPORT_UTF8 +#if (defined SUPPORT_UTF && defined COMPILE_PCRE8) \ + || (defined PCRE_INCLUDED && defined SUPPORT_PCRE16) -const int _pcre_utf8_table1[] = +/* These tables are also required by pcretest in 16 bit mode. */ + +const int PRIV(utf8_table1)[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; -const int _pcre_utf8_table1_size = sizeof(_pcre_utf8_table1)/sizeof(int); +const int PRIV(utf8_table1_size) = sizeof(PRIV(utf8_table1)) / sizeof(int); /* These are the indicator bits and the mask for the data bits to set in the first byte of a character, indexed by the number of additional bytes. */ -const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; -const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; +const int PRIV(utf8_table2)[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; +const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; /* Table of the number of extra bytes, indexed by the first byte masked with 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ -const uschar _pcre_utf8_table4[] = { +const pcre_uint8 PRIV(utf8_table4)[] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; -#ifdef SUPPORT_JIT -/* Full table of the number of extra bytes when the -character code is greater or equal than 0xc0. -See _pcre_utf8_table4 above. */ +#endif /* (SUPPORT_UTF && COMPILE_PCRE8) || (PCRE_INCLUDED && SUPPORT_PCRE16)*/ -const uschar _pcre_utf8_char_sizes[] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4, -}; -#endif +#ifdef SUPPORT_UTF /* Table to translate from particular type value to the general value. */ -const int _pcre_ucp_gentype[] = { +const int PRIV(ucp_gentype)[] = { ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */ ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */ ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */ @@ -114,10 +110,10 @@ const int _pcre_ucp_gentype[] = { }; #ifdef SUPPORT_JIT -/* This table reverses _pcre_ucp_gentype. We can save the cost +/* This table reverses PRIV(ucp_gentype). We can save the cost of a memory load. */ -const int _pcre_ucp_typerange[] = { +const int PRIV(ucp_typerange)[] = { ucp_Cc, ucp_Cs, ucp_Ll, ucp_Lu, ucp_Mc, ucp_Mn, @@ -126,7 +122,7 @@ const int _pcre_ucp_typerange[] = { ucp_Sc, ucp_So, ucp_Zl, ucp_Zs, }; -#endif +#endif /* SUPPORT_JIT */ /* The pcre_utt[] table below translates Unicode property names into type and code values. It is searched by binary chop, so must be in collating sequence of @@ -284,7 +280,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */ #define STRING_Zp0 STR_Z STR_p "\0" #define STRING_Zs0 STR_Z STR_s "\0" -const char _pcre_utt_names[] = +const char PRIV(utt_names)[] = STRING_Any0 STRING_Arabic0 STRING_Armenian0 @@ -424,7 +420,7 @@ const char _pcre_utt_names[] = STRING_Zp0 STRING_Zs0; -const ucp_type_table _pcre_utt[] = { +const ucp_type_table PRIV(utt)[] = { { 0, PT_ANY, 0 }, { 4, PT_SC, ucp_Arabic }, { 11, PT_SC, ucp_Armenian }, @@ -565,8 +561,8 @@ const ucp_type_table _pcre_utt[] = { { 961, PT_PC, ucp_Zs } }; -const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); +const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); -#endif /* SUPPORT_UTF8 */ +#endif /* SUPPORT_UTF */ /* End of pcre_tables.c */ diff --git a/usr.sbin/nginx/src/pcre/pcre_try_flipped.c b/usr.sbin/nginx/src/pcre/pcre_try_flipped.c deleted file mode 100644 index 606504c0b0d..00000000000 --- a/usr.sbin/nginx/src/pcre/pcre_try_flipped.c +++ /dev/null @@ -1,139 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This module contains an internal function that tests a compiled pattern to -see if it was compiled with the opposite endianness. If so, it uses an -auxiliary local function to flip the appropriate bytes. */ - - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "pcre_internal.h" - - -/************************************************* -* Flip bytes in an integer * -*************************************************/ - -/* This function is called when the magic number in a regex doesn't match, in -order to flip its bytes to see if we are dealing with a pattern that was -compiled on a host of different endianness. If so, this function is used to -flip other byte values. - -Arguments: - value the number to flip - n the number of bytes to flip (assumed to be 2 or 4) - -Returns: the flipped value -*/ - -static unsigned long int -byteflip(unsigned long int value, int n) -{ -if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8); -return ((value & 0x000000ff) << 24) | - ((value & 0x0000ff00) << 8) | - ((value & 0x00ff0000) >> 8) | - ((value & 0xff000000) >> 24); -} - - - -/************************************************* -* Test for a byte-flipped compiled regex * -*************************************************/ - -/* This function is called from pcre_exec(), pcre_dfa_exec(), and also from -pcre_fullinfo(). Its job is to test whether the regex is byte-flipped - that -is, it was compiled on a system of opposite endianness. The function is called -only when the native MAGIC_NUMBER test fails. If the regex is indeed flipped, -we flip all the relevant values into a different data block, and return it. - -Arguments: - re points to the regex - study points to study data, or NULL - internal_re points to a new regex block - internal_study points to a new study block - -Returns: the new block if is is indeed a byte-flipped regex - NULL if it is not -*/ - -real_pcre * -_pcre_try_flipped(const real_pcre *re, real_pcre *internal_re, - const pcre_study_data *study, pcre_study_data *internal_study) -{ -if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER) - return NULL; - -*internal_re = *re; /* To copy other fields */ -internal_re->size = byteflip(re->size, sizeof(re->size)); -internal_re->options = byteflip(re->options, sizeof(re->options)); -internal_re->flags = (pcre_uint16)byteflip(re->flags, sizeof(re->flags)); -internal_re->top_bracket = - (pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket)); -internal_re->top_backref = - (pcre_uint16)byteflip(re->top_backref, sizeof(re->top_backref)); -internal_re->first_byte = - (pcre_uint16)byteflip(re->first_byte, sizeof(re->first_byte)); -internal_re->req_byte = - (pcre_uint16)byteflip(re->req_byte, sizeof(re->req_byte)); -internal_re->name_table_offset = - (pcre_uint16)byteflip(re->name_table_offset, sizeof(re->name_table_offset)); -internal_re->name_entry_size = - (pcre_uint16)byteflip(re->name_entry_size, sizeof(re->name_entry_size)); -internal_re->name_count = - (pcre_uint16)byteflip(re->name_count, sizeof(re->name_count)); - -if (study != NULL) - { - *internal_study = *study; /* To copy other fields */ - internal_study->size = byteflip(study->size, sizeof(study->size)); - internal_study->flags = byteflip(study->flags, sizeof(study->flags)); - internal_study->minlength = byteflip(study->minlength, - sizeof(study->minlength)); - } - -return internal_re; -} - -/* End of pcre_tryflipped.c */ diff --git a/usr.sbin/nginx/src/pcre/pcre_ucd.c b/usr.sbin/nginx/src/pcre/pcre_ucd.c index 112cfb41a62..b25574e267e 100644 --- a/usr.sbin/nginx/src/pcre/pcre_ucd.c +++ b/usr.sbin/nginx/src/pcre/pcre_ucd.c @@ -18,21 +18,21 @@ /* Instead, just supply small dummy tables. */ #ifndef SUPPORT_UCP -const ucd_record _pcre_ucd_records[] = {{0,0,0 }}; -const uschar _pcre_ucd_stage1[] = {0}; -const pcre_uint16 _pcre_ucd_stage2[] = {0}; +const ucd_record PRIV(ucd_records)[] = {{0,0,0 }}; +const pcre_uint8 PRIV(ucd_stage1)[] = {0}; +const pcre_uint16 PRIV(ucd_stage2)[] = {0}; #else /* When recompiling tables with a new Unicode version, please check types in the structure definition from pcre_internal.h: typedef struct { -uschar property_0; -uschar property_1; +pcre_uint8 property_0; +pcre_uint8 property_1; pcre_int32 property_2; } ucd_record; */ -const ucd_record _pcre_ucd_records[] = { /* 4320 bytes, record size 8 */ +const ucd_record PRIV(ucd_records)[] = { /* 4320 bytes, record size 8 */ { 9, 0, 0, }, /* 0 */ { 9, 29, 0, }, /* 1 */ { 9, 21, 0, }, /* 2 */ @@ -575,7 +575,7 @@ const ucd_record _pcre_ucd_records[] = { /* 4320 bytes, record size 8 */ { 26, 26, 0, }, /* 539 */ }; -const uschar _pcre_ucd_stage1[] = { /* 8704 bytes */ +const pcre_uint8 PRIV(ucd_stage1)[] = { /* 8704 bytes */ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* U+0000 */ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, /* U+0800 */ 32, 33, 34, 34, 35, 36, 37, 38, 39, 40, 40, 40, 41, 42, 43, 44, /* U+1000 */ @@ -1122,7 +1122,7 @@ const uschar _pcre_ucd_stage1[] = { /* 8704 bytes */ 114,114,114,114,114,114,114,114,114,114,114,114,114,114,114,184, /* U+10F800 */ }; -const pcre_uint16 _pcre_ucd_stage2[] = { /* 47360 bytes, block = 128 */ +const pcre_uint16 PRIV(ucd_stage2)[] = { /* 47360 bytes, block = 128 */ /* block 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/usr.sbin/nginx/src/pcre/pcre_valid_utf8.c b/usr.sbin/nginx/src/pcre/pcre_valid_utf8.c index b94bcc98e6b..7b9d3dfa496 100644 --- a/usr.sbin/nginx/src/pcre/pcre_valid_utf8.c +++ b/usr.sbin/nginx/src/pcre/pcre_valid_utf8.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2009 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -103,10 +103,10 @@ Returns: = 0 if the string is a valid UTF-8 string */ int -_pcre_valid_utf8(USPTR string, int length, int *erroroffset) +PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset) { -#ifdef SUPPORT_UTF8 -register USPTR p; +#ifdef SUPPORT_UTF +register PCRE_PUCHAR p; if (length < 0) { @@ -133,7 +133,7 @@ for (p = string; length-- > 0; p++) return PCRE_UTF8_ERR21; } - ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ + ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */ if (length < ab) { *erroroffset = (int)(p - string); /* Missing bytes */ @@ -288,7 +288,7 @@ for (p = string; length-- > 0; p++) } } -#else /* SUPPORT_UTF8 */ +#else /* SUPPORT_UTF */ (void)(string); /* Keep picky compilers happy */ (void)(length); #endif diff --git a/usr.sbin/nginx/src/pcre/pcre_xclass.c b/usr.sbin/nginx/src/pcre/pcre_xclass.c index 64c1b0043fa..dca7a399403 100644 --- a/usr.sbin/nginx/src/pcre/pcre_xclass.c +++ b/usr.sbin/nginx/src/pcre/pcre_xclass.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2010 University of Cambridge + Copyright (c) 1997-2012 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -64,39 +64,63 @@ Returns: TRUE if character matches, else FALSE */ BOOL -_pcre_xclass(int c, const uschar *data) +PRIV(xclass)(int c, const pcre_uchar *data, BOOL utf) { int t; BOOL negated = (*data & XCL_NOT) != 0; +(void)utf; +#ifdef COMPILE_PCRE8 +/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */ +utf = TRUE; +#endif + /* Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. */ if (c < 256) { - if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) - return !negated; /* char found */ + if ((*data & XCL_MAP) != 0 && + (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0) + return !negated; /* char found */ } /* First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ -if ((*data++ & XCL_MAP) != 0) data += 32; +if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar); while ((t = *data++) != XCL_END) { int x, y; if (t == XCL_SINGLE) { - GETCHARINC(x, data); +#ifdef SUPPORT_UTF + if (utf) + { + GETCHARINC(x, data); /* macro generates multiple statements */ + } + else +#endif + x = *data++; if (c == x) return !negated; } else if (t == XCL_RANGE) { - GETCHARINC(x, data); - GETCHARINC(y, data); +#ifdef SUPPORT_UTF + if (utf) + { + GETCHARINC(x, data); /* macro generates multiple statements */ + GETCHARINC(y, data); /* macro generates multiple statements */ + } + else +#endif + { + x = *data++; + y = *data++; + } if (c >= x && c <= y) return !negated; } @@ -117,7 +141,7 @@ while ((t = *data++) != XCL_END) break; case PT_GC: - if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP)) + if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP)) return !negated; break; @@ -130,28 +154,28 @@ while ((t = *data++) != XCL_END) break; case PT_ALNUM: - if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N) == (t == XCL_PROP)) + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP)) return !negated; break; case PT_SPACE: /* Perl space */ - if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) return !negated; break; case PT_PXSPACE: /* POSIX space */ - if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z || + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z || c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP)) return !negated; break; case PT_WORD: - if ((_pcre_ucp_gentype[prop->chartype] == ucp_L || - _pcre_ucp_gentype[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) + if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || + PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE) == (t == XCL_PROP)) return !negated; break; |
