xref: /PHP-7.0/ext/pcre/pcrelib/pcre_compile.c (revision d37658be)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2014 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 
45 #include "config.h"
46 
47 #define NLBLOCK cd             /* Block containing newline information */
48 #define PSSTART start_pattern  /* Field containing pattern start */
49 #define PSEND   end_pattern    /* Field containing pattern end */
50 
51 #include "pcre_internal.h"
52 
53 
54 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
55 is also used by pcretest. PCRE_DEBUG is not defined when building a production
56 library. We do not need to select pcre16_printint.c specially, because the
57 COMPILE_PCREx macro will already be appropriately set. */
58 
59 #ifdef PCRE_DEBUG
60 /* pcre_printint.c should not include any headers */
61 #define PCRE_INCLUDED
62 #include "pcre_printint.c"
63 #undef PCRE_INCLUDED
64 #endif
65 
66 
67 /* Macro for setting individual bits in class bitmaps. */
68 
69 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
70 
71 /* Maximum length value to check against when making sure that the integer that
72 holds the compiled pattern length does not overflow. We make it a bit less than
73 INT_MAX to allow for adding in group terminating bytes, so that we don't have
74 to check them every time. */
75 
76 #define OFLOW_MAX (INT_MAX - 20)
77 
78 /* Definitions to allow mutual recursion */
79 
80 static int
81   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
82     const pcre_uint32 *, unsigned int);
83 
84 static BOOL
85   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
86     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
87     compile_data *, int *);
88 
89 
90 
91 /*************************************************
92 *      Code parameters and static tables         *
93 *************************************************/
94 
95 /* This value specifies the size of stack workspace that is used during the
96 first pre-compile phase that determines how much memory is required. The regex
97 is partly compiled into this space, but the compiled parts are discarded as
98 soon as they can be, so that hopefully there will never be an overrun. The code
99 does, however, check for an overrun. The largest amount I've seen used is 218,
100 so this number is very generous.
101 
102 The same workspace is used during the second, actual compile phase for
103 remembering forward references to groups so that they can be filled in at the
104 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
105 is 4 there is plenty of room for most patterns. However, the memory can get
106 filled up by repetitions of forward references, for example patterns like
107 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
108 that the workspace is expanded using malloc() in this situation. The value
109 below is therefore a minimum, and we put a maximum on it for safety. The
110 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
111 kicks in at the same number of forward references in all cases. */
112 
113 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
114 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
115 
116 /* This value determines the size of the initial vector that is used for
117 remembering named groups during the pre-compile. It is allocated on the stack,
118 but if it is too small, it is expanded using malloc(), in a similar way to the
119 workspace. The value is the number of slots in the list. */
120 
121 #define NAMED_GROUP_LIST_SIZE  20
122 
123 /* The overrun tests check for a slightly smaller size so that they detect the
124 overrun before it actually does run off the end of the data block. */
125 
126 #define WORK_SIZE_SAFETY_MARGIN (100)
127 
128 /* Private flags added to firstchar and reqchar. */
129 
130 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
131 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
132 /* Negative values for the firstchar and reqchar flags */
133 #define REQ_UNSET       (-2)
134 #define REQ_NONE        (-1)
135 
136 /* Repeated character flags. */
137 
138 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
139 
140 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
141 are simple data values; negative values are for special things like \d and so
142 on. Zero means further processing is needed (for things like \x), or the escape
143 is invalid. */
144 
145 #ifndef EBCDIC
146 
147 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
148 in UTF-8 mode. */
149 
150 static const short int escapes[] = {
151      0,                       0,
152      0,                       0,
153      0,                       0,
154      0,                       0,
155      0,                       0,
156      CHAR_COLON,              CHAR_SEMICOLON,
157      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
158      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
159      CHAR_COMMERCIAL_AT,      -ESC_A,
160      -ESC_B,                  -ESC_C,
161      -ESC_D,                  -ESC_E,
162      0,                       -ESC_G,
163      -ESC_H,                  0,
164      0,                       -ESC_K,
165      0,                       0,
166      -ESC_N,                  0,
167      -ESC_P,                  -ESC_Q,
168      -ESC_R,                  -ESC_S,
169      0,                       0,
170      -ESC_V,                  -ESC_W,
171      -ESC_X,                  0,
172      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
173      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
174      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
175      CHAR_GRAVE_ACCENT,       ESC_a,
176      -ESC_b,                  0,
177      -ESC_d,                  ESC_e,
178      ESC_f,                   0,
179      -ESC_h,                  0,
180      0,                       -ESC_k,
181      0,                       0,
182      ESC_n,                   0,
183      -ESC_p,                  0,
184      ESC_r,                   -ESC_s,
185      ESC_tee,                 0,
186      -ESC_v,                  -ESC_w,
187      0,                       0,
188      -ESC_z
189 };
190 
191 #else
192 
193 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
194 
195 static const short int escapes[] = {
196 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
197 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
198 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
199 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
200 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
201 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
202 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
203 /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
204 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
205 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
206 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
207 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
208 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
209 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
210 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
211 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
212 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
213 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
214 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
215 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
216 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
217 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
218 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
219 };
220 
221 /* We also need a table of characters that may follow \c in an EBCDIC
222 environment for characters 0-31. */
223 
224 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
225 
226 #endif
227 
228 
229 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
230 searched linearly. Put all the names into a single string, in order to reduce
231 the number of relocations when a shared library is dynamically linked. The
232 string is built from string macros so that it works in UTF-8 mode on EBCDIC
233 platforms. */
234 
235 typedef struct verbitem {
236   int   len;                 /* Length of verb name */
237   int   op;                  /* Op when no arg, or -1 if arg mandatory */
238   int   op_arg;              /* Op when arg present, or -1 if not allowed */
239 } verbitem;
240 
241 static const char verbnames[] =
242   "\0"                       /* Empty name is a shorthand for MARK */
243   STRING_MARK0
244   STRING_ACCEPT0
245   STRING_COMMIT0
246   STRING_F0
247   STRING_FAIL0
248   STRING_PRUNE0
249   STRING_SKIP0
250   STRING_THEN;
251 
252 static const verbitem verbs[] = {
253   { 0, -1,        OP_MARK },
254   { 4, -1,        OP_MARK },
255   { 6, OP_ACCEPT, -1 },
256   { 6, OP_COMMIT, -1 },
257   { 1, OP_FAIL,   -1 },
258   { 4, OP_FAIL,   -1 },
259   { 5, OP_PRUNE,  OP_PRUNE_ARG },
260   { 4, OP_SKIP,   OP_SKIP_ARG  },
261   { 4, OP_THEN,   OP_THEN_ARG  }
262 };
263 
264 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
265 
266 
267 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
268 another regex library. */
269 
270 static const pcre_uchar sub_start_of_word[] = {
271   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
272   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
273 
274 static const pcre_uchar sub_end_of_word[] = {
275   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
276   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
277   CHAR_RIGHT_PARENTHESIS, '\0' };
278 
279 
280 /* Tables of names of POSIX character classes and their lengths. The names are
281 now all in a single string, to reduce the number of relocations when a shared
282 library is dynamically loaded. The list of lengths is terminated by a zero
283 length entry. The first three must be alpha, lower, upper, as this is assumed
284 for handling case independence. The indices for graph, print, and punct are
285 needed, so identify them. */
286 
287 static const char posix_names[] =
288   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
289   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
290   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
291   STRING_word0  STRING_xdigit;
292 
293 static const pcre_uint8 posix_name_lengths[] = {
294   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
295 
296 #define PC_GRAPH  8
297 #define PC_PRINT  9
298 #define PC_PUNCT 10
299 
300 
301 /* Table of class bit maps for each POSIX class. Each class is formed from a
302 base map, with an optional addition or removal of another map. Then, for some
303 classes, there is some additional tweaking: for [:blank:] the vertical space
304 characters are removed, and for [:alpha:] and [:alnum:] the underscore
305 character is removed. The triples in the table consist of the base map offset,
306 second map offset or -1 if no second map, and a non-negative value for map
307 addition or a negative value for map subtraction (if there are two maps). The
308 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
309 remove vertical space characters, 2 => remove underscore. */
310 
311 static const int posix_class_maps[] = {
312   cbit_word,  cbit_digit, -2,             /* alpha */
313   cbit_lower, -1,          0,             /* lower */
314   cbit_upper, -1,          0,             /* upper */
315   cbit_word,  -1,          2,             /* alnum - word without underscore */
316   cbit_print, cbit_cntrl,  0,             /* ascii */
317   cbit_space, -1,          1,             /* blank - a GNU extension */
318   cbit_cntrl, -1,          0,             /* cntrl */
319   cbit_digit, -1,          0,             /* digit */
320   cbit_graph, -1,          0,             /* graph */
321   cbit_print, -1,          0,             /* print */
322   cbit_punct, -1,          0,             /* punct */
323   cbit_space, -1,          0,             /* space */
324   cbit_word,  -1,          0,             /* word - a Perl extension */
325   cbit_xdigit,-1,          0              /* xdigit */
326 };
327 
328 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
329 Unicode property escapes. */
330 
331 #ifdef SUPPORT_UCP
332 static const pcre_uchar string_PNd[]  = {
333   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
334   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
335 static const pcre_uchar string_pNd[]  = {
336   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
337   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
338 static const pcre_uchar string_PXsp[] = {
339   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
340   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pXsp[] = {
342   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_PXwd[] = {
345   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
346   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXwd[] = {
348   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 
351 static const pcre_uchar *substitutes[] = {
352   string_PNd,           /* \D */
353   string_pNd,           /* \d */
354   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
355   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
356   string_PXwd,          /* \W */
357   string_pXwd           /* \w */
358 };
359 
360 /* The POSIX class substitutes must be in the order of the POSIX class names,
361 defined above, and there are both positive and negative cases. NULL means no
362 general substitute of a Unicode property escape (\p or \P). However, for some
363 POSIX classes (e.g. graph, print, punct) a special property code is compiled
364 directly. */
365 
366 static const pcre_uchar string_pL[] =   {
367   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
368   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
369 static const pcre_uchar string_pLl[] =  {
370   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
371   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372 static const pcre_uchar string_pLu[] =  {
373   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
374   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
375 static const pcre_uchar string_pXan[] = {
376   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
377   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
378 static const pcre_uchar string_h[] =    {
379   CHAR_BACKSLASH, CHAR_h, '\0' };
380 static const pcre_uchar string_pXps[] = {
381   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
382   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
383 static const pcre_uchar string_PL[] =   {
384   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
385   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
386 static const pcre_uchar string_PLl[] =  {
387   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
388   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
389 static const pcre_uchar string_PLu[] =  {
390   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
391   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
392 static const pcre_uchar string_PXan[] = {
393   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
394   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
395 static const pcre_uchar string_H[] =    {
396   CHAR_BACKSLASH, CHAR_H, '\0' };
397 static const pcre_uchar string_PXps[] = {
398   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
399   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
400 
401 static const pcre_uchar *posix_substitutes[] = {
402   string_pL,            /* alpha */
403   string_pLl,           /* lower */
404   string_pLu,           /* upper */
405   string_pXan,          /* alnum */
406   NULL,                 /* ascii */
407   string_h,             /* blank */
408   NULL,                 /* cntrl */
409   string_pNd,           /* digit */
410   NULL,                 /* graph */
411   NULL,                 /* print */
412   NULL,                 /* punct */
413   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
414   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
415   NULL,                 /* xdigit */
416   /* Negated cases */
417   string_PL,            /* ^alpha */
418   string_PLl,           /* ^lower */
419   string_PLu,           /* ^upper */
420   string_PXan,          /* ^alnum */
421   NULL,                 /* ^ascii */
422   string_H,             /* ^blank */
423   NULL,                 /* ^cntrl */
424   string_PNd,           /* ^digit */
425   NULL,                 /* ^graph */
426   NULL,                 /* ^print */
427   NULL,                 /* ^punct */
428   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
429   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
430   NULL                  /* ^xdigit */
431 };
432 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
433 #endif
434 
435 #define STRING(a)  # a
436 #define XSTRING(s) STRING(s)
437 
438 /* The texts of compile-time error messages. These are "char *" because they
439 are passed to the outside world. Do not ever re-use any error number, because
440 they are documented. Always add a new error instead. Messages marked DEAD below
441 are no longer used. This used to be a table of strings, but in order to reduce
442 the number of relocations needed when a shared library is loaded dynamically,
443 it is now one long string. We cannot use a table of offsets, because the
444 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
445 simply count through to the one we want - this isn't a performance issue
446 because these strings are used only when there is a compilation error.
447 
448 Each substring ends with \0 to insert a null character. This includes the final
449 substring, so that the whole string ends with \0\0, which can be detected when
450 counting through. */
451 
452 static const char error_texts[] =
453   "no error\0"
454   "\\ at end of pattern\0"
455   "\\c at end of pattern\0"
456   "unrecognized character follows \\\0"
457   "numbers out of order in {} quantifier\0"
458   /* 5 */
459   "number too big in {} quantifier\0"
460   "missing terminating ] for character class\0"
461   "invalid escape sequence in character class\0"
462   "range out of order in character class\0"
463   "nothing to repeat\0"
464   /* 10 */
465   "internal error: invalid forward reference offset\0"
466   "internal error: unexpected repeat\0"
467   "unrecognized character after (? or (?-\0"
468   "POSIX named classes are supported only within a class\0"
469   "missing )\0"
470   /* 15 */
471   "reference to non-existent subpattern\0"
472   "erroffset passed as NULL\0"
473   "unknown option bit(s) set\0"
474   "missing ) after comment\0"
475   "parentheses nested too deeply\0"  /** DEAD **/
476   /* 20 */
477   "regular expression is too large\0"
478   "failed to get memory\0"
479   "unmatched parentheses\0"
480   "internal error: code overflow\0"
481   "unrecognized character after (?<\0"
482   /* 25 */
483   "lookbehind assertion is not fixed length\0"
484   "malformed number or name after (?(\0"
485   "conditional group contains more than two branches\0"
486   "assertion expected after (?(\0"
487   "(?R or (?[+-]digits must be followed by )\0"
488   /* 30 */
489   "unknown POSIX class name\0"
490   "POSIX collating elements are not supported\0"
491   "this version of PCRE is compiled without UTF support\0"
492   "spare error\0"  /** DEAD **/
493   "character value in \\x{} or \\o{} is too large\0"
494   /* 35 */
495   "invalid condition (?(0)\0"
496   "\\C not allowed in lookbehind assertion\0"
497   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
498   "number after (?C is > 255\0"
499   "closing ) for (?C expected\0"
500   /* 40 */
501   "recursive call could loop indefinitely\0"
502   "unrecognized character after (?P\0"
503   "syntax error in subpattern name (missing terminator)\0"
504   "two named subpatterns have the same name\0"
505   "invalid UTF-8 string\0"
506   /* 45 */
507   "support for \\P, \\p, and \\X has not been compiled\0"
508   "malformed \\P or \\p sequence\0"
509   "unknown property name after \\P or \\p\0"
510   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
511   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
512   /* 50 */
513   "repeated subpattern is too long\0"    /** DEAD **/
514   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
515   "internal error: overran compiling workspace\0"
516   "internal error: previously-checked referenced subpattern not found\0"
517   "DEFINE group contains more than one branch\0"
518   /* 55 */
519   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
520   "inconsistent NEWLINE options\0"
521   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
522   "a numbered reference must not be zero\0"
523   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
524   /* 60 */
525   "(*VERB) not recognized or malformed\0"
526   "number is too big\0"
527   "subpattern name expected\0"
528   "digit expected after (?+\0"
529   "] is an invalid data character in JavaScript compatibility mode\0"
530   /* 65 */
531   "different names for subpatterns of the same number are not allowed\0"
532   "(*MARK) must have an argument\0"
533   "this version of PCRE is not compiled with Unicode property support\0"
534 #ifndef EBCDIC
535   "\\c must be followed by an ASCII character\0"
536 #else
537   "\\c must be followed by a letter or one of [\\]^_?\0"
538 #endif
539   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
540   /* 70 */
541   "internal error: unknown opcode in find_fixedlength()\0"
542   "\\N is not supported in a class\0"
543   "too many forward references\0"
544   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
545   "invalid UTF-16 string\0"
546   /* 75 */
547   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
548   "character value in \\u.... sequence is too large\0"
549   "invalid UTF-32 string\0"
550   "setting UTF is disabled by the application\0"
551   "non-hex character in \\x{} (closing brace missing?)\0"
552   /* 80 */
553   "non-octal character in \\o{} (closing brace missing?)\0"
554   "missing opening brace after \\o\0"
555   "parentheses are too deeply nested\0"
556   "invalid range in character class\0"
557   "group name must start with a non-digit\0"
558   /* 85 */
559   "parentheses are too deeply nested (stack check)\0"
560   "digits missing in \\x{} or \\o{}\0"
561   ;
562 
563 /* Table to identify digits and hex digits. This is used when compiling
564 patterns. Note that the tables in chartables are dependent on the locale, and
565 may mark arbitrary characters as digits - but the PCRE compiling code expects
566 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
567 a private table here. It costs 256 bytes, but it is a lot faster than doing
568 character value tests (at least in some simple cases I timed), and in some
569 applications one wants PCRE to compile efficiently as well as match
570 efficiently.
571 
572 For convenience, we use the same bit definitions as in chartables:
573 
574   0x04   decimal digit
575   0x08   hexadecimal digit
576 
577 Then we can use ctype_digit and ctype_xdigit in the code. */
578 
579 /* Using a simple comparison for decimal numbers rather than a memory read
580 is much faster, and the resulting code is simpler (the compiler turns it
581 into a subtraction and unsigned comparison). */
582 
583 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
584 
585 #ifndef EBCDIC
586 
587 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
588 UTF-8 mode. */
589 
590 static const pcre_uint8 digitab[] =
591   {
592   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
593   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
594   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
598   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
599   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
600   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
601   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
602   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
603   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
604   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
607   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
616   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
619   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
620   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
621   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
622   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
624 
625 #else
626 
627 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
628 
629 static const pcre_uint8 digitab[] =
630   {
631   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
632   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
633   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
639   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
647   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
650   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
653   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
654   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
655   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
656   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
657   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
658   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
659   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
661   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
662   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
663 
664 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
665   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
666   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
667   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
668   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
669   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
670   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
671   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
672   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
673   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
674   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
675   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
676   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
677   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
678   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
679   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
680   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
681   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
682   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
683   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
684   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
685   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
686   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
687   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
688   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
689   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
690   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
691   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
692   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
693   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
694   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
695   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
696   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
697 #endif
698 
699 
700 /* This table is used to check whether auto-possessification is possible
701 between adjacent character-type opcodes. The left-hand (repeated) opcode is
702 used to select the row, and the right-hand opcode is use to select the column.
703 A value of 1 means that auto-possessification is OK. For example, the second
704 value in the first row means that \D+\d can be turned into \D++\d.
705 
706 The Unicode property types (\P and \p) have to be present to fill out the table
707 because of what their opcode values are, but the table values should always be
708 zero because property types are handled separately in the code. The last four
709 columns apply to items that cannot be repeated, so there is no need to have
710 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
711 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
712 
713 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
714 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
715 
716 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
717 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
718   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
719   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
720   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
721   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
722   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
723   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
724   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
725   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
726   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
727   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
728   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
729   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
730   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
731   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
732   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
733   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
734   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
735 };
736 
737 
738 /* This table is used to check whether auto-possessification is possible
739 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
740 left-hand (repeated) opcode is used to select the row, and the right-hand
741 opcode is used to select the column. The values are as follows:
742 
743   0   Always return FALSE (never auto-possessify)
744   1   Character groups are distinct (possessify if both are OP_PROP)
745   2   Check character categories in the same group (general or particular)
746   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
747 
748   4   Check left general category vs right particular category
749   5   Check right general category vs left particular category
750 
751   6   Left alphanum vs right general category
752   7   Left space vs right general category
753   8   Left word vs right general category
754 
755   9   Right alphanum vs left general category
756  10   Right space vs left general category
757  11   Right word vs left general category
758 
759  12   Left alphanum vs right particular category
760  13   Left space vs right particular category
761  14   Left word vs right particular category
762 
763  15   Right alphanum vs left particular category
764  16   Right space vs left particular category
765  17   Right word vs left particular category
766 */
767 
768 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
769 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
770   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
771   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
772   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
773   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
774   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
775   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
776   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
777   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
778   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
779   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
780   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
781 };
782 
783 /* This table is used to check whether auto-possessification is possible
784 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
785 specifies a general category and the other specifies a particular category. The
786 row is selected by the general category and the column by the particular
787 category. The value is 1 if the particular category is not part of the general
788 category. */
789 
790 static const pcre_uint8 catposstab[7][30] = {
791 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
792   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
793   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
794   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
795   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
796   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
797   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
798   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
799 };
800 
801 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
802 a general or particular category. The properties in each row are those
803 that apply to the character set in question. Duplication means that a little
804 unnecessary work is done when checking, but this keeps things much simpler
805 because they can all use the same code. For more details see the comment where
806 this table is used.
807 
808 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
809 "space", but from Perl 5.18 it's included, so both categories are treated the
810 same here. */
811 
812 static const pcre_uint8 posspropstab[3][4] = {
813   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
814   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
815   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
816 };
817 
818 /* This table is used when converting repeating opcodes into possessified
819 versions as a result of an explicit possessive quantifier such as ++. A zero
820 value means there is no possessified version - in those cases the item in
821 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
822 because all relevant opcodes are less than that. */
823 
824 static const pcre_uint8 opcode_possessify[] = {
825   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
826   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
827 
828   0,                       /* NOTI */
829   OP_POSSTAR, 0,           /* STAR, MINSTAR */
830   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
831   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
832   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
833   0,                       /* EXACT */
834   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
835 
836   OP_POSSTARI, 0,          /* STARI, MINSTARI */
837   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
838   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
839   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
840   0,                       /* EXACTI */
841   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
842 
843   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
844   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
845   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
846   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
847   0,                       /* NOTEXACT */
848   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
849 
850   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
851   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
852   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
853   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
854   0,                       /* NOTEXACTI */
855   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
856 
857   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
858   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
859   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
860   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
861   0,                       /* TYPEEXACT */
862   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
863 
864   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
865   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
866   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
867   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
868   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
869 
870   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
871   0, 0,                    /* REF, REFI */
872   0, 0,                    /* DNREF, DNREFI */
873   0, 0                     /* RECURSE, CALLOUT */
874 };
875 
876 
877 
878 /*************************************************
879 *            Find an error text                  *
880 *************************************************/
881 
882 /* The error texts are now all in one long string, to save on relocations. As
883 some of the text is of unknown length, we can't use a table of offsets.
884 Instead, just count through the strings. This is not a performance issue
885 because it happens only when there has been a compilation error.
886 
887 Argument:   the error number
888 Returns:    pointer to the error string
889 */
890 
891 static const char *
find_error_text(int n)892 find_error_text(int n)
893 {
894 const char *s = error_texts;
895 for (; n > 0; n--)
896   {
897   while (*s++ != CHAR_NULL) {};
898   if (*s == CHAR_NULL) return "Error text not found (please report)";
899   }
900 return s;
901 }
902 
903 
904 
905 /*************************************************
906 *           Expand the workspace                 *
907 *************************************************/
908 
909 /* This function is called during the second compiling phase, if the number of
910 forward references fills the existing workspace, which is originally a block on
911 the stack. A larger block is obtained from malloc() unless the ultimate limit
912 has been reached or the increase will be rather small.
913 
914 Argument: pointer to the compile data block
915 Returns:  0 if all went well, else an error number
916 */
917 
918 static int
expand_workspace(compile_data * cd)919 expand_workspace(compile_data *cd)
920 {
921 pcre_uchar *newspace;
922 int newsize = cd->workspace_size * 2;
923 
924 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
925 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
926     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
927  return ERR72;
928 
929 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
930 if (newspace == NULL) return ERR21;
931 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
932 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
933 if (cd->workspace_size > COMPILE_WORK_SIZE)
934   (PUBL(free))((void *)cd->start_workspace);
935 cd->start_workspace = newspace;
936 cd->workspace_size = newsize;
937 return 0;
938 }
939 
940 
941 
942 /*************************************************
943 *            Check for counted repeat            *
944 *************************************************/
945 
946 /* This function is called when a '{' is encountered in a place where it might
947 start a quantifier. It looks ahead to see if it really is a quantifier or not.
948 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
949 where the ddds are digits.
950 
951 Arguments:
952   p         pointer to the first char after '{'
953 
954 Returns:    TRUE or FALSE
955 */
956 
957 static BOOL
is_counted_repeat(const pcre_uchar * p)958 is_counted_repeat(const pcre_uchar *p)
959 {
960 if (!IS_DIGIT(*p)) return FALSE;
961 p++;
962 while (IS_DIGIT(*p)) p++;
963 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
964 
965 if (*p++ != CHAR_COMMA) return FALSE;
966 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
967 
968 if (!IS_DIGIT(*p)) return FALSE;
969 p++;
970 while (IS_DIGIT(*p)) p++;
971 
972 return (*p == CHAR_RIGHT_CURLY_BRACKET);
973 }
974 
975 
976 
977 /*************************************************
978 *            Handle escapes                      *
979 *************************************************/
980 
981 /* This function is called when a \ has been encountered. It either returns a
982 positive value for a simple escape such as \n, or 0 for a data character which
983 will be placed in chptr. A backreference to group n is returned as negative n.
984 When UTF-8 is enabled, a positive value greater than 255 may be returned in
985 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
986 character of the escape sequence.
987 
988 Arguments:
989   ptrptr         points to the pattern position pointer
990   chptr          points to a returned data character
991   errorcodeptr   points to the errorcode variable
992   bracount       number of previous extracting brackets
993   options        the options bits
994   isclass        TRUE if inside a character class
995 
996 Returns:         zero => a data character
997                  positive => a special escape sequence
998                  negative => a back reference
999                  on error, errorcodeptr is set
1000 */
1001 
1002 static int
check_escape(const pcre_uchar ** ptrptr,pcre_uint32 * chptr,int * errorcodeptr,int bracount,int options,BOOL isclass)1003 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1004   int bracount, int options, BOOL isclass)
1005 {
1006 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
1007 BOOL utf = (options & PCRE_UTF8) != 0;
1008 const pcre_uchar *ptr = *ptrptr + 1;
1009 pcre_uint32 c;
1010 int escape = 0;
1011 int i;
1012 
1013 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1014 ptr--;                            /* Set pointer back to the last byte */
1015 
1016 /* If backslash is at the end of the pattern, it's an error. */
1017 
1018 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1019 
1020 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1021 in a table. A non-zero result is something that can be returned immediately.
1022 Otherwise further processing may be required. */
1023 
1024 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1025 /* Not alphanumeric */
1026 else if (c < CHAR_0 || c > CHAR_z) {}
1027 else if ((i = escapes[c - CHAR_0]) != 0)
1028   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1029 
1030 #else           /* EBCDIC coding */
1031 /* Not alphanumeric */
1032 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1033 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1034 #endif
1035 
1036 /* Escapes that need further processing, or are illegal. */
1037 
1038 else
1039   {
1040   const pcre_uchar *oldptr;
1041   BOOL braced, negated, overflow;
1042   int s;
1043 
1044   switch (c)
1045     {
1046     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1047     error. */
1048 
1049     case CHAR_l:
1050     case CHAR_L:
1051     *errorcodeptr = ERR37;
1052     break;
1053 
1054     case CHAR_u:
1055     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1056       {
1057       /* In JavaScript, \u must be followed by four hexadecimal numbers.
1058       Otherwise it is a lowercase u letter. */
1059       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1060         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1061         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1062         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1063         {
1064         c = 0;
1065         for (i = 0; i < 4; ++i)
1066           {
1067           register pcre_uint32 cc = *(++ptr);
1068 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1069           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1070           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1071 #else           /* EBCDIC coding */
1072           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1073           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1074 #endif
1075           }
1076 
1077 #if defined COMPILE_PCRE8
1078         if (c > (utf ? 0x10ffffU : 0xffU))
1079 #elif defined COMPILE_PCRE16
1080         if (c > (utf ? 0x10ffffU : 0xffffU))
1081 #elif defined COMPILE_PCRE32
1082         if (utf && c > 0x10ffffU)
1083 #endif
1084           {
1085           *errorcodeptr = ERR76;
1086           }
1087         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1088         }
1089       }
1090     else
1091       *errorcodeptr = ERR37;
1092     break;
1093 
1094     case CHAR_U:
1095     /* In JavaScript, \U is an uppercase U letter. */
1096     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1097     break;
1098 
1099     /* In a character class, \g is just a literal "g". Outside a character
1100     class, \g must be followed by one of a number of specific things:
1101 
1102     (1) A number, either plain or braced. If positive, it is an absolute
1103     backreference. If negative, it is a relative backreference. This is a Perl
1104     5.10 feature.
1105 
1106     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1107     is part of Perl's movement towards a unified syntax for back references. As
1108     this is synonymous with \k{name}, we fudge it up by pretending it really
1109     was \k.
1110 
1111     (3) For Oniguruma compatibility we also support \g followed by a name or a
1112     number either in angle brackets or in single quotes. However, these are
1113     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1114     the ESC_g code (cf \k). */
1115 
1116     case CHAR_g:
1117     if (isclass) break;
1118     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1119       {
1120       escape = ESC_g;
1121       break;
1122       }
1123 
1124     /* Handle the Perl-compatible cases */
1125 
1126     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1127       {
1128       const pcre_uchar *p;
1129       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1130         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1131       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1132         {
1133         escape = ESC_k;
1134         break;
1135         }
1136       braced = TRUE;
1137       ptr++;
1138       }
1139     else braced = FALSE;
1140 
1141     if (ptr[1] == CHAR_MINUS)
1142       {
1143       negated = TRUE;
1144       ptr++;
1145       }
1146     else negated = FALSE;
1147 
1148     /* The integer range is limited by the machine's int representation. */
1149     s = 0;
1150     overflow = FALSE;
1151     while (IS_DIGIT(ptr[1]))
1152       {
1153       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1154         {
1155         overflow = TRUE;
1156         break;
1157         }
1158       s = s * 10 + (int)(*(++ptr) - CHAR_0);
1159       }
1160     if (overflow) /* Integer overflow */
1161       {
1162       while (IS_DIGIT(ptr[1]))
1163         ptr++;
1164       *errorcodeptr = ERR61;
1165       break;
1166       }
1167 
1168     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1169       {
1170       *errorcodeptr = ERR57;
1171       break;
1172       }
1173 
1174     if (s == 0)
1175       {
1176       *errorcodeptr = ERR58;
1177       break;
1178       }
1179 
1180     if (negated)
1181       {
1182       if (s > bracount)
1183         {
1184         *errorcodeptr = ERR15;
1185         break;
1186         }
1187       s = bracount - (s - 1);
1188       }
1189 
1190     escape = -s;
1191     break;
1192 
1193     /* The handling of escape sequences consisting of a string of digits
1194     starting with one that is not zero is not straightforward. Perl has changed
1195     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1196     recommended to avoid the ambiguities in the old syntax.
1197 
1198     Outside a character class, the digits are read as a decimal number. If the
1199     number is less than 8 (used to be 10), or if there are that many previous
1200     extracting left brackets, then it is a back reference. Otherwise, up to
1201     three octal digits are read to form an escaped byte. Thus \123 is likely to
1202     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1203     the octal value is greater than 377, the least significant 8 bits are
1204     taken. \8 and \9 are treated as the literal characters 8 and 9.
1205 
1206     Inside a character class, \ followed by a digit is always either a literal
1207     8 or 9 or an octal number. */
1208 
1209     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1210     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1211 
1212     if (!isclass)
1213       {
1214       oldptr = ptr;
1215       /* The integer range is limited by the machine's int representation. */
1216       s = (int)(c -CHAR_0);
1217       overflow = FALSE;
1218       while (IS_DIGIT(ptr[1]))
1219         {
1220         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1221           {
1222           overflow = TRUE;
1223           break;
1224           }
1225         s = s * 10 + (int)(*(++ptr) - CHAR_0);
1226         }
1227       if (overflow) /* Integer overflow */
1228         {
1229         while (IS_DIGIT(ptr[1]))
1230           ptr++;
1231         *errorcodeptr = ERR61;
1232         break;
1233         }
1234       if (s < 8 || s <= bracount)  /* Check for back reference */
1235         {
1236         escape = -s;
1237         break;
1238         }
1239       ptr = oldptr;      /* Put the pointer back and fall through */
1240       }
1241 
1242     /* Handle a digit following \ when the number is not a back reference. If
1243     the first digit is 8 or 9, Perl used to generate a binary zero byte and
1244     then treat the digit as a following literal. At least by Perl 5.18 this
1245     changed so as not to insert the binary zero. */
1246 
1247     if ((c = *ptr) >= CHAR_8) break;
1248 
1249     /* Fall through with a digit less than 8 */
1250 
1251     /* \0 always starts an octal number, but we may drop through to here with a
1252     larger first octal digit. The original code used just to take the least
1253     significant 8 bits of octal numbers (I think this is what early Perls used
1254     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1255     but no more than 3 octal digits. */
1256 
1257     case CHAR_0:
1258     c -= CHAR_0;
1259     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1260         c = c * 8 + *(++ptr) - CHAR_0;
1261 #ifdef COMPILE_PCRE8
1262     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1263 #endif
1264     break;
1265 
1266     /* \o is a relatively new Perl feature, supporting a more general way of
1267     specifying character codes in octal. The only supported form is \o{ddd}. */
1268 
1269     case CHAR_o:
1270     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1271     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1272       {
1273       ptr += 2;
1274       c = 0;
1275       overflow = FALSE;
1276       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1277         {
1278         register pcre_uint32 cc = *ptr++;
1279         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1280 #ifdef COMPILE_PCRE32
1281         if (c >= 0x20000000l) { overflow = TRUE; break; }
1282 #endif
1283         c = (c << 3) + cc - CHAR_0 ;
1284 #if defined COMPILE_PCRE8
1285         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1286 #elif defined COMPILE_PCRE16
1287         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1288 #elif defined COMPILE_PCRE32
1289         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1290 #endif
1291         }
1292       if (overflow)
1293         {
1294         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1295         *errorcodeptr = ERR34;
1296         }
1297       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1298         {
1299         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1300         }
1301       else *errorcodeptr = ERR80;
1302       }
1303     break;
1304 
1305     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1306     numbers. Otherwise it is a lowercase x letter. */
1307 
1308     case CHAR_x:
1309     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1310       {
1311       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1312         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1313         {
1314         c = 0;
1315         for (i = 0; i < 2; ++i)
1316           {
1317           register pcre_uint32 cc = *(++ptr);
1318 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1319           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1320           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1321 #else           /* EBCDIC coding */
1322           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1323           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1324 #endif
1325           }
1326         }
1327       }    /* End JavaScript handling */
1328 
1329     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1330     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1331     digits. If not, { used to be treated as a data character. However, Perl
1332     seems to read hex digits up to the first non-such, and ignore the rest, so
1333     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1334     now gives an error. */
1335 
1336     else
1337       {
1338       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1339         {
1340         ptr += 2;
1341         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1342           {
1343           *errorcodeptr = ERR86;
1344           break;
1345           }
1346         c = 0;
1347         overflow = FALSE;
1348         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1349           {
1350           register pcre_uint32 cc = *ptr++;
1351           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1352 
1353 #ifdef COMPILE_PCRE32
1354           if (c >= 0x10000000l) { overflow = TRUE; break; }
1355 #endif
1356 
1357 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1358           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1359           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1360 #else           /* EBCDIC coding */
1361           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1362           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1363 #endif
1364 
1365 #if defined COMPILE_PCRE8
1366           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1367 #elif defined COMPILE_PCRE16
1368           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1369 #elif defined COMPILE_PCRE32
1370           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1371 #endif
1372           }
1373 
1374         if (overflow)
1375           {
1376           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1377           *errorcodeptr = ERR34;
1378           }
1379 
1380         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1381           {
1382           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1383           }
1384 
1385         /* If the sequence of hex digits does not end with '}', give an error.
1386         We used just to recognize this construct and fall through to the normal
1387         \x handling, but nowadays Perl gives an error, which seems much more
1388         sensible, so we do too. */
1389 
1390         else *errorcodeptr = ERR79;
1391         }   /* End of \x{} processing */
1392 
1393       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1394 
1395       else
1396         {
1397         c = 0;
1398         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1399           {
1400           pcre_uint32 cc;                          /* Some compilers don't like */
1401           cc = *(++ptr);                           /* ++ in initializers */
1402 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1403           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1404           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1405 #else           /* EBCDIC coding */
1406           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1407           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1408 #endif
1409           }
1410         }     /* End of \xdd handling */
1411       }       /* End of Perl-style \x handling */
1412     break;
1413 
1414     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1415     An error is given if the byte following \c is not an ASCII character. This
1416     coding is ASCII-specific, but then the whole concept of \cx is
1417     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1418 
1419     case CHAR_c:
1420     c = *(++ptr);
1421     if (c == CHAR_NULL)
1422       {
1423       *errorcodeptr = ERR2;
1424       break;
1425       }
1426 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1427     if (c > 127)  /* Excludes all non-ASCII in either mode */
1428       {
1429       *errorcodeptr = ERR68;
1430       break;
1431       }
1432     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1433     c ^= 0x40;
1434 #else             /* EBCDIC coding */
1435     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1436     if (c == CHAR_QUESTION_MARK)
1437       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1438     else
1439       {
1440       for (i = 0; i < 32; i++)
1441         {
1442         if (c == ebcdic_escape_c[i]) break;
1443         }
1444       if (i < 32) c = i; else *errorcodeptr = ERR68;
1445       }
1446 #endif
1447     break;
1448 
1449     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1450     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1451     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1452     odd, but there used to be some cases other than the default, and there may
1453     be again in future, so I haven't "optimized" it. */
1454 
1455     default:
1456     if ((options & PCRE_EXTRA) != 0) switch(c)
1457       {
1458       default:
1459       *errorcodeptr = ERR3;
1460       break;
1461       }
1462     break;
1463     }
1464   }
1465 
1466 /* Perl supports \N{name} for character names, as well as plain \N for "not
1467 newline". PCRE does not support \N{name}. However, it does support
1468 quantification such as \N{2,3}. */
1469 
1470 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1471      !is_counted_repeat(ptr+2))
1472   *errorcodeptr = ERR37;
1473 
1474 /* If PCRE_UCP is set, we change the values for \d etc. */
1475 
1476 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1477   escape += (ESC_DU - ESC_D);
1478 
1479 /* Set the pointer to the final character before returning. */
1480 
1481 *ptrptr = ptr;
1482 *chptr = c;
1483 return escape;
1484 }
1485 
1486 
1487 
1488 #ifdef SUPPORT_UCP
1489 /*************************************************
1490 *               Handle \P and \p                 *
1491 *************************************************/
1492 
1493 /* This function is called after \P or \p has been encountered, provided that
1494 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1495 pointing at the P or p. On exit, it is pointing at the final character of the
1496 escape sequence.
1497 
1498 Argument:
1499   ptrptr         points to the pattern position pointer
1500   negptr         points to a boolean that is set TRUE for negation else FALSE
1501   ptypeptr       points to an unsigned int that is set to the type value
1502   pdataptr       points to an unsigned int that is set to the detailed property value
1503   errorcodeptr   points to the error code variable
1504 
1505 Returns:         TRUE if the type value was found, or FALSE for an invalid type
1506 */
1507 
1508 static BOOL
get_ucp(const pcre_uchar ** ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr)1509 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1510   unsigned int *pdataptr, int *errorcodeptr)
1511 {
1512 pcre_uchar c;
1513 int i, bot, top;
1514 const pcre_uchar *ptr = *ptrptr;
1515 pcre_uchar name[32];
1516 
1517 c = *(++ptr);
1518 if (c == CHAR_NULL) goto ERROR_RETURN;
1519 
1520 *negptr = FALSE;
1521 
1522 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1523 negation. */
1524 
1525 if (c == CHAR_LEFT_CURLY_BRACKET)
1526   {
1527   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1528     {
1529     *negptr = TRUE;
1530     ptr++;
1531     }
1532   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1533     {
1534     c = *(++ptr);
1535     if (c == CHAR_NULL) goto ERROR_RETURN;
1536     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1537     name[i] = c;
1538     }
1539   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1540   name[i] = 0;
1541   }
1542 
1543 /* Otherwise there is just one following character */
1544 
1545 else
1546   {
1547   name[0] = c;
1548   name[1] = 0;
1549   }
1550 
1551 *ptrptr = ptr;
1552 
1553 /* Search for a recognized property name using binary chop */
1554 
1555 bot = 0;
1556 top = PRIV(utt_size);
1557 
1558 while (bot < top)
1559   {
1560   int r;
1561   i = (bot + top) >> 1;
1562   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1563   if (r == 0)
1564     {
1565     *ptypeptr = PRIV(utt)[i].type;
1566     *pdataptr = PRIV(utt)[i].value;
1567     return TRUE;
1568     }
1569   if (r > 0) bot = i + 1; else top = i;
1570   }
1571 
1572 *errorcodeptr = ERR47;
1573 *ptrptr = ptr;
1574 return FALSE;
1575 
1576 ERROR_RETURN:
1577 *errorcodeptr = ERR46;
1578 *ptrptr = ptr;
1579 return FALSE;
1580 }
1581 #endif
1582 
1583 
1584 
1585 /*************************************************
1586 *         Read repeat counts                     *
1587 *************************************************/
1588 
1589 /* Read an item of the form {n,m} and return the values. This is called only
1590 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1591 so the syntax is guaranteed to be correct, but we need to check the values.
1592 
1593 Arguments:
1594   p              pointer to first char after '{'
1595   minp           pointer to int for min
1596   maxp           pointer to int for max
1597                  returned as -1 if no max
1598   errorcodeptr   points to error code variable
1599 
1600 Returns:         pointer to '}' on success;
1601                  current ptr on error, with errorcodeptr set non-zero
1602 */
1603 
1604 static const pcre_uchar *
read_repeat_counts(const pcre_uchar * p,int * minp,int * maxp,int * errorcodeptr)1605 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1606 {
1607 int min = 0;
1608 int max = -1;
1609 
1610 while (IS_DIGIT(*p))
1611   {
1612   min = min * 10 + (int)(*p++ - CHAR_0);
1613   if (min > 65535)
1614     {
1615     *errorcodeptr = ERR5;
1616     return p;
1617     }
1618   }
1619 
1620 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1621   {
1622   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1623     {
1624     max = 0;
1625     while(IS_DIGIT(*p))
1626       {
1627       max = max * 10 + (int)(*p++ - CHAR_0);
1628       if (max > 65535)
1629         {
1630         *errorcodeptr = ERR5;
1631         return p;
1632         }
1633       }
1634     if (max < min)
1635       {
1636       *errorcodeptr = ERR4;
1637       return p;
1638       }
1639     }
1640   }
1641 
1642 *minp = min;
1643 *maxp = max;
1644 return p;
1645 }
1646 
1647 
1648 
1649 /*************************************************
1650 *      Find first significant op code            *
1651 *************************************************/
1652 
1653 /* This is called by several functions that scan a compiled expression looking
1654 for a fixed first character, or an anchoring op code etc. It skips over things
1655 that do not influence this. For some calls, it makes sense to skip negative
1656 forward and all backward assertions, and also the \b assertion; for others it
1657 does not.
1658 
1659 Arguments:
1660   code         pointer to the start of the group
1661   skipassert   TRUE if certain assertions are to be skipped
1662 
1663 Returns:       pointer to the first significant opcode
1664 */
1665 
1666 static const pcre_uchar*
first_significant_code(const pcre_uchar * code,BOOL skipassert)1667 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1668 {
1669 for (;;)
1670   {
1671   switch ((int)*code)
1672     {
1673     case OP_ASSERT_NOT:
1674     case OP_ASSERTBACK:
1675     case OP_ASSERTBACK_NOT:
1676     if (!skipassert) return code;
1677     do code += GET(code, 1); while (*code == OP_ALT);
1678     code += PRIV(OP_lengths)[*code];
1679     break;
1680 
1681     case OP_WORD_BOUNDARY:
1682     case OP_NOT_WORD_BOUNDARY:
1683     if (!skipassert) return code;
1684     /* Fall through */
1685 
1686     case OP_CALLOUT:
1687     case OP_CREF:
1688     case OP_DNCREF:
1689     case OP_RREF:
1690     case OP_DNRREF:
1691     case OP_DEF:
1692     code += PRIV(OP_lengths)[*code];
1693     break;
1694 
1695     default:
1696     return code;
1697     }
1698   }
1699 /* Control never reaches here */
1700 }
1701 
1702 
1703 
1704 /*************************************************
1705 *        Find the fixed length of a branch       *
1706 *************************************************/
1707 
1708 /* Scan a branch and compute the fixed length of subject that will match it,
1709 if the length is fixed. This is needed for dealing with backward assertions.
1710 In UTF8 mode, the result is in characters rather than bytes. The branch is
1711 temporarily terminated with OP_END when this function is called.
1712 
1713 This function is called when a backward assertion is encountered, so that if it
1714 fails, the error message can point to the correct place in the pattern.
1715 However, we cannot do this when the assertion contains subroutine calls,
1716 because they can be forward references. We solve this by remembering this case
1717 and doing the check at the end; a flag specifies which mode we are running in.
1718 
1719 Arguments:
1720   code     points to the start of the pattern (the bracket)
1721   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1722   atend    TRUE if called when the pattern is complete
1723   cd       the "compile data" structure
1724   recurses    chain of recurse_check to catch mutual recursion
1725 
1726 Returns:   the fixed length,
1727              or -1 if there is no fixed length,
1728              or -2 if \C was encountered (in UTF-8 mode only)
1729              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1730              or -4 if an unknown opcode was encountered (internal error)
1731 */
1732 
1733 static int
find_fixedlength(pcre_uchar * code,BOOL utf,BOOL atend,compile_data * cd,recurse_check * recurses)1734 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1735   recurse_check *recurses)
1736 {
1737 int length = -1;
1738 recurse_check this_recurse;
1739 register int branchlength = 0;
1740 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1741 
1742 /* Scan along the opcodes for this branch. If we get to the end of the
1743 branch, check the length against that of the other branches. */
1744 
1745 for (;;)
1746   {
1747   int d;
1748   pcre_uchar *ce, *cs;
1749   register pcre_uchar op = *cc;
1750 
1751   switch (op)
1752     {
1753     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1754     OP_BRA (normal non-capturing bracket) because the other variants of these
1755     opcodes are all concerned with unlimited repeated groups, which of course
1756     are not of fixed length. */
1757 
1758     case OP_CBRA:
1759     case OP_BRA:
1760     case OP_ONCE:
1761     case OP_ONCE_NC:
1762     case OP_COND:
1763     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1764       recurses);
1765     if (d < 0) return d;
1766     branchlength += d;
1767     do cc += GET(cc, 1); while (*cc == OP_ALT);
1768     cc += 1 + LINK_SIZE;
1769     break;
1770 
1771     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1772     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1773     an ALT. If it is END it's the end of the outer call. All can be handled by
1774     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1775     because they all imply an unlimited repeat. */
1776 
1777     case OP_ALT:
1778     case OP_KET:
1779     case OP_END:
1780     case OP_ACCEPT:
1781     case OP_ASSERT_ACCEPT:
1782     if (length < 0) length = branchlength;
1783       else if (length != branchlength) return -1;
1784     if (*cc != OP_ALT) return length;
1785     cc += 1 + LINK_SIZE;
1786     branchlength = 0;
1787     break;
1788 
1789     /* A true recursion implies not fixed length, but a subroutine call may
1790     be OK. If the subroutine is a forward reference, we can't deal with
1791     it until the end of the pattern, so return -3. */
1792 
1793     case OP_RECURSE:
1794     if (!atend) return -3;
1795     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1796     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1797     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1798     else   /* Check for mutual recursion */
1799       {
1800       recurse_check *r = recurses;
1801       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1802       if (r != NULL) return -1;   /* Mutual recursion */
1803       }
1804     this_recurse.prev = recurses;
1805     this_recurse.group = cs;
1806     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1807     if (d < 0) return d;
1808     branchlength += d;
1809     cc += 1 + LINK_SIZE;
1810     break;
1811 
1812     /* Skip over assertive subpatterns */
1813 
1814     case OP_ASSERT:
1815     case OP_ASSERT_NOT:
1816     case OP_ASSERTBACK:
1817     case OP_ASSERTBACK_NOT:
1818     do cc += GET(cc, 1); while (*cc == OP_ALT);
1819     cc += 1 + LINK_SIZE;
1820     break;
1821 
1822     /* Skip over things that don't match chars */
1823 
1824     case OP_MARK:
1825     case OP_PRUNE_ARG:
1826     case OP_SKIP_ARG:
1827     case OP_THEN_ARG:
1828     cc += cc[1] + PRIV(OP_lengths)[*cc];
1829     break;
1830 
1831     case OP_CALLOUT:
1832     case OP_CIRC:
1833     case OP_CIRCM:
1834     case OP_CLOSE:
1835     case OP_COMMIT:
1836     case OP_CREF:
1837     case OP_DEF:
1838     case OP_DNCREF:
1839     case OP_DNRREF:
1840     case OP_DOLL:
1841     case OP_DOLLM:
1842     case OP_EOD:
1843     case OP_EODN:
1844     case OP_FAIL:
1845     case OP_NOT_WORD_BOUNDARY:
1846     case OP_PRUNE:
1847     case OP_REVERSE:
1848     case OP_RREF:
1849     case OP_SET_SOM:
1850     case OP_SKIP:
1851     case OP_SOD:
1852     case OP_SOM:
1853     case OP_THEN:
1854     case OP_WORD_BOUNDARY:
1855     cc += PRIV(OP_lengths)[*cc];
1856     break;
1857 
1858     /* Handle literal characters */
1859 
1860     case OP_CHAR:
1861     case OP_CHARI:
1862     case OP_NOT:
1863     case OP_NOTI:
1864     branchlength++;
1865     cc += 2;
1866 #ifdef SUPPORT_UTF
1867     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1868 #endif
1869     break;
1870 
1871     /* Handle exact repetitions. The count is already in characters, but we
1872     need to skip over a multibyte character in UTF8 mode.  */
1873 
1874     case OP_EXACT:
1875     case OP_EXACTI:
1876     case OP_NOTEXACT:
1877     case OP_NOTEXACTI:
1878     branchlength += (int)GET2(cc,1);
1879     cc += 2 + IMM2_SIZE;
1880 #ifdef SUPPORT_UTF
1881     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1882 #endif
1883     break;
1884 
1885     case OP_TYPEEXACT:
1886     branchlength += GET2(cc,1);
1887     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1888       cc += 2;
1889     cc += 1 + IMM2_SIZE + 1;
1890     break;
1891 
1892     /* Handle single-char matchers */
1893 
1894     case OP_PROP:
1895     case OP_NOTPROP:
1896     cc += 2;
1897     /* Fall through */
1898 
1899     case OP_HSPACE:
1900     case OP_VSPACE:
1901     case OP_NOT_HSPACE:
1902     case OP_NOT_VSPACE:
1903     case OP_NOT_DIGIT:
1904     case OP_DIGIT:
1905     case OP_NOT_WHITESPACE:
1906     case OP_WHITESPACE:
1907     case OP_NOT_WORDCHAR:
1908     case OP_WORDCHAR:
1909     case OP_ANY:
1910     case OP_ALLANY:
1911     branchlength++;
1912     cc++;
1913     break;
1914 
1915     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1916     otherwise \C is coded as OP_ALLANY. */
1917 
1918     case OP_ANYBYTE:
1919     return -2;
1920 
1921     /* Check a class for variable quantification */
1922 
1923     case OP_CLASS:
1924     case OP_NCLASS:
1925 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1926     case OP_XCLASS:
1927     /* The original code caused an unsigned overflow in 64 bit systems,
1928     so now we use a conditional statement. */
1929     if (op == OP_XCLASS)
1930       cc += GET(cc, 1);
1931     else
1932       cc += PRIV(OP_lengths)[OP_CLASS];
1933 #else
1934     cc += PRIV(OP_lengths)[OP_CLASS];
1935 #endif
1936 
1937     switch (*cc)
1938       {
1939       case OP_CRSTAR:
1940       case OP_CRMINSTAR:
1941       case OP_CRPLUS:
1942       case OP_CRMINPLUS:
1943       case OP_CRQUERY:
1944       case OP_CRMINQUERY:
1945       case OP_CRPOSSTAR:
1946       case OP_CRPOSPLUS:
1947       case OP_CRPOSQUERY:
1948       return -1;
1949 
1950       case OP_CRRANGE:
1951       case OP_CRMINRANGE:
1952       case OP_CRPOSRANGE:
1953       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1954       branchlength += (int)GET2(cc,1);
1955       cc += 1 + 2 * IMM2_SIZE;
1956       break;
1957 
1958       default:
1959       branchlength++;
1960       }
1961     break;
1962 
1963     /* Anything else is variable length */
1964 
1965     case OP_ANYNL:
1966     case OP_BRAMINZERO:
1967     case OP_BRAPOS:
1968     case OP_BRAPOSZERO:
1969     case OP_BRAZERO:
1970     case OP_CBRAPOS:
1971     case OP_EXTUNI:
1972     case OP_KETRMAX:
1973     case OP_KETRMIN:
1974     case OP_KETRPOS:
1975     case OP_MINPLUS:
1976     case OP_MINPLUSI:
1977     case OP_MINQUERY:
1978     case OP_MINQUERYI:
1979     case OP_MINSTAR:
1980     case OP_MINSTARI:
1981     case OP_MINUPTO:
1982     case OP_MINUPTOI:
1983     case OP_NOTMINPLUS:
1984     case OP_NOTMINPLUSI:
1985     case OP_NOTMINQUERY:
1986     case OP_NOTMINQUERYI:
1987     case OP_NOTMINSTAR:
1988     case OP_NOTMINSTARI:
1989     case OP_NOTMINUPTO:
1990     case OP_NOTMINUPTOI:
1991     case OP_NOTPLUS:
1992     case OP_NOTPLUSI:
1993     case OP_NOTPOSPLUS:
1994     case OP_NOTPOSPLUSI:
1995     case OP_NOTPOSQUERY:
1996     case OP_NOTPOSQUERYI:
1997     case OP_NOTPOSSTAR:
1998     case OP_NOTPOSSTARI:
1999     case OP_NOTPOSUPTO:
2000     case OP_NOTPOSUPTOI:
2001     case OP_NOTQUERY:
2002     case OP_NOTQUERYI:
2003     case OP_NOTSTAR:
2004     case OP_NOTSTARI:
2005     case OP_NOTUPTO:
2006     case OP_NOTUPTOI:
2007     case OP_PLUS:
2008     case OP_PLUSI:
2009     case OP_POSPLUS:
2010     case OP_POSPLUSI:
2011     case OP_POSQUERY:
2012     case OP_POSQUERYI:
2013     case OP_POSSTAR:
2014     case OP_POSSTARI:
2015     case OP_POSUPTO:
2016     case OP_POSUPTOI:
2017     case OP_QUERY:
2018     case OP_QUERYI:
2019     case OP_REF:
2020     case OP_REFI:
2021     case OP_DNREF:
2022     case OP_DNREFI:
2023     case OP_SBRA:
2024     case OP_SBRAPOS:
2025     case OP_SCBRA:
2026     case OP_SCBRAPOS:
2027     case OP_SCOND:
2028     case OP_SKIPZERO:
2029     case OP_STAR:
2030     case OP_STARI:
2031     case OP_TYPEMINPLUS:
2032     case OP_TYPEMINQUERY:
2033     case OP_TYPEMINSTAR:
2034     case OP_TYPEMINUPTO:
2035     case OP_TYPEPLUS:
2036     case OP_TYPEPOSPLUS:
2037     case OP_TYPEPOSQUERY:
2038     case OP_TYPEPOSSTAR:
2039     case OP_TYPEPOSUPTO:
2040     case OP_TYPEQUERY:
2041     case OP_TYPESTAR:
2042     case OP_TYPEUPTO:
2043     case OP_UPTO:
2044     case OP_UPTOI:
2045     return -1;
2046 
2047     /* Catch unrecognized opcodes so that when new ones are added they
2048     are not forgotten, as has happened in the past. */
2049 
2050     default:
2051     return -4;
2052     }
2053   }
2054 /* Control never gets here */
2055 }
2056 
2057 
2058 
2059 /*************************************************
2060 *    Scan compiled regex for specific bracket    *
2061 *************************************************/
2062 
2063 /* This little function scans through a compiled pattern until it finds a
2064 capturing bracket with the given number, or, if the number is negative, an
2065 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2066 so that it can be called from pcre_study() when finding the minimum matching
2067 length.
2068 
2069 Arguments:
2070   code        points to start of expression
2071   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2072   number      the required bracket number or negative to find a lookbehind
2073 
2074 Returns:      pointer to the opcode for the bracket, or NULL if not found
2075 */
2076 
2077 const pcre_uchar *
PRIV(find_bracket)2078 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2079 {
2080 for (;;)
2081   {
2082   register pcre_uchar c = *code;
2083 
2084   if (c == OP_END) return NULL;
2085 
2086   /* XCLASS is used for classes that cannot be represented just by a bit
2087   map. This includes negated single high-valued characters. The length in
2088   the table is zero; the actual length is stored in the compiled code. */
2089 
2090   if (c == OP_XCLASS) code += GET(code, 1);
2091 
2092   /* Handle recursion */
2093 
2094   else if (c == OP_REVERSE)
2095     {
2096     if (number < 0) return (pcre_uchar *)code;
2097     code += PRIV(OP_lengths)[c];
2098     }
2099 
2100   /* Handle capturing bracket */
2101 
2102   else if (c == OP_CBRA || c == OP_SCBRA ||
2103            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2104     {
2105     int n = (int)GET2(code, 1+LINK_SIZE);
2106     if (n == number) return (pcre_uchar *)code;
2107     code += PRIV(OP_lengths)[c];
2108     }
2109 
2110   /* Otherwise, we can get the item's length from the table, except that for
2111   repeated character types, we have to test for \p and \P, which have an extra
2112   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2113   must add in its length. */
2114 
2115   else
2116     {
2117     switch(c)
2118       {
2119       case OP_TYPESTAR:
2120       case OP_TYPEMINSTAR:
2121       case OP_TYPEPLUS:
2122       case OP_TYPEMINPLUS:
2123       case OP_TYPEQUERY:
2124       case OP_TYPEMINQUERY:
2125       case OP_TYPEPOSSTAR:
2126       case OP_TYPEPOSPLUS:
2127       case OP_TYPEPOSQUERY:
2128       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2129       break;
2130 
2131       case OP_TYPEUPTO:
2132       case OP_TYPEMINUPTO:
2133       case OP_TYPEEXACT:
2134       case OP_TYPEPOSUPTO:
2135       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2136         code += 2;
2137       break;
2138 
2139       case OP_MARK:
2140       case OP_PRUNE_ARG:
2141       case OP_SKIP_ARG:
2142       case OP_THEN_ARG:
2143       code += code[1];
2144       break;
2145       }
2146 
2147     /* Add in the fixed length from the table */
2148 
2149     code += PRIV(OP_lengths)[c];
2150 
2151   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2152   a multi-byte character. The length in the table is a minimum, so we have to
2153   arrange to skip the extra bytes. */
2154 
2155 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2156     if (utf) switch(c)
2157       {
2158       case OP_CHAR:
2159       case OP_CHARI:
2160       case OP_NOT:
2161       case OP_NOTI:
2162       case OP_EXACT:
2163       case OP_EXACTI:
2164       case OP_NOTEXACT:
2165       case OP_NOTEXACTI:
2166       case OP_UPTO:
2167       case OP_UPTOI:
2168       case OP_NOTUPTO:
2169       case OP_NOTUPTOI:
2170       case OP_MINUPTO:
2171       case OP_MINUPTOI:
2172       case OP_NOTMINUPTO:
2173       case OP_NOTMINUPTOI:
2174       case OP_POSUPTO:
2175       case OP_POSUPTOI:
2176       case OP_NOTPOSUPTO:
2177       case OP_NOTPOSUPTOI:
2178       case OP_STAR:
2179       case OP_STARI:
2180       case OP_NOTSTAR:
2181       case OP_NOTSTARI:
2182       case OP_MINSTAR:
2183       case OP_MINSTARI:
2184       case OP_NOTMINSTAR:
2185       case OP_NOTMINSTARI:
2186       case OP_POSSTAR:
2187       case OP_POSSTARI:
2188       case OP_NOTPOSSTAR:
2189       case OP_NOTPOSSTARI:
2190       case OP_PLUS:
2191       case OP_PLUSI:
2192       case OP_NOTPLUS:
2193       case OP_NOTPLUSI:
2194       case OP_MINPLUS:
2195       case OP_MINPLUSI:
2196       case OP_NOTMINPLUS:
2197       case OP_NOTMINPLUSI:
2198       case OP_POSPLUS:
2199       case OP_POSPLUSI:
2200       case OP_NOTPOSPLUS:
2201       case OP_NOTPOSPLUSI:
2202       case OP_QUERY:
2203       case OP_QUERYI:
2204       case OP_NOTQUERY:
2205       case OP_NOTQUERYI:
2206       case OP_MINQUERY:
2207       case OP_MINQUERYI:
2208       case OP_NOTMINQUERY:
2209       case OP_NOTMINQUERYI:
2210       case OP_POSQUERY:
2211       case OP_POSQUERYI:
2212       case OP_NOTPOSQUERY:
2213       case OP_NOTPOSQUERYI:
2214       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2215       break;
2216       }
2217 #else
2218     (void)(utf);  /* Keep compiler happy by referencing function argument */
2219 #endif
2220     }
2221   }
2222 }
2223 
2224 
2225 
2226 /*************************************************
2227 *   Scan compiled regex for recursion reference  *
2228 *************************************************/
2229 
2230 /* This little function scans through a compiled pattern until it finds an
2231 instance of OP_RECURSE.
2232 
2233 Arguments:
2234   code        points to start of expression
2235   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2236 
2237 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2238 */
2239 
2240 static const pcre_uchar *
find_recurse(const pcre_uchar * code,BOOL utf)2241 find_recurse(const pcre_uchar *code, BOOL utf)
2242 {
2243 for (;;)
2244   {
2245   register pcre_uchar c = *code;
2246   if (c == OP_END) return NULL;
2247   if (c == OP_RECURSE) return code;
2248 
2249   /* XCLASS is used for classes that cannot be represented just by a bit
2250   map. This includes negated single high-valued characters. The length in
2251   the table is zero; the actual length is stored in the compiled code. */
2252 
2253   if (c == OP_XCLASS) code += GET(code, 1);
2254 
2255   /* Otherwise, we can get the item's length from the table, except that for
2256   repeated character types, we have to test for \p and \P, which have an extra
2257   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2258   must add in its length. */
2259 
2260   else
2261     {
2262     switch(c)
2263       {
2264       case OP_TYPESTAR:
2265       case OP_TYPEMINSTAR:
2266       case OP_TYPEPLUS:
2267       case OP_TYPEMINPLUS:
2268       case OP_TYPEQUERY:
2269       case OP_TYPEMINQUERY:
2270       case OP_TYPEPOSSTAR:
2271       case OP_TYPEPOSPLUS:
2272       case OP_TYPEPOSQUERY:
2273       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2274       break;
2275 
2276       case OP_TYPEPOSUPTO:
2277       case OP_TYPEUPTO:
2278       case OP_TYPEMINUPTO:
2279       case OP_TYPEEXACT:
2280       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2281         code += 2;
2282       break;
2283 
2284       case OP_MARK:
2285       case OP_PRUNE_ARG:
2286       case OP_SKIP_ARG:
2287       case OP_THEN_ARG:
2288       code += code[1];
2289       break;
2290       }
2291 
2292     /* Add in the fixed length from the table */
2293 
2294     code += PRIV(OP_lengths)[c];
2295 
2296     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2297     by a multi-byte character. The length in the table is a minimum, so we have
2298     to arrange to skip the extra bytes. */
2299 
2300 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2301     if (utf) switch(c)
2302       {
2303       case OP_CHAR:
2304       case OP_CHARI:
2305       case OP_NOT:
2306       case OP_NOTI:
2307       case OP_EXACT:
2308       case OP_EXACTI:
2309       case OP_NOTEXACT:
2310       case OP_NOTEXACTI:
2311       case OP_UPTO:
2312       case OP_UPTOI:
2313       case OP_NOTUPTO:
2314       case OP_NOTUPTOI:
2315       case OP_MINUPTO:
2316       case OP_MINUPTOI:
2317       case OP_NOTMINUPTO:
2318       case OP_NOTMINUPTOI:
2319       case OP_POSUPTO:
2320       case OP_POSUPTOI:
2321       case OP_NOTPOSUPTO:
2322       case OP_NOTPOSUPTOI:
2323       case OP_STAR:
2324       case OP_STARI:
2325       case OP_NOTSTAR:
2326       case OP_NOTSTARI:
2327       case OP_MINSTAR:
2328       case OP_MINSTARI:
2329       case OP_NOTMINSTAR:
2330       case OP_NOTMINSTARI:
2331       case OP_POSSTAR:
2332       case OP_POSSTARI:
2333       case OP_NOTPOSSTAR:
2334       case OP_NOTPOSSTARI:
2335       case OP_PLUS:
2336       case OP_PLUSI:
2337       case OP_NOTPLUS:
2338       case OP_NOTPLUSI:
2339       case OP_MINPLUS:
2340       case OP_MINPLUSI:
2341       case OP_NOTMINPLUS:
2342       case OP_NOTMINPLUSI:
2343       case OP_POSPLUS:
2344       case OP_POSPLUSI:
2345       case OP_NOTPOSPLUS:
2346       case OP_NOTPOSPLUSI:
2347       case OP_QUERY:
2348       case OP_QUERYI:
2349       case OP_NOTQUERY:
2350       case OP_NOTQUERYI:
2351       case OP_MINQUERY:
2352       case OP_MINQUERYI:
2353       case OP_NOTMINQUERY:
2354       case OP_NOTMINQUERYI:
2355       case OP_POSQUERY:
2356       case OP_POSQUERYI:
2357       case OP_NOTPOSQUERY:
2358       case OP_NOTPOSQUERYI:
2359       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2360       break;
2361       }
2362 #else
2363     (void)(utf);  /* Keep compiler happy by referencing function argument */
2364 #endif
2365     }
2366   }
2367 }
2368 
2369 
2370 
2371 /*************************************************
2372 *    Scan compiled branch for non-emptiness      *
2373 *************************************************/
2374 
2375 /* This function scans through a branch of a compiled pattern to see whether it
2376 can match the empty string or not. It is called from could_be_empty()
2377 below and from compile_branch() when checking for an unlimited repeat of a
2378 group that can match nothing. Note that first_significant_code() skips over
2379 backward and negative forward assertions when its final argument is TRUE. If we
2380 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2381 bracket whose current branch will already have been scanned.
2382 
2383 Arguments:
2384   code        points to start of search
2385   endcode     points to where to stop
2386   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2387   cd          contains pointers to tables etc.
2388   recurses    chain of recurse_check to catch mutual recursion
2389 
2390 Returns:      TRUE if what is matched could be empty
2391 */
2392 
2393 static BOOL
could_be_empty_branch(const pcre_uchar * code,const pcre_uchar * endcode,BOOL utf,compile_data * cd,recurse_check * recurses)2394 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2395   BOOL utf, compile_data *cd, recurse_check *recurses)
2396 {
2397 register pcre_uchar c;
2398 recurse_check this_recurse;
2399 
2400 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2401      code < endcode;
2402      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2403   {
2404   const pcre_uchar *ccode;
2405 
2406   c = *code;
2407 
2408   /* Skip over forward assertions; the other assertions are skipped by
2409   first_significant_code() with a TRUE final argument. */
2410 
2411   if (c == OP_ASSERT)
2412     {
2413     do code += GET(code, 1); while (*code == OP_ALT);
2414     c = *code;
2415     continue;
2416     }
2417 
2418   /* For a recursion/subroutine call, if its end has been reached, which
2419   implies a backward reference subroutine call, we can scan it. If it's a
2420   forward reference subroutine call, we can't. To detect forward reference
2421   we have to scan up the list that is kept in the workspace. This function is
2422   called only when doing the real compile, not during the pre-compile that
2423   measures the size of the compiled pattern. */
2424 
2425   if (c == OP_RECURSE)
2426     {
2427     const pcre_uchar *scode = cd->start_code + GET(code, 1);
2428     const pcre_uchar *endgroup = scode;
2429     BOOL empty_branch;
2430 
2431     /* Test for forward reference or uncompleted reference. This is disabled
2432     when called to scan a completed pattern by setting cd->start_workspace to
2433     NULL. */
2434 
2435     if (cd->start_workspace != NULL)
2436       {
2437       const pcre_uchar *tcode;
2438       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2439         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2440       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2441       }
2442 
2443     /* If the reference is to a completed group, we need to detect whether this
2444     is a recursive call, as otherwise there will be an infinite loop. If it is
2445     a recursion, just skip over it. Simple recursions are easily detected. For
2446     mutual recursions we keep a chain on the stack. */
2447 
2448     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2449     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2450     else
2451       {
2452       recurse_check *r = recurses;
2453       for (r = recurses; r != NULL; r = r->prev)
2454         if (r->group == scode) break;
2455       if (r != NULL) continue;   /* Mutual recursion */
2456       }
2457 
2458     /* Completed reference; scan the referenced group, remembering it on the
2459     stack chain to detect mutual recursions. */
2460 
2461     empty_branch = FALSE;
2462     this_recurse.prev = recurses;
2463     this_recurse.group = scode;
2464 
2465     do
2466       {
2467       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2468         {
2469         empty_branch = TRUE;
2470         break;
2471         }
2472       scode += GET(scode, 1);
2473       }
2474     while (*scode == OP_ALT);
2475 
2476     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2477     continue;
2478     }
2479 
2480   /* Groups with zero repeats can of course be empty; skip them. */
2481 
2482   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2483       c == OP_BRAPOSZERO)
2484     {
2485     code += PRIV(OP_lengths)[c];
2486     do code += GET(code, 1); while (*code == OP_ALT);
2487     c = *code;
2488     continue;
2489     }
2490 
2491   /* A nested group that is already marked as "could be empty" can just be
2492   skipped. */
2493 
2494   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2495       c == OP_SCBRA || c == OP_SCBRAPOS)
2496     {
2497     do code += GET(code, 1); while (*code == OP_ALT);
2498     c = *code;
2499     continue;
2500     }
2501 
2502   /* For other groups, scan the branches. */
2503 
2504   if (c == OP_BRA  || c == OP_BRAPOS ||
2505       c == OP_CBRA || c == OP_CBRAPOS ||
2506       c == OP_ONCE || c == OP_ONCE_NC ||
2507       c == OP_COND || c == OP_SCOND)
2508     {
2509     BOOL empty_branch;
2510     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2511 
2512     /* If a conditional group has only one branch, there is a second, implied,
2513     empty branch, so just skip over the conditional, because it could be empty.
2514     Otherwise, scan the individual branches of the group. */
2515 
2516     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2517       code += GET(code, 1);
2518     else
2519       {
2520       empty_branch = FALSE;
2521       do
2522         {
2523         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2524           recurses)) empty_branch = TRUE;
2525         code += GET(code, 1);
2526         }
2527       while (*code == OP_ALT);
2528       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2529       }
2530 
2531     c = *code;
2532     continue;
2533     }
2534 
2535   /* Handle the other opcodes */
2536 
2537   switch (c)
2538     {
2539     /* Check for quantifiers after a class. XCLASS is used for classes that
2540     cannot be represented just by a bit map. This includes negated single
2541     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2542     actual length is stored in the compiled code, so we must update "code"
2543     here. */
2544 
2545 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2546     case OP_XCLASS:
2547     ccode = code += GET(code, 1);
2548     goto CHECK_CLASS_REPEAT;
2549 #endif
2550 
2551     case OP_CLASS:
2552     case OP_NCLASS:
2553     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2554 
2555 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2556     CHECK_CLASS_REPEAT:
2557 #endif
2558 
2559     switch (*ccode)
2560       {
2561       case OP_CRSTAR:            /* These could be empty; continue */
2562       case OP_CRMINSTAR:
2563       case OP_CRQUERY:
2564       case OP_CRMINQUERY:
2565       case OP_CRPOSSTAR:
2566       case OP_CRPOSQUERY:
2567       break;
2568 
2569       default:                   /* Non-repeat => class must match */
2570       case OP_CRPLUS:            /* These repeats aren't empty */
2571       case OP_CRMINPLUS:
2572       case OP_CRPOSPLUS:
2573       return FALSE;
2574 
2575       case OP_CRRANGE:
2576       case OP_CRMINRANGE:
2577       case OP_CRPOSRANGE:
2578       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2579       break;
2580       }
2581     break;
2582 
2583     /* Opcodes that must match a character */
2584 
2585     case OP_ANY:
2586     case OP_ALLANY:
2587     case OP_ANYBYTE:
2588 
2589     case OP_PROP:
2590     case OP_NOTPROP:
2591     case OP_ANYNL:
2592 
2593     case OP_NOT_HSPACE:
2594     case OP_HSPACE:
2595     case OP_NOT_VSPACE:
2596     case OP_VSPACE:
2597     case OP_EXTUNI:
2598 
2599     case OP_NOT_DIGIT:
2600     case OP_DIGIT:
2601     case OP_NOT_WHITESPACE:
2602     case OP_WHITESPACE:
2603     case OP_NOT_WORDCHAR:
2604     case OP_WORDCHAR:
2605 
2606     case OP_CHAR:
2607     case OP_CHARI:
2608     case OP_NOT:
2609     case OP_NOTI:
2610 
2611     case OP_PLUS:
2612     case OP_PLUSI:
2613     case OP_MINPLUS:
2614     case OP_MINPLUSI:
2615 
2616     case OP_NOTPLUS:
2617     case OP_NOTPLUSI:
2618     case OP_NOTMINPLUS:
2619     case OP_NOTMINPLUSI:
2620 
2621     case OP_POSPLUS:
2622     case OP_POSPLUSI:
2623     case OP_NOTPOSPLUS:
2624     case OP_NOTPOSPLUSI:
2625 
2626     case OP_EXACT:
2627     case OP_EXACTI:
2628     case OP_NOTEXACT:
2629     case OP_NOTEXACTI:
2630 
2631     case OP_TYPEPLUS:
2632     case OP_TYPEMINPLUS:
2633     case OP_TYPEPOSPLUS:
2634     case OP_TYPEEXACT:
2635 
2636     return FALSE;
2637 
2638     /* These are going to continue, as they may be empty, but we have to
2639     fudge the length for the \p and \P cases. */
2640 
2641     case OP_TYPESTAR:
2642     case OP_TYPEMINSTAR:
2643     case OP_TYPEPOSSTAR:
2644     case OP_TYPEQUERY:
2645     case OP_TYPEMINQUERY:
2646     case OP_TYPEPOSQUERY:
2647     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2648     break;
2649 
2650     /* Same for these */
2651 
2652     case OP_TYPEUPTO:
2653     case OP_TYPEMINUPTO:
2654     case OP_TYPEPOSUPTO:
2655     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2656       code += 2;
2657     break;
2658 
2659     /* End of branch */
2660 
2661     case OP_KET:
2662     case OP_KETRMAX:
2663     case OP_KETRMIN:
2664     case OP_KETRPOS:
2665     case OP_ALT:
2666     return TRUE;
2667 
2668     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2669     MINUPTO, and POSUPTO and their caseless and negative versions may be
2670     followed by a multibyte character. */
2671 
2672 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2673     case OP_STAR:
2674     case OP_STARI:
2675     case OP_NOTSTAR:
2676     case OP_NOTSTARI:
2677 
2678     case OP_MINSTAR:
2679     case OP_MINSTARI:
2680     case OP_NOTMINSTAR:
2681     case OP_NOTMINSTARI:
2682 
2683     case OP_POSSTAR:
2684     case OP_POSSTARI:
2685     case OP_NOTPOSSTAR:
2686     case OP_NOTPOSSTARI:
2687 
2688     case OP_QUERY:
2689     case OP_QUERYI:
2690     case OP_NOTQUERY:
2691     case OP_NOTQUERYI:
2692 
2693     case OP_MINQUERY:
2694     case OP_MINQUERYI:
2695     case OP_NOTMINQUERY:
2696     case OP_NOTMINQUERYI:
2697 
2698     case OP_POSQUERY:
2699     case OP_POSQUERYI:
2700     case OP_NOTPOSQUERY:
2701     case OP_NOTPOSQUERYI:
2702 
2703     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2704     break;
2705 
2706     case OP_UPTO:
2707     case OP_UPTOI:
2708     case OP_NOTUPTO:
2709     case OP_NOTUPTOI:
2710 
2711     case OP_MINUPTO:
2712     case OP_MINUPTOI:
2713     case OP_NOTMINUPTO:
2714     case OP_NOTMINUPTOI:
2715 
2716     case OP_POSUPTO:
2717     case OP_POSUPTOI:
2718     case OP_NOTPOSUPTO:
2719     case OP_NOTPOSUPTOI:
2720 
2721     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2722     break;
2723 #endif
2724 
2725     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2726     string. */
2727 
2728     case OP_MARK:
2729     case OP_PRUNE_ARG:
2730     case OP_SKIP_ARG:
2731     case OP_THEN_ARG:
2732     code += code[1];
2733     break;
2734 
2735     /* None of the remaining opcodes are required to match a character. */
2736 
2737     default:
2738     break;
2739     }
2740   }
2741 
2742 return TRUE;
2743 }
2744 
2745 
2746 
2747 /*************************************************
2748 *    Scan compiled regex for non-emptiness       *
2749 *************************************************/
2750 
2751 /* This function is called to check for left recursive calls. We want to check
2752 the current branch of the current pattern to see if it could match the empty
2753 string. If it could, we must look outwards for branches at other levels,
2754 stopping when we pass beyond the bracket which is the subject of the recursion.
2755 This function is called only during the real compile, not during the
2756 pre-compile.
2757 
2758 Arguments:
2759   code        points to start of the recursion
2760   endcode     points to where to stop (current RECURSE item)
2761   bcptr       points to the chain of current (unclosed) branch starts
2762   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2763   cd          pointers to tables etc
2764 
2765 Returns:      TRUE if what is matched could be empty
2766 */
2767 
2768 static BOOL
could_be_empty(const pcre_uchar * code,const pcre_uchar * endcode,branch_chain * bcptr,BOOL utf,compile_data * cd)2769 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2770   branch_chain *bcptr, BOOL utf, compile_data *cd)
2771 {
2772 while (bcptr != NULL && bcptr->current_branch >= code)
2773   {
2774   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2775     return FALSE;
2776   bcptr = bcptr->outer;
2777   }
2778 return TRUE;
2779 }
2780 
2781 
2782 
2783 /*************************************************
2784 *        Base opcode of repeated opcodes         *
2785 *************************************************/
2786 
2787 /* Returns the base opcode for repeated single character type opcodes. If the
2788 opcode is not a repeated character type, it returns with the original value.
2789 
2790 Arguments:  c opcode
2791 Returns:    base opcode for the type
2792 */
2793 
2794 static pcre_uchar
get_repeat_base(pcre_uchar c)2795 get_repeat_base(pcre_uchar c)
2796 {
2797 return (c > OP_TYPEPOSUPTO)? c :
2798        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2799        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2800        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2801        (c >= OP_STARI)?      OP_STARI :
2802                              OP_STAR;
2803 }
2804 
2805 
2806 
2807 #ifdef SUPPORT_UCP
2808 /*************************************************
2809 *        Check a character and a property        *
2810 *************************************************/
2811 
2812 /* This function is called by check_auto_possessive() when a property item
2813 is adjacent to a fixed character.
2814 
2815 Arguments:
2816   c            the character
2817   ptype        the property type
2818   pdata        the data for the type
2819   negated      TRUE if it's a negated property (\P or \p{^)
2820 
2821 Returns:       TRUE if auto-possessifying is OK
2822 */
2823 
2824 static BOOL
check_char_prop(pcre_uint32 c,unsigned int ptype,unsigned int pdata,BOOL negated)2825 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2826   BOOL negated)
2827 {
2828 const pcre_uint32 *p;
2829 const ucd_record *prop = GET_UCD(c);
2830 
2831 switch(ptype)
2832   {
2833   case PT_LAMP:
2834   return (prop->chartype == ucp_Lu ||
2835           prop->chartype == ucp_Ll ||
2836           prop->chartype == ucp_Lt) == negated;
2837 
2838   case PT_GC:
2839   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2840 
2841   case PT_PC:
2842   return (pdata == prop->chartype) == negated;
2843 
2844   case PT_SC:
2845   return (pdata == prop->script) == negated;
2846 
2847   /* These are specials */
2848 
2849   case PT_ALNUM:
2850   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2851           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2852 
2853   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2854   means that Perl space and POSIX space are now identical. PCRE was changed
2855   at release 8.34. */
2856 
2857   case PT_SPACE:    /* Perl space */
2858   case PT_PXSPACE:  /* POSIX space */
2859   switch(c)
2860     {
2861     HSPACE_CASES:
2862     VSPACE_CASES:
2863     return negated;
2864 
2865     default:
2866     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2867     }
2868   break;  /* Control never reaches here */
2869 
2870   case PT_WORD:
2871   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2872           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2873           c == CHAR_UNDERSCORE) == negated;
2874 
2875   case PT_CLIST:
2876   p = PRIV(ucd_caseless_sets) + prop->caseset;
2877   for (;;)
2878     {
2879     if (c < *p) return !negated;
2880     if (c == *p++) return negated;
2881     }
2882   break;  /* Control never reaches here */
2883   }
2884 
2885 return FALSE;
2886 }
2887 #endif  /* SUPPORT_UCP */
2888 
2889 
2890 
2891 /*************************************************
2892 *        Fill the character property list        *
2893 *************************************************/
2894 
2895 /* Checks whether the code points to an opcode that can take part in auto-
2896 possessification, and if so, fills a list with its properties.
2897 
2898 Arguments:
2899   code        points to start of expression
2900   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2901   fcc         points to case-flipping table
2902   list        points to output list
2903               list[0] will be filled with the opcode
2904               list[1] will be non-zero if this opcode
2905                 can match an empty character string
2906               list[2..7] depends on the opcode
2907 
2908 Returns:      points to the start of the next opcode if *code is accepted
2909               NULL if *code is not accepted
2910 */
2911 
2912 static const pcre_uchar *
get_chr_property_list(const pcre_uchar * code,BOOL utf,const pcre_uint8 * fcc,pcre_uint32 * list)2913 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2914   const pcre_uint8 *fcc, pcre_uint32 *list)
2915 {
2916 pcre_uchar c = *code;
2917 pcre_uchar base;
2918 const pcre_uchar *end;
2919 pcre_uint32 chr;
2920 
2921 #ifdef SUPPORT_UCP
2922 pcre_uint32 *clist_dest;
2923 const pcre_uint32 *clist_src;
2924 #else
2925 utf = utf;  /* Suppress "unused parameter" compiler warning */
2926 #endif
2927 
2928 list[0] = c;
2929 list[1] = FALSE;
2930 code++;
2931 
2932 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2933   {
2934   base = get_repeat_base(c);
2935   c -= (base - OP_STAR);
2936 
2937   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2938     code += IMM2_SIZE;
2939 
2940   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2941 
2942   switch(base)
2943     {
2944     case OP_STAR:
2945     list[0] = OP_CHAR;
2946     break;
2947 
2948     case OP_STARI:
2949     list[0] = OP_CHARI;
2950     break;
2951 
2952     case OP_NOTSTAR:
2953     list[0] = OP_NOT;
2954     break;
2955 
2956     case OP_NOTSTARI:
2957     list[0] = OP_NOTI;
2958     break;
2959 
2960     case OP_TYPESTAR:
2961     list[0] = *code;
2962     code++;
2963     break;
2964     }
2965   c = list[0];
2966   }
2967 
2968 switch(c)
2969   {
2970   case OP_NOT_DIGIT:
2971   case OP_DIGIT:
2972   case OP_NOT_WHITESPACE:
2973   case OP_WHITESPACE:
2974   case OP_NOT_WORDCHAR:
2975   case OP_WORDCHAR:
2976   case OP_ANY:
2977   case OP_ALLANY:
2978   case OP_ANYNL:
2979   case OP_NOT_HSPACE:
2980   case OP_HSPACE:
2981   case OP_NOT_VSPACE:
2982   case OP_VSPACE:
2983   case OP_EXTUNI:
2984   case OP_EODN:
2985   case OP_EOD:
2986   case OP_DOLL:
2987   case OP_DOLLM:
2988   return code;
2989 
2990   case OP_CHAR:
2991   case OP_NOT:
2992   GETCHARINCTEST(chr, code);
2993   list[2] = chr;
2994   list[3] = NOTACHAR;
2995   return code;
2996 
2997   case OP_CHARI:
2998   case OP_NOTI:
2999   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3000   GETCHARINCTEST(chr, code);
3001   list[2] = chr;
3002 
3003 #ifdef SUPPORT_UCP
3004   if (chr < 128 || (chr < 256 && !utf))
3005     list[3] = fcc[chr];
3006   else
3007     list[3] = UCD_OTHERCASE(chr);
3008 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3009   list[3] = (chr < 256) ? fcc[chr] : chr;
3010 #else
3011   list[3] = fcc[chr];
3012 #endif
3013 
3014   /* The othercase might be the same value. */
3015 
3016   if (chr == list[3])
3017     list[3] = NOTACHAR;
3018   else
3019     list[4] = NOTACHAR;
3020   return code;
3021 
3022 #ifdef SUPPORT_UCP
3023   case OP_PROP:
3024   case OP_NOTPROP:
3025   if (code[0] != PT_CLIST)
3026     {
3027     list[2] = code[0];
3028     list[3] = code[1];
3029     return code + 2;
3030     }
3031 
3032   /* Convert only if we have enough space. */
3033 
3034   clist_src = PRIV(ucd_caseless_sets) + code[1];
3035   clist_dest = list + 2;
3036   code += 2;
3037 
3038   do {
3039      if (clist_dest >= list + 8)
3040        {
3041        /* Early return if there is not enough space. This should never
3042        happen, since all clists are shorter than 5 character now. */
3043        list[2] = code[0];
3044        list[3] = code[1];
3045        return code;
3046        }
3047      *clist_dest++ = *clist_src;
3048      }
3049   while(*clist_src++ != NOTACHAR);
3050 
3051   /* All characters are stored. The terminating NOTACHAR
3052   is copied form the clist itself. */
3053 
3054   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3055   return code;
3056 #endif
3057 
3058   case OP_NCLASS:
3059   case OP_CLASS:
3060 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3061   case OP_XCLASS:
3062   if (c == OP_XCLASS)
3063     end = code + GET(code, 0) - 1;
3064   else
3065 #endif
3066     end = code + 32 / sizeof(pcre_uchar);
3067 
3068   switch(*end)
3069     {
3070     case OP_CRSTAR:
3071     case OP_CRMINSTAR:
3072     case OP_CRQUERY:
3073     case OP_CRMINQUERY:
3074     case OP_CRPOSSTAR:
3075     case OP_CRPOSQUERY:
3076     list[1] = TRUE;
3077     end++;
3078     break;
3079 
3080     case OP_CRPLUS:
3081     case OP_CRMINPLUS:
3082     case OP_CRPOSPLUS:
3083     end++;
3084     break;
3085 
3086     case OP_CRRANGE:
3087     case OP_CRMINRANGE:
3088     case OP_CRPOSRANGE:
3089     list[1] = (GET2(end, 1) == 0);
3090     end += 1 + 2 * IMM2_SIZE;
3091     break;
3092     }
3093   list[2] = (pcre_uint32)(end - code);
3094   return end;
3095   }
3096 return NULL;    /* Opcode not accepted */
3097 }
3098 
3099 
3100 
3101 /*************************************************
3102 *    Scan further character sets for match       *
3103 *************************************************/
3104 
3105 /* Checks whether the base and the current opcode have a common character, in
3106 which case the base cannot be possessified.
3107 
3108 Arguments:
3109   code        points to the byte code
3110   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3111   cd          static compile data
3112   base_list   the data list of the base opcode
3113 
3114 Returns:      TRUE if the auto-possessification is possible
3115 */
3116 
3117 static BOOL
compare_opcodes(const pcre_uchar * code,BOOL utf,const compile_data * cd,const pcre_uint32 * base_list,const pcre_uchar * base_end,int * rec_limit)3118 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3119   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3120 {
3121 pcre_uchar c;
3122 pcre_uint32 list[8];
3123 const pcre_uint32 *chr_ptr;
3124 const pcre_uint32 *ochr_ptr;
3125 const pcre_uint32 *list_ptr;
3126 const pcre_uchar *next_code;
3127 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3128 const pcre_uchar *xclass_flags;
3129 #endif
3130 const pcre_uint8 *class_bitset;
3131 const pcre_uint8 *set1, *set2, *set_end;
3132 pcre_uint32 chr;
3133 BOOL accepted, invert_bits;
3134 BOOL entered_a_group = FALSE;
3135 
3136 if (*rec_limit == 0) return FALSE;
3137 --(*rec_limit);
3138 
3139 /* Note: the base_list[1] contains whether the current opcode has greedy
3140 (represented by a non-zero value) quantifier. This is a different from
3141 other character type lists, which stores here that the character iterator
3142 matches to an empty string (also represented by a non-zero value). */
3143 
3144 for(;;)
3145   {
3146   /* All operations move the code pointer forward.
3147   Therefore infinite recursions are not possible. */
3148 
3149   c = *code;
3150 
3151   /* Skip over callouts */
3152 
3153   if (c == OP_CALLOUT)
3154     {
3155     code += PRIV(OP_lengths)[c];
3156     continue;
3157     }
3158 
3159   if (c == OP_ALT)
3160     {
3161     do code += GET(code, 1); while (*code == OP_ALT);
3162     c = *code;
3163     }
3164 
3165   switch(c)
3166     {
3167     case OP_END:
3168     case OP_KETRPOS:
3169     /* TRUE only in greedy case. The non-greedy case could be replaced by
3170     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3171     uses more memory, which we cannot get at this stage.) */
3172 
3173     return base_list[1] != 0;
3174 
3175     case OP_KET:
3176     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3177     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3178     cannot be converted to a possessive form. */
3179 
3180     if (base_list[1] == 0) return FALSE;
3181 
3182     switch(*(code - GET(code, 1)))
3183       {
3184       case OP_ASSERT:
3185       case OP_ASSERT_NOT:
3186       case OP_ASSERTBACK:
3187       case OP_ASSERTBACK_NOT:
3188       case OP_ONCE:
3189       case OP_ONCE_NC:
3190       /* Atomic sub-patterns and assertions can always auto-possessify their
3191       last iterator. However, if the group was entered as a result of checking
3192       a previous iterator, this is not possible. */
3193 
3194       return !entered_a_group;
3195       }
3196 
3197     code += PRIV(OP_lengths)[c];
3198     continue;
3199 
3200     case OP_ONCE:
3201     case OP_ONCE_NC:
3202     case OP_BRA:
3203     case OP_CBRA:
3204     next_code = code + GET(code, 1);
3205     code += PRIV(OP_lengths)[c];
3206 
3207     while (*next_code == OP_ALT)
3208       {
3209       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3210         return FALSE;
3211       code = next_code + 1 + LINK_SIZE;
3212       next_code += GET(next_code, 1);
3213       }
3214 
3215     entered_a_group = TRUE;
3216     continue;
3217 
3218     case OP_BRAZERO:
3219     case OP_BRAMINZERO:
3220 
3221     next_code = code + 1;
3222     if (*next_code != OP_BRA && *next_code != OP_CBRA
3223         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3224 
3225     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3226 
3227     /* The bracket content will be checked by the
3228     OP_BRA/OP_CBRA case above. */
3229     next_code += 1 + LINK_SIZE;
3230     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3231       return FALSE;
3232 
3233     code += PRIV(OP_lengths)[c];
3234     continue;
3235 
3236     default:
3237     break;
3238     }
3239 
3240   /* Check for a supported opcode, and load its properties. */
3241 
3242   code = get_chr_property_list(code, utf, cd->fcc, list);
3243   if (code == NULL) return FALSE;    /* Unsupported */
3244 
3245   /* If either opcode is a small character list, set pointers for comparing
3246   characters from that list with another list, or with a property. */
3247 
3248   if (base_list[0] == OP_CHAR)
3249     {
3250     chr_ptr = base_list + 2;
3251     list_ptr = list;
3252     }
3253   else if (list[0] == OP_CHAR)
3254     {
3255     chr_ptr = list + 2;
3256     list_ptr = base_list;
3257     }
3258 
3259   /* Character bitsets can also be compared to certain opcodes. */
3260 
3261   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3262 #ifdef COMPILE_PCRE8
3263       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3264       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3265 #endif
3266       )
3267     {
3268 #ifdef COMPILE_PCRE8
3269     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3270 #else
3271     if (base_list[0] == OP_CLASS)
3272 #endif
3273       {
3274       set1 = (pcre_uint8 *)(base_end - base_list[2]);
3275       list_ptr = list;
3276       }
3277     else
3278       {
3279       set1 = (pcre_uint8 *)(code - list[2]);
3280       list_ptr = base_list;
3281       }
3282 
3283     invert_bits = FALSE;
3284     switch(list_ptr[0])
3285       {
3286       case OP_CLASS:
3287       case OP_NCLASS:
3288       set2 = (pcre_uint8 *)
3289         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3290       break;
3291 
3292 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3293       case OP_XCLASS:
3294       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3295       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3296       if ((*xclass_flags & XCL_MAP) == 0)
3297         {
3298         /* No bits are set for characters < 256. */
3299         if (list[1] == 0) return TRUE;
3300         /* Might be an empty repeat. */
3301         continue;
3302         }
3303       set2 = (pcre_uint8 *)(xclass_flags + 1);
3304       break;
3305 #endif
3306 
3307       case OP_NOT_DIGIT:
3308       invert_bits = TRUE;
3309       /* Fall through */
3310       case OP_DIGIT:
3311       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3312       break;
3313 
3314       case OP_NOT_WHITESPACE:
3315       invert_bits = TRUE;
3316       /* Fall through */
3317       case OP_WHITESPACE:
3318       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3319       break;
3320 
3321       case OP_NOT_WORDCHAR:
3322       invert_bits = TRUE;
3323       /* Fall through */
3324       case OP_WORDCHAR:
3325       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3326       break;
3327 
3328       default:
3329       return FALSE;
3330       }
3331 
3332     /* Because the sets are unaligned, we need
3333     to perform byte comparison here. */
3334     set_end = set1 + 32;
3335     if (invert_bits)
3336       {
3337       do
3338         {
3339         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3340         }
3341       while (set1 < set_end);
3342       }
3343     else
3344       {
3345       do
3346         {
3347         if ((*set1++ & *set2++) != 0) return FALSE;
3348         }
3349       while (set1 < set_end);
3350       }
3351 
3352     if (list[1] == 0) return TRUE;
3353     /* Might be an empty repeat. */
3354     continue;
3355     }
3356 
3357   /* Some property combinations also acceptable. Unicode property opcodes are
3358   processed specially; the rest can be handled with a lookup table. */
3359 
3360   else
3361     {
3362     pcre_uint32 leftop, rightop;
3363 
3364     leftop = base_list[0];
3365     rightop = list[0];
3366 
3367 #ifdef SUPPORT_UCP
3368     accepted = FALSE; /* Always set in non-unicode case. */
3369     if (leftop == OP_PROP || leftop == OP_NOTPROP)
3370       {
3371       if (rightop == OP_EOD)
3372         accepted = TRUE;
3373       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3374         {
3375         int n;
3376         const pcre_uint8 *p;
3377         BOOL same = leftop == rightop;
3378         BOOL lisprop = leftop == OP_PROP;
3379         BOOL risprop = rightop == OP_PROP;
3380         BOOL bothprop = lisprop && risprop;
3381 
3382         /* There's a table that specifies how each combination is to be
3383         processed:
3384           0   Always return FALSE (never auto-possessify)
3385           1   Character groups are distinct (possessify if both are OP_PROP)
3386           2   Check character categories in the same group (general or particular)
3387           3   Return TRUE if the two opcodes are not the same
3388           ... see comments below
3389         */
3390 
3391         n = propposstab[base_list[2]][list[2]];
3392         switch(n)
3393           {
3394           case 0: break;
3395           case 1: accepted = bothprop; break;
3396           case 2: accepted = (base_list[3] == list[3]) != same; break;
3397           case 3: accepted = !same; break;
3398 
3399           case 4:  /* Left general category, right particular category */
3400           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3401           break;
3402 
3403           case 5:  /* Right general category, left particular category */
3404           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3405           break;
3406 
3407           /* This code is logically tricky. Think hard before fiddling with it.
3408           The posspropstab table has four entries per row. Each row relates to
3409           one of PCRE's special properties such as ALNUM or SPACE or WORD.
3410           Only WORD actually needs all four entries, but using repeats for the
3411           others means they can all use the same code below.
3412 
3413           The first two entries in each row are Unicode general categories, and
3414           apply always, because all the characters they include are part of the
3415           PCRE character set. The third and fourth entries are a general and a
3416           particular category, respectively, that include one or more relevant
3417           characters. One or the other is used, depending on whether the check
3418           is for a general or a particular category. However, in both cases the
3419           category contains more characters than the specials that are defined
3420           for the property being tested against. Therefore, it cannot be used
3421           in a NOTPROP case.
3422 
3423           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3424           Underscore is covered by ucp_P or ucp_Po. */
3425 
3426           case 6:  /* Left alphanum vs right general category */
3427           case 7:  /* Left space vs right general category */
3428           case 8:  /* Left word vs right general category */
3429           p = posspropstab[n-6];
3430           accepted = risprop && lisprop ==
3431             (list[3] != p[0] &&
3432              list[3] != p[1] &&
3433             (list[3] != p[2] || !lisprop));
3434           break;
3435 
3436           case 9:   /* Right alphanum vs left general category */
3437           case 10:  /* Right space vs left general category */
3438           case 11:  /* Right word vs left general category */
3439           p = posspropstab[n-9];
3440           accepted = lisprop && risprop ==
3441             (base_list[3] != p[0] &&
3442              base_list[3] != p[1] &&
3443             (base_list[3] != p[2] || !risprop));
3444           break;
3445 
3446           case 12:  /* Left alphanum vs right particular category */
3447           case 13:  /* Left space vs right particular category */
3448           case 14:  /* Left word vs right particular category */
3449           p = posspropstab[n-12];
3450           accepted = risprop && lisprop ==
3451             (catposstab[p[0]][list[3]] &&
3452              catposstab[p[1]][list[3]] &&
3453             (list[3] != p[3] || !lisprop));
3454           break;
3455 
3456           case 15:  /* Right alphanum vs left particular category */
3457           case 16:  /* Right space vs left particular category */
3458           case 17:  /* Right word vs left particular category */
3459           p = posspropstab[n-15];
3460           accepted = lisprop && risprop ==
3461             (catposstab[p[0]][base_list[3]] &&
3462              catposstab[p[1]][base_list[3]] &&
3463             (base_list[3] != p[3] || !risprop));
3464           break;
3465           }
3466         }
3467       }
3468 
3469     else
3470 #endif  /* SUPPORT_UCP */
3471 
3472     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3473            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3474            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3475 
3476     if (!accepted) return FALSE;
3477 
3478     if (list[1] == 0) return TRUE;
3479     /* Might be an empty repeat. */
3480     continue;
3481     }
3482 
3483   /* Control reaches here only if one of the items is a small character list.
3484   All characters are checked against the other side. */
3485 
3486   do
3487     {
3488     chr = *chr_ptr;
3489 
3490     switch(list_ptr[0])
3491       {
3492       case OP_CHAR:
3493       ochr_ptr = list_ptr + 2;
3494       do
3495         {
3496         if (chr == *ochr_ptr) return FALSE;
3497         ochr_ptr++;
3498         }
3499       while(*ochr_ptr != NOTACHAR);
3500       break;
3501 
3502       case OP_NOT:
3503       ochr_ptr = list_ptr + 2;
3504       do
3505         {
3506         if (chr == *ochr_ptr)
3507           break;
3508         ochr_ptr++;
3509         }
3510       while(*ochr_ptr != NOTACHAR);
3511       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3512       break;
3513 
3514       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3515       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3516 
3517       case OP_DIGIT:
3518       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3519       break;
3520 
3521       case OP_NOT_DIGIT:
3522       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3523       break;
3524 
3525       case OP_WHITESPACE:
3526       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3527       break;
3528 
3529       case OP_NOT_WHITESPACE:
3530       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3531       break;
3532 
3533       case OP_WORDCHAR:
3534       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3535       break;
3536 
3537       case OP_NOT_WORDCHAR:
3538       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3539       break;
3540 
3541       case OP_HSPACE:
3542       switch(chr)
3543         {
3544         HSPACE_CASES: return FALSE;
3545         default: break;
3546         }
3547       break;
3548 
3549       case OP_NOT_HSPACE:
3550       switch(chr)
3551         {
3552         HSPACE_CASES: break;
3553         default: return FALSE;
3554         }
3555       break;
3556 
3557       case OP_ANYNL:
3558       case OP_VSPACE:
3559       switch(chr)
3560         {
3561         VSPACE_CASES: return FALSE;
3562         default: break;
3563         }
3564       break;
3565 
3566       case OP_NOT_VSPACE:
3567       switch(chr)
3568         {
3569         VSPACE_CASES: break;
3570         default: return FALSE;
3571         }
3572       break;
3573 
3574       case OP_DOLL:
3575       case OP_EODN:
3576       switch (chr)
3577         {
3578         case CHAR_CR:
3579         case CHAR_LF:
3580         case CHAR_VT:
3581         case CHAR_FF:
3582         case CHAR_NEL:
3583 #ifndef EBCDIC
3584         case 0x2028:
3585         case 0x2029:
3586 #endif  /* Not EBCDIC */
3587         return FALSE;
3588         }
3589       break;
3590 
3591       case OP_EOD:    /* Can always possessify before \z */
3592       break;
3593 
3594 #ifdef SUPPORT_UCP
3595       case OP_PROP:
3596       case OP_NOTPROP:
3597       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3598             list_ptr[0] == OP_NOTPROP))
3599         return FALSE;
3600       break;
3601 #endif
3602 
3603       case OP_NCLASS:
3604       if (chr > 255) return FALSE;
3605       /* Fall through */
3606 
3607       case OP_CLASS:
3608       if (chr > 255) break;
3609       class_bitset = (pcre_uint8 *)
3610         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3611       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3612       break;
3613 
3614 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3615       case OP_XCLASS:
3616       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3617           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3618       break;
3619 #endif
3620 
3621       default:
3622       return FALSE;
3623       }
3624 
3625     chr_ptr++;
3626     }
3627   while(*chr_ptr != NOTACHAR);
3628 
3629   /* At least one character must be matched from this opcode. */
3630 
3631   if (list[1] == 0) return TRUE;
3632   }
3633 
3634 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3635 but some compilers complain about an unreachable statement. */
3636 
3637 }
3638 
3639 
3640 
3641 /*************************************************
3642 *    Scan compiled regex for auto-possession     *
3643 *************************************************/
3644 
3645 /* Replaces single character iterations with their possessive alternatives
3646 if appropriate. This function modifies the compiled opcode!
3647 
3648 Arguments:
3649   code        points to start of the byte code
3650   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3651   cd          static compile data
3652 
3653 Returns:      nothing
3654 */
3655 
3656 static void
auto_possessify(pcre_uchar * code,BOOL utf,const compile_data * cd)3657 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3658 {
3659 register pcre_uchar c;
3660 const pcre_uchar *end;
3661 pcre_uchar *repeat_opcode;
3662 pcre_uint32 list[8];
3663 int rec_limit;
3664 
3665 for (;;)
3666   {
3667   c = *code;
3668 
3669   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3670   it may compile without complaining, but may get into a loop here if the code
3671   pointer points to a bad value. This is, of course a documentated possibility,
3672   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3673   just give up on this optimization. */
3674 
3675   if (c >= OP_TABLE_LENGTH) return;
3676 
3677   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3678     {
3679     c -= get_repeat_base(c) - OP_STAR;
3680     end = (c <= OP_MINUPTO) ?
3681       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3682     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3683 
3684     rec_limit = 1000;
3685     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3686       {
3687       switch(c)
3688         {
3689         case OP_STAR:
3690         *code += OP_POSSTAR - OP_STAR;
3691         break;
3692 
3693         case OP_MINSTAR:
3694         *code += OP_POSSTAR - OP_MINSTAR;
3695         break;
3696 
3697         case OP_PLUS:
3698         *code += OP_POSPLUS - OP_PLUS;
3699         break;
3700 
3701         case OP_MINPLUS:
3702         *code += OP_POSPLUS - OP_MINPLUS;
3703         break;
3704 
3705         case OP_QUERY:
3706         *code += OP_POSQUERY - OP_QUERY;
3707         break;
3708 
3709         case OP_MINQUERY:
3710         *code += OP_POSQUERY - OP_MINQUERY;
3711         break;
3712 
3713         case OP_UPTO:
3714         *code += OP_POSUPTO - OP_UPTO;
3715         break;
3716 
3717         case OP_MINUPTO:
3718         *code += OP_POSUPTO - OP_MINUPTO;
3719         break;
3720         }
3721       }
3722     c = *code;
3723     }
3724   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3725     {
3726 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3727     if (c == OP_XCLASS)
3728       repeat_opcode = code + GET(code, 1);
3729     else
3730 #endif
3731       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3732 
3733     c = *repeat_opcode;
3734     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3735       {
3736       /* end must not be NULL. */
3737       end = get_chr_property_list(code, utf, cd->fcc, list);
3738 
3739       list[1] = (c & 1) == 0;
3740 
3741       rec_limit = 1000;
3742       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3743         {
3744         switch (c)
3745           {
3746           case OP_CRSTAR:
3747           case OP_CRMINSTAR:
3748           *repeat_opcode = OP_CRPOSSTAR;
3749           break;
3750 
3751           case OP_CRPLUS:
3752           case OP_CRMINPLUS:
3753           *repeat_opcode = OP_CRPOSPLUS;
3754           break;
3755 
3756           case OP_CRQUERY:
3757           case OP_CRMINQUERY:
3758           *repeat_opcode = OP_CRPOSQUERY;
3759           break;
3760 
3761           case OP_CRRANGE:
3762           case OP_CRMINRANGE:
3763           *repeat_opcode = OP_CRPOSRANGE;
3764           break;
3765           }
3766         }
3767       }
3768     c = *code;
3769     }
3770 
3771   switch(c)
3772     {
3773     case OP_END:
3774     return;
3775 
3776     case OP_TYPESTAR:
3777     case OP_TYPEMINSTAR:
3778     case OP_TYPEPLUS:
3779     case OP_TYPEMINPLUS:
3780     case OP_TYPEQUERY:
3781     case OP_TYPEMINQUERY:
3782     case OP_TYPEPOSSTAR:
3783     case OP_TYPEPOSPLUS:
3784     case OP_TYPEPOSQUERY:
3785     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3786     break;
3787 
3788     case OP_TYPEUPTO:
3789     case OP_TYPEMINUPTO:
3790     case OP_TYPEEXACT:
3791     case OP_TYPEPOSUPTO:
3792     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3793       code += 2;
3794     break;
3795 
3796 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3797     case OP_XCLASS:
3798     code += GET(code, 1);
3799     break;
3800 #endif
3801 
3802     case OP_MARK:
3803     case OP_PRUNE_ARG:
3804     case OP_SKIP_ARG:
3805     case OP_THEN_ARG:
3806     code += code[1];
3807     break;
3808     }
3809 
3810   /* Add in the fixed length from the table */
3811 
3812   code += PRIV(OP_lengths)[c];
3813 
3814   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3815   a multi-byte character. The length in the table is a minimum, so we have to
3816   arrange to skip the extra bytes. */
3817 
3818 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3819   if (utf) switch(c)
3820     {
3821     case OP_CHAR:
3822     case OP_CHARI:
3823     case OP_NOT:
3824     case OP_NOTI:
3825     case OP_STAR:
3826     case OP_MINSTAR:
3827     case OP_PLUS:
3828     case OP_MINPLUS:
3829     case OP_QUERY:
3830     case OP_MINQUERY:
3831     case OP_UPTO:
3832     case OP_MINUPTO:
3833     case OP_EXACT:
3834     case OP_POSSTAR:
3835     case OP_POSPLUS:
3836     case OP_POSQUERY:
3837     case OP_POSUPTO:
3838     case OP_STARI:
3839     case OP_MINSTARI:
3840     case OP_PLUSI:
3841     case OP_MINPLUSI:
3842     case OP_QUERYI:
3843     case OP_MINQUERYI:
3844     case OP_UPTOI:
3845     case OP_MINUPTOI:
3846     case OP_EXACTI:
3847     case OP_POSSTARI:
3848     case OP_POSPLUSI:
3849     case OP_POSQUERYI:
3850     case OP_POSUPTOI:
3851     case OP_NOTSTAR:
3852     case OP_NOTMINSTAR:
3853     case OP_NOTPLUS:
3854     case OP_NOTMINPLUS:
3855     case OP_NOTQUERY:
3856     case OP_NOTMINQUERY:
3857     case OP_NOTUPTO:
3858     case OP_NOTMINUPTO:
3859     case OP_NOTEXACT:
3860     case OP_NOTPOSSTAR:
3861     case OP_NOTPOSPLUS:
3862     case OP_NOTPOSQUERY:
3863     case OP_NOTPOSUPTO:
3864     case OP_NOTSTARI:
3865     case OP_NOTMINSTARI:
3866     case OP_NOTPLUSI:
3867     case OP_NOTMINPLUSI:
3868     case OP_NOTQUERYI:
3869     case OP_NOTMINQUERYI:
3870     case OP_NOTUPTOI:
3871     case OP_NOTMINUPTOI:
3872     case OP_NOTEXACTI:
3873     case OP_NOTPOSSTARI:
3874     case OP_NOTPOSPLUSI:
3875     case OP_NOTPOSQUERYI:
3876     case OP_NOTPOSUPTOI:
3877     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3878     break;
3879     }
3880 #else
3881   (void)(utf);  /* Keep compiler happy by referencing function argument */
3882 #endif
3883   }
3884 }
3885 
3886 
3887 
3888 /*************************************************
3889 *           Check for POSIX class syntax         *
3890 *************************************************/
3891 
3892 /* This function is called when the sequence "[:" or "[." or "[=" is
3893 encountered in a character class. It checks whether this is followed by a
3894 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3895 reach an unescaped ']' without the special preceding character, return FALSE.
3896 
3897 Originally, this function only recognized a sequence of letters between the
3898 terminators, but it seems that Perl recognizes any sequence of characters,
3899 though of course unknown POSIX names are subsequently rejected. Perl gives an
3900 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3901 didn't consider this to be a POSIX class. Likewise for [:1234:].
3902 
3903 The problem in trying to be exactly like Perl is in the handling of escapes. We
3904 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3905 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3906 below handles the special cases \\ and \], but does not try to do any other
3907 escape processing. This makes it different from Perl for cases such as
3908 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3909 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3910 when Perl does, I think.
3911 
3912 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3913 It seems that the appearance of a nested POSIX class supersedes an apparent
3914 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3915 a digit.
3916 
3917 In Perl, unescaped square brackets may also appear as part of class names. For
3918 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3919 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3920 seem right at all. PCRE does not allow closing square brackets in POSIX class
3921 names.
3922 
3923 Arguments:
3924   ptr      pointer to the initial [
3925   endptr   where to return the end pointer
3926 
3927 Returns:   TRUE or FALSE
3928 */
3929 
3930 static BOOL
check_posix_syntax(const pcre_uchar * ptr,const pcre_uchar ** endptr)3931 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3932 {
3933 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3934 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3935 for (++ptr; *ptr != CHAR_NULL; ptr++)
3936   {
3937   if (*ptr == CHAR_BACKSLASH &&
3938       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3939        ptr[1] == CHAR_BACKSLASH))
3940     ptr++;
3941   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3942             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3943   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3944     {
3945     *endptr = ptr;
3946     return TRUE;
3947     }
3948   }
3949 return FALSE;
3950 }
3951 
3952 
3953 
3954 
3955 /*************************************************
3956 *          Check POSIX class name                *
3957 *************************************************/
3958 
3959 /* This function is called to check the name given in a POSIX-style class entry
3960 such as [:alnum:].
3961 
3962 Arguments:
3963   ptr        points to the first letter
3964   len        the length of the name
3965 
3966 Returns:     a value representing the name, or -1 if unknown
3967 */
3968 
3969 static int
check_posix_name(const pcre_uchar * ptr,int len)3970 check_posix_name(const pcre_uchar *ptr, int len)
3971 {
3972 const char *pn = posix_names;
3973 register int yield = 0;
3974 while (posix_name_lengths[yield] != 0)
3975   {
3976   if (len == posix_name_lengths[yield] &&
3977     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3978   pn += posix_name_lengths[yield] + 1;
3979   yield++;
3980   }
3981 return -1;
3982 }
3983 
3984 
3985 /*************************************************
3986 *    Adjust OP_RECURSE items in repeated group   *
3987 *************************************************/
3988 
3989 /* OP_RECURSE items contain an offset from the start of the regex to the group
3990 that is referenced. This means that groups can be replicated for fixed
3991 repetition simply by copying (because the recursion is allowed to refer to
3992 earlier groups that are outside the current group). However, when a group is
3993 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3994 inserted before it, after it has been compiled. This means that any OP_RECURSE
3995 items within it that refer to the group itself or any contained groups have to
3996 have their offsets adjusted. That one of the jobs of this function. Before it
3997 is called, the partially compiled regex must be temporarily terminated with
3998 OP_END.
3999 
4000 This function has been extended to cope with forward references for recursions
4001 and subroutine calls. It must check the list of such references for the
4002 group we are dealing with. If it finds that one of the recursions in the
4003 current group is on this list, it does not adjust the value in the reference
4004 (which is a group number). After the group has been scanned, all the offsets in
4005 the forward reference list for the group are adjusted.
4006 
4007 Arguments:
4008   group      points to the start of the group
4009   adjust     the amount by which the group is to be moved
4010   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4011   cd         contains pointers to tables etc.
4012   save_hwm_offset   the hwm forward reference offset at the start of the group
4013 
4014 Returns:     nothing
4015 */
4016 
4017 static void
adjust_recurse(pcre_uchar * group,int adjust,BOOL utf,compile_data * cd,size_t save_hwm_offset)4018 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4019   size_t save_hwm_offset)
4020 {
4021 int offset;
4022 pcre_uchar *hc;
4023 pcre_uchar *ptr = group;
4024 
4025 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4026   {
4027   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4028        hc += LINK_SIZE)
4029     {
4030     offset = (int)GET(hc, 0);
4031     if (cd->start_code + offset == ptr + 1) break;
4032     }
4033 
4034   /* If we have not found this recursion on the forward reference list, adjust
4035   the recursion's offset if it's after the start of this group. */
4036 
4037   if (hc >= cd->hwm)
4038     {
4039     offset = (int)GET(ptr, 1);
4040     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4041     }
4042 
4043   ptr += 1 + LINK_SIZE;
4044   }
4045 
4046 /* Now adjust all forward reference offsets for the group. */
4047 
4048 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4049      hc += LINK_SIZE)
4050   {
4051   offset = (int)GET(hc, 0);
4052   PUT(hc, 0, offset + adjust);
4053   }
4054 }
4055 
4056 
4057 
4058 /*************************************************
4059 *        Insert an automatic callout point       *
4060 *************************************************/
4061 
4062 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4063 callout points before each pattern item.
4064 
4065 Arguments:
4066   code           current code pointer
4067   ptr            current pattern pointer
4068   cd             pointers to tables etc
4069 
4070 Returns:         new code pointer
4071 */
4072 
4073 static pcre_uchar *
auto_callout(pcre_uchar * code,const pcre_uchar * ptr,compile_data * cd)4074 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4075 {
4076 *code++ = OP_CALLOUT;
4077 *code++ = 255;
4078 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4079 PUT(code, LINK_SIZE, 0);                       /* Default length */
4080 return code + 2 * LINK_SIZE;
4081 }
4082 
4083 
4084 
4085 /*************************************************
4086 *         Complete a callout item                *
4087 *************************************************/
4088 
4089 /* A callout item contains the length of the next item in the pattern, which
4090 we can't fill in till after we have reached the relevant point. This is used
4091 for both automatic and manual callouts.
4092 
4093 Arguments:
4094   previous_callout   points to previous callout item
4095   ptr                current pattern pointer
4096   cd                 pointers to tables etc
4097 
4098 Returns:             nothing
4099 */
4100 
4101 static void
complete_callout(pcre_uchar * previous_callout,const pcre_uchar * ptr,compile_data * cd)4102 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4103 {
4104 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4105 PUT(previous_callout, 2 + LINK_SIZE, length);
4106 }
4107 
4108 
4109 
4110 #ifdef SUPPORT_UCP
4111 /*************************************************
4112 *           Get othercase range                  *
4113 *************************************************/
4114 
4115 /* This function is passed the start and end of a class range, in UTF-8 mode
4116 with UCP support. It searches up the characters, looking for ranges of
4117 characters in the "other" case. Each call returns the next one, updating the
4118 start address. A character with multiple other cases is returned on its own
4119 with a special return value.
4120 
4121 Arguments:
4122   cptr        points to starting character value; updated
4123   d           end value
4124   ocptr       where to put start of othercase range
4125   odptr       where to put end of othercase range
4126 
4127 Yield:        -1 when no more
4128                0 when a range is returned
4129               >0 the CASESET offset for char with multiple other cases
4130                 in this case, ocptr contains the original
4131 */
4132 
4133 static int
get_othercase_range(pcre_uint32 * cptr,pcre_uint32 d,pcre_uint32 * ocptr,pcre_uint32 * odptr)4134 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4135   pcre_uint32 *odptr)
4136 {
4137 pcre_uint32 c, othercase, next;
4138 unsigned int co;
4139 
4140 /* Find the first character that has an other case. If it has multiple other
4141 cases, return its case offset value. */
4142 
4143 for (c = *cptr; c <= d; c++)
4144   {
4145   if ((co = UCD_CASESET(c)) != 0)
4146     {
4147     *ocptr = c++;   /* Character that has the set */
4148     *cptr = c;      /* Rest of input range */
4149     return (int)co;
4150     }
4151   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4152   }
4153 
4154 if (c > d) return -1;  /* Reached end of range */
4155 
4156 /* Found a character that has a single other case. Search for the end of the
4157 range, which is either the end of the input range, or a character that has zero
4158 or more than one other cases. */
4159 
4160 *ocptr = othercase;
4161 next = othercase + 1;
4162 
4163 for (++c; c <= d; c++)
4164   {
4165   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4166   next++;
4167   }
4168 
4169 *odptr = next - 1;     /* End of othercase range */
4170 *cptr = c;             /* Rest of input range */
4171 return 0;
4172 }
4173 #endif  /* SUPPORT_UCP */
4174 
4175 
4176 
4177 /*************************************************
4178 *        Add a character or range to a class     *
4179 *************************************************/
4180 
4181 /* This function packages up the logic of adding a character or range of
4182 characters to a class. The character values in the arguments will be within the
4183 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4184 mutually recursive with the function immediately below.
4185 
4186 Arguments:
4187   classbits     the bit map for characters < 256
4188   uchardptr     points to the pointer for extra data
4189   options       the options word
4190   cd            contains pointers to tables etc.
4191   start         start of range character
4192   end           end of range character
4193 
4194 Returns:        the number of < 256 characters added
4195                 the pointer to extra data is updated
4196 */
4197 
4198 static int
add_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,pcre_uint32 start,pcre_uint32 end)4199 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4200   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4201 {
4202 pcre_uint32 c;
4203 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4204 int n8 = 0;
4205 
4206 /* If caseless matching is required, scan the range and process alternate
4207 cases. In Unicode, there are 8-bit characters that have alternate cases that
4208 are greater than 255 and vice-versa. Sometimes we can just extend the original
4209 range. */
4210 
4211 if ((options & PCRE_CASELESS) != 0)
4212   {
4213 #ifdef SUPPORT_UCP
4214   if ((options & PCRE_UTF8) != 0)
4215     {
4216     int rc;
4217     pcre_uint32 oc, od;
4218 
4219     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4220     c = start;
4221 
4222     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4223       {
4224       /* Handle a single character that has more than one other case. */
4225 
4226       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4227         PRIV(ucd_caseless_sets) + rc, oc);
4228 
4229       /* Do nothing if the other case range is within the original range. */
4230 
4231       else if (oc >= start && od <= end) continue;
4232 
4233       /* Extend the original range if there is overlap, noting that if oc < c, we
4234       can't have od > end because a subrange is always shorter than the basic
4235       range. Otherwise, use a recursive call to add the additional range. */
4236 
4237       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4238       else if (od > end && oc <= end + 1)
4239         {
4240         end = od;       /* Extend upwards */
4241         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4242         }
4243       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4244       }
4245     }
4246   else
4247 #endif  /* SUPPORT_UCP */
4248 
4249   /* Not UTF-mode, or no UCP */
4250 
4251   for (c = start; c <= classbits_end; c++)
4252     {
4253     SETBIT(classbits, cd->fcc[c]);
4254     n8++;
4255     }
4256   }
4257 
4258 /* Now handle the original range. Adjust the final value according to the bit
4259 length - this means that the same lists of (e.g.) horizontal spaces can be used
4260 in all cases. */
4261 
4262 #if defined COMPILE_PCRE8
4263 #ifdef SUPPORT_UTF
4264   if ((options & PCRE_UTF8) == 0)
4265 #endif
4266   if (end > 0xff) end = 0xff;
4267 
4268 #elif defined COMPILE_PCRE16
4269 #ifdef SUPPORT_UTF
4270   if ((options & PCRE_UTF16) == 0)
4271 #endif
4272   if (end > 0xffff) end = 0xffff;
4273 
4274 #endif /* COMPILE_PCRE[8|16] */
4275 
4276 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4277 
4278 for (c = start; c <= classbits_end; c++)
4279   {
4280   /* Regardless of start, c will always be <= 255. */
4281   SETBIT(classbits, c);
4282   n8++;
4283   }
4284 
4285 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4286 if (start <= 0xff) start = 0xff + 1;
4287 
4288 if (end >= start)
4289   {
4290   pcre_uchar *uchardata = *uchardptr;
4291 #ifdef SUPPORT_UTF
4292   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4293     {
4294     if (start < end)
4295       {
4296       *uchardata++ = XCL_RANGE;
4297       uchardata += PRIV(ord2utf)(start, uchardata);
4298       uchardata += PRIV(ord2utf)(end, uchardata);
4299       }
4300     else if (start == end)
4301       {
4302       *uchardata++ = XCL_SINGLE;
4303       uchardata += PRIV(ord2utf)(start, uchardata);
4304       }
4305     }
4306   else
4307 #endif  /* SUPPORT_UTF */
4308 
4309   /* Without UTF support, character values are constrained by the bit length,
4310   and can only be > 256 for 16-bit and 32-bit libraries. */
4311 
4312 #ifdef COMPILE_PCRE8
4313     {}
4314 #else
4315   if (start < end)
4316     {
4317     *uchardata++ = XCL_RANGE;
4318     *uchardata++ = start;
4319     *uchardata++ = end;
4320     }
4321   else if (start == end)
4322     {
4323     *uchardata++ = XCL_SINGLE;
4324     *uchardata++ = start;
4325     }
4326 #endif
4327 
4328   *uchardptr = uchardata;   /* Updata extra data pointer */
4329   }
4330 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4331 
4332 return n8;    /* Number of 8-bit characters */
4333 }
4334 
4335 
4336 
4337 
4338 /*************************************************
4339 *        Add a list of characters to a class     *
4340 *************************************************/
4341 
4342 /* This function is used for adding a list of case-equivalent characters to a
4343 class, and also for adding a list of horizontal or vertical whitespace. If the
4344 list is in order (which it should be), ranges of characters are detected and
4345 handled appropriately. This function is mutually recursive with the function
4346 above.
4347 
4348 Arguments:
4349   classbits     the bit map for characters < 256
4350   uchardptr     points to the pointer for extra data
4351   options       the options word
4352   cd            contains pointers to tables etc.
4353   p             points to row of 32-bit values, terminated by NOTACHAR
4354   except        character to omit; this is used when adding lists of
4355                   case-equivalent characters to avoid including the one we
4356                   already know about
4357 
4358 Returns:        the number of < 256 characters added
4359                 the pointer to extra data is updated
4360 */
4361 
4362 static int
add_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p,unsigned int except)4363 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4364   compile_data *cd, const pcre_uint32 *p, unsigned int except)
4365 {
4366 int n8 = 0;
4367 while (p[0] < NOTACHAR)
4368   {
4369   int n = 0;
4370   if (p[0] != except)
4371     {
4372     while(p[n+1] == p[0] + n + 1) n++;
4373     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4374     }
4375   p += n + 1;
4376   }
4377 return n8;
4378 }
4379 
4380 
4381 
4382 /*************************************************
4383 *    Add characters not in a list to a class     *
4384 *************************************************/
4385 
4386 /* This function is used for adding the complement of a list of horizontal or
4387 vertical whitespace to a class. The list must be in order.
4388 
4389 Arguments:
4390   classbits     the bit map for characters < 256
4391   uchardptr     points to the pointer for extra data
4392   options       the options word
4393   cd            contains pointers to tables etc.
4394   p             points to row of 32-bit values, terminated by NOTACHAR
4395 
4396 Returns:        the number of < 256 characters added
4397                 the pointer to extra data is updated
4398 */
4399 
4400 static int
add_not_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p)4401 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4402   int options, compile_data *cd, const pcre_uint32 *p)
4403 {
4404 BOOL utf = (options & PCRE_UTF8) != 0;
4405 int n8 = 0;
4406 if (p[0] > 0)
4407   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4408 while (p[0] < NOTACHAR)
4409   {
4410   while (p[1] == p[0] + 1) p++;
4411   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4412     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4413   p++;
4414   }
4415 return n8;
4416 }
4417 
4418 
4419 
4420 /*************************************************
4421 *           Compile one branch                   *
4422 *************************************************/
4423 
4424 /* Scan the pattern, compiling it into the a vector. If the options are
4425 changed during the branch, the pointer is used to change the external options
4426 bits. This function is used during the pre-compile phase when we are trying
4427 to find out the amount of memory needed, as well as during the real compile
4428 phase. The value of lengthptr distinguishes the two phases.
4429 
4430 Arguments:
4431   optionsptr        pointer to the option bits
4432   codeptr           points to the pointer to the current code point
4433   ptrptr            points to the current pattern pointer
4434   errorcodeptr      points to error code variable
4435   firstcharptr      place to put the first required character
4436   firstcharflagsptr place to put the first character flags, or a negative number
4437   reqcharptr        place to put the last required character
4438   reqcharflagsptr   place to put the last required character flags, or a negative number
4439   bcptr             points to current branch chain
4440   cond_depth        conditional nesting depth
4441   cd                contains pointers to tables etc.
4442   lengthptr         NULL during the real compile phase
4443                     points to length accumulator during pre-compile phase
4444 
4445 Returns:            TRUE on success
4446                     FALSE, with *errorcodeptr set non-zero on error
4447 */
4448 
4449 static BOOL
compile_branch(int * optionsptr,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,int cond_depth,compile_data * cd,int * lengthptr)4450 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4451   const pcre_uchar **ptrptr, int *errorcodeptr,
4452   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4453   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4454   branch_chain *bcptr, int cond_depth,
4455   compile_data *cd, int *lengthptr)
4456 {
4457 int repeat_type, op_type;
4458 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4459 int bravalue = 0;
4460 int greedy_default, greedy_non_default;
4461 pcre_uint32 firstchar, reqchar;
4462 pcre_int32 firstcharflags, reqcharflags;
4463 pcre_uint32 zeroreqchar, zerofirstchar;
4464 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4465 pcre_int32 req_caseopt, reqvary, tempreqvary;
4466 int options = *optionsptr;               /* May change dynamically */
4467 int after_manual_callout = 0;
4468 int length_prevgroup = 0;
4469 register pcre_uint32 c;
4470 int escape;
4471 register pcre_uchar *code = *codeptr;
4472 pcre_uchar *last_code = code;
4473 pcre_uchar *orig_code = code;
4474 pcre_uchar *tempcode;
4475 BOOL inescq = FALSE;
4476 BOOL groupsetfirstchar = FALSE;
4477 const pcre_uchar *ptr = *ptrptr;
4478 const pcre_uchar *tempptr;
4479 const pcre_uchar *nestptr = NULL;
4480 pcre_uchar *previous = NULL;
4481 pcre_uchar *previous_callout = NULL;
4482 size_t item_hwm_offset = 0;
4483 pcre_uint8 classbits[32];
4484 
4485 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4486 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4487 dynamically as we process the pattern. */
4488 
4489 #ifdef SUPPORT_UTF
4490 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4491 BOOL utf = (options & PCRE_UTF8) != 0;
4492 #ifndef COMPILE_PCRE32
4493 pcre_uchar utf_chars[6];
4494 #endif
4495 #else
4496 BOOL utf = FALSE;
4497 #endif
4498 
4499 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4500 class_uchardata always so that it can be passed to add_to_class() always,
4501 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4502 alternative calls for the different cases. */
4503 
4504 pcre_uchar *class_uchardata;
4505 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4506 BOOL xclass;
4507 pcre_uchar *class_uchardata_base;
4508 #endif
4509 
4510 #ifdef PCRE_DEBUG
4511 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4512 #endif
4513 
4514 /* Set up the default and non-default settings for greediness */
4515 
4516 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4517 greedy_non_default = greedy_default ^ 1;
4518 
4519 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4520 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4521 matches a non-fixed char first char; reqchar just remains unset if we never
4522 find one.
4523 
4524 When we hit a repeat whose minimum is zero, we may have to adjust these values
4525 to take the zero repeat into account. This is implemented by setting them to
4526 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4527 item types that can be repeated set these backoff variables appropriately. */
4528 
4529 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4530 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4531 
4532 /* The variable req_caseopt contains either the REQ_CASELESS value
4533 or zero, according to the current setting of the caseless flag. The
4534 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4535 firstchar or reqchar variables to record the case status of the
4536 value. This is used only for ASCII characters. */
4537 
4538 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4539 
4540 /* Switch on next character until the end of the branch */
4541 
4542 for (;; ptr++)
4543   {
4544   BOOL negate_class;
4545   BOOL should_flip_negation;
4546   BOOL possessive_quantifier;
4547   BOOL is_quantifier;
4548   BOOL is_recurse;
4549   BOOL reset_bracount;
4550   int class_has_8bitchar;
4551   int class_one_char;
4552 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4553   BOOL xclass_has_prop;
4554 #endif
4555   int newoptions;
4556   int recno;
4557   int refsign;
4558   int skipbytes;
4559   pcre_uint32 subreqchar, subfirstchar;
4560   pcre_int32 subreqcharflags, subfirstcharflags;
4561   int terminator;
4562   unsigned int mclength;
4563   unsigned int tempbracount;
4564   pcre_uint32 ec;
4565   pcre_uchar mcbuffer[8];
4566 
4567   /* Get next character in the pattern */
4568 
4569   c = *ptr;
4570 
4571   /* If we are at the end of a nested substitution, revert to the outer level
4572   string. Nesting only happens one level deep. */
4573 
4574   if (c == CHAR_NULL && nestptr != NULL)
4575     {
4576     ptr = nestptr;
4577     nestptr = NULL;
4578     c = *ptr;
4579     }
4580 
4581   /* If we are in the pre-compile phase, accumulate the length used for the
4582   previous cycle of this loop. */
4583 
4584   if (lengthptr != NULL)
4585     {
4586 #ifdef PCRE_DEBUG
4587     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4588 #endif
4589     if (code > cd->start_workspace + cd->workspace_size -
4590         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4591       {
4592       *errorcodeptr = ERR52;
4593       goto FAILED;
4594       }
4595 
4596     /* There is at least one situation where code goes backwards: this is the
4597     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4598     the class is simply eliminated. However, it is created first, so we have to
4599     allow memory for it. Therefore, don't ever reduce the length at this point.
4600     */
4601 
4602     if (code < last_code) code = last_code;
4603 
4604     /* Paranoid check for integer overflow */
4605 
4606     if (OFLOW_MAX - *lengthptr < code - last_code)
4607       {
4608       *errorcodeptr = ERR20;
4609       goto FAILED;
4610       }
4611 
4612     *lengthptr += (int)(code - last_code);
4613     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4614       (int)(code - last_code), c, c));
4615 
4616     /* If "previous" is set and it is not at the start of the work space, move
4617     it back to there, in order to avoid filling up the work space. Otherwise,
4618     if "previous" is NULL, reset the current code pointer to the start. */
4619 
4620     if (previous != NULL)
4621       {
4622       if (previous > orig_code)
4623         {
4624         memmove(orig_code, previous, IN_UCHARS(code - previous));
4625         code -= previous - orig_code;
4626         previous = orig_code;
4627         }
4628       }
4629     else code = orig_code;
4630 
4631     /* Remember where this code item starts so we can pick up the length
4632     next time round. */
4633 
4634     last_code = code;
4635     }
4636 
4637   /* In the real compile phase, just check the workspace used by the forward
4638   reference list. */
4639 
4640   else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4641     {
4642     *errorcodeptr = ERR52;
4643     goto FAILED;
4644     }
4645 
4646   /* If in \Q...\E, check for the end; if not, we have a literal */
4647 
4648   if (inescq && c != CHAR_NULL)
4649     {
4650     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4651       {
4652       inescq = FALSE;
4653       ptr++;
4654       continue;
4655       }
4656     else
4657       {
4658       if (previous_callout != NULL)
4659         {
4660         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4661           complete_callout(previous_callout, ptr, cd);
4662         previous_callout = NULL;
4663         }
4664       if ((options & PCRE_AUTO_CALLOUT) != 0)
4665         {
4666         previous_callout = code;
4667         code = auto_callout(code, ptr, cd);
4668         }
4669       goto NORMAL_CHAR;
4670       }
4671     /* Control does not reach here. */
4672     }
4673 
4674   /* In extended mode, skip white space and comments. We need a loop in order
4675   to check for more white space and more comments after a comment. */
4676 
4677   if ((options & PCRE_EXTENDED) != 0)
4678     {
4679     for (;;)
4680       {
4681       while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4682       if (c != CHAR_NUMBER_SIGN) break;
4683       ptr++;
4684       while (*ptr != CHAR_NULL)
4685         {
4686         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4687           {                          /* IS_NEWLINE sets cd->nllen. */
4688           ptr += cd->nllen;
4689           break;
4690           }
4691         ptr++;
4692 #ifdef SUPPORT_UTF
4693         if (utf) FORWARDCHAR(ptr);
4694 #endif
4695         }
4696       c = *ptr;     /* Either NULL or the char after a newline */
4697       }
4698     }
4699 
4700   /* See if the next thing is a quantifier. */
4701 
4702   is_quantifier =
4703     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4704     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4705 
4706   /* Fill in length of a previous callout, except when the next thing is a
4707   quantifier or when processing a property substitution string in UCP mode. */
4708 
4709   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4710        after_manual_callout-- <= 0)
4711     {
4712     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4713       complete_callout(previous_callout, ptr, cd);
4714     previous_callout = NULL;
4715     }
4716 
4717   /* Create auto callout, except for quantifiers, or while processing property
4718   strings that are substituted for \w etc in UCP mode. */
4719 
4720   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4721     {
4722     previous_callout = code;
4723     code = auto_callout(code, ptr, cd);
4724     }
4725 
4726   /* Process the next pattern item. */
4727 
4728   switch(c)
4729     {
4730     /* ===================================================================*/
4731     case CHAR_NULL:                /* The branch terminates at string end */
4732     case CHAR_VERTICAL_LINE:       /* or | or ) */
4733     case CHAR_RIGHT_PARENTHESIS:
4734     *firstcharptr = firstchar;
4735     *firstcharflagsptr = firstcharflags;
4736     *reqcharptr = reqchar;
4737     *reqcharflagsptr = reqcharflags;
4738     *codeptr = code;
4739     *ptrptr = ptr;
4740     if (lengthptr != NULL)
4741       {
4742       if (OFLOW_MAX - *lengthptr < code - last_code)
4743         {
4744         *errorcodeptr = ERR20;
4745         goto FAILED;
4746         }
4747       *lengthptr += (int)(code - last_code);   /* To include callout length */
4748       DPRINTF((">> end branch\n"));
4749       }
4750     return TRUE;
4751 
4752 
4753     /* ===================================================================*/
4754     /* Handle single-character metacharacters. In multiline mode, ^ disables
4755     the setting of any following char as a first character. */
4756 
4757     case CHAR_CIRCUMFLEX_ACCENT:
4758     previous = NULL;
4759     if ((options & PCRE_MULTILINE) != 0)
4760       {
4761       if (firstcharflags == REQ_UNSET)
4762         zerofirstcharflags = firstcharflags = REQ_NONE;
4763       *code++ = OP_CIRCM;
4764       }
4765     else *code++ = OP_CIRC;
4766     break;
4767 
4768     case CHAR_DOLLAR_SIGN:
4769     previous = NULL;
4770     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4771     break;
4772 
4773     /* There can never be a first char if '.' is first, whatever happens about
4774     repeats. The value of reqchar doesn't change either. */
4775 
4776     case CHAR_DOT:
4777     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4778     zerofirstchar = firstchar;
4779     zerofirstcharflags = firstcharflags;
4780     zeroreqchar = reqchar;
4781     zeroreqcharflags = reqcharflags;
4782     previous = code;
4783     item_hwm_offset = cd->hwm - cd->start_workspace;
4784     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4785     break;
4786 
4787 
4788     /* ===================================================================*/
4789     /* Character classes. If the included characters are all < 256, we build a
4790     32-byte bitmap of the permitted characters, except in the special case
4791     where there is only one such character. For negated classes, we build the
4792     map as usual, then invert it at the end. However, we use a different opcode
4793     so that data characters > 255 can be handled correctly.
4794 
4795     If the class contains characters outside the 0-255 range, a different
4796     opcode is compiled. It may optionally have a bit map for characters < 256,
4797     but those above are are explicitly listed afterwards. A flag byte tells
4798     whether the bitmap is present, and whether this is a negated class or not.
4799 
4800     In JavaScript compatibility mode, an isolated ']' causes an error. In
4801     default (Perl) mode, it is treated as a data character. */
4802 
4803     case CHAR_RIGHT_SQUARE_BRACKET:
4804     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4805       {
4806       *errorcodeptr = ERR64;
4807       goto FAILED;
4808       }
4809     goto NORMAL_CHAR;
4810 
4811     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4812     used for "start of word" and "end of word". As these are otherwise illegal
4813     sequences, we don't break anything by recognizing them. They are replaced
4814     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4815     erroneous and are handled by the normal code below. */
4816 
4817     case CHAR_LEFT_SQUARE_BRACKET:
4818     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4819       {
4820       nestptr = ptr + 7;
4821       ptr = sub_start_of_word - 1;
4822       continue;
4823       }
4824 
4825     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4826       {
4827       nestptr = ptr + 7;
4828       ptr = sub_end_of_word - 1;
4829       continue;
4830       }
4831 
4832     /* Handle a real character class. */
4833 
4834     previous = code;
4835     item_hwm_offset = cd->hwm - cd->start_workspace;
4836 
4837     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4838     they are encountered at the top level, so we'll do that too. */
4839 
4840     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4841          ptr[1] == CHAR_EQUALS_SIGN) &&
4842         check_posix_syntax(ptr, &tempptr))
4843       {
4844       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4845       goto FAILED;
4846       }
4847 
4848     /* If the first character is '^', set the negation flag and skip it. Also,
4849     if the first few characters (either before or after ^) are \Q\E or \E we
4850     skip them too. This makes for compatibility with Perl. */
4851 
4852     negate_class = FALSE;
4853     for (;;)
4854       {
4855       c = *(++ptr);
4856       if (c == CHAR_BACKSLASH)
4857         {
4858         if (ptr[1] == CHAR_E)
4859           ptr++;
4860         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4861           ptr += 3;
4862         else
4863           break;
4864         }
4865       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4866         negate_class = TRUE;
4867       else break;
4868       }
4869 
4870     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4871     an initial ']' is taken as a data character -- the code below handles
4872     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4873     [^] must match any character, so generate OP_ALLANY. */
4874 
4875     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4876         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4877       {
4878       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4879       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4880       zerofirstchar = firstchar;
4881       zerofirstcharflags = firstcharflags;
4882       break;
4883       }
4884 
4885     /* If a class contains a negative special such as \S, we need to flip the
4886     negation flag at the end, so that support for characters > 255 works
4887     correctly (they are all included in the class). */
4888 
4889     should_flip_negation = FALSE;
4890 
4891     /* Extended class (xclass) will be used when characters > 255
4892     might match. */
4893 
4894 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4895     xclass = FALSE;
4896     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4897     class_uchardata_base = class_uchardata;   /* Save the start */
4898 #endif
4899 
4900     /* For optimization purposes, we track some properties of the class:
4901     class_has_8bitchar will be non-zero if the class contains at least one <
4902     256 character; class_one_char will be 1 if the class contains just one
4903     character; xclass_has_prop will be TRUE if unicode property checks
4904     are present in the class. */
4905 
4906     class_has_8bitchar = 0;
4907     class_one_char = 0;
4908 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4909     xclass_has_prop = FALSE;
4910 #endif
4911 
4912     /* Initialize the 32-char bit map to all zeros. We build the map in a
4913     temporary bit of memory, in case the class contains fewer than two
4914     8-bit characters because in that case the compiled code doesn't use the bit
4915     map. */
4916 
4917     memset(classbits, 0, 32 * sizeof(pcre_uint8));
4918 
4919     /* Process characters until ] is reached. By writing this as a "do" it
4920     means that an initial ] is taken as a data character. At the start of the
4921     loop, c contains the first byte of the character. */
4922 
4923     if (c != CHAR_NULL) do
4924       {
4925       const pcre_uchar *oldptr;
4926 
4927 #ifdef SUPPORT_UTF
4928       if (utf && HAS_EXTRALEN(c))
4929         {                           /* Braces are required because the */
4930         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4931         }
4932 #endif
4933 
4934 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4935       /* In the pre-compile phase, accumulate the length of any extra
4936       data and reset the pointer. This is so that very large classes that
4937       contain a zillion > 255 characters no longer overwrite the work space
4938       (which is on the stack). We have to remember that there was XCLASS data,
4939       however. */
4940 
4941       if (class_uchardata > class_uchardata_base) xclass = TRUE;
4942 
4943       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4944         {
4945         *lengthptr += (int)(class_uchardata - class_uchardata_base);
4946         class_uchardata = class_uchardata_base;
4947         }
4948 #endif
4949 
4950       /* Inside \Q...\E everything is literal except \E */
4951 
4952       if (inescq)
4953         {
4954         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4955           {
4956           inescq = FALSE;                   /* Reset literal state */
4957           ptr++;                            /* Skip the 'E' */
4958           continue;                         /* Carry on with next */
4959           }
4960         goto CHECK_RANGE;                   /* Could be range if \E follows */
4961         }
4962 
4963       /* Handle POSIX class names. Perl allows a negation extension of the
4964       form [:^name:]. A square bracket that doesn't match the syntax is
4965       treated as a literal. We also recognize the POSIX constructions
4966       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4967       5.6 and 5.8 do. */
4968 
4969       if (c == CHAR_LEFT_SQUARE_BRACKET &&
4970           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4971            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4972         {
4973         BOOL local_negate = FALSE;
4974         int posix_class, taboffset, tabopt;
4975         register const pcre_uint8 *cbits = cd->cbits;
4976         pcre_uint8 pbits[32];
4977 
4978         if (ptr[1] != CHAR_COLON)
4979           {
4980           *errorcodeptr = ERR31;
4981           goto FAILED;
4982           }
4983 
4984         ptr += 2;
4985         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4986           {
4987           local_negate = TRUE;
4988           should_flip_negation = TRUE;  /* Note negative special */
4989           ptr++;
4990           }
4991 
4992         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4993         if (posix_class < 0)
4994           {
4995           *errorcodeptr = ERR30;
4996           goto FAILED;
4997           }
4998 
4999         /* If matching is caseless, upper and lower are converted to
5000         alpha. This relies on the fact that the class table starts with
5001         alpha, lower, upper as the first 3 entries. */
5002 
5003         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5004           posix_class = 0;
5005 
5006         /* When PCRE_UCP is set, some of the POSIX classes are converted to
5007         different escape sequences that use Unicode properties \p or \P. Others
5008         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5009         directly. */
5010 
5011 #ifdef SUPPORT_UCP
5012         if ((options & PCRE_UCP) != 0)
5013           {
5014           unsigned int ptype = 0;
5015           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5016 
5017           /* The posix_substitutes table specifies which POSIX classes can be
5018           converted to \p or \P items. */
5019 
5020           if (posix_substitutes[pc] != NULL)
5021             {
5022             nestptr = tempptr + 1;
5023             ptr = posix_substitutes[pc] - 1;
5024             continue;
5025             }
5026 
5027           /* There are three other classes that generate special property calls
5028           that are recognized only in an XCLASS. */
5029 
5030           else switch(posix_class)
5031             {
5032             case PC_GRAPH:
5033             ptype = PT_PXGRAPH;
5034             /* Fall through */
5035             case PC_PRINT:
5036             if (ptype == 0) ptype = PT_PXPRINT;
5037             /* Fall through */
5038             case PC_PUNCT:
5039             if (ptype == 0) ptype = PT_PXPUNCT;
5040             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5041             *class_uchardata++ = ptype;
5042             *class_uchardata++ = 0;
5043             xclass_has_prop = TRUE;
5044             ptr = tempptr + 1;
5045             continue;
5046 
5047             /* For the other POSIX classes (ascii, xdigit) we are going to fall
5048             through to the non-UCP case and build a bit map for characters with
5049             code points less than 256. If we are in a negated POSIX class
5050             within a non-negated overall class, characters with code points
5051             greater than 255 must all match. In the special case where we have
5052             not yet generated any xclass data, and this is the final item in
5053             the overall class, we need do nothing: later on, the opcode
5054             OP_NCLASS will be used to indicate that characters greater than 255
5055             are acceptable. If we have already seen an xclass item or one may
5056             follow (we have to assume that it might if this is not the end of
5057             the class), explicitly match all wide codepoints. */
5058 
5059             default:
5060             if (!negate_class && local_negate &&
5061                 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5062               {
5063               *class_uchardata++ = XCL_RANGE;
5064               class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5065               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5066               }
5067             break;
5068             }
5069           }
5070 #endif
5071         /* In the non-UCP case, or when UCP makes no difference, we build the
5072         bit map for the POSIX class in a chunk of local store because we may be
5073         adding and subtracting from it, and we don't want to subtract bits that
5074         may be in the main map already. At the end we or the result into the
5075         bit map that is being built. */
5076 
5077         posix_class *= 3;
5078 
5079         /* Copy in the first table (always present) */
5080 
5081         memcpy(pbits, cbits + posix_class_maps[posix_class],
5082           32 * sizeof(pcre_uint8));
5083 
5084         /* If there is a second table, add or remove it as required. */
5085 
5086         taboffset = posix_class_maps[posix_class + 1];
5087         tabopt = posix_class_maps[posix_class + 2];
5088 
5089         if (taboffset >= 0)
5090           {
5091           if (tabopt >= 0)
5092             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5093           else
5094             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5095           }
5096 
5097         /* Now see if we need to remove any special characters. An option
5098         value of 1 removes vertical space and 2 removes underscore. */
5099 
5100         if (tabopt < 0) tabopt = -tabopt;
5101         if (tabopt == 1) pbits[1] &= ~0x3c;
5102           else if (tabopt == 2) pbits[11] &= 0x7f;
5103 
5104         /* Add the POSIX table or its complement into the main table that is
5105         being built and we are done. */
5106 
5107         if (local_negate)
5108           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5109         else
5110           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5111 
5112         ptr = tempptr + 1;
5113         /* Every class contains at least one < 256 character. */
5114         class_has_8bitchar = 1;
5115         /* Every class contains at least two characters. */
5116         class_one_char = 2;
5117         continue;    /* End of POSIX syntax handling */
5118         }
5119 
5120       /* Backslash may introduce a single character, or it may introduce one
5121       of the specials, which just set a flag. The sequence \b is a special
5122       case. Inside a class (and only there) it is treated as backspace. We
5123       assume that other escapes have more than one character in them, so
5124       speculatively set both class_has_8bitchar and class_one_char bigger
5125       than one. Unrecognized escapes fall through and are either treated
5126       as literal characters (by default), or are faulted if
5127       PCRE_EXTRA is set. */
5128 
5129       if (c == CHAR_BACKSLASH)
5130         {
5131         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5132           TRUE);
5133         if (*errorcodeptr != 0) goto FAILED;
5134         if (escape == 0) c = ec;
5135         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5136         else if (escape == ESC_N)          /* \N is not supported in a class */
5137           {
5138           *errorcodeptr = ERR71;
5139           goto FAILED;
5140           }
5141         else if (escape == ESC_Q)            /* Handle start of quoted string */
5142           {
5143           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5144             {
5145             ptr += 2; /* avoid empty string */
5146             }
5147           else inescq = TRUE;
5148           continue;
5149           }
5150         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5151 
5152         else
5153           {
5154           register const pcre_uint8 *cbits = cd->cbits;
5155           /* Every class contains at least two < 256 characters. */
5156           class_has_8bitchar++;
5157           /* Every class contains at least two characters. */
5158           class_one_char += 2;
5159 
5160           switch (escape)
5161             {
5162 #ifdef SUPPORT_UCP
5163             case ESC_du:     /* These are the values given for \d etc */
5164             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5165             case ESC_wu:     /* escape sequence with an appropriate \p */
5166             case ESC_WU:     /* or \P to test Unicode properties instead */
5167             case ESC_su:     /* of the default ASCII testing. */
5168             case ESC_SU:
5169             nestptr = ptr;
5170             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5171             class_has_8bitchar--;                /* Undo! */
5172             continue;
5173 #endif
5174             case ESC_d:
5175             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5176             continue;
5177 
5178             case ESC_D:
5179             should_flip_negation = TRUE;
5180             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5181             continue;
5182 
5183             case ESC_w:
5184             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5185             continue;
5186 
5187             case ESC_W:
5188             should_flip_negation = TRUE;
5189             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5190             continue;
5191 
5192             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5193             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5194             previously set by something earlier in the character class.
5195             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5196             we could just adjust the appropriate bit. From PCRE 8.34 we no
5197             longer treat \s and \S specially. */
5198 
5199             case ESC_s:
5200             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5201             continue;
5202 
5203             case ESC_S:
5204             should_flip_negation = TRUE;
5205             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5206             continue;
5207 
5208             /* The rest apply in both UCP and non-UCP cases. */
5209 
5210             case ESC_h:
5211             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5212               PRIV(hspace_list), NOTACHAR);
5213             continue;
5214 
5215             case ESC_H:
5216             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5217               cd, PRIV(hspace_list));
5218             continue;
5219 
5220             case ESC_v:
5221             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5222               PRIV(vspace_list), NOTACHAR);
5223             continue;
5224 
5225             case ESC_V:
5226             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5227               cd, PRIV(vspace_list));
5228             continue;
5229 
5230             case ESC_p:
5231             case ESC_P:
5232 #ifdef SUPPORT_UCP
5233               {
5234               BOOL negated;
5235               unsigned int ptype = 0, pdata = 0;
5236               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5237                 goto FAILED;
5238               *class_uchardata++ = ((escape == ESC_p) != negated)?
5239                 XCL_PROP : XCL_NOTPROP;
5240               *class_uchardata++ = ptype;
5241               *class_uchardata++ = pdata;
5242               xclass_has_prop = TRUE;
5243               class_has_8bitchar--;                /* Undo! */
5244               continue;
5245               }
5246 #else
5247             *errorcodeptr = ERR45;
5248             goto FAILED;
5249 #endif
5250             /* Unrecognized escapes are faulted if PCRE is running in its
5251             strict mode. By default, for compatibility with Perl, they are
5252             treated as literals. */
5253 
5254             default:
5255             if ((options & PCRE_EXTRA) != 0)
5256               {
5257               *errorcodeptr = ERR7;
5258               goto FAILED;
5259               }
5260             class_has_8bitchar--;    /* Undo the speculative increase. */
5261             class_one_char -= 2;     /* Undo the speculative increase. */
5262             c = *ptr;                /* Get the final character and fall through */
5263             break;
5264             }
5265           }
5266 
5267         /* Fall through if the escape just defined a single character (c >= 0).
5268         This may be greater than 256. */
5269 
5270         escape = 0;
5271 
5272         }   /* End of backslash handling */
5273 
5274       /* A character may be followed by '-' to form a range. However, Perl does
5275       not permit ']' to be the end of the range. A '-' character at the end is
5276       treated as a literal. Perl ignores orphaned \E sequences entirely. The
5277       code for handling \Q and \E is messy. */
5278 
5279       CHECK_RANGE:
5280       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5281         {
5282         inescq = FALSE;
5283         ptr += 2;
5284         }
5285       oldptr = ptr;
5286 
5287       /* Remember if \r or \n were explicitly used */
5288 
5289       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5290 
5291       /* Check for range */
5292 
5293       if (!inescq && ptr[1] == CHAR_MINUS)
5294         {
5295         pcre_uint32 d;
5296         ptr += 2;
5297         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5298 
5299         /* If we hit \Q (not followed by \E) at this point, go into escaped
5300         mode. */
5301 
5302         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5303           {
5304           ptr += 2;
5305           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5306             { ptr += 2; continue; }
5307           inescq = TRUE;
5308           break;
5309           }
5310 
5311         /* Minus (hyphen) at the end of a class is treated as a literal, so put
5312         back the pointer and jump to handle the character that preceded it. */
5313 
5314         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5315           {
5316           ptr = oldptr;
5317           goto CLASS_SINGLE_CHARACTER;
5318           }
5319 
5320         /* Otherwise, we have a potential range; pick up the next character */
5321 
5322 #ifdef SUPPORT_UTF
5323         if (utf)
5324           {                           /* Braces are required because the */
5325           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5326           }
5327         else
5328 #endif
5329         d = *ptr;  /* Not UTF-8 mode */
5330 
5331         /* The second part of a range can be a single-character escape
5332         sequence, but not any of the other escapes. Perl treats a hyphen as a
5333         literal in such circumstances. However, in Perl's warning mode, a
5334         warning is given, so PCRE now faults it as it is almost certainly a
5335         mistake on the user's part. */
5336 
5337         if (!inescq)
5338           {
5339           if (d == CHAR_BACKSLASH)
5340             {
5341             int descape;
5342             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5343             if (*errorcodeptr != 0) goto FAILED;
5344 
5345             /* 0 means a character was put into d; \b is backspace; any other
5346             special causes an error. */
5347 
5348             if (descape != 0)
5349               {
5350               if (descape == ESC_b) d = CHAR_BS; else
5351                 {
5352                 *errorcodeptr = ERR83;
5353                 goto FAILED;
5354                 }
5355               }
5356             }
5357 
5358           /* A hyphen followed by a POSIX class is treated in the same way. */
5359 
5360           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5361                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5362                     ptr[1] == CHAR_EQUALS_SIGN) &&
5363                    check_posix_syntax(ptr, &tempptr))
5364             {
5365             *errorcodeptr = ERR83;
5366             goto FAILED;
5367             }
5368           }
5369 
5370         /* Check that the two values are in the correct order. Optimize
5371         one-character ranges. */
5372 
5373         if (d < c)
5374           {
5375           *errorcodeptr = ERR8;
5376           goto FAILED;
5377           }
5378         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5379 
5380         /* We have found a character range, so single character optimizations
5381         cannot be done anymore. Any value greater than 1 indicates that there
5382         is more than one character. */
5383 
5384         class_one_char = 2;
5385 
5386         /* Remember an explicit \r or \n, and add the range to the class. */
5387 
5388         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5389 
5390         class_has_8bitchar +=
5391           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5392 
5393         continue;   /* Go get the next char in the class */
5394         }
5395 
5396       /* Handle a single character - we can get here for a normal non-escape
5397       char, or after \ that introduces a single character or for an apparent
5398       range that isn't. Only the value 1 matters for class_one_char, so don't
5399       increase it if it is already 2 or more ... just in case there's a class
5400       with a zillion characters in it. */
5401 
5402       CLASS_SINGLE_CHARACTER:
5403       if (class_one_char < 2) class_one_char++;
5404 
5405       /* If xclass_has_prop is false and class_one_char is 1, we have the first
5406       single character in the class, and there have been no prior ranges, or
5407       XCLASS items generated by escapes. If this is the final character in the
5408       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5409       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5410       can cause firstchar to be set. Otherwise, there can be no first char if
5411       this item is first, whatever repeat count may follow. In the case of
5412       reqchar, save the previous value for reinstating. */
5413 
5414       if (!inescq &&
5415 #ifdef SUPPORT_UCP
5416           !xclass_has_prop &&
5417 #endif
5418           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5419         {
5420         ptr++;
5421         zeroreqchar = reqchar;
5422         zeroreqcharflags = reqcharflags;
5423 
5424         if (negate_class)
5425           {
5426 #ifdef SUPPORT_UCP
5427           int d;
5428 #endif
5429           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5430           zerofirstchar = firstchar;
5431           zerofirstcharflags = firstcharflags;
5432 
5433           /* For caseless UTF-8 mode when UCP support is available, check
5434           whether this character has more than one other case. If so, generate
5435           a special OP_NOTPROP item instead of OP_NOTI. */
5436 
5437 #ifdef SUPPORT_UCP
5438           if (utf && (options & PCRE_CASELESS) != 0 &&
5439               (d = UCD_CASESET(c)) != 0)
5440             {
5441             *code++ = OP_NOTPROP;
5442             *code++ = PT_CLIST;
5443             *code++ = d;
5444             }
5445           else
5446 #endif
5447           /* Char has only one other case, or UCP not available */
5448 
5449             {
5450             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5451 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5452             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5453               code += PRIV(ord2utf)(c, code);
5454             else
5455 #endif
5456               *code++ = c;
5457             }
5458 
5459           /* We are finished with this character class */
5460 
5461           goto END_CLASS;
5462           }
5463 
5464         /* For a single, positive character, get the value into mcbuffer, and
5465         then we can handle this with the normal one-character code. */
5466 
5467 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5468         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5469           mclength = PRIV(ord2utf)(c, mcbuffer);
5470         else
5471 #endif
5472           {
5473           mcbuffer[0] = c;
5474           mclength = 1;
5475           }
5476         goto ONE_CHAR;
5477         }       /* End of 1-char optimization */
5478 
5479       /* There is more than one character in the class, or an XCLASS item
5480       has been generated. Add this character to the class. */
5481 
5482       class_has_8bitchar +=
5483         add_to_class(classbits, &class_uchardata, options, cd, c, c);
5484       }
5485 
5486     /* Loop until ']' reached. This "while" is the end of the "do" far above.
5487     If we are at the end of an internal nested string, revert to the outer
5488     string. */
5489 
5490     while (((c = *(++ptr)) != CHAR_NULL ||
5491            (nestptr != NULL &&
5492              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5493            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5494 
5495     /* Check for missing terminating ']' */
5496 
5497     if (c == CHAR_NULL)
5498       {
5499       *errorcodeptr = ERR6;
5500       goto FAILED;
5501       }
5502 
5503     /* We will need an XCLASS if data has been placed in class_uchardata. In
5504     the second phase this is a sufficient test. However, in the pre-compile
5505     phase, class_uchardata gets emptied to prevent workspace overflow, so it
5506     only if the very last character in the class needs XCLASS will it contain
5507     anything at this point. For this reason, xclass gets set TRUE above when
5508     uchar_classdata is emptied, and that's why this code is the way it is here
5509     instead of just doing a test on class_uchardata below. */
5510 
5511 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5512     if (class_uchardata > class_uchardata_base) xclass = TRUE;
5513 #endif
5514 
5515     /* If this is the first thing in the branch, there can be no first char
5516     setting, whatever the repeat count. Any reqchar setting must remain
5517     unchanged after any kind of repeat. */
5518 
5519     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5520     zerofirstchar = firstchar;
5521     zerofirstcharflags = firstcharflags;
5522     zeroreqchar = reqchar;
5523     zeroreqcharflags = reqcharflags;
5524 
5525     /* If there are characters with values > 255, we have to compile an
5526     extended class, with its own opcode, unless there was a negated special
5527     such as \S in the class, and PCRE_UCP is not set, because in that case all
5528     characters > 255 are in the class, so any that were explicitly given as
5529     well can be ignored. If (when there are explicit characters > 255 that must
5530     be listed) there are no characters < 256, we can omit the bitmap in the
5531     actual compiled code. */
5532 
5533 #ifdef SUPPORT_UTF
5534     if (xclass && (xclass_has_prop || !should_flip_negation ||
5535         (options & PCRE_UCP) != 0))
5536 #elif !defined COMPILE_PCRE8
5537     if (xclass && (xclass_has_prop || !should_flip_negation))
5538 #endif
5539 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5540       {
5541       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5542       *code++ = OP_XCLASS;
5543       code += LINK_SIZE;
5544       *code = negate_class? XCL_NOT:0;
5545       if (xclass_has_prop) *code |= XCL_HASPROP;
5546 
5547       /* If the map is required, move up the extra data to make room for it;
5548       otherwise just move the code pointer to the end of the extra data. */
5549 
5550       if (class_has_8bitchar > 0)
5551         {
5552         *code++ |= XCL_MAP;
5553         memmove(code + (32 / sizeof(pcre_uchar)), code,
5554           IN_UCHARS(class_uchardata - code));
5555         if (negate_class && !xclass_has_prop)
5556           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5557         memcpy(code, classbits, 32);
5558         code = class_uchardata + (32 / sizeof(pcre_uchar));
5559         }
5560       else code = class_uchardata;
5561 
5562       /* Now fill in the complete length of the item */
5563 
5564       PUT(previous, 1, (int)(code - previous));
5565       break;   /* End of class handling */
5566       }
5567 
5568     /* Even though any XCLASS list is now discarded, we must allow for
5569     its memory. */
5570 
5571     if (lengthptr != NULL)
5572       *lengthptr += (int)(class_uchardata - class_uchardata_base);
5573 #endif
5574 
5575     /* If there are no characters > 255, or they are all to be included or
5576     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5577     whole class was negated and whether there were negative specials such as \S
5578     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5579     negating it if necessary. */
5580 
5581     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5582     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5583       {
5584       if (negate_class)
5585         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5586       memcpy(code, classbits, 32);
5587       }
5588     code += 32 / sizeof(pcre_uchar);
5589 
5590     END_CLASS:
5591     break;
5592 
5593 
5594     /* ===================================================================*/
5595     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5596     has been tested above. */
5597 
5598     case CHAR_LEFT_CURLY_BRACKET:
5599     if (!is_quantifier) goto NORMAL_CHAR;
5600     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5601     if (*errorcodeptr != 0) goto FAILED;
5602     goto REPEAT;
5603 
5604     case CHAR_ASTERISK:
5605     repeat_min = 0;
5606     repeat_max = -1;
5607     goto REPEAT;
5608 
5609     case CHAR_PLUS:
5610     repeat_min = 1;
5611     repeat_max = -1;
5612     goto REPEAT;
5613 
5614     case CHAR_QUESTION_MARK:
5615     repeat_min = 0;
5616     repeat_max = 1;
5617 
5618     REPEAT:
5619     if (previous == NULL)
5620       {
5621       *errorcodeptr = ERR9;
5622       goto FAILED;
5623       }
5624 
5625     if (repeat_min == 0)
5626       {
5627       firstchar = zerofirstchar;    /* Adjust for zero repeat */
5628       firstcharflags = zerofirstcharflags;
5629       reqchar = zeroreqchar;        /* Ditto */
5630       reqcharflags = zeroreqcharflags;
5631       }
5632 
5633     /* Remember whether this is a variable length repeat */
5634 
5635     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5636 
5637     op_type = 0;                    /* Default single-char op codes */
5638     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5639 
5640     /* Save start of previous item, in case we have to move it up in order to
5641     insert something before it. */
5642 
5643     tempcode = previous;
5644 
5645     /* Before checking for a possessive quantifier, we must skip over
5646     whitespace and comments in extended mode because Perl allows white space at
5647     this point. */
5648 
5649     if ((options & PCRE_EXTENDED) != 0)
5650       {
5651       const pcre_uchar *p = ptr + 1;
5652       for (;;)
5653         {
5654         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5655         if (*p != CHAR_NUMBER_SIGN) break;
5656         p++;
5657         while (*p != CHAR_NULL)
5658           {
5659           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5660             {                        /* IS_NEWLINE sets cd->nllen. */
5661             p += cd->nllen;
5662             break;
5663             }
5664           p++;
5665 #ifdef SUPPORT_UTF
5666           if (utf) FORWARDCHAR(p);
5667 #endif
5668           }           /* Loop for comment characters */
5669         }             /* Loop for multiple comments */
5670       ptr = p - 1;    /* Character before the next significant one. */
5671       }
5672 
5673     /* If the next character is '+', we have a possessive quantifier. This
5674     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5675     If the next character is '?' this is a minimizing repeat, by default,
5676     but if PCRE_UNGREEDY is set, it works the other way round. We change the
5677     repeat type to the non-default. */
5678 
5679     if (ptr[1] == CHAR_PLUS)
5680       {
5681       repeat_type = 0;                  /* Force greedy */
5682       possessive_quantifier = TRUE;
5683       ptr++;
5684       }
5685     else if (ptr[1] == CHAR_QUESTION_MARK)
5686       {
5687       repeat_type = greedy_non_default;
5688       ptr++;
5689       }
5690     else repeat_type = greedy_default;
5691 
5692     /* If previous was a recursion call, wrap it in atomic brackets so that
5693     previous becomes the atomic group. All recursions were so wrapped in the
5694     past, but it no longer happens for non-repeated recursions. In fact, the
5695     repeated ones could be re-implemented independently so as not to need this,
5696     but for the moment we rely on the code for repeating groups. */
5697 
5698     if (*previous == OP_RECURSE)
5699       {
5700       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5701       *previous = OP_ONCE;
5702       PUT(previous, 1, 2 + 2*LINK_SIZE);
5703       previous[2 + 2*LINK_SIZE] = OP_KET;
5704       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5705       code += 2 + 2 * LINK_SIZE;
5706       length_prevgroup = 3 + 3*LINK_SIZE;
5707 
5708       /* When actually compiling, we need to check whether this was a forward
5709       reference, and if so, adjust the offset. */
5710 
5711       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5712         {
5713         int offset = GET(cd->hwm, -LINK_SIZE);
5714         if (offset == previous + 1 - cd->start_code)
5715           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5716         }
5717       }
5718 
5719     /* Now handle repetition for the different types of item. */
5720 
5721     /* If previous was a character or negated character match, abolish the item
5722     and generate a repeat item instead. If a char item has a minimum of more
5723     than one, ensure that it is set in reqchar - it might not be if a sequence
5724     such as x{3} is the first thing in a branch because the x will have gone
5725     into firstchar instead.  */
5726 
5727     if (*previous == OP_CHAR || *previous == OP_CHARI
5728         || *previous == OP_NOT || *previous == OP_NOTI)
5729       {
5730       switch (*previous)
5731         {
5732         default: /* Make compiler happy. */
5733         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5734         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5735         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5736         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5737         }
5738 
5739       /* Deal with UTF characters that take up more than one character. It's
5740       easier to write this out separately than try to macrify it. Use c to
5741       hold the length of the character in bytes, plus UTF_LENGTH to flag that
5742       it's a length rather than a small character. */
5743 
5744 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5745       if (utf && NOT_FIRSTCHAR(code[-1]))
5746         {
5747         pcre_uchar *lastchar = code - 1;
5748         BACKCHAR(lastchar);
5749         c = (int)(code - lastchar);     /* Length of UTF-8 character */
5750         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5751         c |= UTF_LENGTH;                /* Flag c as a length */
5752         }
5753       else
5754 #endif /* SUPPORT_UTF */
5755 
5756       /* Handle the case of a single charater - either with no UTF support, or
5757       with UTF disabled, or for a single character UTF character. */
5758         {
5759         c = code[-1];
5760         if (*previous <= OP_CHARI && repeat_min > 1)
5761           {
5762           reqchar = c;
5763           reqcharflags = req_caseopt | cd->req_varyopt;
5764           }
5765         }
5766 
5767       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5768       }
5769 
5770     /* If previous was a character type match (\d or similar), abolish it and
5771     create a suitable repeat item. The code is shared with single-character
5772     repeats by setting op_type to add a suitable offset into repeat_type. Note
5773     that the Unicode property types will be present only when SUPPORT_UCP is
5774     defined, but we don't wrap the little bits of code here because it just
5775     makes it horribly messy. */
5776 
5777     else if (*previous < OP_EODN)
5778       {
5779       pcre_uchar *oldcode;
5780       int prop_type, prop_value;
5781       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5782       c = *previous;
5783 
5784       OUTPUT_SINGLE_REPEAT:
5785       if (*previous == OP_PROP || *previous == OP_NOTPROP)
5786         {
5787         prop_type = previous[1];
5788         prop_value = previous[2];
5789         }
5790       else prop_type = prop_value = -1;
5791 
5792       oldcode = code;
5793       code = previous;                  /* Usually overwrite previous item */
5794 
5795       /* If the maximum is zero then the minimum must also be zero; Perl allows
5796       this case, so we do too - by simply omitting the item altogether. */
5797 
5798       if (repeat_max == 0) goto END_REPEAT;
5799 
5800       /* Combine the op_type with the repeat_type */
5801 
5802       repeat_type += op_type;
5803 
5804       /* A minimum of zero is handled either as the special case * or ?, or as
5805       an UPTO, with the maximum given. */
5806 
5807       if (repeat_min == 0)
5808         {
5809         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5810           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5811         else
5812           {
5813           *code++ = OP_UPTO + repeat_type;
5814           PUT2INC(code, 0, repeat_max);
5815           }
5816         }
5817 
5818       /* A repeat minimum of 1 is optimized into some special cases. If the
5819       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5820       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5821       one less than the maximum. */
5822 
5823       else if (repeat_min == 1)
5824         {
5825         if (repeat_max == -1)
5826           *code++ = OP_PLUS + repeat_type;
5827         else
5828           {
5829           code = oldcode;                 /* leave previous item in place */
5830           if (repeat_max == 1) goto END_REPEAT;
5831           *code++ = OP_UPTO + repeat_type;
5832           PUT2INC(code, 0, repeat_max - 1);
5833           }
5834         }
5835 
5836       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5837       handled as an EXACT followed by an UPTO. */
5838 
5839       else
5840         {
5841         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5842         PUT2INC(code, 0, repeat_min);
5843 
5844         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5845         we have to insert the character for the previous code. For a repeated
5846         Unicode property match, there are two extra bytes that define the
5847         required property. In UTF-8 mode, long characters have their length in
5848         c, with the UTF_LENGTH bit as a flag. */
5849 
5850         if (repeat_max < 0)
5851           {
5852 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5853           if (utf && (c & UTF_LENGTH) != 0)
5854             {
5855             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5856             code += c & 7;
5857             }
5858           else
5859 #endif
5860             {
5861             *code++ = c;
5862             if (prop_type >= 0)
5863               {
5864               *code++ = prop_type;
5865               *code++ = prop_value;
5866               }
5867             }
5868           *code++ = OP_STAR + repeat_type;
5869           }
5870 
5871         /* Else insert an UPTO if the max is greater than the min, again
5872         preceded by the character, for the previously inserted code. If the
5873         UPTO is just for 1 instance, we can use QUERY instead. */
5874 
5875         else if (repeat_max != repeat_min)
5876           {
5877 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5878           if (utf && (c & UTF_LENGTH) != 0)
5879             {
5880             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5881             code += c & 7;
5882             }
5883           else
5884 #endif
5885           *code++ = c;
5886           if (prop_type >= 0)
5887             {
5888             *code++ = prop_type;
5889             *code++ = prop_value;
5890             }
5891           repeat_max -= repeat_min;
5892 
5893           if (repeat_max == 1)
5894             {
5895             *code++ = OP_QUERY + repeat_type;
5896             }
5897           else
5898             {
5899             *code++ = OP_UPTO + repeat_type;
5900             PUT2INC(code, 0, repeat_max);
5901             }
5902           }
5903         }
5904 
5905       /* The character or character type itself comes last in all cases. */
5906 
5907 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5908       if (utf && (c & UTF_LENGTH) != 0)
5909         {
5910         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5911         code += c & 7;
5912         }
5913       else
5914 #endif
5915       *code++ = c;
5916 
5917       /* For a repeated Unicode property match, there are two extra bytes that
5918       define the required property. */
5919 
5920 #ifdef SUPPORT_UCP
5921       if (prop_type >= 0)
5922         {
5923         *code++ = prop_type;
5924         *code++ = prop_value;
5925         }
5926 #endif
5927       }
5928 
5929     /* If previous was a character class or a back reference, we put the repeat
5930     stuff after it, but just skip the item if the repeat was {0,0}. */
5931 
5932     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5933 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5934              *previous == OP_XCLASS ||
5935 #endif
5936              *previous == OP_REF   || *previous == OP_REFI ||
5937              *previous == OP_DNREF || *previous == OP_DNREFI)
5938       {
5939       if (repeat_max == 0)
5940         {
5941         code = previous;
5942         goto END_REPEAT;
5943         }
5944 
5945       if (repeat_min == 0 && repeat_max == -1)
5946         *code++ = OP_CRSTAR + repeat_type;
5947       else if (repeat_min == 1 && repeat_max == -1)
5948         *code++ = OP_CRPLUS + repeat_type;
5949       else if (repeat_min == 0 && repeat_max == 1)
5950         *code++ = OP_CRQUERY + repeat_type;
5951       else
5952         {
5953         *code++ = OP_CRRANGE + repeat_type;
5954         PUT2INC(code, 0, repeat_min);
5955         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5956         PUT2INC(code, 0, repeat_max);
5957         }
5958       }
5959 
5960     /* If previous was a bracket group, we may have to replicate it in certain
5961     cases. Note that at this point we can encounter only the "basic" bracket
5962     opcodes such as BRA and CBRA, as this is the place where they get converted
5963     into the more special varieties such as BRAPOS and SBRA. A test for >=
5964     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5965     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5966     Originally, PCRE did not allow repetition of assertions, but now it does,
5967     for Perl compatibility. */
5968 
5969     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5970       {
5971       register int i;
5972       int len = (int)(code - previous);
5973       size_t base_hwm_offset = item_hwm_offset;
5974       pcre_uchar *bralink = NULL;
5975       pcre_uchar *brazeroptr = NULL;
5976 
5977       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5978       we just ignore the repeat. */
5979 
5980       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5981         goto END_REPEAT;
5982 
5983       /* There is no sense in actually repeating assertions. The only potential
5984       use of repetition is in cases when the assertion is optional. Therefore,
5985       if the minimum is greater than zero, just ignore the repeat. If the
5986       maximum is not zero or one, set it to 1. */
5987 
5988       if (*previous < OP_ONCE)    /* Assertion */
5989         {
5990         if (repeat_min > 0) goto END_REPEAT;
5991         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5992         }
5993 
5994       /* The case of a zero minimum is special because of the need to stick
5995       OP_BRAZERO in front of it, and because the group appears once in the
5996       data, whereas in other cases it appears the minimum number of times. For
5997       this reason, it is simplest to treat this case separately, as otherwise
5998       the code gets far too messy. There are several special subcases when the
5999       minimum is zero. */
6000 
6001       if (repeat_min == 0)
6002         {
6003         /* If the maximum is also zero, we used to just omit the group from the
6004         output altogether, like this:
6005 
6006         ** if (repeat_max == 0)
6007         **   {
6008         **   code = previous;
6009         **   goto END_REPEAT;
6010         **   }
6011 
6012         However, that fails when a group or a subgroup within it is referenced
6013         as a subroutine from elsewhere in the pattern, so now we stick in
6014         OP_SKIPZERO in front of it so that it is skipped on execution. As we
6015         don't have a list of which groups are referenced, we cannot do this
6016         selectively.
6017 
6018         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6019         and do no more at this point. However, we do need to adjust any
6020         OP_RECURSE calls inside the group that refer to the group itself or any
6021         internal or forward referenced group, because the offset is from the
6022         start of the whole regex. Temporarily terminate the pattern while doing
6023         this. */
6024 
6025         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6026           {
6027           *code = OP_END;
6028           adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6029           memmove(previous + 1, previous, IN_UCHARS(len));
6030           code++;
6031           if (repeat_max == 0)
6032             {
6033             *previous++ = OP_SKIPZERO;
6034             goto END_REPEAT;
6035             }
6036           brazeroptr = previous;    /* Save for possessive optimizing */
6037           *previous++ = OP_BRAZERO + repeat_type;
6038           }
6039 
6040         /* If the maximum is greater than 1 and limited, we have to replicate
6041         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6042         The first one has to be handled carefully because it's the original
6043         copy, which has to be moved up. The remainder can be handled by code
6044         that is common with the non-zero minimum case below. We have to
6045         adjust the value or repeat_max, since one less copy is required. Once
6046         again, we may have to adjust any OP_RECURSE calls inside the group. */
6047 
6048         else
6049           {
6050           int offset;
6051           *code = OP_END;
6052           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6053           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6054           code += 2 + LINK_SIZE;
6055           *previous++ = OP_BRAZERO + repeat_type;
6056           *previous++ = OP_BRA;
6057 
6058           /* We chain together the bracket offset fields that have to be
6059           filled in later when the ends of the brackets are reached. */
6060 
6061           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6062           bralink = previous;
6063           PUTINC(previous, 0, offset);
6064           }
6065 
6066         repeat_max--;
6067         }
6068 
6069       /* If the minimum is greater than zero, replicate the group as many
6070       times as necessary, and adjust the maximum to the number of subsequent
6071       copies that we need. If we set a first char from the group, and didn't
6072       set a required char, copy the latter from the former. If there are any
6073       forward reference subroutine calls in the group, there will be entries on
6074       the workspace list; replicate these with an appropriate increment. */
6075 
6076       else
6077         {
6078         if (repeat_min > 1)
6079           {
6080           /* In the pre-compile phase, we don't actually do the replication. We
6081           just adjust the length as if we had. Do some paranoid checks for
6082           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6083           integer type when available, otherwise double. */
6084 
6085           if (lengthptr != NULL)
6086             {
6087             int delta = (repeat_min - 1)*length_prevgroup;
6088             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6089                   (INT64_OR_DOUBLE)length_prevgroup >
6090                     (INT64_OR_DOUBLE)INT_MAX ||
6091                 OFLOW_MAX - *lengthptr < delta)
6092               {
6093               *errorcodeptr = ERR20;
6094               goto FAILED;
6095               }
6096             *lengthptr += delta;
6097             }
6098 
6099           /* This is compiling for real. If there is a set first byte for
6100           the group, and we have not yet set a "required byte", set it. Make
6101           sure there is enough workspace for copying forward references before
6102           doing the copy. */
6103 
6104           else
6105             {
6106             if (groupsetfirstchar && reqcharflags < 0)
6107               {
6108               reqchar = firstchar;
6109               reqcharflags = firstcharflags;
6110               }
6111 
6112             for (i = 1; i < repeat_min; i++)
6113               {
6114               pcre_uchar *hc;
6115               size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6116               memcpy(code, previous, IN_UCHARS(len));
6117 
6118               while (cd->hwm > cd->start_workspace + cd->workspace_size -
6119                      WORK_SIZE_SAFETY_MARGIN -
6120                      (this_hwm_offset - base_hwm_offset))
6121                 {
6122                 *errorcodeptr = expand_workspace(cd);
6123                 if (*errorcodeptr != 0) goto FAILED;
6124                 }
6125 
6126               for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6127                    hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6128                    hc += LINK_SIZE)
6129                 {
6130                 PUT(cd->hwm, 0, GET(hc, 0) + len);
6131                 cd->hwm += LINK_SIZE;
6132                 }
6133               base_hwm_offset = this_hwm_offset;
6134               code += len;
6135               }
6136             }
6137           }
6138 
6139         if (repeat_max > 0) repeat_max -= repeat_min;
6140         }
6141 
6142       /* This code is common to both the zero and non-zero minimum cases. If
6143       the maximum is limited, it replicates the group in a nested fashion,
6144       remembering the bracket starts on a stack. In the case of a zero minimum,
6145       the first one was set up above. In all cases the repeat_max now specifies
6146       the number of additional copies needed. Again, we must remember to
6147       replicate entries on the forward reference list. */
6148 
6149       if (repeat_max >= 0)
6150         {
6151         /* In the pre-compile phase, we don't actually do the replication. We
6152         just adjust the length as if we had. For each repetition we must add 1
6153         to the length for BRAZERO and for all but the last repetition we must
6154         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6155         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6156         a 64-bit integer type when available, otherwise double. */
6157 
6158         if (lengthptr != NULL && repeat_max > 0)
6159           {
6160           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6161                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6162           if ((INT64_OR_DOUBLE)repeat_max *
6163                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6164                   > (INT64_OR_DOUBLE)INT_MAX ||
6165               OFLOW_MAX - *lengthptr < delta)
6166             {
6167             *errorcodeptr = ERR20;
6168             goto FAILED;
6169             }
6170           *lengthptr += delta;
6171           }
6172 
6173         /* This is compiling for real */
6174 
6175         else for (i = repeat_max - 1; i >= 0; i--)
6176           {
6177           pcre_uchar *hc;
6178           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6179 
6180           *code++ = OP_BRAZERO + repeat_type;
6181 
6182           /* All but the final copy start a new nesting, maintaining the
6183           chain of brackets outstanding. */
6184 
6185           if (i != 0)
6186             {
6187             int offset;
6188             *code++ = OP_BRA;
6189             offset = (bralink == NULL)? 0 : (int)(code - bralink);
6190             bralink = code;
6191             PUTINC(code, 0, offset);
6192             }
6193 
6194           memcpy(code, previous, IN_UCHARS(len));
6195 
6196           /* Ensure there is enough workspace for forward references before
6197           copying them. */
6198 
6199           while (cd->hwm > cd->start_workspace + cd->workspace_size -
6200                  WORK_SIZE_SAFETY_MARGIN -
6201                  (this_hwm_offset - base_hwm_offset))
6202             {
6203             *errorcodeptr = expand_workspace(cd);
6204             if (*errorcodeptr != 0) goto FAILED;
6205             }
6206 
6207           for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6208                hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6209                hc += LINK_SIZE)
6210             {
6211             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6212             cd->hwm += LINK_SIZE;
6213             }
6214           base_hwm_offset = this_hwm_offset;
6215           code += len;
6216           }
6217 
6218         /* Now chain through the pending brackets, and fill in their length
6219         fields (which are holding the chain links pro tem). */
6220 
6221         while (bralink != NULL)
6222           {
6223           int oldlinkoffset;
6224           int offset = (int)(code - bralink + 1);
6225           pcre_uchar *bra = code - offset;
6226           oldlinkoffset = GET(bra, 1);
6227           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6228           *code++ = OP_KET;
6229           PUTINC(code, 0, offset);
6230           PUT(bra, 1, offset);
6231           }
6232         }
6233 
6234       /* If the maximum is unlimited, set a repeater in the final copy. For
6235       ONCE brackets, that's all we need to do. However, possessively repeated
6236       ONCE brackets can be converted into non-capturing brackets, as the
6237       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6238       deal with possessive ONCEs specially.
6239 
6240       Otherwise, when we are doing the actual compile phase, check to see
6241       whether this group is one that could match an empty string. If so,
6242       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6243       that runtime checking can be done. [This check is also applied to ONCE
6244       groups at runtime, but in a different way.]
6245 
6246       Then, if the quantifier was possessive and the bracket is not a
6247       conditional, we convert the BRA code to the POS form, and the KET code to
6248       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6249       subpattern at both the start and at the end.) The use of special opcodes
6250       makes it possible to reduce greatly the stack usage in pcre_exec(). If
6251       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6252 
6253       Then, if the minimum number of matches is 1 or 0, cancel the possessive
6254       flag so that the default action below, of wrapping everything inside
6255       atomic brackets, does not happen. When the minimum is greater than 1,
6256       there will be earlier copies of the group, and so we still have to wrap
6257       the whole thing. */
6258 
6259       else
6260         {
6261         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6262         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6263 
6264         /* Convert possessive ONCE brackets to non-capturing */
6265 
6266         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6267             possessive_quantifier) *bracode = OP_BRA;
6268 
6269         /* For non-possessive ONCE brackets, all we need to do is to
6270         set the KET. */
6271 
6272         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6273           *ketcode = OP_KETRMAX + repeat_type;
6274 
6275         /* Handle non-ONCE brackets and possessive ONCEs (which have been
6276         converted to non-capturing above). */
6277 
6278         else
6279           {
6280           /* In the compile phase, check for empty string matching. */
6281 
6282           if (lengthptr == NULL)
6283             {
6284             pcre_uchar *scode = bracode;
6285             do
6286               {
6287               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6288                 {
6289                 *bracode += OP_SBRA - OP_BRA;
6290                 break;
6291                 }
6292               scode += GET(scode, 1);
6293               }
6294             while (*scode == OP_ALT);
6295             }
6296 
6297           /* A conditional group with only one branch has an implicit empty
6298           alternative branch. */
6299 
6300           if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6301             *bracode = OP_SCOND;
6302 
6303           /* Handle possessive quantifiers. */
6304 
6305           if (possessive_quantifier)
6306             {
6307             /* For COND brackets, we wrap the whole thing in a possessively
6308             repeated non-capturing bracket, because we have not invented POS
6309             versions of the COND opcodes. Because we are moving code along, we
6310             must ensure that any pending recursive references are updated. */
6311 
6312             if (*bracode == OP_COND || *bracode == OP_SCOND)
6313               {
6314               int nlen = (int)(code - bracode);
6315               *code = OP_END;
6316               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6317               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6318               code += 1 + LINK_SIZE;
6319               nlen += 1 + LINK_SIZE;
6320               *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6321               *code++ = OP_KETRPOS;
6322               PUTINC(code, 0, nlen);
6323               PUT(bracode, 1, nlen);
6324               }
6325 
6326             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6327 
6328             else
6329               {
6330               *bracode += 1;              /* Switch to xxxPOS opcodes */
6331               *ketcode = OP_KETRPOS;
6332               }
6333 
6334             /* If the minimum is zero, mark it as possessive, then unset the
6335             possessive flag when the minimum is 0 or 1. */
6336 
6337             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6338             if (repeat_min < 2) possessive_quantifier = FALSE;
6339             }
6340 
6341           /* Non-possessive quantifier */
6342 
6343           else *ketcode = OP_KETRMAX + repeat_type;
6344           }
6345         }
6346       }
6347 
6348     /* If previous is OP_FAIL, it was generated by an empty class [] in
6349     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6350     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6351     error above. We can just ignore the repeat in JS case. */
6352 
6353     else if (*previous == OP_FAIL) goto END_REPEAT;
6354 
6355     /* Else there's some kind of shambles */
6356 
6357     else
6358       {
6359       *errorcodeptr = ERR11;
6360       goto FAILED;
6361       }
6362 
6363     /* If the character following a repeat is '+', possessive_quantifier is
6364     TRUE. For some opcodes, there are special alternative opcodes for this
6365     case. For anything else, we wrap the entire repeated item inside OP_ONCE
6366     brackets. Logically, the '+' notation is just syntactic sugar, taken from
6367     Sun's Java package, but the special opcodes can optimize it.
6368 
6369     Some (but not all) possessively repeated subpatterns have already been
6370     completely handled in the code just above. For them, possessive_quantifier
6371     is always FALSE at this stage. Note that the repeated item starts at
6372     tempcode, not at previous, which might be the first part of a string whose
6373     (former) last char we repeated. */
6374 
6375     if (possessive_quantifier)
6376       {
6377       int len;
6378 
6379       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6380       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6381       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6382       remains is greater than zero, there's a further opcode that can be
6383       handled. If not, do nothing, leaving the EXACT alone. */
6384 
6385       switch(*tempcode)
6386         {
6387         case OP_TYPEEXACT:
6388         tempcode += PRIV(OP_lengths)[*tempcode] +
6389           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6390           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6391         break;
6392 
6393         /* CHAR opcodes are used for exacts whose count is 1. */
6394 
6395         case OP_CHAR:
6396         case OP_CHARI:
6397         case OP_NOT:
6398         case OP_NOTI:
6399         case OP_EXACT:
6400         case OP_EXACTI:
6401         case OP_NOTEXACT:
6402         case OP_NOTEXACTI:
6403         tempcode += PRIV(OP_lengths)[*tempcode];
6404 #ifdef SUPPORT_UTF
6405         if (utf && HAS_EXTRALEN(tempcode[-1]))
6406           tempcode += GET_EXTRALEN(tempcode[-1]);
6407 #endif
6408         break;
6409 
6410         /* For the class opcodes, the repeat operator appears at the end;
6411         adjust tempcode to point to it. */
6412 
6413         case OP_CLASS:
6414         case OP_NCLASS:
6415         tempcode += 1 + 32/sizeof(pcre_uchar);
6416         break;
6417 
6418 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6419         case OP_XCLASS:
6420         tempcode += GET(tempcode, 1);
6421         break;
6422 #endif
6423         }
6424 
6425       /* If tempcode is equal to code (which points to the end of the repeated
6426       item), it means we have skipped an EXACT item but there is no following
6427       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6428       all other cases, tempcode will be pointing to the repeat opcode, and will
6429       be less than code, so the value of len will be greater than 0. */
6430 
6431       len = (int)(code - tempcode);
6432       if (len > 0)
6433         {
6434         unsigned int repcode = *tempcode;
6435 
6436         /* There is a table for possessifying opcodes, all of which are less
6437         than OP_CALLOUT. A zero entry means there is no possessified version.
6438         */
6439 
6440         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6441           *tempcode = opcode_possessify[repcode];
6442 
6443         /* For opcode without a special possessified version, wrap the item in
6444         ONCE brackets. Because we are moving code along, we must ensure that any
6445         pending recursive references are updated. */
6446 
6447         else
6448           {
6449           *code = OP_END;
6450           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6451           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6452           code += 1 + LINK_SIZE;
6453           len += 1 + LINK_SIZE;
6454           tempcode[0] = OP_ONCE;
6455           *code++ = OP_KET;
6456           PUTINC(code, 0, len);
6457           PUT(tempcode, 1, len);
6458           }
6459         }
6460 
6461 #ifdef NEVER
6462       if (len > 0) switch (*tempcode)
6463         {
6464         case OP_STAR:  *tempcode = OP_POSSTAR; break;
6465         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6466         case OP_QUERY: *tempcode = OP_POSQUERY; break;
6467         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6468 
6469         case OP_STARI:  *tempcode = OP_POSSTARI; break;
6470         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6471         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6472         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6473 
6474         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6475         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6476         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6477         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6478 
6479         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6480         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6481         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6482         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6483 
6484         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6485         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6486         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6487         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6488 
6489         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6490         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6491         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6492         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6493 
6494         /* Because we are moving code along, we must ensure that any
6495         pending recursive references are updated. */
6496 
6497         default:
6498         *code = OP_END;
6499         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6500         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6501         code += 1 + LINK_SIZE;
6502         len += 1 + LINK_SIZE;
6503         tempcode[0] = OP_ONCE;
6504         *code++ = OP_KET;
6505         PUTINC(code, 0, len);
6506         PUT(tempcode, 1, len);
6507         break;
6508         }
6509 #endif
6510       }
6511 
6512     /* In all case we no longer have a previous item. We also set the
6513     "follows varying string" flag for subsequently encountered reqchars if
6514     it isn't already set and we have just passed a varying length item. */
6515 
6516     END_REPEAT:
6517     previous = NULL;
6518     cd->req_varyopt |= reqvary;
6519     break;
6520 
6521 
6522     /* ===================================================================*/
6523     /* Start of nested parenthesized sub-expression, or comment or lookahead or
6524     lookbehind or option setting or condition or all the other extended
6525     parenthesis forms.  */
6526 
6527     case CHAR_LEFT_PARENTHESIS:
6528     ptr++;
6529 
6530     /* First deal with comments. Putting this code right at the start ensures
6531     that comments have no bad side effects. */
6532 
6533     if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
6534       {
6535       ptr += 2;
6536       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6537       if (*ptr == CHAR_NULL)
6538         {
6539         *errorcodeptr = ERR18;
6540         goto FAILED;
6541         }
6542       continue;
6543       }
6544 
6545     /* Now deal with various "verbs" that can be introduced by '*'. */
6546 
6547     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6548          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6549       {
6550       int i, namelen;
6551       int arglen = 0;
6552       const char *vn = verbnames;
6553       const pcre_uchar *name = ptr + 1;
6554       const pcre_uchar *arg = NULL;
6555       previous = NULL;
6556       ptr++;
6557       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6558       namelen = (int)(ptr - name);
6559 
6560       /* It appears that Perl allows any characters whatsoever, other than
6561       a closing parenthesis, to appear in arguments, so we no longer insist on
6562       letters, digits, and underscores. */
6563 
6564       if (*ptr == CHAR_COLON)
6565         {
6566         arg = ++ptr;
6567         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6568         arglen = (int)(ptr - arg);
6569         if ((unsigned int)arglen > MAX_MARK)
6570           {
6571           *errorcodeptr = ERR75;
6572           goto FAILED;
6573           }
6574         }
6575 
6576       if (*ptr != CHAR_RIGHT_PARENTHESIS)
6577         {
6578         *errorcodeptr = ERR60;
6579         goto FAILED;
6580         }
6581 
6582       /* Scan the table of verb names */
6583 
6584       for (i = 0; i < verbcount; i++)
6585         {
6586         if (namelen == verbs[i].len &&
6587             STRNCMP_UC_C8(name, vn, namelen) == 0)
6588           {
6589           int setverb;
6590 
6591           /* Check for open captures before ACCEPT and convert it to
6592           ASSERT_ACCEPT if in an assertion. */
6593 
6594           if (verbs[i].op == OP_ACCEPT)
6595             {
6596             open_capitem *oc;
6597             if (arglen != 0)
6598               {
6599               *errorcodeptr = ERR59;
6600               goto FAILED;
6601               }
6602             cd->had_accept = TRUE;
6603             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6604               {
6605               *code++ = OP_CLOSE;
6606               PUT2INC(code, 0, oc->number);
6607               }
6608             setverb = *code++ =
6609               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6610 
6611             /* Do not set firstchar after *ACCEPT */
6612             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6613             }
6614 
6615           /* Handle other cases with/without an argument */
6616 
6617           else if (arglen == 0)
6618             {
6619             if (verbs[i].op < 0)   /* Argument is mandatory */
6620               {
6621               *errorcodeptr = ERR66;
6622               goto FAILED;
6623               }
6624             setverb = *code++ = verbs[i].op;
6625             }
6626 
6627           else
6628             {
6629             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6630               {
6631               *errorcodeptr = ERR59;
6632               goto FAILED;
6633               }
6634             setverb = *code++ = verbs[i].op_arg;
6635             if (lengthptr != NULL)    /* In pass 1 just add in the length */
6636               {                       /* to avoid potential workspace */
6637               *lengthptr += arglen;   /* overflow. */
6638               *code++ = 0;
6639               }
6640             else
6641               {
6642               *code++ = arglen;
6643               memcpy(code, arg, IN_UCHARS(arglen));
6644               code += arglen;
6645               }
6646             *code++ = 0;
6647             }
6648 
6649           switch (setverb)
6650             {
6651             case OP_THEN:
6652             case OP_THEN_ARG:
6653             cd->external_flags |= PCRE_HASTHEN;
6654             break;
6655 
6656             case OP_PRUNE:
6657             case OP_PRUNE_ARG:
6658             case OP_SKIP:
6659             case OP_SKIP_ARG:
6660             cd->had_pruneorskip = TRUE;
6661             break;
6662             }
6663 
6664           break;  /* Found verb, exit loop */
6665           }
6666 
6667         vn += verbs[i].len + 1;
6668         }
6669 
6670       if (i < verbcount) continue;    /* Successfully handled a verb */
6671       *errorcodeptr = ERR60;          /* Verb not recognized */
6672       goto FAILED;
6673       }
6674 
6675     /* Initialize for "real" parentheses */
6676 
6677     newoptions = options;
6678     skipbytes = 0;
6679     bravalue = OP_CBRA;
6680     item_hwm_offset = cd->hwm - cd->start_workspace;
6681     reset_bracount = FALSE;
6682 
6683     /* Deal with the extended parentheses; all are introduced by '?', and the
6684     appearance of any of them means that this is not a capturing group. */
6685 
6686     if (*ptr == CHAR_QUESTION_MARK)
6687       {
6688       int i, set, unset, namelen;
6689       int *optset;
6690       const pcre_uchar *name;
6691       pcre_uchar *slot;
6692 
6693       switch (*(++ptr))
6694         {
6695         /* ------------------------------------------------------------ */
6696         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6697         reset_bracount = TRUE;
6698         cd->dupgroups = TRUE;     /* Record (?| encountered */
6699         /* Fall through */
6700 
6701         /* ------------------------------------------------------------ */
6702         case CHAR_COLON:          /* Non-capturing bracket */
6703         bravalue = OP_BRA;
6704         ptr++;
6705         break;
6706 
6707 
6708         /* ------------------------------------------------------------ */
6709         case CHAR_LEFT_PARENTHESIS:
6710         bravalue = OP_COND;       /* Conditional group */
6711         tempptr = ptr;
6712 
6713         /* A condition can be an assertion, a number (referring to a numbered
6714         group's having been set), a name (referring to a named group), or 'R',
6715         referring to recursion. R<digits> and R&name are also permitted for
6716         recursion tests.
6717 
6718         There are ways of testing a named group: (?(name)) is used by Python;
6719         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6720 
6721         There is one unfortunate ambiguity, caused by history. 'R' can be the
6722         recursive thing or the name 'R' (and similarly for 'R' followed by
6723         digits). We look for a name first; if not found, we try the other case.
6724 
6725         For compatibility with auto-callouts, we allow a callout to be
6726         specified before a condition that is an assertion. First, check for the
6727         syntax of a callout; if found, adjust the temporary pointer that is
6728         used to check for an assertion condition. That's all that is needed! */
6729 
6730         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6731           {
6732           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6733           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6734             tempptr += i + 1;
6735           }
6736 
6737         /* For conditions that are assertions, check the syntax, and then exit
6738         the switch. This will take control down to where bracketed groups,
6739         including assertions, are processed. */
6740 
6741         if (tempptr[1] == CHAR_QUESTION_MARK &&
6742               (tempptr[2] == CHAR_EQUALS_SIGN ||
6743                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6744                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6745                    (tempptr[3] == CHAR_EQUALS_SIGN ||
6746                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
6747           {
6748           cd->iscondassert = TRUE;
6749           break;
6750           }
6751 
6752         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6753         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6754 
6755         code[1+LINK_SIZE] = OP_CREF;
6756         skipbytes = 1+IMM2_SIZE;
6757         refsign = -1;     /* => not a number */
6758         namelen = -1;     /* => not a name; must set to avoid warning */
6759         name = NULL;      /* Always set to avoid warning */
6760         recno = 0;        /* Always set to avoid warning */
6761 
6762         /* Check for a test for recursion in a named group. */
6763 
6764         ptr++;
6765         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6766           {
6767           terminator = -1;
6768           ptr += 2;
6769           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6770           }
6771 
6772         /* Check for a test for a named group's having been set, using the Perl
6773         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6774         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6775 
6776         else if (*ptr == CHAR_LESS_THAN_SIGN)
6777           {
6778           terminator = CHAR_GREATER_THAN_SIGN;
6779           ptr++;
6780           }
6781         else if (*ptr == CHAR_APOSTROPHE)
6782           {
6783           terminator = CHAR_APOSTROPHE;
6784           ptr++;
6785           }
6786         else
6787           {
6788           terminator = CHAR_NULL;
6789           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6790             else if (IS_DIGIT(*ptr)) refsign = 0;
6791           }
6792 
6793         /* Handle a number */
6794 
6795         if (refsign >= 0)
6796           {
6797           while (IS_DIGIT(*ptr))
6798             {
6799             if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6800               {
6801               while (IS_DIGIT(*ptr)) ptr++;
6802               *errorcodeptr = ERR61;
6803               goto FAILED;
6804               }
6805             recno = recno * 10 + (int)(*ptr - CHAR_0);
6806             ptr++;
6807             }
6808           }
6809 
6810         /* Otherwise we expect to read a name; anything else is an error. When
6811         a name is one of a number of duplicates, a different opcode is used and
6812         it needs more memory. Unfortunately we cannot tell whether a name is a
6813         duplicate in the first pass, so we have to allow for more memory. */
6814 
6815         else
6816           {
6817           if (IS_DIGIT(*ptr))
6818             {
6819             *errorcodeptr = ERR84;
6820             goto FAILED;
6821             }
6822           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6823             {
6824             *errorcodeptr = ERR28;   /* Assertion expected */
6825             goto FAILED;
6826             }
6827           name = ptr++;
6828           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6829             {
6830             ptr++;
6831             }
6832           namelen = (int)(ptr - name);
6833           if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6834           }
6835 
6836         /* Check the terminator */
6837 
6838         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6839             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6840           {
6841           ptr--;                  /* Error offset */
6842           *errorcodeptr = ERR26;  /* Malformed number or name */
6843           goto FAILED;
6844           }
6845 
6846         /* Do no further checking in the pre-compile phase. */
6847 
6848         if (lengthptr != NULL) break;
6849 
6850         /* In the real compile we do the work of looking for the actual
6851         reference. If refsign is not negative, it means we have a number in
6852         recno. */
6853 
6854         if (refsign >= 0)
6855           {
6856           if (recno <= 0)
6857             {
6858             *errorcodeptr = ERR35;
6859             goto FAILED;
6860             }
6861           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6862             cd->bracount - recno + 1 : recno + cd->bracount;
6863           if (recno <= 0 || recno > cd->final_bracount)
6864             {
6865             *errorcodeptr = ERR15;
6866             goto FAILED;
6867             }
6868           PUT2(code, 2+LINK_SIZE, recno);
6869           if (recno > cd->top_backref) cd->top_backref = recno;
6870           break;
6871           }
6872 
6873         /* Otherwise look for the name. */
6874 
6875         slot = cd->name_table;
6876         for (i = 0; i < cd->names_found; i++)
6877           {
6878           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6879           slot += cd->name_entry_size;
6880           }
6881 
6882         /* Found the named subpattern. If the name is duplicated, add one to
6883         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6884         appropriate data values. Otherwise, just insert the unique subpattern
6885         number. */
6886 
6887         if (i < cd->names_found)
6888           {
6889           int offset = i++;
6890           int count = 1;
6891           recno = GET2(slot, 0);   /* Number from first found */
6892           if (recno > cd->top_backref) cd->top_backref = recno;
6893           for (; i < cd->names_found; i++)
6894             {
6895             slot += cd->name_entry_size;
6896             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6897               (slot+IMM2_SIZE)[namelen] != 0) break;
6898             count++;
6899             }
6900 
6901           if (count > 1)
6902             {
6903             PUT2(code, 2+LINK_SIZE, offset);
6904             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6905             skipbytes += IMM2_SIZE;
6906             code[1+LINK_SIZE]++;
6907             }
6908           else  /* Not a duplicated name */
6909             {
6910             PUT2(code, 2+LINK_SIZE, recno);
6911             }
6912           }
6913 
6914         /* If terminator == CHAR_NULL it means that the name followed directly
6915         after the opening parenthesis [e.g. (?(abc)...] and in this case there
6916         are some further alternatives to try. For the cases where terminator !=
6917         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6918         we have now checked all the possibilities, so give an error. */
6919 
6920         else if (terminator != CHAR_NULL)
6921           {
6922           *errorcodeptr = ERR15;
6923           goto FAILED;
6924           }
6925 
6926         /* Check for (?(R) for recursion. Allow digits after R to specify a
6927         specific group number. */
6928 
6929         else if (*name == CHAR_R)
6930           {
6931           recno = 0;
6932           for (i = 1; i < namelen; i++)
6933             {
6934             if (!IS_DIGIT(name[i]))
6935               {
6936               *errorcodeptr = ERR15;
6937               goto FAILED;
6938               }
6939             if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
6940               {
6941               *errorcodeptr = ERR61;
6942               goto FAILED;
6943               }
6944             recno = recno * 10 + name[i] - CHAR_0;
6945             }
6946           if (recno == 0) recno = RREF_ANY;
6947           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6948           PUT2(code, 2+LINK_SIZE, recno);
6949           }
6950 
6951         /* Similarly, check for the (?(DEFINE) "condition", which is always
6952         false. */
6953 
6954         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6955           {
6956           code[1+LINK_SIZE] = OP_DEF;
6957           skipbytes = 1;
6958           }
6959 
6960         /* Reference to an unidentified subpattern. */
6961 
6962         else
6963           {
6964           *errorcodeptr = ERR15;
6965           goto FAILED;
6966           }
6967         break;
6968 
6969 
6970         /* ------------------------------------------------------------ */
6971         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
6972         bravalue = OP_ASSERT;
6973         cd->assert_depth += 1;
6974         ptr++;
6975         break;
6976 
6977         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6978         thing to do, but Perl allows all assertions to be quantified, and when
6979         they contain capturing parentheses there may be a potential use for
6980         this feature. Not that that applies to a quantified (?!) but we allow
6981         it for uniformity. */
6982 
6983         /* ------------------------------------------------------------ */
6984         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6985         ptr++;
6986         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6987              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6988             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6989           {
6990           *code++ = OP_FAIL;
6991           previous = NULL;
6992           continue;
6993           }
6994         bravalue = OP_ASSERT_NOT;
6995         cd->assert_depth += 1;
6996         break;
6997 
6998 
6999         /* ------------------------------------------------------------ */
7000         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
7001         switch (ptr[1])
7002           {
7003           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
7004           bravalue = OP_ASSERTBACK;
7005           cd->assert_depth += 1;
7006           ptr += 2;
7007           break;
7008 
7009           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
7010           bravalue = OP_ASSERTBACK_NOT;
7011           cd->assert_depth += 1;
7012           ptr += 2;
7013           break;
7014 
7015           default:                /* Could be name define, else bad */
7016           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7017             goto DEFINE_NAME;
7018           ptr++;                  /* Correct offset for error */
7019           *errorcodeptr = ERR24;
7020           goto FAILED;
7021           }
7022         break;
7023 
7024 
7025         /* ------------------------------------------------------------ */
7026         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
7027         bravalue = OP_ONCE;
7028         ptr++;
7029         break;
7030 
7031 
7032         /* ------------------------------------------------------------ */
7033         case CHAR_C:                 /* Callout - may be followed by digits; */
7034         previous_callout = code;     /* Save for later completion */
7035         after_manual_callout = 1;    /* Skip one item before completing */
7036         *code++ = OP_CALLOUT;
7037           {
7038           int n = 0;
7039           ptr++;
7040           while(IS_DIGIT(*ptr))
7041             n = n * 10 + *ptr++ - CHAR_0;
7042           if (*ptr != CHAR_RIGHT_PARENTHESIS)
7043             {
7044             *errorcodeptr = ERR39;
7045             goto FAILED;
7046             }
7047           if (n > 255)
7048             {
7049             *errorcodeptr = ERR38;
7050             goto FAILED;
7051             }
7052           *code++ = n;
7053           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7054           PUT(code, LINK_SIZE, 0);                          /* Default length */
7055           code += 2 * LINK_SIZE;
7056           }
7057         previous = NULL;
7058         continue;
7059 
7060 
7061         /* ------------------------------------------------------------ */
7062         case CHAR_P:              /* Python-style named subpattern handling */
7063         if (*(++ptr) == CHAR_EQUALS_SIGN ||
7064             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
7065           {
7066           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7067           terminator = CHAR_RIGHT_PARENTHESIS;
7068           goto NAMED_REF_OR_RECURSE;
7069           }
7070         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7071           {
7072           *errorcodeptr = ERR41;
7073           goto FAILED;
7074           }
7075         /* Fall through to handle (?P< as (?< is handled */
7076 
7077 
7078         /* ------------------------------------------------------------ */
7079         DEFINE_NAME:    /* Come here from (?< handling */
7080         case CHAR_APOSTROPHE:
7081         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7082           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7083         name = ++ptr;
7084         if (IS_DIGIT(*ptr))
7085           {
7086           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7087           goto FAILED;
7088           }
7089         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7090         namelen = (int)(ptr - name);
7091 
7092         /* In the pre-compile phase, do a syntax check, remember the longest
7093         name, and then remember the group in a vector, expanding it if
7094         necessary. Duplicates for the same number are skipped; other duplicates
7095         are checked for validity. In the actual compile, there is nothing to
7096         do. */
7097 
7098         if (lengthptr != NULL)
7099           {
7100           named_group *ng;
7101           pcre_uint32 number = cd->bracount + 1;
7102 
7103           if (*ptr != (pcre_uchar)terminator)
7104             {
7105             *errorcodeptr = ERR42;
7106             goto FAILED;
7107             }
7108 
7109           if (cd->names_found >= MAX_NAME_COUNT)
7110             {
7111             *errorcodeptr = ERR49;
7112             goto FAILED;
7113             }
7114 
7115           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7116             {
7117             cd->name_entry_size = namelen + IMM2_SIZE + 1;
7118             if (namelen > MAX_NAME_SIZE)
7119               {
7120               *errorcodeptr = ERR48;
7121               goto FAILED;
7122               }
7123             }
7124 
7125           /* Scan the list to check for duplicates. For duplicate names, if the
7126           number is the same, break the loop, which causes the name to be
7127           discarded; otherwise, if DUPNAMES is not set, give an error.
7128           If it is set, allow the name with a different number, but continue
7129           scanning in case this is a duplicate with the same number. For
7130           non-duplicate names, give an error if the number is duplicated. */
7131 
7132           ng = cd->named_groups;
7133           for (i = 0; i < cd->names_found; i++, ng++)
7134             {
7135             if (namelen == ng->length &&
7136                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7137               {
7138               if (ng->number == number) break;
7139               if ((options & PCRE_DUPNAMES) == 0)
7140                 {
7141                 *errorcodeptr = ERR43;
7142                 goto FAILED;
7143                 }
7144               cd->dupnames = TRUE;  /* Duplicate names exist */
7145               }
7146             else if (ng->number == number)
7147               {
7148               *errorcodeptr = ERR65;
7149               goto FAILED;
7150               }
7151             }
7152 
7153           if (i >= cd->names_found)     /* Not a duplicate with same number */
7154             {
7155             /* Increase the list size if necessary */
7156 
7157             if (cd->names_found >= cd->named_group_list_size)
7158               {
7159               int newsize = cd->named_group_list_size * 2;
7160               named_group *newspace = (PUBL(malloc))
7161                 (newsize * sizeof(named_group));
7162 
7163               if (newspace == NULL)
7164                 {
7165                 *errorcodeptr = ERR21;
7166                 goto FAILED;
7167                 }
7168 
7169               memcpy(newspace, cd->named_groups,
7170                 cd->named_group_list_size * sizeof(named_group));
7171               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7172                 (PUBL(free))((void *)cd->named_groups);
7173               cd->named_groups = newspace;
7174               cd->named_group_list_size = newsize;
7175               }
7176 
7177             cd->named_groups[cd->names_found].name = name;
7178             cd->named_groups[cd->names_found].length = namelen;
7179             cd->named_groups[cd->names_found].number = number;
7180             cd->names_found++;
7181             }
7182           }
7183 
7184         ptr++;                    /* Move past > or ' in both passes. */
7185         goto NUMBERED_GROUP;
7186 
7187 
7188         /* ------------------------------------------------------------ */
7189         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7190         terminator = CHAR_RIGHT_PARENTHESIS;
7191         is_recurse = TRUE;
7192         /* Fall through */
7193 
7194         /* We come here from the Python syntax above that handles both
7195         references (?P=name) and recursion (?P>name), as well as falling
7196         through from the Perl recursion syntax (?&name). We also come here from
7197         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7198         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7199 
7200         NAMED_REF_OR_RECURSE:
7201         name = ++ptr;
7202         if (IS_DIGIT(*ptr))
7203           {
7204           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7205           goto FAILED;
7206           }
7207         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7208         namelen = (int)(ptr - name);
7209 
7210         /* In the pre-compile phase, do a syntax check. We used to just set
7211         a dummy reference number, because it was not used in the first pass.
7212         However, with the change of recursive back references to be atomic,
7213         we have to look for the number so that this state can be identified, as
7214         otherwise the incorrect length is computed. If it's not a backwards
7215         reference, the dummy number will do. */
7216 
7217         if (lengthptr != NULL)
7218           {
7219           named_group *ng;
7220           recno = 0;
7221 
7222           if (namelen == 0)
7223             {
7224             *errorcodeptr = ERR62;
7225             goto FAILED;
7226             }
7227           if (*ptr != (pcre_uchar)terminator)
7228             {
7229             *errorcodeptr = ERR42;
7230             goto FAILED;
7231             }
7232           if (namelen > MAX_NAME_SIZE)
7233             {
7234             *errorcodeptr = ERR48;
7235             goto FAILED;
7236             }
7237 
7238           /* Count named back references. */
7239 
7240           if (!is_recurse) cd->namedrefcount++;
7241 
7242           /* We have to allow for a named reference to a duplicated name (this
7243           cannot be determined until the second pass). This needs an extra
7244           16-bit data item. */
7245 
7246           *lengthptr += IMM2_SIZE;
7247 
7248           /* If this is a forward reference and we are within a (?|...) group,
7249           the reference may end up as the number of a group which we are
7250           currently inside, that is, it could be a recursive reference. In the
7251           real compile this will be picked up and the reference wrapped with
7252           OP_ONCE to make it atomic, so we must space in case this occurs. */
7253 
7254           /* In fact, this can happen for a non-forward reference because
7255           another group with the same number might be created later. This
7256           issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7257           only mode, we finesse the bug by allowing more memory always. */
7258 
7259           *lengthptr += 2 + 2*LINK_SIZE;
7260 
7261           /* It is even worse than that. The current reference may be to an
7262           existing named group with a different number (so apparently not
7263           recursive) but which later on is also attached to a group with the
7264           current number. This can only happen if $(| has been previous
7265           encountered. In that case, we allow yet more memory, just in case.
7266           (Again, this is fixed "properly" in PCRE2. */
7267 
7268           if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7269 
7270           /* Otherwise, check for recursion here. The name table does not exist
7271           in the first pass; instead we must scan the list of names encountered
7272           so far in order to get the number. If the name is not found, leave
7273           the value of recno as 0 for a forward reference. */
7274 
7275           /* This patch (removing "else") fixes a problem when a reference is
7276           to multiple identically named nested groups from within the nest.
7277           Once again, it is not the "proper" fix, and it results in an
7278           over-allocation of memory. */
7279 
7280           /* else */
7281             {
7282             ng = cd->named_groups;
7283             for (i = 0; i < cd->names_found; i++, ng++)
7284               {
7285               if (namelen == ng->length &&
7286                   STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7287                 {
7288                 open_capitem *oc;
7289                 recno = ng->number;
7290                 if (is_recurse) break;
7291                 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7292                   {
7293                   if (oc->number == recno)
7294                     {
7295                     oc->flag = TRUE;
7296                     break;
7297                     }
7298                   }
7299                 }
7300               }
7301             }
7302           }
7303 
7304         /* In the real compile, search the name table. We check the name
7305         first, and then check that we have reached the end of the name in the
7306         table. That way, if the name is longer than any in the table, the
7307         comparison will fail without reading beyond the table entry. */
7308 
7309         else
7310           {
7311           slot = cd->name_table;
7312           for (i = 0; i < cd->names_found; i++)
7313             {
7314             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7315                 slot[IMM2_SIZE+namelen] == 0)
7316               break;
7317             slot += cd->name_entry_size;
7318             }
7319 
7320           if (i < cd->names_found)
7321             {
7322             recno = GET2(slot, 0);
7323             }
7324           else
7325             {
7326             *errorcodeptr = ERR15;
7327             goto FAILED;
7328             }
7329           }
7330 
7331         /* In both phases, for recursions, we can now go to the code than
7332         handles numerical recursion. */
7333 
7334         if (is_recurse) goto HANDLE_RECURSION;
7335 
7336         /* In the second pass we must see if the name is duplicated. If so, we
7337         generate a different opcode. */
7338 
7339         if (lengthptr == NULL && cd->dupnames)
7340           {
7341           int count = 1;
7342           unsigned int index = i;
7343           pcre_uchar *cslot = slot + cd->name_entry_size;
7344 
7345           for (i++; i < cd->names_found; i++)
7346             {
7347             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7348             count++;
7349             cslot += cd->name_entry_size;
7350             }
7351 
7352           if (count > 1)
7353             {
7354             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7355             previous = code;
7356             item_hwm_offset = cd->hwm - cd->start_workspace;
7357             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7358             PUT2INC(code, 0, index);
7359             PUT2INC(code, 0, count);
7360 
7361             /* Process each potentially referenced group. */
7362 
7363             for (; slot < cslot; slot += cd->name_entry_size)
7364               {
7365               open_capitem *oc;
7366               recno = GET2(slot, 0);
7367               cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7368               if (recno > cd->top_backref) cd->top_backref = recno;
7369 
7370               /* Check to see if this back reference is recursive, that it, it
7371               is inside the group that it references. A flag is set so that the
7372               group can be made atomic. */
7373 
7374               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7375                 {
7376                 if (oc->number == recno)
7377                   {
7378                   oc->flag = TRUE;
7379                   break;
7380                   }
7381                 }
7382               }
7383 
7384             continue;  /* End of back ref handling */
7385             }
7386           }
7387 
7388         /* First pass, or a non-duplicated name. */
7389 
7390         goto HANDLE_REFERENCE;
7391 
7392 
7393         /* ------------------------------------------------------------ */
7394         case CHAR_R:              /* Recursion, same as (?0) */
7395         recno = 0;
7396         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7397           {
7398           *errorcodeptr = ERR29;
7399           goto FAILED;
7400           }
7401         goto HANDLE_RECURSION;
7402 
7403 
7404         /* ------------------------------------------------------------ */
7405         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7406         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7407         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7408           {
7409           const pcre_uchar *called;
7410           terminator = CHAR_RIGHT_PARENTHESIS;
7411 
7412           /* Come here from the \g<...> and \g'...' code (Oniguruma
7413           compatibility). However, the syntax has been checked to ensure that
7414           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7415           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7416           ever be taken. */
7417 
7418           HANDLE_NUMERICAL_RECURSION:
7419 
7420           if ((refsign = *ptr) == CHAR_PLUS)
7421             {
7422             ptr++;
7423             if (!IS_DIGIT(*ptr))
7424               {
7425               *errorcodeptr = ERR63;
7426               goto FAILED;
7427               }
7428             }
7429           else if (refsign == CHAR_MINUS)
7430             {
7431             if (!IS_DIGIT(ptr[1]))
7432               goto OTHER_CHAR_AFTER_QUERY;
7433             ptr++;
7434             }
7435 
7436           recno = 0;
7437           while(IS_DIGIT(*ptr))
7438             {
7439             if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7440               {
7441               while (IS_DIGIT(*ptr)) ptr++;
7442               *errorcodeptr = ERR61;
7443               goto FAILED;
7444               }
7445             recno = recno * 10 + *ptr++ - CHAR_0;
7446             }
7447 
7448           if (*ptr != (pcre_uchar)terminator)
7449             {
7450             *errorcodeptr = ERR29;
7451             goto FAILED;
7452             }
7453 
7454           if (refsign == CHAR_MINUS)
7455             {
7456             if (recno == 0)
7457               {
7458               *errorcodeptr = ERR58;
7459               goto FAILED;
7460               }
7461             recno = cd->bracount - recno + 1;
7462             if (recno <= 0)
7463               {
7464               *errorcodeptr = ERR15;
7465               goto FAILED;
7466               }
7467             }
7468           else if (refsign == CHAR_PLUS)
7469             {
7470             if (recno == 0)
7471               {
7472               *errorcodeptr = ERR58;
7473               goto FAILED;
7474               }
7475             recno += cd->bracount;
7476             }
7477 
7478           /* Come here from code above that handles a named recursion */
7479 
7480           HANDLE_RECURSION:
7481 
7482           previous = code;
7483           item_hwm_offset = cd->hwm - cd->start_workspace;
7484           called = cd->start_code;
7485 
7486           /* When we are actually compiling, find the bracket that is being
7487           referenced. Temporarily end the regex in case it doesn't exist before
7488           this point. If we end up with a forward reference, first check that
7489           the bracket does occur later so we can give the error (and position)
7490           now. Then remember this forward reference in the workspace so it can
7491           be filled in at the end. */
7492 
7493           if (lengthptr == NULL)
7494             {
7495             *code = OP_END;
7496             if (recno != 0)
7497               called = PRIV(find_bracket)(cd->start_code, utf, recno);
7498 
7499             /* Forward reference */
7500 
7501             if (called == NULL)
7502               {
7503               if (recno > cd->final_bracount)
7504                 {
7505                 *errorcodeptr = ERR15;
7506                 goto FAILED;
7507                 }
7508 
7509               /* Fudge the value of "called" so that when it is inserted as an
7510               offset below, what it actually inserted is the reference number
7511               of the group. Then remember the forward reference. */
7512 
7513               called = cd->start_code + recno;
7514               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7515                   WORK_SIZE_SAFETY_MARGIN)
7516                 {
7517                 *errorcodeptr = expand_workspace(cd);
7518                 if (*errorcodeptr != 0) goto FAILED;
7519                 }
7520               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7521               }
7522 
7523             /* If not a forward reference, and the subpattern is still open,
7524             this is a recursive call. We check to see if this is a left
7525             recursion that could loop for ever, and diagnose that case. We
7526             must not, however, do this check if we are in a conditional
7527             subpattern because the condition might be testing for recursion in
7528             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7529             Forever loops are also detected at runtime, so those that occur in
7530             conditional subpatterns will be picked up then. */
7531 
7532             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7533                      could_be_empty(called, code, bcptr, utf, cd))
7534               {
7535               *errorcodeptr = ERR40;
7536               goto FAILED;
7537               }
7538             }
7539 
7540           /* Insert the recursion/subroutine item. It does not have a set first
7541           character (relevant if it is repeated, because it will then be
7542           wrapped with ONCE brackets). */
7543 
7544           *code = OP_RECURSE;
7545           PUT(code, 1, (int)(called - cd->start_code));
7546           code += 1 + LINK_SIZE;
7547           groupsetfirstchar = FALSE;
7548           }
7549 
7550         /* Can't determine a first byte now */
7551 
7552         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7553         continue;
7554 
7555 
7556         /* ------------------------------------------------------------ */
7557         default:              /* Other characters: check option setting */
7558         OTHER_CHAR_AFTER_QUERY:
7559         set = unset = 0;
7560         optset = &set;
7561 
7562         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7563           {
7564           switch (*ptr++)
7565             {
7566             case CHAR_MINUS: optset = &unset; break;
7567 
7568             case CHAR_J:    /* Record that it changed in the external options */
7569             *optset |= PCRE_DUPNAMES;
7570             cd->external_flags |= PCRE_JCHANGED;
7571             break;
7572 
7573             case CHAR_i: *optset |= PCRE_CASELESS; break;
7574             case CHAR_m: *optset |= PCRE_MULTILINE; break;
7575             case CHAR_s: *optset |= PCRE_DOTALL; break;
7576             case CHAR_x: *optset |= PCRE_EXTENDED; break;
7577             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7578             case CHAR_X: *optset |= PCRE_EXTRA; break;
7579 
7580             default:  *errorcodeptr = ERR12;
7581                       ptr--;    /* Correct the offset */
7582                       goto FAILED;
7583             }
7584           }
7585 
7586         /* Set up the changed option bits, but don't change anything yet. */
7587 
7588         newoptions = (options | set) & (~unset);
7589 
7590         /* If the options ended with ')' this is not the start of a nested
7591         group with option changes, so the options change at this level. If this
7592         item is right at the start of the pattern, the options can be
7593         abstracted and made external in the pre-compile phase, and ignored in
7594         the compile phase. This can be helpful when matching -- for instance in
7595         caseless checking of required bytes.
7596 
7597         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
7598         definitely *not* at the start of the pattern because something has been
7599         compiled. In the pre-compile phase, however, the code pointer can have
7600         that value after the start, because it gets reset as code is discarded
7601         during the pre-compile. However, this can happen only at top level - if
7602         we are within parentheses, the starting BRA will still be present. At
7603         any parenthesis level, the length value can be used to test if anything
7604         has been compiled at that level. Thus, a test for both these conditions
7605         is necessary to ensure we correctly detect the start of the pattern in
7606         both phases.
7607 
7608         If we are not at the pattern start, reset the greedy defaults and the
7609         case value for firstchar and reqchar. */
7610 
7611         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7612           {
7613           if (code == cd->start_code + 1 + LINK_SIZE &&
7614                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
7615             {
7616             cd->external_options = newoptions;
7617             }
7618           else
7619             {
7620             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7621             greedy_non_default = greedy_default ^ 1;
7622             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7623             }
7624 
7625           /* Change options at this level, and pass them back for use
7626           in subsequent branches. */
7627 
7628           *optionsptr = options = newoptions;
7629           previous = NULL;       /* This item can't be repeated */
7630           continue;              /* It is complete */
7631           }
7632 
7633         /* If the options ended with ':' we are heading into a nested group
7634         with possible change of options. Such groups are non-capturing and are
7635         not assertions of any kind. All we need to do is skip over the ':';
7636         the newoptions value is handled below. */
7637 
7638         bravalue = OP_BRA;
7639         ptr++;
7640         }     /* End of switch for character following (? */
7641       }       /* End of (? handling */
7642 
7643     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7644     is set, all unadorned brackets become non-capturing and behave like (?:...)
7645     brackets. */
7646 
7647     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7648       {
7649       bravalue = OP_BRA;
7650       }
7651 
7652     /* Else we have a capturing group. */
7653 
7654     else
7655       {
7656       NUMBERED_GROUP:
7657       cd->bracount += 1;
7658       PUT2(code, 1+LINK_SIZE, cd->bracount);
7659       skipbytes = IMM2_SIZE;
7660       }
7661 
7662     /* Process nested bracketed regex. First check for parentheses nested too
7663     deeply. */
7664 
7665     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7666       {
7667       *errorcodeptr = ERR82;
7668       goto FAILED;
7669       }
7670 
7671     /* All assertions used not to be repeatable, but this was changed for Perl
7672     compatibility. All kinds can now be repeated except for assertions that are
7673     conditions (Perl also forbids these to be repeated). We copy code into a
7674     non-register variable (tempcode) in order to be able to pass its address
7675     because some compilers complain otherwise. At the start of a conditional
7676     group whose condition is an assertion, cd->iscondassert is set. We unset it
7677     here so as to allow assertions later in the group to be quantified. */
7678 
7679     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7680         cd->iscondassert)
7681       {
7682       previous = NULL;
7683       cd->iscondassert = FALSE;
7684       }
7685     else
7686       {
7687       previous = code;
7688       item_hwm_offset = cd->hwm - cd->start_workspace;
7689       }
7690 
7691     *code = bravalue;
7692     tempcode = code;
7693     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7694     tempbracount = cd->bracount;          /* Save value before bracket */
7695     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7696 
7697     if (!compile_regex(
7698          newoptions,                      /* The complete new option state */
7699          &tempcode,                       /* Where to put code (updated) */
7700          &ptr,                            /* Input pointer (updated) */
7701          errorcodeptr,                    /* Where to put an error message */
7702          (bravalue == OP_ASSERTBACK ||
7703           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7704          reset_bracount,                  /* True if (?| group */
7705          skipbytes,                       /* Skip over bracket number */
7706          cond_depth +
7707            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7708          &subfirstchar,                   /* For possible first char */
7709          &subfirstcharflags,
7710          &subreqchar,                     /* For possible last char */
7711          &subreqcharflags,
7712          bcptr,                           /* Current branch chain */
7713          cd,                              /* Tables block */
7714          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7715            &length_prevgroup              /* Pre-compile phase */
7716          ))
7717       goto FAILED;
7718 
7719     cd->parens_depth -= 1;
7720 
7721     /* If this was an atomic group and there are no capturing groups within it,
7722     generate OP_ONCE_NC instead of OP_ONCE. */
7723 
7724     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7725       *code = OP_ONCE_NC;
7726 
7727     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7728       cd->assert_depth -= 1;
7729 
7730     /* At the end of compiling, code is still pointing to the start of the
7731     group, while tempcode has been updated to point past the end of the group.
7732     The pattern pointer (ptr) is on the bracket.
7733 
7734     If this is a conditional bracket, check that there are no more than
7735     two branches in the group, or just one if it's a DEFINE group. We do this
7736     in the real compile phase, not in the pre-pass, where the whole group may
7737     not be available. */
7738 
7739     if (bravalue == OP_COND && lengthptr == NULL)
7740       {
7741       pcre_uchar *tc = code;
7742       int condcount = 0;
7743 
7744       do {
7745          condcount++;
7746          tc += GET(tc,1);
7747          }
7748       while (*tc != OP_KET);
7749 
7750       /* A DEFINE group is never obeyed inline (the "condition" is always
7751       false). It must have only one branch. */
7752 
7753       if (code[LINK_SIZE+1] == OP_DEF)
7754         {
7755         if (condcount > 1)
7756           {
7757           *errorcodeptr = ERR54;
7758           goto FAILED;
7759           }
7760         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7761         }
7762 
7763       /* A "normal" conditional group. If there is just one branch, we must not
7764       make use of its firstchar or reqchar, because this is equivalent to an
7765       empty second branch. */
7766 
7767       else
7768         {
7769         if (condcount > 2)
7770           {
7771           *errorcodeptr = ERR27;
7772           goto FAILED;
7773           }
7774         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7775         }
7776       }
7777 
7778     /* Error if hit end of pattern */
7779 
7780     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7781       {
7782       *errorcodeptr = ERR14;
7783       goto FAILED;
7784       }
7785 
7786     /* In the pre-compile phase, update the length by the length of the group,
7787     less the brackets at either end. Then reduce the compiled code to just a
7788     set of non-capturing brackets so that it doesn't use much memory if it is
7789     duplicated by a quantifier.*/
7790 
7791     if (lengthptr != NULL)
7792       {
7793       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7794         {
7795         *errorcodeptr = ERR20;
7796         goto FAILED;
7797         }
7798       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7799       code++;   /* This already contains bravalue */
7800       PUTINC(code, 0, 1 + LINK_SIZE);
7801       *code++ = OP_KET;
7802       PUTINC(code, 0, 1 + LINK_SIZE);
7803       break;    /* No need to waste time with special character handling */
7804       }
7805 
7806     /* Otherwise update the main code pointer to the end of the group. */
7807 
7808     code = tempcode;
7809 
7810     /* For a DEFINE group, required and first character settings are not
7811     relevant. */
7812 
7813     if (bravalue == OP_DEF) break;
7814 
7815     /* Handle updating of the required and first characters for other types of
7816     group. Update for normal brackets of all kinds, and conditions with two
7817     branches (see code above). If the bracket is followed by a quantifier with
7818     zero repeat, we have to back off. Hence the definition of zeroreqchar and
7819     zerofirstchar outside the main loop so that they can be accessed for the
7820     back off. */
7821 
7822     zeroreqchar = reqchar;
7823     zeroreqcharflags = reqcharflags;
7824     zerofirstchar = firstchar;
7825     zerofirstcharflags = firstcharflags;
7826     groupsetfirstchar = FALSE;
7827 
7828     if (bravalue >= OP_ONCE)
7829       {
7830       /* If we have not yet set a firstchar in this branch, take it from the
7831       subpattern, remembering that it was set here so that a repeat of more
7832       than one can replicate it as reqchar if necessary. If the subpattern has
7833       no firstchar, set "none" for the whole branch. In both cases, a zero
7834       repeat forces firstchar to "none". */
7835 
7836       if (firstcharflags == REQ_UNSET)
7837         {
7838         if (subfirstcharflags >= 0)
7839           {
7840           firstchar = subfirstchar;
7841           firstcharflags = subfirstcharflags;
7842           groupsetfirstchar = TRUE;
7843           }
7844         else firstcharflags = REQ_NONE;
7845         zerofirstcharflags = REQ_NONE;
7846         }
7847 
7848       /* If firstchar was previously set, convert the subpattern's firstchar
7849       into reqchar if there wasn't one, using the vary flag that was in
7850       existence beforehand. */
7851 
7852       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7853         {
7854         subreqchar = subfirstchar;
7855         subreqcharflags = subfirstcharflags | tempreqvary;
7856         }
7857 
7858       /* If the subpattern set a required byte (or set a first byte that isn't
7859       really the first byte - see above), set it. */
7860 
7861       if (subreqcharflags >= 0)
7862         {
7863         reqchar = subreqchar;
7864         reqcharflags = subreqcharflags;
7865         }
7866       }
7867 
7868     /* For a forward assertion, we take the reqchar, if set. This can be
7869     helpful if the pattern that follows the assertion doesn't set a different
7870     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
7871     for an assertion, however because it leads to incorrect effect for patterns
7872     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
7873     of a firstchar. This is overcome by a scan at the end if there's no
7874     firstchar, looking for an asserted first char. */
7875 
7876     else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
7877       {
7878       reqchar = subreqchar;
7879       reqcharflags = subreqcharflags;
7880       }
7881     break;     /* End of processing '(' */
7882 
7883 
7884     /* ===================================================================*/
7885     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7886     are arranged to be the negation of the corresponding OP_values in the
7887     default case when PCRE_UCP is not set. For the back references, the values
7888     are negative the reference number. Only back references and those types
7889     that consume a character may be repeated. We can test for values between
7890     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7891     ever created. */
7892 
7893     case CHAR_BACKSLASH:
7894     tempptr = ptr;
7895     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7896     if (*errorcodeptr != 0) goto FAILED;
7897 
7898     if (escape == 0)                  /* The escape coded a single character */
7899       c = ec;
7900     else
7901       {
7902       if (escape == ESC_Q)            /* Handle start of quoted string */
7903         {
7904         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
7905           ptr += 2;               /* avoid empty string */
7906             else inescq = TRUE;
7907         continue;
7908         }
7909 
7910       if (escape == ESC_E) continue;  /* Perl ignores an orphan \E */
7911 
7912       /* For metasequences that actually match a character, we disable the
7913       setting of a first character if it hasn't already been set. */
7914 
7915       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7916         firstcharflags = REQ_NONE;
7917 
7918       /* Set values to reset to if this is followed by a zero repeat. */
7919 
7920       zerofirstchar = firstchar;
7921       zerofirstcharflags = firstcharflags;
7922       zeroreqchar = reqchar;
7923       zeroreqcharflags = reqcharflags;
7924 
7925       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7926       is a subroutine call by number (Oniguruma syntax). In fact, the value
7927       ESC_g is returned only for these cases. So we don't need to check for <
7928       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7929       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7930       that is a synonym for a named back reference). */
7931 
7932       if (escape == ESC_g)
7933         {
7934         const pcre_uchar *p;
7935         pcre_uint32 cf;
7936 
7937         item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
7938         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7939           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7940 
7941         /* These two statements stop the compiler for warning about possibly
7942         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7943         fact, because we do the check for a number below, the paths that
7944         would actually be in error are never taken. */
7945 
7946         skipbytes = 0;
7947         reset_bracount = FALSE;
7948 
7949         /* If it's not a signed or unsigned number, treat it as a name. */
7950 
7951         cf = ptr[1];
7952         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7953           {
7954           is_recurse = TRUE;
7955           goto NAMED_REF_OR_RECURSE;
7956           }
7957 
7958         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7959         or a digit. */
7960 
7961         p = ptr + 2;
7962         while (IS_DIGIT(*p)) p++;
7963         if (*p != (pcre_uchar)terminator)
7964           {
7965           *errorcodeptr = ERR57;
7966           goto FAILED;
7967           }
7968         ptr++;
7969         goto HANDLE_NUMERICAL_RECURSION;
7970         }
7971 
7972       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7973       We also support \k{name} (.NET syntax).  */
7974 
7975       if (escape == ESC_k)
7976         {
7977         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7978           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7979           {
7980           *errorcodeptr = ERR69;
7981           goto FAILED;
7982           }
7983         is_recurse = FALSE;
7984         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7985           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7986           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7987         goto NAMED_REF_OR_RECURSE;
7988         }
7989 
7990       /* Back references are handled specially; must disable firstchar if
7991       not set to cope with cases like (?=(\w+))\1: which would otherwise set
7992       ':' later. */
7993 
7994       if (escape < 0)
7995         {
7996         open_capitem *oc;
7997         recno = -escape;
7998 
7999         /* Come here from named backref handling when the reference is to a
8000         single group (i.e. not to a duplicated name. */
8001 
8002         HANDLE_REFERENCE:
8003         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8004         previous = code;
8005         item_hwm_offset = cd->hwm - cd->start_workspace;
8006         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8007         PUT2INC(code, 0, recno);
8008         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8009         if (recno > cd->top_backref) cd->top_backref = recno;
8010 
8011         /* Check to see if this back reference is recursive, that it, it
8012         is inside the group that it references. A flag is set so that the
8013         group can be made atomic. */
8014 
8015         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8016           {
8017           if (oc->number == recno)
8018             {
8019             oc->flag = TRUE;
8020             break;
8021             }
8022           }
8023         }
8024 
8025       /* So are Unicode property matches, if supported. */
8026 
8027 #ifdef SUPPORT_UCP
8028       else if (escape == ESC_P || escape == ESC_p)
8029         {
8030         BOOL negated;
8031         unsigned int ptype = 0, pdata = 0;
8032         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8033           goto FAILED;
8034         previous = code;
8035         item_hwm_offset = cd->hwm - cd->start_workspace;
8036         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8037         *code++ = ptype;
8038         *code++ = pdata;
8039         }
8040 #else
8041 
8042       /* If Unicode properties are not supported, \X, \P, and \p are not
8043       allowed. */
8044 
8045       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8046         {
8047         *errorcodeptr = ERR45;
8048         goto FAILED;
8049         }
8050 #endif
8051 
8052       /* For the rest (including \X when Unicode properties are supported), we
8053       can obtain the OP value by negating the escape value in the default
8054       situation when PCRE_UCP is not set. When it *is* set, we substitute
8055       Unicode property tests. Note that \b and \B do a one-character
8056       lookbehind, and \A also behaves as if it does. */
8057 
8058       else
8059         {
8060         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8061              cd->max_lookbehind == 0)
8062           cd->max_lookbehind = 1;
8063 #ifdef SUPPORT_UCP
8064         if (escape >= ESC_DU && escape <= ESC_wu)
8065           {
8066           nestptr = ptr + 1;                   /* Where to resume */
8067           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
8068           }
8069         else
8070 #endif
8071         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8072         so that it works in DFA mode and in lookbehinds. */
8073 
8074           {
8075           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8076           item_hwm_offset = cd->hwm - cd->start_workspace;
8077           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8078           }
8079         }
8080       continue;
8081       }
8082 
8083     /* We have a data character whose value is in c. In UTF-8 mode it may have
8084     a value > 127. We set its representation in the length/buffer, and then
8085     handle it as a data character. */
8086 
8087 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8088     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8089       mclength = PRIV(ord2utf)(c, mcbuffer);
8090     else
8091 #endif
8092 
8093      {
8094      mcbuffer[0] = c;
8095      mclength = 1;
8096      }
8097     goto ONE_CHAR;
8098 
8099 
8100     /* ===================================================================*/
8101     /* Handle a literal character. It is guaranteed not to be whitespace or #
8102     when the extended flag is set. If we are in a UTF mode, it may be a
8103     multi-unit literal character. */
8104 
8105     default:
8106     NORMAL_CHAR:
8107     mclength = 1;
8108     mcbuffer[0] = c;
8109 
8110 #ifdef SUPPORT_UTF
8111     if (utf && HAS_EXTRALEN(c))
8112       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8113 #endif
8114 
8115     /* At this point we have the character's bytes in mcbuffer, and the length
8116     in mclength. When not in UTF-8 mode, the length is always 1. */
8117 
8118     ONE_CHAR:
8119     previous = code;
8120     item_hwm_offset = cd->hwm - cd->start_workspace;
8121 
8122     /* For caseless UTF-8 mode when UCP support is available, check whether
8123     this character has more than one other case. If so, generate a special
8124     OP_PROP item instead of OP_CHARI. */
8125 
8126 #ifdef SUPPORT_UCP
8127     if (utf && (options & PCRE_CASELESS) != 0)
8128       {
8129       GETCHAR(c, mcbuffer);
8130       if ((c = UCD_CASESET(c)) != 0)
8131         {
8132         *code++ = OP_PROP;
8133         *code++ = PT_CLIST;
8134         *code++ = c;
8135         if (firstcharflags == REQ_UNSET)
8136           firstcharflags = zerofirstcharflags = REQ_NONE;
8137         break;
8138         }
8139       }
8140 #endif
8141 
8142     /* Caseful matches, or not one of the multicase characters. */
8143 
8144     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8145     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8146 
8147     /* Remember if \r or \n were seen */
8148 
8149     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8150       cd->external_flags |= PCRE_HASCRORLF;
8151 
8152     /* Set the first and required bytes appropriately. If no previous first
8153     byte, set it from this character, but revert to none on a zero repeat.
8154     Otherwise, leave the firstchar value alone, and don't change it on a zero
8155     repeat. */
8156 
8157     if (firstcharflags == REQ_UNSET)
8158       {
8159       zerofirstcharflags = REQ_NONE;
8160       zeroreqchar = reqchar;
8161       zeroreqcharflags = reqcharflags;
8162 
8163       /* If the character is more than one byte long, we can set firstchar
8164       only if it is not to be matched caselessly. */
8165 
8166       if (mclength == 1 || req_caseopt == 0)
8167         {
8168         firstchar = mcbuffer[0] | req_caseopt;
8169         firstchar = mcbuffer[0];
8170         firstcharflags = req_caseopt;
8171 
8172         if (mclength != 1)
8173           {
8174           reqchar = code[-1];
8175           reqcharflags = cd->req_varyopt;
8176           }
8177         }
8178       else firstcharflags = reqcharflags = REQ_NONE;
8179       }
8180 
8181     /* firstchar was previously set; we can set reqchar only if the length is
8182     1 or the matching is caseful. */
8183 
8184     else
8185       {
8186       zerofirstchar = firstchar;
8187       zerofirstcharflags = firstcharflags;
8188       zeroreqchar = reqchar;
8189       zeroreqcharflags = reqcharflags;
8190       if (mclength == 1 || req_caseopt == 0)
8191         {
8192         reqchar = code[-1];
8193         reqcharflags = req_caseopt | cd->req_varyopt;
8194         }
8195       }
8196 
8197     break;            /* End of literal character handling */
8198     }
8199   }                   /* end of big loop */
8200 
8201 
8202 /* Control never reaches here by falling through, only by a goto for all the
8203 error states. Pass back the position in the pattern so that it can be displayed
8204 to the user for diagnosing the error. */
8205 
8206 FAILED:
8207 *ptrptr = ptr;
8208 return FALSE;
8209 }
8210 
8211 
8212 
8213 /*************************************************
8214 *     Compile sequence of alternatives           *
8215 *************************************************/
8216 
8217 /* On entry, ptr is pointing past the bracket character, but on return it
8218 points to the closing bracket, or vertical bar, or end of string. The code
8219 variable is pointing at the byte into which the BRA operator has been stored.
8220 This function is used during the pre-compile phase when we are trying to find
8221 out the amount of memory needed, as well as during the real compile phase. The
8222 value of lengthptr distinguishes the two phases.
8223 
8224 Arguments:
8225   options           option bits, including any changes for this subpattern
8226   codeptr           -> the address of the current code pointer
8227   ptrptr            -> the address of the current pattern pointer
8228   errorcodeptr      -> pointer to error code variable
8229   lookbehind        TRUE if this is a lookbehind assertion
8230   reset_bracount    TRUE to reset the count for each branch
8231   skipbytes         skip this many bytes at start (for brackets and OP_COND)
8232   cond_depth        depth of nesting for conditional subpatterns
8233   firstcharptr      place to put the first required character
8234   firstcharflagsptr place to put the first character flags, or a negative number
8235   reqcharptr        place to put the last required character
8236   reqcharflagsptr   place to put the last required character flags, or a negative number
8237   bcptr             pointer to the chain of currently open branches
8238   cd                points to the data block with tables pointers etc.
8239   lengthptr         NULL during the real compile phase
8240                     points to length accumulator during pre-compile phase
8241 
8242 Returns:            TRUE on success
8243 */
8244 
8245 static BOOL
compile_regex(int options,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,int skipbytes,int cond_depth,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,compile_data * cd,int * lengthptr)8246 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8247   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8248   int cond_depth,
8249   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8250   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8251   branch_chain *bcptr, compile_data *cd, int *lengthptr)
8252 {
8253 const pcre_uchar *ptr = *ptrptr;
8254 pcre_uchar *code = *codeptr;
8255 pcre_uchar *last_branch = code;
8256 pcre_uchar *start_bracket = code;
8257 pcre_uchar *reverse_count = NULL;
8258 open_capitem capitem;
8259 int capnumber = 0;
8260 pcre_uint32 firstchar, reqchar;
8261 pcre_int32 firstcharflags, reqcharflags;
8262 pcre_uint32 branchfirstchar, branchreqchar;
8263 pcre_int32 branchfirstcharflags, branchreqcharflags;
8264 int length;
8265 unsigned int orig_bracount;
8266 unsigned int max_bracount;
8267 branch_chain bc;
8268 size_t save_hwm_offset;
8269 
8270 /* If set, call the external function that checks for stack availability. */
8271 
8272 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8273   {
8274   *errorcodeptr= ERR85;
8275   return FALSE;
8276   }
8277 
8278 /* Miscellaneous initialization */
8279 
8280 bc.outer = bcptr;
8281 bc.current_branch = code;
8282 
8283 firstchar = reqchar = 0;
8284 firstcharflags = reqcharflags = REQ_UNSET;
8285 
8286 save_hwm_offset = cd->hwm - cd->start_workspace;
8287 
8288 /* Accumulate the length for use in the pre-compile phase. Start with the
8289 length of the BRA and KET and any extra bytes that are required at the
8290 beginning. We accumulate in a local variable to save frequent testing of
8291 lenthptr for NULL. We cannot do this by looking at the value of code at the
8292 start and end of each alternative, because compiled items are discarded during
8293 the pre-compile phase so that the work space is not exceeded. */
8294 
8295 length = 2 + 2*LINK_SIZE + skipbytes;
8296 
8297 /* WARNING: If the above line is changed for any reason, you must also change
8298 the code that abstracts option settings at the start of the pattern and makes
8299 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8300 pre-compile phase to find out whether anything has yet been compiled or not. */
8301 
8302 /* If this is a capturing subpattern, add to the chain of open capturing items
8303 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8304 detect groups that contain recursive back references to themselves. Note that
8305 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8306 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8307 
8308 if (*code == OP_CBRA)
8309   {
8310   capnumber = GET2(code, 1 + LINK_SIZE);
8311   capitem.number = capnumber;
8312   capitem.next = cd->open_caps;
8313   capitem.flag = FALSE;
8314   cd->open_caps = &capitem;
8315   }
8316 
8317 /* Offset is set zero to mark that this bracket is still open */
8318 
8319 PUT(code, 1, 0);
8320 code += 1 + LINK_SIZE + skipbytes;
8321 
8322 /* Loop for each alternative branch */
8323 
8324 orig_bracount = max_bracount = cd->bracount;
8325 for (;;)
8326   {
8327   /* For a (?| group, reset the capturing bracket count so that each branch
8328   uses the same numbers. */
8329 
8330   if (reset_bracount) cd->bracount = orig_bracount;
8331 
8332   /* Set up dummy OP_REVERSE if lookbehind assertion */
8333 
8334   if (lookbehind)
8335     {
8336     *code++ = OP_REVERSE;
8337     reverse_count = code;
8338     PUTINC(code, 0, 0);
8339     length += 1 + LINK_SIZE;
8340     }
8341 
8342   /* Now compile the branch; in the pre-compile phase its length gets added
8343   into the length. */
8344 
8345   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8346         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8347         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8348     {
8349     *ptrptr = ptr;
8350     return FALSE;
8351     }
8352 
8353   /* Keep the highest bracket count in case (?| was used and some branch
8354   has fewer than the rest. */
8355 
8356   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8357 
8358   /* In the real compile phase, there is some post-processing to be done. */
8359 
8360   if (lengthptr == NULL)
8361     {
8362     /* If this is the first branch, the firstchar and reqchar values for the
8363     branch become the values for the regex. */
8364 
8365     if (*last_branch != OP_ALT)
8366       {
8367       firstchar = branchfirstchar;
8368       firstcharflags = branchfirstcharflags;
8369       reqchar = branchreqchar;
8370       reqcharflags = branchreqcharflags;
8371       }
8372 
8373     /* If this is not the first branch, the first char and reqchar have to
8374     match the values from all the previous branches, except that if the
8375     previous value for reqchar didn't have REQ_VARY set, it can still match,
8376     and we set REQ_VARY for the regex. */
8377 
8378     else
8379       {
8380       /* If we previously had a firstchar, but it doesn't match the new branch,
8381       we have to abandon the firstchar for the regex, but if there was
8382       previously no reqchar, it takes on the value of the old firstchar. */
8383 
8384       if (firstcharflags >= 0 &&
8385           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8386         {
8387         if (reqcharflags < 0)
8388           {
8389           reqchar = firstchar;
8390           reqcharflags = firstcharflags;
8391           }
8392         firstcharflags = REQ_NONE;
8393         }
8394 
8395       /* If we (now or from before) have no firstchar, a firstchar from the
8396       branch becomes a reqchar if there isn't a branch reqchar. */
8397 
8398       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8399         {
8400         branchreqchar = branchfirstchar;
8401         branchreqcharflags = branchfirstcharflags;
8402         }
8403 
8404       /* Now ensure that the reqchars match */
8405 
8406       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8407           reqchar != branchreqchar)
8408         reqcharflags = REQ_NONE;
8409       else
8410         {
8411         reqchar = branchreqchar;
8412         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8413         }
8414       }
8415 
8416     /* If lookbehind, check that this branch matches a fixed-length string, and
8417     put the length into the OP_REVERSE item. Temporarily mark the end of the
8418     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8419     because there may be forward references that we can't check here. Set a
8420     flag to cause another lookbehind check at the end. Why not do it all at the
8421     end? Because common, erroneous checks are picked up here and the offset of
8422     the problem can be shown. */
8423 
8424     if (lookbehind)
8425       {
8426       int fixed_length;
8427       *code = OP_END;
8428       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8429         FALSE, cd, NULL);
8430       DPRINTF(("fixed length = %d\n", fixed_length));
8431       if (fixed_length == -3)
8432         {
8433         cd->check_lookbehind = TRUE;
8434         }
8435       else if (fixed_length < 0)
8436         {
8437         *errorcodeptr = (fixed_length == -2)? ERR36 :
8438                         (fixed_length == -4)? ERR70: ERR25;
8439         *ptrptr = ptr;
8440         return FALSE;
8441         }
8442       else
8443         {
8444         if (fixed_length > cd->max_lookbehind)
8445           cd->max_lookbehind = fixed_length;
8446         PUT(reverse_count, 0, fixed_length);
8447         }
8448       }
8449     }
8450 
8451   /* Reached end of expression, either ')' or end of pattern. In the real
8452   compile phase, go back through the alternative branches and reverse the chain
8453   of offsets, with the field in the BRA item now becoming an offset to the
8454   first alternative. If there are no alternatives, it points to the end of the
8455   group. The length in the terminating ket is always the length of the whole
8456   bracketed item. Return leaving the pointer at the terminating char. */
8457 
8458   if (*ptr != CHAR_VERTICAL_LINE)
8459     {
8460     if (lengthptr == NULL)
8461       {
8462       int branch_length = (int)(code - last_branch);
8463       do
8464         {
8465         int prev_length = GET(last_branch, 1);
8466         PUT(last_branch, 1, branch_length);
8467         branch_length = prev_length;
8468         last_branch -= branch_length;
8469         }
8470       while (branch_length > 0);
8471       }
8472 
8473     /* Fill in the ket */
8474 
8475     *code = OP_KET;
8476     PUT(code, 1, (int)(code - start_bracket));
8477     code += 1 + LINK_SIZE;
8478 
8479     /* If it was a capturing subpattern, check to see if it contained any
8480     recursive back references. If so, we must wrap it in atomic brackets.
8481     Because we are moving code along, we must ensure that any pending recursive
8482     references are updated. In any event, remove the block from the chain. */
8483 
8484     if (capnumber > 0)
8485       {
8486       if (cd->open_caps->flag)
8487         {
8488         *code = OP_END;
8489         adjust_recurse(start_bracket, 1 + LINK_SIZE,
8490           (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8491         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8492           IN_UCHARS(code - start_bracket));
8493         *start_bracket = OP_ONCE;
8494         code += 1 + LINK_SIZE;
8495         PUT(start_bracket, 1, (int)(code - start_bracket));
8496         *code = OP_KET;
8497         PUT(code, 1, (int)(code - start_bracket));
8498         code += 1 + LINK_SIZE;
8499         length += 2 + 2*LINK_SIZE;
8500         }
8501       cd->open_caps = cd->open_caps->next;
8502       }
8503 
8504     /* Retain the highest bracket number, in case resetting was used. */
8505 
8506     cd->bracount = max_bracount;
8507 
8508     /* Set values to pass back */
8509 
8510     *codeptr = code;
8511     *ptrptr = ptr;
8512     *firstcharptr = firstchar;
8513     *firstcharflagsptr = firstcharflags;
8514     *reqcharptr = reqchar;
8515     *reqcharflagsptr = reqcharflags;
8516     if (lengthptr != NULL)
8517       {
8518       if (OFLOW_MAX - *lengthptr < length)
8519         {
8520         *errorcodeptr = ERR20;
8521         return FALSE;
8522         }
8523       *lengthptr += length;
8524       }
8525     return TRUE;
8526     }
8527 
8528   /* Another branch follows. In the pre-compile phase, we can move the code
8529   pointer back to where it was for the start of the first branch. (That is,
8530   pretend that each branch is the only one.)
8531 
8532   In the real compile phase, insert an ALT node. Its length field points back
8533   to the previous branch while the bracket remains open. At the end the chain
8534   is reversed. It's done like this so that the start of the bracket has a
8535   zero offset until it is closed, making it possible to detect recursion. */
8536 
8537   if (lengthptr != NULL)
8538     {
8539     code = *codeptr + 1 + LINK_SIZE + skipbytes;
8540     length += 1 + LINK_SIZE;
8541     }
8542   else
8543     {
8544     *code = OP_ALT;
8545     PUT(code, 1, (int)(code - last_branch));
8546     bc.current_branch = last_branch = code;
8547     code += 1 + LINK_SIZE;
8548     }
8549 
8550   ptr++;
8551   }
8552 /* Control never reaches here */
8553 }
8554 
8555 
8556 
8557 
8558 /*************************************************
8559 *          Check for anchored expression         *
8560 *************************************************/
8561 
8562 /* Try to find out if this is an anchored regular expression. Consider each
8563 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8564 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8565 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8566 be found, because ^ generates OP_CIRCM in that mode.
8567 
8568 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8569 This is the code for \G, which means "match at start of match position, taking
8570 into account the match offset".
8571 
8572 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8573 because that will try the rest of the pattern at all possible matching points,
8574 so there is no point trying again.... er ....
8575 
8576 .... except when the .* appears inside capturing parentheses, and there is a
8577 subsequent back reference to those parentheses. We haven't enough information
8578 to catch that case precisely.
8579 
8580 At first, the best we could do was to detect when .* was in capturing brackets
8581 and the highest back reference was greater than or equal to that level.
8582 However, by keeping a bitmap of the first 31 back references, we can catch some
8583 of the more common cases more precisely.
8584 
8585 ... A second exception is when the .* appears inside an atomic group, because
8586 this prevents the number of characters it matches from being adjusted.
8587 
8588 Arguments:
8589   code           points to start of expression (the bracket)
8590   bracket_map    a bitmap of which brackets we are inside while testing; this
8591                   handles up to substring 31; after that we just have to take
8592                   the less precise approach
8593   cd             points to the compile data block
8594   atomcount      atomic group level
8595 
8596 Returns:     TRUE or FALSE
8597 */
8598 
8599 static BOOL
is_anchored(register const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8600 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8601   compile_data *cd, int atomcount)
8602 {
8603 do {
8604    const pcre_uchar *scode = first_significant_code(
8605      code + PRIV(OP_lengths)[*code], FALSE);
8606    register int op = *scode;
8607 
8608    /* Non-capturing brackets */
8609 
8610    if (op == OP_BRA  || op == OP_BRAPOS ||
8611        op == OP_SBRA || op == OP_SBRAPOS)
8612      {
8613      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8614      }
8615 
8616    /* Capturing brackets */
8617 
8618    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8619             op == OP_SCBRA || op == OP_SCBRAPOS)
8620      {
8621      int n = GET2(scode, 1+LINK_SIZE);
8622      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8623      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8624      }
8625 
8626    /* Positive forward assertions and conditions */
8627 
8628    else if (op == OP_ASSERT || op == OP_COND)
8629      {
8630      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8631      }
8632 
8633    /* Atomic groups */
8634 
8635    else if (op == OP_ONCE || op == OP_ONCE_NC)
8636      {
8637      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8638        return FALSE;
8639      }
8640 
8641    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8642    it isn't in brackets that are or may be referenced or inside an atomic
8643    group. */
8644 
8645    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8646              op == OP_TYPEPOSSTAR))
8647      {
8648      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8649          atomcount > 0 || cd->had_pruneorskip)
8650        return FALSE;
8651      }
8652 
8653    /* Check for explicit anchoring */
8654 
8655    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8656 
8657    code += GET(code, 1);
8658    }
8659 while (*code == OP_ALT);   /* Loop for each alternative */
8660 return TRUE;
8661 }
8662 
8663 
8664 
8665 /*************************************************
8666 *         Check for starting with ^ or .*        *
8667 *************************************************/
8668 
8669 /* This is called to find out if every branch starts with ^ or .* so that
8670 "first char" processing can be done to speed things up in multiline
8671 matching and for non-DOTALL patterns that start with .* (which must start at
8672 the beginning or after \n). As in the case of is_anchored() (see above), we
8673 have to take account of back references to capturing brackets that contain .*
8674 because in that case we can't make the assumption. Also, the appearance of .*
8675 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8676 count, because once again the assumption no longer holds.
8677 
8678 Arguments:
8679   code           points to start of expression (the bracket)
8680   bracket_map    a bitmap of which brackets we are inside while testing; this
8681                   handles up to substring 31; after that we just have to take
8682                   the less precise approach
8683   cd             points to the compile data
8684   atomcount      atomic group level
8685 
8686 Returns:         TRUE or FALSE
8687 */
8688 
8689 static BOOL
is_startline(const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8690 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8691   compile_data *cd, int atomcount)
8692 {
8693 do {
8694    const pcre_uchar *scode = first_significant_code(
8695      code + PRIV(OP_lengths)[*code], FALSE);
8696    register int op = *scode;
8697 
8698    /* If we are at the start of a conditional assertion group, *both* the
8699    conditional assertion *and* what follows the condition must satisfy the test
8700    for start of line. Other kinds of condition fail. Note that there may be an
8701    auto-callout at the start of a condition. */
8702 
8703    if (op == OP_COND)
8704      {
8705      scode += 1 + LINK_SIZE;
8706      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8707      switch (*scode)
8708        {
8709        case OP_CREF:
8710        case OP_DNCREF:
8711        case OP_RREF:
8712        case OP_DNRREF:
8713        case OP_DEF:
8714        case OP_FAIL:
8715        return FALSE;
8716 
8717        default:     /* Assertion */
8718        if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8719        do scode += GET(scode, 1); while (*scode == OP_ALT);
8720        scode += 1 + LINK_SIZE;
8721        break;
8722        }
8723      scode = first_significant_code(scode, FALSE);
8724      op = *scode;
8725      }
8726 
8727    /* Non-capturing brackets */
8728 
8729    if (op == OP_BRA  || op == OP_BRAPOS ||
8730        op == OP_SBRA || op == OP_SBRAPOS)
8731      {
8732      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8733      }
8734 
8735    /* Capturing brackets */
8736 
8737    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8738             op == OP_SCBRA || op == OP_SCBRAPOS)
8739      {
8740      int n = GET2(scode, 1+LINK_SIZE);
8741      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8742      if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
8743      }
8744 
8745    /* Positive forward assertions */
8746 
8747    else if (op == OP_ASSERT)
8748      {
8749      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8750      }
8751 
8752    /* Atomic brackets */
8753 
8754    else if (op == OP_ONCE || op == OP_ONCE_NC)
8755      {
8756      if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
8757      }
8758 
8759    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8760    brackets that may be referenced, as long as the pattern does not contain
8761    *PRUNE or *SKIP, because these break the feature. Consider, for example,
8762    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8763    start of a line. */
8764 
8765    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8766      {
8767      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8768          atomcount > 0 || cd->had_pruneorskip)
8769        return FALSE;
8770      }
8771 
8772    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8773    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8774    because the number of characters matched by .* cannot be adjusted inside
8775    them. */
8776 
8777    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8778 
8779    /* Move on to the next alternative */
8780 
8781    code += GET(code, 1);
8782    }
8783 while (*code == OP_ALT);  /* Loop for each alternative */
8784 return TRUE;
8785 }
8786 
8787 
8788 
8789 /*************************************************
8790 *       Check for asserted fixed first char      *
8791 *************************************************/
8792 
8793 /* During compilation, the "first char" settings from forward assertions are
8794 discarded, because they can cause conflicts with actual literals that follow.
8795 However, if we end up without a first char setting for an unanchored pattern,
8796 it is worth scanning the regex to see if there is an initial asserted first
8797 char. If all branches start with the same asserted char, or with a
8798 non-conditional bracket all of whose alternatives start with the same asserted
8799 char (recurse ad lib), then we return that char, with the flags set to zero or
8800 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8801 
8802 Arguments:
8803   code       points to start of expression (the bracket)
8804   flags      points to the first char flags, or to REQ_NONE
8805   inassert   TRUE if in an assertion
8806 
8807 Returns:     the fixed first char, or 0 with REQ_NONE in flags
8808 */
8809 
8810 static pcre_uint32
find_firstassertedchar(const pcre_uchar * code,pcre_int32 * flags,BOOL inassert)8811 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8812   BOOL inassert)
8813 {
8814 register pcre_uint32 c = 0;
8815 int cflags = REQ_NONE;
8816 
8817 *flags = REQ_NONE;
8818 do {
8819    pcre_uint32 d;
8820    int dflags;
8821    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8822              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8823    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8824      TRUE);
8825    register pcre_uchar op = *scode;
8826 
8827    switch(op)
8828      {
8829      default:
8830      return 0;
8831 
8832      case OP_BRA:
8833      case OP_BRAPOS:
8834      case OP_CBRA:
8835      case OP_SCBRA:
8836      case OP_CBRAPOS:
8837      case OP_SCBRAPOS:
8838      case OP_ASSERT:
8839      case OP_ONCE:
8840      case OP_ONCE_NC:
8841      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8842      if (dflags < 0)
8843        return 0;
8844      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8845      break;
8846 
8847      case OP_EXACT:
8848      scode += IMM2_SIZE;
8849      /* Fall through */
8850 
8851      case OP_CHAR:
8852      case OP_PLUS:
8853      case OP_MINPLUS:
8854      case OP_POSPLUS:
8855      if (!inassert) return 0;
8856      if (cflags < 0) { c = scode[1]; cflags = 0; }
8857        else if (c != scode[1]) return 0;
8858      break;
8859 
8860      case OP_EXACTI:
8861      scode += IMM2_SIZE;
8862      /* Fall through */
8863 
8864      case OP_CHARI:
8865      case OP_PLUSI:
8866      case OP_MINPLUSI:
8867      case OP_POSPLUSI:
8868      if (!inassert) return 0;
8869      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8870        else if (c != scode[1]) return 0;
8871      break;
8872      }
8873 
8874    code += GET(code, 1);
8875    }
8876 while (*code == OP_ALT);
8877 
8878 *flags = cflags;
8879 return c;
8880 }
8881 
8882 
8883 
8884 /*************************************************
8885 *     Add an entry to the name/number table      *
8886 *************************************************/
8887 
8888 /* This function is called between compiling passes to add an entry to the
8889 name/number table, maintaining alphabetical order. Checking for permitted
8890 and forbidden duplicates has already been done.
8891 
8892 Arguments:
8893   cd           the compile data block
8894   name         the name to add
8895   length       the length of the name
8896   groupno      the group number
8897 
8898 Returns:       nothing
8899 */
8900 
8901 static void
add_name(compile_data * cd,const pcre_uchar * name,int length,unsigned int groupno)8902 add_name(compile_data *cd, const pcre_uchar *name, int length,
8903   unsigned int groupno)
8904 {
8905 int i;
8906 pcre_uchar *slot = cd->name_table;
8907 
8908 for (i = 0; i < cd->names_found; i++)
8909   {
8910   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8911   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8912     crc = -1; /* Current name is a substring */
8913 
8914   /* Make space in the table and break the loop for an earlier name. For a
8915   duplicate or later name, carry on. We do this for duplicates so that in the
8916   simple case (when ?(| is not used) they are in order of their numbers. In all
8917   cases they are in the order in which they appear in the pattern. */
8918 
8919   if (crc < 0)
8920     {
8921     memmove(slot + cd->name_entry_size, slot,
8922       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8923     break;
8924     }
8925 
8926   /* Continue the loop for a later or duplicate name */
8927 
8928   slot += cd->name_entry_size;
8929   }
8930 
8931 PUT2(slot, 0, groupno);
8932 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8933 slot[IMM2_SIZE + length] = 0;
8934 cd->names_found++;
8935 }
8936 
8937 
8938 
8939 /*************************************************
8940 *        Compile a Regular Expression            *
8941 *************************************************/
8942 
8943 /* This function takes a string and returns a pointer to a block of store
8944 holding a compiled version of the expression. The original API for this
8945 function had no error code return variable; it is retained for backwards
8946 compatibility. The new function is given a new name.
8947 
8948 Arguments:
8949   pattern       the regular expression
8950   options       various option bits
8951   errorcodeptr  pointer to error code variable (pcre_compile2() only)
8952                   can be NULL if you don't want a code value
8953   errorptr      pointer to pointer to error text
8954   erroroffset   ptr offset in pattern where error was detected
8955   tables        pointer to character tables or NULL
8956 
8957 Returns:        pointer to compiled data block, or NULL on error,
8958                 with errorptr and erroroffset set
8959 */
8960 
8961 #if defined COMPILE_PCRE8
8962 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char * pattern,int options,const char ** errorptr,int * erroroffset,const unsigned char * tables)8963 pcre_compile(const char *pattern, int options, const char **errorptr,
8964   int *erroroffset, const unsigned char *tables)
8965 #elif defined COMPILE_PCRE16
8966 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8967 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
8968   int *erroroffset, const unsigned char *tables)
8969 #elif defined COMPILE_PCRE32
8970 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8971 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
8972   int *erroroffset, const unsigned char *tables)
8973 #endif
8974 {
8975 #if defined COMPILE_PCRE8
8976 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8977 #elif defined COMPILE_PCRE16
8978 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8979 #elif defined COMPILE_PCRE32
8980 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8981 #endif
8982 }
8983 
8984 
8985 #if defined COMPILE_PCRE8
8986 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char * pattern,int options,int * errorcodeptr,const char ** errorptr,int * erroroffset,const unsigned char * tables)8987 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
8988   const char **errorptr, int *erroroffset, const unsigned char *tables)
8989 #elif defined COMPILE_PCRE16
8990 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8991 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
8992   const char **errorptr, int *erroroffset, const unsigned char *tables)
8993 #elif defined COMPILE_PCRE32
8994 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8995 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
8996   const char **errorptr, int *erroroffset, const unsigned char *tables)
8997 #endif
8998 {
8999 REAL_PCRE *re;
9000 int length = 1;  /* For final END opcode */
9001 pcre_int32 firstcharflags, reqcharflags;
9002 pcre_uint32 firstchar, reqchar;
9003 pcre_uint32 limit_match = PCRE_UINT32_MAX;
9004 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9005 int newline;
9006 int errorcode = 0;
9007 int skipatstart = 0;
9008 BOOL utf;
9009 BOOL never_utf = FALSE;
9010 size_t size;
9011 pcre_uchar *code;
9012 const pcre_uchar *codestart;
9013 const pcre_uchar *ptr;
9014 compile_data compile_block;
9015 compile_data *cd = &compile_block;
9016 
9017 /* This space is used for "compiling" into during the first phase, when we are
9018 computing the amount of memory that is needed. Compiled items are thrown away
9019 as soon as possible, so that a fairly large buffer should be sufficient for
9020 this purpose. The same space is used in the second phase for remembering where
9021 to fill in forward references to subpatterns. That may overflow, in which case
9022 new memory is obtained from malloc(). */
9023 
9024 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9025 
9026 /* This vector is used for remembering name groups during the pre-compile. In a
9027 similar way to cworkspace, it can be expanded using malloc() if necessary. */
9028 
9029 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9030 
9031 /* Set this early so that early errors get offset 0. */
9032 
9033 ptr = (const pcre_uchar *)pattern;
9034 
9035 /* We can't pass back an error message if errorptr is NULL; I guess the best we
9036 can do is just return NULL, but we can set a code value if there is a code
9037 pointer. */
9038 
9039 if (errorptr == NULL)
9040   {
9041   if (errorcodeptr != NULL) *errorcodeptr = 99;
9042   return NULL;
9043   }
9044 
9045 *errorptr = NULL;
9046 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9047 
9048 /* However, we can give a message for this error */
9049 
9050 if (erroroffset == NULL)
9051   {
9052   errorcode = ERR16;
9053   goto PCRE_EARLY_ERROR_RETURN2;
9054   }
9055 
9056 *erroroffset = 0;
9057 
9058 /* Set up pointers to the individual character tables */
9059 
9060 if (tables == NULL) tables = PRIV(default_tables);
9061 cd->lcc = tables + lcc_offset;
9062 cd->fcc = tables + fcc_offset;
9063 cd->cbits = tables + cbits_offset;
9064 cd->ctypes = tables + ctypes_offset;
9065 
9066 /* Check that all undefined public option bits are zero */
9067 
9068 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9069   {
9070   errorcode = ERR17;
9071   goto PCRE_EARLY_ERROR_RETURN;
9072   }
9073 
9074 /* If PCRE_NEVER_UTF is set, remember it. */
9075 
9076 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9077 
9078 /* Check for global one-time settings at the start of the pattern, and remember
9079 the offset for later. */
9080 
9081 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
9082 
9083 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9084        ptr[skipatstart+1] == CHAR_ASTERISK)
9085   {
9086   int newnl = 0;
9087   int newbsr = 0;
9088 
9089 /* For completeness and backward compatibility, (*UTFn) is supported in the
9090 relevant libraries, but (*UTF) is generic and always supported. Note that
9091 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9092 
9093 #ifdef COMPILE_PCRE8
9094   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9095     { skipatstart += 7; options |= PCRE_UTF8; continue; }
9096 #endif
9097 #ifdef COMPILE_PCRE16
9098   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9099     { skipatstart += 8; options |= PCRE_UTF16; continue; }
9100 #endif
9101 #ifdef COMPILE_PCRE32
9102   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9103     { skipatstart += 8; options |= PCRE_UTF32; continue; }
9104 #endif
9105 
9106   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9107     { skipatstart += 6; options |= PCRE_UTF8; continue; }
9108   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9109     { skipatstart += 6; options |= PCRE_UCP; continue; }
9110   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9111     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9112   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9113     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9114 
9115   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9116     {
9117     pcre_uint32 c = 0;
9118     int p = skipatstart + 14;
9119     while (isdigit(ptr[p]))
9120       {
9121       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9122       c = c*10 + ptr[p++] - CHAR_0;
9123       }
9124     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9125     if (c < limit_match)
9126       {
9127       limit_match = c;
9128       cd->external_flags |= PCRE_MLSET;
9129       }
9130     skipatstart = p;
9131     continue;
9132     }
9133 
9134   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9135     {
9136     pcre_uint32 c = 0;
9137     int p = skipatstart + 18;
9138     while (isdigit(ptr[p]))
9139       {
9140       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9141       c = c*10 + ptr[p++] - CHAR_0;
9142       }
9143     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9144     if (c < limit_recursion)
9145       {
9146       limit_recursion = c;
9147       cd->external_flags |= PCRE_RLSET;
9148       }
9149     skipatstart = p;
9150     continue;
9151     }
9152 
9153   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9154     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9155   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9156     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9157   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9158     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9159   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9160     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9161   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9162     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9163 
9164   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9165     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9166   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9167     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9168 
9169   if (newnl != 0)
9170     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9171   else if (newbsr != 0)
9172     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9173   else break;
9174   }
9175 
9176 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9177 utf = (options & PCRE_UTF8) != 0;
9178 if (utf && never_utf)
9179   {
9180   errorcode = ERR78;
9181   goto PCRE_EARLY_ERROR_RETURN2;
9182   }
9183 
9184 /* Can't support UTF unless PCRE has been compiled to include the code. The
9185 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9186 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9187 not used here. */
9188 
9189 #ifdef SUPPORT_UTF
9190 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9191      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9192   {
9193 #if defined COMPILE_PCRE8
9194   errorcode = ERR44;
9195 #elif defined COMPILE_PCRE16
9196   errorcode = ERR74;
9197 #elif defined COMPILE_PCRE32
9198   errorcode = ERR77;
9199 #endif
9200   goto PCRE_EARLY_ERROR_RETURN2;
9201   }
9202 #else
9203 if (utf)
9204   {
9205   errorcode = ERR32;
9206   goto PCRE_EARLY_ERROR_RETURN;
9207   }
9208 #endif
9209 
9210 /* Can't support UCP unless PCRE has been compiled to include the code. */
9211 
9212 #ifndef SUPPORT_UCP
9213 if ((options & PCRE_UCP) != 0)
9214   {
9215   errorcode = ERR67;
9216   goto PCRE_EARLY_ERROR_RETURN;
9217   }
9218 #endif
9219 
9220 /* Check validity of \R options. */
9221 
9222 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9223      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9224   {
9225   errorcode = ERR56;
9226   goto PCRE_EARLY_ERROR_RETURN;
9227   }
9228 
9229 /* Handle different types of newline. The three bits give seven cases. The
9230 current code allows for fixed one- or two-byte sequences, plus "any" and
9231 "anycrlf". */
9232 
9233 switch (options & PCRE_NEWLINE_BITS)
9234   {
9235   case 0: newline = NEWLINE; break;   /* Build-time default */
9236   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9237   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9238   case PCRE_NEWLINE_CR+
9239        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9240   case PCRE_NEWLINE_ANY: newline = -1; break;
9241   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9242   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9243   }
9244 
9245 if (newline == -2)
9246   {
9247   cd->nltype = NLTYPE_ANYCRLF;
9248   }
9249 else if (newline < 0)
9250   {
9251   cd->nltype = NLTYPE_ANY;
9252   }
9253 else
9254   {
9255   cd->nltype = NLTYPE_FIXED;
9256   if (newline > 255)
9257     {
9258     cd->nllen = 2;
9259     cd->nl[0] = (newline >> 8) & 255;
9260     cd->nl[1] = newline & 255;
9261     }
9262   else
9263     {
9264     cd->nllen = 1;
9265     cd->nl[0] = newline;
9266     }
9267   }
9268 
9269 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9270 references to help in deciding whether (.*) can be treated as anchored or not.
9271 */
9272 
9273 cd->top_backref = 0;
9274 cd->backref_map = 0;
9275 
9276 /* Reflect pattern for debugging output */
9277 
9278 DPRINTF(("------------------------------------------------------------------\n"));
9279 #ifdef PCRE_DEBUG
9280 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9281 #endif
9282 DPRINTF(("\n"));
9283 
9284 /* Pretend to compile the pattern while actually just accumulating the length
9285 of memory required. This behaviour is triggered by passing a non-NULL final
9286 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9287 to compile parts of the pattern into; the compiled code is discarded when it is
9288 no longer needed, so hopefully this workspace will never overflow, though there
9289 is a test for its doing so. */
9290 
9291 cd->bracount = cd->final_bracount = 0;
9292 cd->names_found = 0;
9293 cd->name_entry_size = 0;
9294 cd->name_table = NULL;
9295 cd->dupnames = FALSE;
9296 cd->dupgroups = FALSE;
9297 cd->namedrefcount = 0;
9298 cd->start_code = cworkspace;
9299 cd->hwm = cworkspace;
9300 cd->iscondassert = FALSE;
9301 cd->start_workspace = cworkspace;
9302 cd->workspace_size = COMPILE_WORK_SIZE;
9303 cd->named_groups = named_groups;
9304 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9305 cd->start_pattern = (const pcre_uchar *)pattern;
9306 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9307 cd->req_varyopt = 0;
9308 cd->parens_depth = 0;
9309 cd->assert_depth = 0;
9310 cd->max_lookbehind = 0;
9311 cd->external_options = options;
9312 cd->open_caps = NULL;
9313 
9314 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9315 don't need to look at the result of the function here. The initial options have
9316 been put into the cd block so that they can be changed if an option setting is
9317 found within the regex right at the beginning. Bringing initial option settings
9318 outside can help speed up starting point checks. */
9319 
9320 ptr += skipatstart;
9321 code = cworkspace;
9322 *code = OP_BRA;
9323 
9324 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9325   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9326   cd, &length);
9327 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9328 
9329 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9330   (int)(cd->hwm - cworkspace)));
9331 
9332 if (length > MAX_PATTERN_SIZE)
9333   {
9334   errorcode = ERR20;
9335   goto PCRE_EARLY_ERROR_RETURN;
9336   }
9337 
9338 /* Compute the size of the data block for storing the compiled pattern. Integer
9339 overflow should no longer be possible because nowadays we limit the maximum
9340 value of cd->names_found and cd->name_entry_size. */
9341 
9342 size = sizeof(REAL_PCRE) +
9343   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9344 
9345 /* Get the memory. */
9346 
9347 re = (REAL_PCRE *)(PUBL(malloc))(size);
9348 if (re == NULL)
9349   {
9350   errorcode = ERR21;
9351   goto PCRE_EARLY_ERROR_RETURN;
9352   }
9353 
9354 /* Put in the magic number, and save the sizes, initial options, internal
9355 flags, and character table pointer. NULL is used for the default character
9356 tables. The nullpad field is at the end; it's there to help in the case when a
9357 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9358 pointers. */
9359 
9360 re->magic_number = MAGIC_NUMBER;
9361 re->size = (int)size;
9362 re->options = cd->external_options;
9363 re->flags = cd->external_flags;
9364 re->limit_match = limit_match;
9365 re->limit_recursion = limit_recursion;
9366 re->first_char = 0;
9367 re->req_char = 0;
9368 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9369 re->name_entry_size = cd->name_entry_size;
9370 re->name_count = cd->names_found;
9371 re->ref_count = 0;
9372 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9373 re->nullpad = NULL;
9374 #ifdef COMPILE_PCRE32
9375 re->dummy = 0;
9376 #else
9377 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9378 #endif
9379 
9380 /* The starting points of the name/number translation table and of the code are
9381 passed around in the compile data block. The start/end pattern and initial
9382 options are already set from the pre-compile phase, as is the name_entry_size
9383 field. Reset the bracket count and the names_found field. Also reset the hwm
9384 field; this time it's used for remembering forward references to subpatterns.
9385 */
9386 
9387 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9388 cd->parens_depth = 0;
9389 cd->assert_depth = 0;
9390 cd->bracount = 0;
9391 cd->max_lookbehind = 0;
9392 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9393 codestart = cd->name_table + re->name_entry_size * re->name_count;
9394 cd->start_code = codestart;
9395 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9396 cd->iscondassert = FALSE;
9397 cd->req_varyopt = 0;
9398 cd->had_accept = FALSE;
9399 cd->had_pruneorskip = FALSE;
9400 cd->check_lookbehind = FALSE;
9401 cd->open_caps = NULL;
9402 
9403 /* If any named groups were found, create the name/number table from the list
9404 created in the first pass. */
9405 
9406 if (cd->names_found > 0)
9407   {
9408   int i = cd->names_found;
9409   named_group *ng = cd->named_groups;
9410   cd->names_found = 0;
9411   for (; i > 0; i--, ng++)
9412     add_name(cd, ng->name, ng->length, ng->number);
9413   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9414     (PUBL(free))((void *)cd->named_groups);
9415   }
9416 
9417 /* Set up a starting, non-extracting bracket, then compile the expression. On
9418 error, errorcode will be set non-zero, so we don't need to look at the result
9419 of the function here. */
9420 
9421 ptr = (const pcre_uchar *)pattern + skipatstart;
9422 code = (pcre_uchar *)codestart;
9423 *code = OP_BRA;
9424 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9425   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9426 re->top_bracket = cd->bracount;
9427 re->top_backref = cd->top_backref;
9428 re->max_lookbehind = cd->max_lookbehind;
9429 re->flags = cd->external_flags | PCRE_MODE;
9430 
9431 if (cd->had_accept)
9432   {
9433   reqchar = 0;              /* Must disable after (*ACCEPT) */
9434   reqcharflags = REQ_NONE;
9435   }
9436 
9437 /* If not reached end of pattern on success, there's an excess bracket. */
9438 
9439 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9440 
9441 /* Fill in the terminating state and check for disastrous overflow, but
9442 if debugging, leave the test till after things are printed out. */
9443 
9444 *code++ = OP_END;
9445 
9446 #ifndef PCRE_DEBUG
9447 if (code - codestart > length) errorcode = ERR23;
9448 #endif
9449 
9450 #ifdef SUPPORT_VALGRIND
9451 /* If the estimated length exceeds the really used length, mark the extra
9452 allocated memory as unaddressable, so that any out-of-bound reads can be
9453 detected. */
9454 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9455 #endif
9456 
9457 /* Fill in any forward references that are required. There may be repeated
9458 references; optimize for them, as searching a large regex takes time. */
9459 
9460 if (cd->hwm > cd->start_workspace)
9461   {
9462   int prev_recno = -1;
9463   const pcre_uchar *groupptr = NULL;
9464   while (errorcode == 0 && cd->hwm > cd->start_workspace)
9465     {
9466     int offset, recno;
9467     cd->hwm -= LINK_SIZE;
9468     offset = GET(cd->hwm, 0);
9469 
9470     /* Check that the hwm handling hasn't gone wrong. This whole area is
9471     rewritten in PCRE2 because there are some obscure cases. */
9472 
9473     if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9474       {
9475       errorcode = ERR10;
9476       break;
9477       }
9478 
9479     recno = GET(codestart, offset);
9480     if (recno != prev_recno)
9481       {
9482       groupptr = PRIV(find_bracket)(codestart, utf, recno);
9483       prev_recno = recno;
9484       }
9485     if (groupptr == NULL) errorcode = ERR53;
9486       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9487     }
9488   }
9489 
9490 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9491 NULL to indicate that forward references have been filled in. */
9492 
9493 if (cd->workspace_size > COMPILE_WORK_SIZE)
9494   (PUBL(free))((void *)cd->start_workspace);
9495 cd->start_workspace = NULL;
9496 
9497 /* Give an error if there's back reference to a non-existent capturing
9498 subpattern. */
9499 
9500 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9501 
9502 /* Unless disabled, check whether any single character iterators can be
9503 auto-possessified. The function overwrites the appropriate opcode values, so
9504 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9505 used in this code because at least one compiler gives a warning about loss of
9506 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9507 function call. */
9508 
9509 if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9510   {
9511   pcre_uchar *temp = (pcre_uchar *)codestart;
9512   auto_possessify(temp, utf, cd);
9513   }
9514 
9515 /* If there were any lookbehind assertions that contained OP_RECURSE
9516 (recursions or subroutine calls), a flag is set for them to be checked here,
9517 because they may contain forward references. Actual recursions cannot be fixed
9518 length, but subroutine calls can. It is done like this so that those without
9519 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9520 exceptional ones forgo this. We scan the pattern to check that they are fixed
9521 length, and set their lengths. */
9522 
9523 if (errorcode == 0 && cd->check_lookbehind)
9524   {
9525   pcre_uchar *cc = (pcre_uchar *)codestart;
9526 
9527   /* Loop, searching for OP_REVERSE items, and process those that do not have
9528   their length set. (Actually, it will also re-process any that have a length
9529   of zero, but that is a pathological case, and it does no harm.) When we find
9530   one, we temporarily terminate the branch it is in while we scan it. */
9531 
9532   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9533        cc != NULL;
9534        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9535     {
9536     if (GET(cc, 1) == 0)
9537       {
9538       int fixed_length;
9539       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9540       int end_op = *be;
9541       *be = OP_END;
9542       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9543         cd, NULL);
9544       *be = end_op;
9545       DPRINTF(("fixed length = %d\n", fixed_length));
9546       if (fixed_length < 0)
9547         {
9548         errorcode = (fixed_length == -2)? ERR36 :
9549                     (fixed_length == -4)? ERR70 : ERR25;
9550         break;
9551         }
9552       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9553       PUT(cc, 1, fixed_length);
9554       }
9555     cc += 1 + LINK_SIZE;
9556     }
9557   }
9558 
9559 /* Failed to compile, or error while post-processing */
9560 
9561 if (errorcode != 0)
9562   {
9563   (PUBL(free))(re);
9564   PCRE_EARLY_ERROR_RETURN:
9565   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9566   PCRE_EARLY_ERROR_RETURN2:
9567   *errorptr = find_error_text(errorcode);
9568   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9569   return NULL;
9570   }
9571 
9572 /* If the anchored option was not passed, set the flag if we can determine that
9573 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9574 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9575 of *PRUNE or *SKIP.
9576 
9577 Otherwise, if we know what the first byte has to be, save it, because that
9578 speeds up unanchored matches no end. If not, see if we can set the
9579 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9580 start with ^. and also when all branches start with non-atomic .* for
9581 non-DOTALL matches when *PRUNE and SKIP are not present. */
9582 
9583 if ((re->options & PCRE_ANCHORED) == 0)
9584   {
9585   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9586   else
9587     {
9588     if (firstcharflags < 0)
9589       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9590     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9591       {
9592 #if defined COMPILE_PCRE8
9593       re->first_char = firstchar & 0xff;
9594 #elif defined COMPILE_PCRE16
9595       re->first_char = firstchar & 0xffff;
9596 #elif defined COMPILE_PCRE32
9597       re->first_char = firstchar;
9598 #endif
9599       if ((firstcharflags & REQ_CASELESS) != 0)
9600         {
9601 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9602         /* We ignore non-ASCII first chars in 8 bit mode. */
9603         if (utf)
9604           {
9605           if (re->first_char < 128)
9606             {
9607             if (cd->fcc[re->first_char] != re->first_char)
9608               re->flags |= PCRE_FCH_CASELESS;
9609             }
9610           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9611             re->flags |= PCRE_FCH_CASELESS;
9612           }
9613         else
9614 #endif
9615         if (MAX_255(re->first_char)
9616             && cd->fcc[re->first_char] != re->first_char)
9617           re->flags |= PCRE_FCH_CASELESS;
9618         }
9619 
9620       re->flags |= PCRE_FIRSTSET;
9621       }
9622 
9623     else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
9624     }
9625   }
9626 
9627 /* For an anchored pattern, we use the "required byte" only if it follows a
9628 variable length item in the regex. Remove the caseless flag for non-caseable
9629 bytes. */
9630 
9631 if (reqcharflags >= 0 &&
9632      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9633   {
9634 #if defined COMPILE_PCRE8
9635   re->req_char = reqchar & 0xff;
9636 #elif defined COMPILE_PCRE16
9637   re->req_char = reqchar & 0xffff;
9638 #elif defined COMPILE_PCRE32
9639   re->req_char = reqchar;
9640 #endif
9641   if ((reqcharflags & REQ_CASELESS) != 0)
9642     {
9643 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9644     /* We ignore non-ASCII first chars in 8 bit mode. */
9645     if (utf)
9646       {
9647       if (re->req_char < 128)
9648         {
9649         if (cd->fcc[re->req_char] != re->req_char)
9650           re->flags |= PCRE_RCH_CASELESS;
9651         }
9652       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9653         re->flags |= PCRE_RCH_CASELESS;
9654       }
9655     else
9656 #endif
9657     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9658       re->flags |= PCRE_RCH_CASELESS;
9659     }
9660 
9661   re->flags |= PCRE_REQCHSET;
9662   }
9663 
9664 /* Print out the compiled data if debugging is enabled. This is never the
9665 case when building a production library. */
9666 
9667 #ifdef PCRE_DEBUG
9668 printf("Length = %d top_bracket = %d top_backref = %d\n",
9669   length, re->top_bracket, re->top_backref);
9670 
9671 printf("Options=%08x\n", re->options);
9672 
9673 if ((re->flags & PCRE_FIRSTSET) != 0)
9674   {
9675   pcre_uchar ch = re->first_char;
9676   const char *caseless =
9677     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9678   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9679     else printf("First char = \\x%02x%s\n", ch, caseless);
9680   }
9681 
9682 if ((re->flags & PCRE_REQCHSET) != 0)
9683   {
9684   pcre_uchar ch = re->req_char;
9685   const char *caseless =
9686     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9687   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9688     else printf("Req char = \\x%02x%s\n", ch, caseless);
9689   }
9690 
9691 #if defined COMPILE_PCRE8
9692 pcre_printint((pcre *)re, stdout, TRUE);
9693 #elif defined COMPILE_PCRE16
9694 pcre16_printint((pcre *)re, stdout, TRUE);
9695 #elif defined COMPILE_PCRE32
9696 pcre32_printint((pcre *)re, stdout, TRUE);
9697 #endif
9698 
9699 /* This check is done here in the debugging case so that the code that
9700 was compiled can be seen. */
9701 
9702 if (code - codestart > length)
9703   {
9704   (PUBL(free))(re);
9705   *errorptr = find_error_text(ERR23);
9706   *erroroffset = ptr - (pcre_uchar *)pattern;
9707   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9708   return NULL;
9709   }
9710 #endif   /* PCRE_DEBUG */
9711 
9712 /* Check for a pattern than can match an empty string, so that this information
9713 can be provided to applications. */
9714 
9715 do
9716   {
9717   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9718     {
9719     re->flags |= PCRE_MATCH_EMPTY;
9720     break;
9721     }
9722   codestart += GET(codestart, 1);
9723   }
9724 while (*codestart == OP_ALT);
9725 
9726 #if defined COMPILE_PCRE8
9727 return (pcre *)re;
9728 #elif defined COMPILE_PCRE16
9729 return (pcre16 *)re;
9730 #elif defined COMPILE_PCRE32
9731 return (pcre32 *)re;
9732 #endif
9733 }
9734 
9735 /* End of pcre_compile.c */
9736