xref: /PHP-7.1/ext/pcre/pcrelib/pcre_compile.c (revision 7bf1f9d5)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2014 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 
45 #include "config.h"
46 
47 #define NLBLOCK cd             /* Block containing newline information */
48 #define PSSTART start_pattern  /* Field containing pattern start */
49 #define PSEND   end_pattern    /* Field containing pattern end */
50 
51 #include "pcre_internal.h"
52 
53 
54 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
55 is also used by pcretest. PCRE_DEBUG is not defined when building a production
56 library. We do not need to select pcre16_printint.c specially, because the
57 COMPILE_PCREx macro will already be appropriately set. */
58 
59 #ifdef PCRE_DEBUG
60 /* pcre_printint.c should not include any headers */
61 #define PCRE_INCLUDED
62 #include "pcre_printint.c"
63 #undef PCRE_INCLUDED
64 #endif
65 
66 
67 /* Macro for setting individual bits in class bitmaps. */
68 
69 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
70 
71 /* Maximum length value to check against when making sure that the integer that
72 holds the compiled pattern length does not overflow. We make it a bit less than
73 INT_MAX to allow for adding in group terminating bytes, so that we don't have
74 to check them every time. */
75 
76 #define OFLOW_MAX (INT_MAX - 20)
77 
78 /* Definitions to allow mutual recursion */
79 
80 static int
81   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
82     const pcre_uint32 *, unsigned int);
83 
84 static BOOL
85   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
86     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
87     compile_data *, int *);
88 
89 
90 
91 /*************************************************
92 *      Code parameters and static tables         *
93 *************************************************/
94 
95 /* This value specifies the size of stack workspace that is used during the
96 first pre-compile phase that determines how much memory is required. The regex
97 is partly compiled into this space, but the compiled parts are discarded as
98 soon as they can be, so that hopefully there will never be an overrun. The code
99 does, however, check for an overrun. The largest amount I've seen used is 218,
100 so this number is very generous.
101 
102 The same workspace is used during the second, actual compile phase for
103 remembering forward references to groups so that they can be filled in at the
104 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
105 is 4 there is plenty of room for most patterns. However, the memory can get
106 filled up by repetitions of forward references, for example patterns like
107 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
108 that the workspace is expanded using malloc() in this situation. The value
109 below is therefore a minimum, and we put a maximum on it for safety. The
110 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
111 kicks in at the same number of forward references in all cases. */
112 
113 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
114 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
115 
116 /* This value determines the size of the initial vector that is used for
117 remembering named groups during the pre-compile. It is allocated on the stack,
118 but if it is too small, it is expanded using malloc(), in a similar way to the
119 workspace. The value is the number of slots in the list. */
120 
121 #define NAMED_GROUP_LIST_SIZE  20
122 
123 /* The overrun tests check for a slightly smaller size so that they detect the
124 overrun before it actually does run off the end of the data block. */
125 
126 #define WORK_SIZE_SAFETY_MARGIN (100)
127 
128 /* Private flags added to firstchar and reqchar. */
129 
130 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
131 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
132 /* Negative values for the firstchar and reqchar flags */
133 #define REQ_UNSET       (-2)
134 #define REQ_NONE        (-1)
135 
136 /* Repeated character flags. */
137 
138 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
139 
140 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
141 are simple data values; negative values are for special things like \d and so
142 on. Zero means further processing is needed (for things like \x), or the escape
143 is invalid. */
144 
145 #ifndef EBCDIC
146 
147 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
148 in UTF-8 mode. */
149 
150 static const short int escapes[] = {
151      0,                       0,
152      0,                       0,
153      0,                       0,
154      0,                       0,
155      0,                       0,
156      CHAR_COLON,              CHAR_SEMICOLON,
157      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
158      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
159      CHAR_COMMERCIAL_AT,      -ESC_A,
160      -ESC_B,                  -ESC_C,
161      -ESC_D,                  -ESC_E,
162      0,                       -ESC_G,
163      -ESC_H,                  0,
164      0,                       -ESC_K,
165      0,                       0,
166      -ESC_N,                  0,
167      -ESC_P,                  -ESC_Q,
168      -ESC_R,                  -ESC_S,
169      0,                       0,
170      -ESC_V,                  -ESC_W,
171      -ESC_X,                  0,
172      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
173      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
174      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
175      CHAR_GRAVE_ACCENT,       ESC_a,
176      -ESC_b,                  0,
177      -ESC_d,                  ESC_e,
178      ESC_f,                   0,
179      -ESC_h,                  0,
180      0,                       -ESC_k,
181      0,                       0,
182      ESC_n,                   0,
183      -ESC_p,                  0,
184      ESC_r,                   -ESC_s,
185      ESC_tee,                 0,
186      -ESC_v,                  -ESC_w,
187      0,                       0,
188      -ESC_z
189 };
190 
191 #else
192 
193 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
194 
195 static const short int escapes[] = {
196 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
197 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
198 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
199 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
200 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
201 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
202 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
203 /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
204 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
205 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
206 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
207 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
208 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
209 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
210 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
211 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
212 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
213 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
214 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
215 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
216 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
217 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
218 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
219 };
220 
221 /* We also need a table of characters that may follow \c in an EBCDIC
222 environment for characters 0-31. */
223 
224 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
225 
226 #endif
227 
228 
229 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
230 searched linearly. Put all the names into a single string, in order to reduce
231 the number of relocations when a shared library is dynamically linked. The
232 string is built from string macros so that it works in UTF-8 mode on EBCDIC
233 platforms. */
234 
235 typedef struct verbitem {
236   int   len;                 /* Length of verb name */
237   int   op;                  /* Op when no arg, or -1 if arg mandatory */
238   int   op_arg;              /* Op when arg present, or -1 if not allowed */
239 } verbitem;
240 
241 static const char verbnames[] =
242   "\0"                       /* Empty name is a shorthand for MARK */
243   STRING_MARK0
244   STRING_ACCEPT0
245   STRING_COMMIT0
246   STRING_F0
247   STRING_FAIL0
248   STRING_PRUNE0
249   STRING_SKIP0
250   STRING_THEN;
251 
252 static const verbitem verbs[] = {
253   { 0, -1,        OP_MARK },
254   { 4, -1,        OP_MARK },
255   { 6, OP_ACCEPT, -1 },
256   { 6, OP_COMMIT, -1 },
257   { 1, OP_FAIL,   -1 },
258   { 4, OP_FAIL,   -1 },
259   { 5, OP_PRUNE,  OP_PRUNE_ARG },
260   { 4, OP_SKIP,   OP_SKIP_ARG  },
261   { 4, OP_THEN,   OP_THEN_ARG  }
262 };
263 
264 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
265 
266 
267 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
268 another regex library. */
269 
270 static const pcre_uchar sub_start_of_word[] = {
271   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
272   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
273 
274 static const pcre_uchar sub_end_of_word[] = {
275   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
276   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
277   CHAR_RIGHT_PARENTHESIS, '\0' };
278 
279 
280 /* Tables of names of POSIX character classes and their lengths. The names are
281 now all in a single string, to reduce the number of relocations when a shared
282 library is dynamically loaded. The list of lengths is terminated by a zero
283 length entry. The first three must be alpha, lower, upper, as this is assumed
284 for handling case independence. The indices for graph, print, and punct are
285 needed, so identify them. */
286 
287 static const char posix_names[] =
288   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
289   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
290   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
291   STRING_word0  STRING_xdigit;
292 
293 static const pcre_uint8 posix_name_lengths[] = {
294   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
295 
296 #define PC_GRAPH  8
297 #define PC_PRINT  9
298 #define PC_PUNCT 10
299 
300 
301 /* Table of class bit maps for each POSIX class. Each class is formed from a
302 base map, with an optional addition or removal of another map. Then, for some
303 classes, there is some additional tweaking: for [:blank:] the vertical space
304 characters are removed, and for [:alpha:] and [:alnum:] the underscore
305 character is removed. The triples in the table consist of the base map offset,
306 second map offset or -1 if no second map, and a non-negative value for map
307 addition or a negative value for map subtraction (if there are two maps). The
308 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
309 remove vertical space characters, 2 => remove underscore. */
310 
311 static const int posix_class_maps[] = {
312   cbit_word,  cbit_digit, -2,             /* alpha */
313   cbit_lower, -1,          0,             /* lower */
314   cbit_upper, -1,          0,             /* upper */
315   cbit_word,  -1,          2,             /* alnum - word without underscore */
316   cbit_print, cbit_cntrl,  0,             /* ascii */
317   cbit_space, -1,          1,             /* blank - a GNU extension */
318   cbit_cntrl, -1,          0,             /* cntrl */
319   cbit_digit, -1,          0,             /* digit */
320   cbit_graph, -1,          0,             /* graph */
321   cbit_print, -1,          0,             /* print */
322   cbit_punct, -1,          0,             /* punct */
323   cbit_space, -1,          0,             /* space */
324   cbit_word,  -1,          0,             /* word - a Perl extension */
325   cbit_xdigit,-1,          0              /* xdigit */
326 };
327 
328 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
329 Unicode property escapes. */
330 
331 #ifdef SUPPORT_UCP
332 static const pcre_uchar string_PNd[]  = {
333   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
334   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
335 static const pcre_uchar string_pNd[]  = {
336   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
337   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
338 static const pcre_uchar string_PXsp[] = {
339   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
340   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pXsp[] = {
342   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_PXwd[] = {
345   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
346   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXwd[] = {
348   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 
351 static const pcre_uchar *substitutes[] = {
352   string_PNd,           /* \D */
353   string_pNd,           /* \d */
354   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
355   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
356   string_PXwd,          /* \W */
357   string_pXwd           /* \w */
358 };
359 
360 /* The POSIX class substitutes must be in the order of the POSIX class names,
361 defined above, and there are both positive and negative cases. NULL means no
362 general substitute of a Unicode property escape (\p or \P). However, for some
363 POSIX classes (e.g. graph, print, punct) a special property code is compiled
364 directly. */
365 
366 static const pcre_uchar string_pL[] =   {
367   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
368   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
369 static const pcre_uchar string_pLl[] =  {
370   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
371   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372 static const pcre_uchar string_pLu[] =  {
373   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
374   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
375 static const pcre_uchar string_pXan[] = {
376   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
377   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
378 static const pcre_uchar string_h[] =    {
379   CHAR_BACKSLASH, CHAR_h, '\0' };
380 static const pcre_uchar string_pXps[] = {
381   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
382   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
383 static const pcre_uchar string_PL[] =   {
384   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
385   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
386 static const pcre_uchar string_PLl[] =  {
387   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
388   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
389 static const pcre_uchar string_PLu[] =  {
390   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
391   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
392 static const pcre_uchar string_PXan[] = {
393   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
394   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
395 static const pcre_uchar string_H[] =    {
396   CHAR_BACKSLASH, CHAR_H, '\0' };
397 static const pcre_uchar string_PXps[] = {
398   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
399   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
400 
401 static const pcre_uchar *posix_substitutes[] = {
402   string_pL,            /* alpha */
403   string_pLl,           /* lower */
404   string_pLu,           /* upper */
405   string_pXan,          /* alnum */
406   NULL,                 /* ascii */
407   string_h,             /* blank */
408   NULL,                 /* cntrl */
409   string_pNd,           /* digit */
410   NULL,                 /* graph */
411   NULL,                 /* print */
412   NULL,                 /* punct */
413   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
414   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
415   NULL,                 /* xdigit */
416   /* Negated cases */
417   string_PL,            /* ^alpha */
418   string_PLl,           /* ^lower */
419   string_PLu,           /* ^upper */
420   string_PXan,          /* ^alnum */
421   NULL,                 /* ^ascii */
422   string_H,             /* ^blank */
423   NULL,                 /* ^cntrl */
424   string_PNd,           /* ^digit */
425   NULL,                 /* ^graph */
426   NULL,                 /* ^print */
427   NULL,                 /* ^punct */
428   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
429   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
430   NULL                  /* ^xdigit */
431 };
432 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
433 #endif
434 
435 #define STRING(a)  # a
436 #define XSTRING(s) STRING(s)
437 
438 /* The texts of compile-time error messages. These are "char *" because they
439 are passed to the outside world. Do not ever re-use any error number, because
440 they are documented. Always add a new error instead. Messages marked DEAD below
441 are no longer used. This used to be a table of strings, but in order to reduce
442 the number of relocations needed when a shared library is loaded dynamically,
443 it is now one long string. We cannot use a table of offsets, because the
444 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
445 simply count through to the one we want - this isn't a performance issue
446 because these strings are used only when there is a compilation error.
447 
448 Each substring ends with \0 to insert a null character. This includes the final
449 substring, so that the whole string ends with \0\0, which can be detected when
450 counting through. */
451 
452 static const char error_texts[] =
453   "no error\0"
454   "\\ at end of pattern\0"
455   "\\c at end of pattern\0"
456   "unrecognized character follows \\\0"
457   "numbers out of order in {} quantifier\0"
458   /* 5 */
459   "number too big in {} quantifier\0"
460   "missing terminating ] for character class\0"
461   "invalid escape sequence in character class\0"
462   "range out of order in character class\0"
463   "nothing to repeat\0"
464   /* 10 */
465   "internal error: invalid forward reference offset\0"
466   "internal error: unexpected repeat\0"
467   "unrecognized character after (? or (?-\0"
468   "POSIX named classes are supported only within a class\0"
469   "missing )\0"
470   /* 15 */
471   "reference to non-existent subpattern\0"
472   "erroffset passed as NULL\0"
473   "unknown option bit(s) set\0"
474   "missing ) after comment\0"
475   "parentheses nested too deeply\0"  /** DEAD **/
476   /* 20 */
477   "regular expression is too large\0"
478   "failed to get memory\0"
479   "unmatched parentheses\0"
480   "internal error: code overflow\0"
481   "unrecognized character after (?<\0"
482   /* 25 */
483   "lookbehind assertion is not fixed length\0"
484   "malformed number or name after (?(\0"
485   "conditional group contains more than two branches\0"
486   "assertion expected after (?( or (?(?C)\0"
487   "(?R or (?[+-]digits must be followed by )\0"
488   /* 30 */
489   "unknown POSIX class name\0"
490   "POSIX collating elements are not supported\0"
491   "this version of PCRE is compiled without UTF support\0"
492   "spare error\0"  /** DEAD **/
493   "character value in \\x{} or \\o{} is too large\0"
494   /* 35 */
495   "invalid condition (?(0)\0"
496   "\\C not allowed in lookbehind assertion\0"
497   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
498   "number after (?C is > 255\0"
499   "closing ) for (?C expected\0"
500   /* 40 */
501   "recursive call could loop indefinitely\0"
502   "unrecognized character after (?P\0"
503   "syntax error in subpattern name (missing terminator)\0"
504   "two named subpatterns have the same name\0"
505   "invalid UTF-8 string\0"
506   /* 45 */
507   "support for \\P, \\p, and \\X has not been compiled\0"
508   "malformed \\P or \\p sequence\0"
509   "unknown property name after \\P or \\p\0"
510   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
511   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
512   /* 50 */
513   "repeated subpattern is too long\0"    /** DEAD **/
514   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
515   "internal error: overran compiling workspace\0"
516   "internal error: previously-checked referenced subpattern not found\0"
517   "DEFINE group contains more than one branch\0"
518   /* 55 */
519   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
520   "inconsistent NEWLINE options\0"
521   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
522   "a numbered reference must not be zero\0"
523   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
524   /* 60 */
525   "(*VERB) not recognized or malformed\0"
526   "number is too big\0"
527   "subpattern name expected\0"
528   "digit expected after (?+\0"
529   "] is an invalid data character in JavaScript compatibility mode\0"
530   /* 65 */
531   "different names for subpatterns of the same number are not allowed\0"
532   "(*MARK) must have an argument\0"
533   "this version of PCRE is not compiled with Unicode property support\0"
534 #ifndef EBCDIC
535   "\\c must be followed by an ASCII character\0"
536 #else
537   "\\c must be followed by a letter or one of [\\]^_?\0"
538 #endif
539   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
540   /* 70 */
541   "internal error: unknown opcode in find_fixedlength()\0"
542   "\\N is not supported in a class\0"
543   "too many forward references\0"
544   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
545   "invalid UTF-16 string\0"
546   /* 75 */
547   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
548   "character value in \\u.... sequence is too large\0"
549   "invalid UTF-32 string\0"
550   "setting UTF is disabled by the application\0"
551   "non-hex character in \\x{} (closing brace missing?)\0"
552   /* 80 */
553   "non-octal character in \\o{} (closing brace missing?)\0"
554   "missing opening brace after \\o\0"
555   "parentheses are too deeply nested\0"
556   "invalid range in character class\0"
557   "group name must start with a non-digit\0"
558   /* 85 */
559   "parentheses are too deeply nested (stack check)\0"
560   "digits missing in \\x{} or \\o{}\0"
561   ;
562 
563 /* Table to identify digits and hex digits. This is used when compiling
564 patterns. Note that the tables in chartables are dependent on the locale, and
565 may mark arbitrary characters as digits - but the PCRE compiling code expects
566 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
567 a private table here. It costs 256 bytes, but it is a lot faster than doing
568 character value tests (at least in some simple cases I timed), and in some
569 applications one wants PCRE to compile efficiently as well as match
570 efficiently.
571 
572 For convenience, we use the same bit definitions as in chartables:
573 
574   0x04   decimal digit
575   0x08   hexadecimal digit
576 
577 Then we can use ctype_digit and ctype_xdigit in the code. */
578 
579 /* Using a simple comparison for decimal numbers rather than a memory read
580 is much faster, and the resulting code is simpler (the compiler turns it
581 into a subtraction and unsigned comparison). */
582 
583 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
584 
585 #ifndef EBCDIC
586 
587 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
588 UTF-8 mode. */
589 
590 static const pcre_uint8 digitab[] =
591   {
592   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
593   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
594   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
598   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
599   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
600   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
601   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
602   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
603   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
604   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
607   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
616   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
619   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
620   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
621   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
622   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
624 
625 #else
626 
627 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
628 
629 static const pcre_uint8 digitab[] =
630   {
631   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
632   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
633   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
639   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
647   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
650   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
653   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
654   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
655   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
656   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
657   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
658   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
659   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
661   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
662   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
663 
664 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
665   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
666   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
667   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
668   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
669   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
670   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
671   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
672   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
673   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
674   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
675   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
676   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
677   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
678   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
679   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
680   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
681   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
682   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
683   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
684   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
685   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
686   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
687   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
688   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
689   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
690   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
691   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
692   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
693   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
694   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
695   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
696   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
697 #endif
698 
699 
700 /* This table is used to check whether auto-possessification is possible
701 between adjacent character-type opcodes. The left-hand (repeated) opcode is
702 used to select the row, and the right-hand opcode is use to select the column.
703 A value of 1 means that auto-possessification is OK. For example, the second
704 value in the first row means that \D+\d can be turned into \D++\d.
705 
706 The Unicode property types (\P and \p) have to be present to fill out the table
707 because of what their opcode values are, but the table values should always be
708 zero because property types are handled separately in the code. The last four
709 columns apply to items that cannot be repeated, so there is no need to have
710 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
711 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
712 
713 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
714 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
715 
716 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
717 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
718   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
719   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
720   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
721   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
722   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
723   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
724   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
725   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
726   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
727   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
728   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
729   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
730   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
731   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
732   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
733   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
734   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
735 };
736 
737 
738 /* This table is used to check whether auto-possessification is possible
739 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
740 left-hand (repeated) opcode is used to select the row, and the right-hand
741 opcode is used to select the column. The values are as follows:
742 
743   0   Always return FALSE (never auto-possessify)
744   1   Character groups are distinct (possessify if both are OP_PROP)
745   2   Check character categories in the same group (general or particular)
746   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
747 
748   4   Check left general category vs right particular category
749   5   Check right general category vs left particular category
750 
751   6   Left alphanum vs right general category
752   7   Left space vs right general category
753   8   Left word vs right general category
754 
755   9   Right alphanum vs left general category
756  10   Right space vs left general category
757  11   Right word vs left general category
758 
759  12   Left alphanum vs right particular category
760  13   Left space vs right particular category
761  14   Left word vs right particular category
762 
763  15   Right alphanum vs left particular category
764  16   Right space vs left particular category
765  17   Right word vs left particular category
766 */
767 
768 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
769 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
770   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
771   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
772   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
773   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
774   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
775   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
776   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
777   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
778   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
779   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
780   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
781 };
782 
783 /* This table is used to check whether auto-possessification is possible
784 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
785 specifies a general category and the other specifies a particular category. The
786 row is selected by the general category and the column by the particular
787 category. The value is 1 if the particular category is not part of the general
788 category. */
789 
790 static const pcre_uint8 catposstab[7][30] = {
791 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
792   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
793   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
794   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
795   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
796   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
797   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
798   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
799 };
800 
801 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
802 a general or particular category. The properties in each row are those
803 that apply to the character set in question. Duplication means that a little
804 unnecessary work is done when checking, but this keeps things much simpler
805 because they can all use the same code. For more details see the comment where
806 this table is used.
807 
808 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
809 "space", but from Perl 5.18 it's included, so both categories are treated the
810 same here. */
811 
812 static const pcre_uint8 posspropstab[3][4] = {
813   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
814   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
815   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
816 };
817 
818 /* This table is used when converting repeating opcodes into possessified
819 versions as a result of an explicit possessive quantifier such as ++. A zero
820 value means there is no possessified version - in those cases the item in
821 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
822 because all relevant opcodes are less than that. */
823 
824 static const pcre_uint8 opcode_possessify[] = {
825   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
826   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
827 
828   0,                       /* NOTI */
829   OP_POSSTAR, 0,           /* STAR, MINSTAR */
830   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
831   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
832   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
833   0,                       /* EXACT */
834   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
835 
836   OP_POSSTARI, 0,          /* STARI, MINSTARI */
837   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
838   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
839   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
840   0,                       /* EXACTI */
841   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
842 
843   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
844   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
845   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
846   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
847   0,                       /* NOTEXACT */
848   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
849 
850   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
851   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
852   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
853   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
854   0,                       /* NOTEXACTI */
855   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
856 
857   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
858   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
859   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
860   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
861   0,                       /* TYPEEXACT */
862   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
863 
864   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
865   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
866   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
867   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
868   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
869 
870   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
871   0, 0,                    /* REF, REFI */
872   0, 0,                    /* DNREF, DNREFI */
873   0, 0                     /* RECURSE, CALLOUT */
874 };
875 
876 
877 
878 /*************************************************
879 *            Find an error text                  *
880 *************************************************/
881 
882 /* The error texts are now all in one long string, to save on relocations. As
883 some of the text is of unknown length, we can't use a table of offsets.
884 Instead, just count through the strings. This is not a performance issue
885 because it happens only when there has been a compilation error.
886 
887 Argument:   the error number
888 Returns:    pointer to the error string
889 */
890 
891 static const char *
find_error_text(int n)892 find_error_text(int n)
893 {
894 const char *s = error_texts;
895 for (; n > 0; n--)
896   {
897   while (*s++ != CHAR_NULL) {};
898   if (*s == CHAR_NULL) return "Error text not found (please report)";
899   }
900 return s;
901 }
902 
903 
904 
905 /*************************************************
906 *           Expand the workspace                 *
907 *************************************************/
908 
909 /* This function is called during the second compiling phase, if the number of
910 forward references fills the existing workspace, which is originally a block on
911 the stack. A larger block is obtained from malloc() unless the ultimate limit
912 has been reached or the increase will be rather small.
913 
914 Argument: pointer to the compile data block
915 Returns:  0 if all went well, else an error number
916 */
917 
918 static int
expand_workspace(compile_data * cd)919 expand_workspace(compile_data *cd)
920 {
921 pcre_uchar *newspace;
922 int newsize = cd->workspace_size * 2;
923 
924 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
925 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
926     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
927  return ERR72;
928 
929 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
930 if (newspace == NULL) return ERR21;
931 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
932 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
933 if (cd->workspace_size > COMPILE_WORK_SIZE)
934   (PUBL(free))((void *)cd->start_workspace);
935 cd->start_workspace = newspace;
936 cd->workspace_size = newsize;
937 return 0;
938 }
939 
940 
941 
942 /*************************************************
943 *            Check for counted repeat            *
944 *************************************************/
945 
946 /* This function is called when a '{' is encountered in a place where it might
947 start a quantifier. It looks ahead to see if it really is a quantifier or not.
948 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
949 where the ddds are digits.
950 
951 Arguments:
952   p         pointer to the first char after '{'
953 
954 Returns:    TRUE or FALSE
955 */
956 
957 static BOOL
is_counted_repeat(const pcre_uchar * p)958 is_counted_repeat(const pcre_uchar *p)
959 {
960 if (!IS_DIGIT(*p)) return FALSE;
961 p++;
962 while (IS_DIGIT(*p)) p++;
963 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
964 
965 if (*p++ != CHAR_COMMA) return FALSE;
966 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
967 
968 if (!IS_DIGIT(*p)) return FALSE;
969 p++;
970 while (IS_DIGIT(*p)) p++;
971 
972 return (*p == CHAR_RIGHT_CURLY_BRACKET);
973 }
974 
975 
976 
977 /*************************************************
978 *            Handle escapes                      *
979 *************************************************/
980 
981 /* This function is called when a \ has been encountered. It either returns a
982 positive value for a simple escape such as \n, or 0 for a data character which
983 will be placed in chptr. A backreference to group n is returned as negative n.
984 When UTF-8 is enabled, a positive value greater than 255 may be returned in
985 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
986 character of the escape sequence.
987 
988 Arguments:
989   ptrptr         points to the pattern position pointer
990   chptr          points to a returned data character
991   errorcodeptr   points to the errorcode variable
992   bracount       number of previous extracting brackets
993   options        the options bits
994   isclass        TRUE if inside a character class
995 
996 Returns:         zero => a data character
997                  positive => a special escape sequence
998                  negative => a back reference
999                  on error, errorcodeptr is set
1000 */
1001 
1002 static int
check_escape(const pcre_uchar ** ptrptr,pcre_uint32 * chptr,int * errorcodeptr,int bracount,int options,BOOL isclass)1003 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1004   int bracount, int options, BOOL isclass)
1005 {
1006 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
1007 BOOL utf = (options & PCRE_UTF8) != 0;
1008 const pcre_uchar *ptr = *ptrptr + 1;
1009 pcre_uint32 c;
1010 int escape = 0;
1011 int i;
1012 
1013 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1014 ptr--;                            /* Set pointer back to the last byte */
1015 
1016 /* If backslash is at the end of the pattern, it's an error. */
1017 
1018 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1019 
1020 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1021 in a table. A non-zero result is something that can be returned immediately.
1022 Otherwise further processing may be required. */
1023 
1024 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1025 /* Not alphanumeric */
1026 else if (c < CHAR_0 || c > CHAR_z) {}
1027 else if ((i = escapes[c - CHAR_0]) != 0)
1028   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1029 
1030 #else           /* EBCDIC coding */
1031 /* Not alphanumeric */
1032 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1033 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1034 #endif
1035 
1036 /* Escapes that need further processing, or are illegal. */
1037 
1038 else
1039   {
1040   const pcre_uchar *oldptr;
1041   BOOL braced, negated, overflow;
1042   int s;
1043 
1044   switch (c)
1045     {
1046     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1047     error. */
1048 
1049     case CHAR_l:
1050     case CHAR_L:
1051     *errorcodeptr = ERR37;
1052     break;
1053 
1054     case CHAR_u:
1055     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1056       {
1057       /* In JavaScript, \u must be followed by four hexadecimal numbers.
1058       Otherwise it is a lowercase u letter. */
1059       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1060         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1061         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1062         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1063         {
1064         c = 0;
1065         for (i = 0; i < 4; ++i)
1066           {
1067           register pcre_uint32 cc = *(++ptr);
1068 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1069           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1070           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1071 #else           /* EBCDIC coding */
1072           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1073           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1074 #endif
1075           }
1076 
1077 #if defined COMPILE_PCRE8
1078         if (c > (utf ? 0x10ffffU : 0xffU))
1079 #elif defined COMPILE_PCRE16
1080         if (c > (utf ? 0x10ffffU : 0xffffU))
1081 #elif defined COMPILE_PCRE32
1082         if (utf && c > 0x10ffffU)
1083 #endif
1084           {
1085           *errorcodeptr = ERR76;
1086           }
1087         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1088         }
1089       }
1090     else
1091       *errorcodeptr = ERR37;
1092     break;
1093 
1094     case CHAR_U:
1095     /* In JavaScript, \U is an uppercase U letter. */
1096     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1097     break;
1098 
1099     /* In a character class, \g is just a literal "g". Outside a character
1100     class, \g must be followed by one of a number of specific things:
1101 
1102     (1) A number, either plain or braced. If positive, it is an absolute
1103     backreference. If negative, it is a relative backreference. This is a Perl
1104     5.10 feature.
1105 
1106     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1107     is part of Perl's movement towards a unified syntax for back references. As
1108     this is synonymous with \k{name}, we fudge it up by pretending it really
1109     was \k.
1110 
1111     (3) For Oniguruma compatibility we also support \g followed by a name or a
1112     number either in angle brackets or in single quotes. However, these are
1113     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1114     the ESC_g code (cf \k). */
1115 
1116     case CHAR_g:
1117     if (isclass) break;
1118     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1119       {
1120       escape = ESC_g;
1121       break;
1122       }
1123 
1124     /* Handle the Perl-compatible cases */
1125 
1126     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1127       {
1128       const pcre_uchar *p;
1129       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1130         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1131       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1132         {
1133         escape = ESC_k;
1134         break;
1135         }
1136       braced = TRUE;
1137       ptr++;
1138       }
1139     else braced = FALSE;
1140 
1141     if (ptr[1] == CHAR_MINUS)
1142       {
1143       negated = TRUE;
1144       ptr++;
1145       }
1146     else negated = FALSE;
1147 
1148     /* The integer range is limited by the machine's int representation. */
1149     s = 0;
1150     overflow = FALSE;
1151     while (IS_DIGIT(ptr[1]))
1152       {
1153       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1154         {
1155         overflow = TRUE;
1156         break;
1157         }
1158       s = s * 10 + (int)(*(++ptr) - CHAR_0);
1159       }
1160     if (overflow) /* Integer overflow */
1161       {
1162       while (IS_DIGIT(ptr[1]))
1163         ptr++;
1164       *errorcodeptr = ERR61;
1165       break;
1166       }
1167 
1168     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1169       {
1170       *errorcodeptr = ERR57;
1171       break;
1172       }
1173 
1174     if (s == 0)
1175       {
1176       *errorcodeptr = ERR58;
1177       break;
1178       }
1179 
1180     if (negated)
1181       {
1182       if (s > bracount)
1183         {
1184         *errorcodeptr = ERR15;
1185         break;
1186         }
1187       s = bracount - (s - 1);
1188       }
1189 
1190     escape = -s;
1191     break;
1192 
1193     /* The handling of escape sequences consisting of a string of digits
1194     starting with one that is not zero is not straightforward. Perl has changed
1195     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1196     recommended to avoid the ambiguities in the old syntax.
1197 
1198     Outside a character class, the digits are read as a decimal number. If the
1199     number is less than 8 (used to be 10), or if there are that many previous
1200     extracting left brackets, then it is a back reference. Otherwise, up to
1201     three octal digits are read to form an escaped byte. Thus \123 is likely to
1202     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1203     the octal value is greater than 377, the least significant 8 bits are
1204     taken. \8 and \9 are treated as the literal characters 8 and 9.
1205 
1206     Inside a character class, \ followed by a digit is always either a literal
1207     8 or 9 or an octal number. */
1208 
1209     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1210     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1211 
1212     if (!isclass)
1213       {
1214       oldptr = ptr;
1215       /* The integer range is limited by the machine's int representation. */
1216       s = (int)(c -CHAR_0);
1217       overflow = FALSE;
1218       while (IS_DIGIT(ptr[1]))
1219         {
1220         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1221           {
1222           overflow = TRUE;
1223           break;
1224           }
1225         s = s * 10 + (int)(*(++ptr) - CHAR_0);
1226         }
1227       if (overflow) /* Integer overflow */
1228         {
1229         while (IS_DIGIT(ptr[1]))
1230           ptr++;
1231         *errorcodeptr = ERR61;
1232         break;
1233         }
1234       if (s < 8 || s <= bracount)  /* Check for back reference */
1235         {
1236         escape = -s;
1237         break;
1238         }
1239       ptr = oldptr;      /* Put the pointer back and fall through */
1240       }
1241 
1242     /* Handle a digit following \ when the number is not a back reference. If
1243     the first digit is 8 or 9, Perl used to generate a binary zero byte and
1244     then treat the digit as a following literal. At least by Perl 5.18 this
1245     changed so as not to insert the binary zero. */
1246 
1247     if ((c = *ptr) >= CHAR_8) break;
1248 
1249     /* Fall through with a digit less than 8 */
1250 
1251     /* \0 always starts an octal number, but we may drop through to here with a
1252     larger first octal digit. The original code used just to take the least
1253     significant 8 bits of octal numbers (I think this is what early Perls used
1254     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1255     but no more than 3 octal digits. */
1256 
1257     case CHAR_0:
1258     c -= CHAR_0;
1259     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1260         c = c * 8 + *(++ptr) - CHAR_0;
1261 #ifdef COMPILE_PCRE8
1262     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1263 #endif
1264     break;
1265 
1266     /* \o is a relatively new Perl feature, supporting a more general way of
1267     specifying character codes in octal. The only supported form is \o{ddd}. */
1268 
1269     case CHAR_o:
1270     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1271     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1272       {
1273       ptr += 2;
1274       c = 0;
1275       overflow = FALSE;
1276       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1277         {
1278         register pcre_uint32 cc = *ptr++;
1279         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1280 #ifdef COMPILE_PCRE32
1281         if (c >= 0x20000000l) { overflow = TRUE; break; }
1282 #endif
1283         c = (c << 3) + cc - CHAR_0 ;
1284 #if defined COMPILE_PCRE8
1285         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1286 #elif defined COMPILE_PCRE16
1287         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1288 #elif defined COMPILE_PCRE32
1289         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1290 #endif
1291         }
1292       if (overflow)
1293         {
1294         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1295         *errorcodeptr = ERR34;
1296         }
1297       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1298         {
1299         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1300         }
1301       else *errorcodeptr = ERR80;
1302       }
1303     break;
1304 
1305     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1306     numbers. Otherwise it is a lowercase x letter. */
1307 
1308     case CHAR_x:
1309     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1310       {
1311       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1312         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1313         {
1314         c = 0;
1315         for (i = 0; i < 2; ++i)
1316           {
1317           register pcre_uint32 cc = *(++ptr);
1318 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1319           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1320           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1321 #else           /* EBCDIC coding */
1322           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1323           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1324 #endif
1325           }
1326         }
1327       }    /* End JavaScript handling */
1328 
1329     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1330     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1331     digits. If not, { used to be treated as a data character. However, Perl
1332     seems to read hex digits up to the first non-such, and ignore the rest, so
1333     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1334     now gives an error. */
1335 
1336     else
1337       {
1338       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1339         {
1340         ptr += 2;
1341         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1342           {
1343           *errorcodeptr = ERR86;
1344           break;
1345           }
1346         c = 0;
1347         overflow = FALSE;
1348         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1349           {
1350           register pcre_uint32 cc = *ptr++;
1351           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1352 
1353 #ifdef COMPILE_PCRE32
1354           if (c >= 0x10000000l) { overflow = TRUE; break; }
1355 #endif
1356 
1357 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1358           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1359           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1360 #else           /* EBCDIC coding */
1361           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1362           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1363 #endif
1364 
1365 #if defined COMPILE_PCRE8
1366           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1367 #elif defined COMPILE_PCRE16
1368           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1369 #elif defined COMPILE_PCRE32
1370           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1371 #endif
1372           }
1373 
1374         if (overflow)
1375           {
1376           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1377           *errorcodeptr = ERR34;
1378           }
1379 
1380         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1381           {
1382           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1383           }
1384 
1385         /* If the sequence of hex digits does not end with '}', give an error.
1386         We used just to recognize this construct and fall through to the normal
1387         \x handling, but nowadays Perl gives an error, which seems much more
1388         sensible, so we do too. */
1389 
1390         else *errorcodeptr = ERR79;
1391         }   /* End of \x{} processing */
1392 
1393       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1394 
1395       else
1396         {
1397         c = 0;
1398         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1399           {
1400           pcre_uint32 cc;                          /* Some compilers don't like */
1401           cc = *(++ptr);                           /* ++ in initializers */
1402 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1403           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1404           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1405 #else           /* EBCDIC coding */
1406           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1407           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1408 #endif
1409           }
1410         }     /* End of \xdd handling */
1411       }       /* End of Perl-style \x handling */
1412     break;
1413 
1414     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1415     An error is given if the byte following \c is not an ASCII character. This
1416     coding is ASCII-specific, but then the whole concept of \cx is
1417     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1418 
1419     case CHAR_c:
1420     c = *(++ptr);
1421     if (c == CHAR_NULL)
1422       {
1423       *errorcodeptr = ERR2;
1424       break;
1425       }
1426 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1427     if (c > 127)  /* Excludes all non-ASCII in either mode */
1428       {
1429       *errorcodeptr = ERR68;
1430       break;
1431       }
1432     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1433     c ^= 0x40;
1434 #else             /* EBCDIC coding */
1435     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1436     if (c == CHAR_QUESTION_MARK)
1437       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1438     else
1439       {
1440       for (i = 0; i < 32; i++)
1441         {
1442         if (c == ebcdic_escape_c[i]) break;
1443         }
1444       if (i < 32) c = i; else *errorcodeptr = ERR68;
1445       }
1446 #endif
1447     break;
1448 
1449     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1450     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1451     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1452     odd, but there used to be some cases other than the default, and there may
1453     be again in future, so I haven't "optimized" it. */
1454 
1455     default:
1456     if ((options & PCRE_EXTRA) != 0) switch(c)
1457       {
1458       default:
1459       *errorcodeptr = ERR3;
1460       break;
1461       }
1462     break;
1463     }
1464   }
1465 
1466 /* Perl supports \N{name} for character names, as well as plain \N for "not
1467 newline". PCRE does not support \N{name}. However, it does support
1468 quantification such as \N{2,3}. */
1469 
1470 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1471      !is_counted_repeat(ptr+2))
1472   *errorcodeptr = ERR37;
1473 
1474 /* If PCRE_UCP is set, we change the values for \d etc. */
1475 
1476 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1477   escape += (ESC_DU - ESC_D);
1478 
1479 /* Set the pointer to the final character before returning. */
1480 
1481 *ptrptr = ptr;
1482 *chptr = c;
1483 return escape;
1484 }
1485 
1486 
1487 
1488 #ifdef SUPPORT_UCP
1489 /*************************************************
1490 *               Handle \P and \p                 *
1491 *************************************************/
1492 
1493 /* This function is called after \P or \p has been encountered, provided that
1494 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1495 pointing at the P or p. On exit, it is pointing at the final character of the
1496 escape sequence.
1497 
1498 Argument:
1499   ptrptr         points to the pattern position pointer
1500   negptr         points to a boolean that is set TRUE for negation else FALSE
1501   ptypeptr       points to an unsigned int that is set to the type value
1502   pdataptr       points to an unsigned int that is set to the detailed property value
1503   errorcodeptr   points to the error code variable
1504 
1505 Returns:         TRUE if the type value was found, or FALSE for an invalid type
1506 */
1507 
1508 static BOOL
get_ucp(const pcre_uchar ** ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr)1509 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1510   unsigned int *pdataptr, int *errorcodeptr)
1511 {
1512 pcre_uchar c;
1513 int i, bot, top;
1514 const pcre_uchar *ptr = *ptrptr;
1515 pcre_uchar name[32];
1516 
1517 c = *(++ptr);
1518 if (c == CHAR_NULL) goto ERROR_RETURN;
1519 
1520 *negptr = FALSE;
1521 
1522 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1523 negation. */
1524 
1525 if (c == CHAR_LEFT_CURLY_BRACKET)
1526   {
1527   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1528     {
1529     *negptr = TRUE;
1530     ptr++;
1531     }
1532   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1533     {
1534     c = *(++ptr);
1535     if (c == CHAR_NULL) goto ERROR_RETURN;
1536     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1537     name[i] = c;
1538     }
1539   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1540   name[i] = 0;
1541   }
1542 
1543 /* Otherwise there is just one following character */
1544 
1545 else
1546   {
1547   name[0] = c;
1548   name[1] = 0;
1549   }
1550 
1551 *ptrptr = ptr;
1552 
1553 /* Search for a recognized property name using binary chop */
1554 
1555 bot = 0;
1556 top = PRIV(utt_size);
1557 
1558 while (bot < top)
1559   {
1560   int r;
1561   i = (bot + top) >> 1;
1562   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1563   if (r == 0)
1564     {
1565     *ptypeptr = PRIV(utt)[i].type;
1566     *pdataptr = PRIV(utt)[i].value;
1567     return TRUE;
1568     }
1569   if (r > 0) bot = i + 1; else top = i;
1570   }
1571 
1572 *errorcodeptr = ERR47;
1573 *ptrptr = ptr;
1574 return FALSE;
1575 
1576 ERROR_RETURN:
1577 *errorcodeptr = ERR46;
1578 *ptrptr = ptr;
1579 return FALSE;
1580 }
1581 #endif
1582 
1583 
1584 
1585 /*************************************************
1586 *         Read repeat counts                     *
1587 *************************************************/
1588 
1589 /* Read an item of the form {n,m} and return the values. This is called only
1590 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1591 so the syntax is guaranteed to be correct, but we need to check the values.
1592 
1593 Arguments:
1594   p              pointer to first char after '{'
1595   minp           pointer to int for min
1596   maxp           pointer to int for max
1597                  returned as -1 if no max
1598   errorcodeptr   points to error code variable
1599 
1600 Returns:         pointer to '}' on success;
1601                  current ptr on error, with errorcodeptr set non-zero
1602 */
1603 
1604 static const pcre_uchar *
read_repeat_counts(const pcre_uchar * p,int * minp,int * maxp,int * errorcodeptr)1605 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1606 {
1607 int min = 0;
1608 int max = -1;
1609 
1610 while (IS_DIGIT(*p))
1611   {
1612   min = min * 10 + (int)(*p++ - CHAR_0);
1613   if (min > 65535)
1614     {
1615     *errorcodeptr = ERR5;
1616     return p;
1617     }
1618   }
1619 
1620 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1621   {
1622   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1623     {
1624     max = 0;
1625     while(IS_DIGIT(*p))
1626       {
1627       max = max * 10 + (int)(*p++ - CHAR_0);
1628       if (max > 65535)
1629         {
1630         *errorcodeptr = ERR5;
1631         return p;
1632         }
1633       }
1634     if (max < min)
1635       {
1636       *errorcodeptr = ERR4;
1637       return p;
1638       }
1639     }
1640   }
1641 
1642 *minp = min;
1643 *maxp = max;
1644 return p;
1645 }
1646 
1647 
1648 
1649 /*************************************************
1650 *      Find first significant op code            *
1651 *************************************************/
1652 
1653 /* This is called by several functions that scan a compiled expression looking
1654 for a fixed first character, or an anchoring op code etc. It skips over things
1655 that do not influence this. For some calls, it makes sense to skip negative
1656 forward and all backward assertions, and also the \b assertion; for others it
1657 does not.
1658 
1659 Arguments:
1660   code         pointer to the start of the group
1661   skipassert   TRUE if certain assertions are to be skipped
1662 
1663 Returns:       pointer to the first significant opcode
1664 */
1665 
1666 static const pcre_uchar*
first_significant_code(const pcre_uchar * code,BOOL skipassert)1667 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1668 {
1669 for (;;)
1670   {
1671   switch ((int)*code)
1672     {
1673     case OP_ASSERT_NOT:
1674     case OP_ASSERTBACK:
1675     case OP_ASSERTBACK_NOT:
1676     if (!skipassert) return code;
1677     do code += GET(code, 1); while (*code == OP_ALT);
1678     code += PRIV(OP_lengths)[*code];
1679     break;
1680 
1681     case OP_WORD_BOUNDARY:
1682     case OP_NOT_WORD_BOUNDARY:
1683     if (!skipassert) return code;
1684     /* Fall through */
1685 
1686     case OP_CALLOUT:
1687     case OP_CREF:
1688     case OP_DNCREF:
1689     case OP_RREF:
1690     case OP_DNRREF:
1691     case OP_DEF:
1692     code += PRIV(OP_lengths)[*code];
1693     break;
1694 
1695     default:
1696     return code;
1697     }
1698   }
1699 /* Control never reaches here */
1700 }
1701 
1702 
1703 
1704 /*************************************************
1705 *        Find the fixed length of a branch       *
1706 *************************************************/
1707 
1708 /* Scan a branch and compute the fixed length of subject that will match it,
1709 if the length is fixed. This is needed for dealing with backward assertions.
1710 In UTF8 mode, the result is in characters rather than bytes. The branch is
1711 temporarily terminated with OP_END when this function is called.
1712 
1713 This function is called when a backward assertion is encountered, so that if it
1714 fails, the error message can point to the correct place in the pattern.
1715 However, we cannot do this when the assertion contains subroutine calls,
1716 because they can be forward references. We solve this by remembering this case
1717 and doing the check at the end; a flag specifies which mode we are running in.
1718 
1719 Arguments:
1720   code     points to the start of the pattern (the bracket)
1721   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1722   atend    TRUE if called when the pattern is complete
1723   cd       the "compile data" structure
1724   recurses    chain of recurse_check to catch mutual recursion
1725 
1726 Returns:   the fixed length,
1727              or -1 if there is no fixed length,
1728              or -2 if \C was encountered (in UTF-8 mode only)
1729              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1730              or -4 if an unknown opcode was encountered (internal error)
1731 */
1732 
1733 static int
find_fixedlength(pcre_uchar * code,BOOL utf,BOOL atend,compile_data * cd,recurse_check * recurses)1734 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1735   recurse_check *recurses)
1736 {
1737 int length = -1;
1738 recurse_check this_recurse;
1739 register int branchlength = 0;
1740 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1741 
1742 /* Scan along the opcodes for this branch. If we get to the end of the
1743 branch, check the length against that of the other branches. */
1744 
1745 for (;;)
1746   {
1747   int d;
1748   pcre_uchar *ce, *cs;
1749   register pcre_uchar op = *cc;
1750 
1751   switch (op)
1752     {
1753     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1754     OP_BRA (normal non-capturing bracket) because the other variants of these
1755     opcodes are all concerned with unlimited repeated groups, which of course
1756     are not of fixed length. */
1757 
1758     case OP_CBRA:
1759     case OP_BRA:
1760     case OP_ONCE:
1761     case OP_ONCE_NC:
1762     case OP_COND:
1763     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1764       recurses);
1765     if (d < 0) return d;
1766     branchlength += d;
1767     do cc += GET(cc, 1); while (*cc == OP_ALT);
1768     cc += 1 + LINK_SIZE;
1769     break;
1770 
1771     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1772     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1773     an ALT. If it is END it's the end of the outer call. All can be handled by
1774     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1775     because they all imply an unlimited repeat. */
1776 
1777     case OP_ALT:
1778     case OP_KET:
1779     case OP_END:
1780     case OP_ACCEPT:
1781     case OP_ASSERT_ACCEPT:
1782     if (length < 0) length = branchlength;
1783       else if (length != branchlength) return -1;
1784     if (*cc != OP_ALT) return length;
1785     cc += 1 + LINK_SIZE;
1786     branchlength = 0;
1787     break;
1788 
1789     /* A true recursion implies not fixed length, but a subroutine call may
1790     be OK. If the subroutine is a forward reference, we can't deal with
1791     it until the end of the pattern, so return -3. */
1792 
1793     case OP_RECURSE:
1794     if (!atend) return -3;
1795     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1796     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1797     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1798     else   /* Check for mutual recursion */
1799       {
1800       recurse_check *r = recurses;
1801       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1802       if (r != NULL) return -1;   /* Mutual recursion */
1803       }
1804     this_recurse.prev = recurses;
1805     this_recurse.group = cs;
1806     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1807     if (d < 0) return d;
1808     branchlength += d;
1809     cc += 1 + LINK_SIZE;
1810     break;
1811 
1812     /* Skip over assertive subpatterns */
1813 
1814     case OP_ASSERT:
1815     case OP_ASSERT_NOT:
1816     case OP_ASSERTBACK:
1817     case OP_ASSERTBACK_NOT:
1818     do cc += GET(cc, 1); while (*cc == OP_ALT);
1819     cc += 1 + LINK_SIZE;
1820     break;
1821 
1822     /* Skip over things that don't match chars */
1823 
1824     case OP_MARK:
1825     case OP_PRUNE_ARG:
1826     case OP_SKIP_ARG:
1827     case OP_THEN_ARG:
1828     cc += cc[1] + PRIV(OP_lengths)[*cc];
1829     break;
1830 
1831     case OP_CALLOUT:
1832     case OP_CIRC:
1833     case OP_CIRCM:
1834     case OP_CLOSE:
1835     case OP_COMMIT:
1836     case OP_CREF:
1837     case OP_DEF:
1838     case OP_DNCREF:
1839     case OP_DNRREF:
1840     case OP_DOLL:
1841     case OP_DOLLM:
1842     case OP_EOD:
1843     case OP_EODN:
1844     case OP_FAIL:
1845     case OP_NOT_WORD_BOUNDARY:
1846     case OP_PRUNE:
1847     case OP_REVERSE:
1848     case OP_RREF:
1849     case OP_SET_SOM:
1850     case OP_SKIP:
1851     case OP_SOD:
1852     case OP_SOM:
1853     case OP_THEN:
1854     case OP_WORD_BOUNDARY:
1855     cc += PRIV(OP_lengths)[*cc];
1856     break;
1857 
1858     /* Handle literal characters */
1859 
1860     case OP_CHAR:
1861     case OP_CHARI:
1862     case OP_NOT:
1863     case OP_NOTI:
1864     branchlength++;
1865     cc += 2;
1866 #ifdef SUPPORT_UTF
1867     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1868 #endif
1869     break;
1870 
1871     /* Handle exact repetitions. The count is already in characters, but we
1872     need to skip over a multibyte character in UTF8 mode.  */
1873 
1874     case OP_EXACT:
1875     case OP_EXACTI:
1876     case OP_NOTEXACT:
1877     case OP_NOTEXACTI:
1878     branchlength += (int)GET2(cc,1);
1879     cc += 2 + IMM2_SIZE;
1880 #ifdef SUPPORT_UTF
1881     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1882 #endif
1883     break;
1884 
1885     case OP_TYPEEXACT:
1886     branchlength += GET2(cc,1);
1887     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1888       cc += 2;
1889     cc += 1 + IMM2_SIZE + 1;
1890     break;
1891 
1892     /* Handle single-char matchers */
1893 
1894     case OP_PROP:
1895     case OP_NOTPROP:
1896     cc += 2;
1897     /* Fall through */
1898 
1899     case OP_HSPACE:
1900     case OP_VSPACE:
1901     case OP_NOT_HSPACE:
1902     case OP_NOT_VSPACE:
1903     case OP_NOT_DIGIT:
1904     case OP_DIGIT:
1905     case OP_NOT_WHITESPACE:
1906     case OP_WHITESPACE:
1907     case OP_NOT_WORDCHAR:
1908     case OP_WORDCHAR:
1909     case OP_ANY:
1910     case OP_ALLANY:
1911     branchlength++;
1912     cc++;
1913     break;
1914 
1915     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1916     otherwise \C is coded as OP_ALLANY. */
1917 
1918     case OP_ANYBYTE:
1919     return -2;
1920 
1921     /* Check a class for variable quantification */
1922 
1923     case OP_CLASS:
1924     case OP_NCLASS:
1925 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1926     case OP_XCLASS:
1927     /* The original code caused an unsigned overflow in 64 bit systems,
1928     so now we use a conditional statement. */
1929     if (op == OP_XCLASS)
1930       cc += GET(cc, 1);
1931     else
1932       cc += PRIV(OP_lengths)[OP_CLASS];
1933 #else
1934     cc += PRIV(OP_lengths)[OP_CLASS];
1935 #endif
1936 
1937     switch (*cc)
1938       {
1939       case OP_CRSTAR:
1940       case OP_CRMINSTAR:
1941       case OP_CRPLUS:
1942       case OP_CRMINPLUS:
1943       case OP_CRQUERY:
1944       case OP_CRMINQUERY:
1945       case OP_CRPOSSTAR:
1946       case OP_CRPOSPLUS:
1947       case OP_CRPOSQUERY:
1948       return -1;
1949 
1950       case OP_CRRANGE:
1951       case OP_CRMINRANGE:
1952       case OP_CRPOSRANGE:
1953       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1954       branchlength += (int)GET2(cc,1);
1955       cc += 1 + 2 * IMM2_SIZE;
1956       break;
1957 
1958       default:
1959       branchlength++;
1960       }
1961     break;
1962 
1963     /* Anything else is variable length */
1964 
1965     case OP_ANYNL:
1966     case OP_BRAMINZERO:
1967     case OP_BRAPOS:
1968     case OP_BRAPOSZERO:
1969     case OP_BRAZERO:
1970     case OP_CBRAPOS:
1971     case OP_EXTUNI:
1972     case OP_KETRMAX:
1973     case OP_KETRMIN:
1974     case OP_KETRPOS:
1975     case OP_MINPLUS:
1976     case OP_MINPLUSI:
1977     case OP_MINQUERY:
1978     case OP_MINQUERYI:
1979     case OP_MINSTAR:
1980     case OP_MINSTARI:
1981     case OP_MINUPTO:
1982     case OP_MINUPTOI:
1983     case OP_NOTMINPLUS:
1984     case OP_NOTMINPLUSI:
1985     case OP_NOTMINQUERY:
1986     case OP_NOTMINQUERYI:
1987     case OP_NOTMINSTAR:
1988     case OP_NOTMINSTARI:
1989     case OP_NOTMINUPTO:
1990     case OP_NOTMINUPTOI:
1991     case OP_NOTPLUS:
1992     case OP_NOTPLUSI:
1993     case OP_NOTPOSPLUS:
1994     case OP_NOTPOSPLUSI:
1995     case OP_NOTPOSQUERY:
1996     case OP_NOTPOSQUERYI:
1997     case OP_NOTPOSSTAR:
1998     case OP_NOTPOSSTARI:
1999     case OP_NOTPOSUPTO:
2000     case OP_NOTPOSUPTOI:
2001     case OP_NOTQUERY:
2002     case OP_NOTQUERYI:
2003     case OP_NOTSTAR:
2004     case OP_NOTSTARI:
2005     case OP_NOTUPTO:
2006     case OP_NOTUPTOI:
2007     case OP_PLUS:
2008     case OP_PLUSI:
2009     case OP_POSPLUS:
2010     case OP_POSPLUSI:
2011     case OP_POSQUERY:
2012     case OP_POSQUERYI:
2013     case OP_POSSTAR:
2014     case OP_POSSTARI:
2015     case OP_POSUPTO:
2016     case OP_POSUPTOI:
2017     case OP_QUERY:
2018     case OP_QUERYI:
2019     case OP_REF:
2020     case OP_REFI:
2021     case OP_DNREF:
2022     case OP_DNREFI:
2023     case OP_SBRA:
2024     case OP_SBRAPOS:
2025     case OP_SCBRA:
2026     case OP_SCBRAPOS:
2027     case OP_SCOND:
2028     case OP_SKIPZERO:
2029     case OP_STAR:
2030     case OP_STARI:
2031     case OP_TYPEMINPLUS:
2032     case OP_TYPEMINQUERY:
2033     case OP_TYPEMINSTAR:
2034     case OP_TYPEMINUPTO:
2035     case OP_TYPEPLUS:
2036     case OP_TYPEPOSPLUS:
2037     case OP_TYPEPOSQUERY:
2038     case OP_TYPEPOSSTAR:
2039     case OP_TYPEPOSUPTO:
2040     case OP_TYPEQUERY:
2041     case OP_TYPESTAR:
2042     case OP_TYPEUPTO:
2043     case OP_UPTO:
2044     case OP_UPTOI:
2045     return -1;
2046 
2047     /* Catch unrecognized opcodes so that when new ones are added they
2048     are not forgotten, as has happened in the past. */
2049 
2050     default:
2051     return -4;
2052     }
2053   }
2054 /* Control never gets here */
2055 }
2056 
2057 
2058 
2059 /*************************************************
2060 *    Scan compiled regex for specific bracket    *
2061 *************************************************/
2062 
2063 /* This little function scans through a compiled pattern until it finds a
2064 capturing bracket with the given number, or, if the number is negative, an
2065 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2066 so that it can be called from pcre_study() when finding the minimum matching
2067 length.
2068 
2069 Arguments:
2070   code        points to start of expression
2071   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2072   number      the required bracket number or negative to find a lookbehind
2073 
2074 Returns:      pointer to the opcode for the bracket, or NULL if not found
2075 */
2076 
2077 const pcre_uchar *
PRIV(find_bracket)2078 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2079 {
2080 for (;;)
2081   {
2082   register pcre_uchar c = *code;
2083 
2084   if (c == OP_END) return NULL;
2085 
2086   /* XCLASS is used for classes that cannot be represented just by a bit
2087   map. This includes negated single high-valued characters. The length in
2088   the table is zero; the actual length is stored in the compiled code. */
2089 
2090   if (c == OP_XCLASS) code += GET(code, 1);
2091 
2092   /* Handle recursion */
2093 
2094   else if (c == OP_REVERSE)
2095     {
2096     if (number < 0) return (pcre_uchar *)code;
2097     code += PRIV(OP_lengths)[c];
2098     }
2099 
2100   /* Handle capturing bracket */
2101 
2102   else if (c == OP_CBRA || c == OP_SCBRA ||
2103            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2104     {
2105     int n = (int)GET2(code, 1+LINK_SIZE);
2106     if (n == number) return (pcre_uchar *)code;
2107     code += PRIV(OP_lengths)[c];
2108     }
2109 
2110   /* Otherwise, we can get the item's length from the table, except that for
2111   repeated character types, we have to test for \p and \P, which have an extra
2112   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2113   must add in its length. */
2114 
2115   else
2116     {
2117     switch(c)
2118       {
2119       case OP_TYPESTAR:
2120       case OP_TYPEMINSTAR:
2121       case OP_TYPEPLUS:
2122       case OP_TYPEMINPLUS:
2123       case OP_TYPEQUERY:
2124       case OP_TYPEMINQUERY:
2125       case OP_TYPEPOSSTAR:
2126       case OP_TYPEPOSPLUS:
2127       case OP_TYPEPOSQUERY:
2128       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2129       break;
2130 
2131       case OP_TYPEUPTO:
2132       case OP_TYPEMINUPTO:
2133       case OP_TYPEEXACT:
2134       case OP_TYPEPOSUPTO:
2135       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2136         code += 2;
2137       break;
2138 
2139       case OP_MARK:
2140       case OP_PRUNE_ARG:
2141       case OP_SKIP_ARG:
2142       case OP_THEN_ARG:
2143       code += code[1];
2144       break;
2145       }
2146 
2147     /* Add in the fixed length from the table */
2148 
2149     code += PRIV(OP_lengths)[c];
2150 
2151   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2152   a multi-byte character. The length in the table is a minimum, so we have to
2153   arrange to skip the extra bytes. */
2154 
2155 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2156     if (utf) switch(c)
2157       {
2158       case OP_CHAR:
2159       case OP_CHARI:
2160       case OP_NOT:
2161       case OP_NOTI:
2162       case OP_EXACT:
2163       case OP_EXACTI:
2164       case OP_NOTEXACT:
2165       case OP_NOTEXACTI:
2166       case OP_UPTO:
2167       case OP_UPTOI:
2168       case OP_NOTUPTO:
2169       case OP_NOTUPTOI:
2170       case OP_MINUPTO:
2171       case OP_MINUPTOI:
2172       case OP_NOTMINUPTO:
2173       case OP_NOTMINUPTOI:
2174       case OP_POSUPTO:
2175       case OP_POSUPTOI:
2176       case OP_NOTPOSUPTO:
2177       case OP_NOTPOSUPTOI:
2178       case OP_STAR:
2179       case OP_STARI:
2180       case OP_NOTSTAR:
2181       case OP_NOTSTARI:
2182       case OP_MINSTAR:
2183       case OP_MINSTARI:
2184       case OP_NOTMINSTAR:
2185       case OP_NOTMINSTARI:
2186       case OP_POSSTAR:
2187       case OP_POSSTARI:
2188       case OP_NOTPOSSTAR:
2189       case OP_NOTPOSSTARI:
2190       case OP_PLUS:
2191       case OP_PLUSI:
2192       case OP_NOTPLUS:
2193       case OP_NOTPLUSI:
2194       case OP_MINPLUS:
2195       case OP_MINPLUSI:
2196       case OP_NOTMINPLUS:
2197       case OP_NOTMINPLUSI:
2198       case OP_POSPLUS:
2199       case OP_POSPLUSI:
2200       case OP_NOTPOSPLUS:
2201       case OP_NOTPOSPLUSI:
2202       case OP_QUERY:
2203       case OP_QUERYI:
2204       case OP_NOTQUERY:
2205       case OP_NOTQUERYI:
2206       case OP_MINQUERY:
2207       case OP_MINQUERYI:
2208       case OP_NOTMINQUERY:
2209       case OP_NOTMINQUERYI:
2210       case OP_POSQUERY:
2211       case OP_POSQUERYI:
2212       case OP_NOTPOSQUERY:
2213       case OP_NOTPOSQUERYI:
2214       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2215       break;
2216       }
2217 #else
2218     (void)(utf);  /* Keep compiler happy by referencing function argument */
2219 #endif
2220     }
2221   }
2222 }
2223 
2224 
2225 
2226 /*************************************************
2227 *   Scan compiled regex for recursion reference  *
2228 *************************************************/
2229 
2230 /* This little function scans through a compiled pattern until it finds an
2231 instance of OP_RECURSE.
2232 
2233 Arguments:
2234   code        points to start of expression
2235   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2236 
2237 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2238 */
2239 
2240 static const pcre_uchar *
find_recurse(const pcre_uchar * code,BOOL utf)2241 find_recurse(const pcre_uchar *code, BOOL utf)
2242 {
2243 for (;;)
2244   {
2245   register pcre_uchar c = *code;
2246   if (c == OP_END) return NULL;
2247   if (c == OP_RECURSE) return code;
2248 
2249   /* XCLASS is used for classes that cannot be represented just by a bit
2250   map. This includes negated single high-valued characters. The length in
2251   the table is zero; the actual length is stored in the compiled code. */
2252 
2253   if (c == OP_XCLASS) code += GET(code, 1);
2254 
2255   /* Otherwise, we can get the item's length from the table, except that for
2256   repeated character types, we have to test for \p and \P, which have an extra
2257   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2258   must add in its length. */
2259 
2260   else
2261     {
2262     switch(c)
2263       {
2264       case OP_TYPESTAR:
2265       case OP_TYPEMINSTAR:
2266       case OP_TYPEPLUS:
2267       case OP_TYPEMINPLUS:
2268       case OP_TYPEQUERY:
2269       case OP_TYPEMINQUERY:
2270       case OP_TYPEPOSSTAR:
2271       case OP_TYPEPOSPLUS:
2272       case OP_TYPEPOSQUERY:
2273       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2274       break;
2275 
2276       case OP_TYPEPOSUPTO:
2277       case OP_TYPEUPTO:
2278       case OP_TYPEMINUPTO:
2279       case OP_TYPEEXACT:
2280       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2281         code += 2;
2282       break;
2283 
2284       case OP_MARK:
2285       case OP_PRUNE_ARG:
2286       case OP_SKIP_ARG:
2287       case OP_THEN_ARG:
2288       code += code[1];
2289       break;
2290       }
2291 
2292     /* Add in the fixed length from the table */
2293 
2294     code += PRIV(OP_lengths)[c];
2295 
2296     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2297     by a multi-byte character. The length in the table is a minimum, so we have
2298     to arrange to skip the extra bytes. */
2299 
2300 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2301     if (utf) switch(c)
2302       {
2303       case OP_CHAR:
2304       case OP_CHARI:
2305       case OP_NOT:
2306       case OP_NOTI:
2307       case OP_EXACT:
2308       case OP_EXACTI:
2309       case OP_NOTEXACT:
2310       case OP_NOTEXACTI:
2311       case OP_UPTO:
2312       case OP_UPTOI:
2313       case OP_NOTUPTO:
2314       case OP_NOTUPTOI:
2315       case OP_MINUPTO:
2316       case OP_MINUPTOI:
2317       case OP_NOTMINUPTO:
2318       case OP_NOTMINUPTOI:
2319       case OP_POSUPTO:
2320       case OP_POSUPTOI:
2321       case OP_NOTPOSUPTO:
2322       case OP_NOTPOSUPTOI:
2323       case OP_STAR:
2324       case OP_STARI:
2325       case OP_NOTSTAR:
2326       case OP_NOTSTARI:
2327       case OP_MINSTAR:
2328       case OP_MINSTARI:
2329       case OP_NOTMINSTAR:
2330       case OP_NOTMINSTARI:
2331       case OP_POSSTAR:
2332       case OP_POSSTARI:
2333       case OP_NOTPOSSTAR:
2334       case OP_NOTPOSSTARI:
2335       case OP_PLUS:
2336       case OP_PLUSI:
2337       case OP_NOTPLUS:
2338       case OP_NOTPLUSI:
2339       case OP_MINPLUS:
2340       case OP_MINPLUSI:
2341       case OP_NOTMINPLUS:
2342       case OP_NOTMINPLUSI:
2343       case OP_POSPLUS:
2344       case OP_POSPLUSI:
2345       case OP_NOTPOSPLUS:
2346       case OP_NOTPOSPLUSI:
2347       case OP_QUERY:
2348       case OP_QUERYI:
2349       case OP_NOTQUERY:
2350       case OP_NOTQUERYI:
2351       case OP_MINQUERY:
2352       case OP_MINQUERYI:
2353       case OP_NOTMINQUERY:
2354       case OP_NOTMINQUERYI:
2355       case OP_POSQUERY:
2356       case OP_POSQUERYI:
2357       case OP_NOTPOSQUERY:
2358       case OP_NOTPOSQUERYI:
2359       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2360       break;
2361       }
2362 #else
2363     (void)(utf);  /* Keep compiler happy by referencing function argument */
2364 #endif
2365     }
2366   }
2367 }
2368 
2369 
2370 
2371 /*************************************************
2372 *    Scan compiled branch for non-emptiness      *
2373 *************************************************/
2374 
2375 /* This function scans through a branch of a compiled pattern to see whether it
2376 can match the empty string or not. It is called from could_be_empty()
2377 below and from compile_branch() when checking for an unlimited repeat of a
2378 group that can match nothing. Note that first_significant_code() skips over
2379 backward and negative forward assertions when its final argument is TRUE. If we
2380 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2381 bracket whose current branch will already have been scanned.
2382 
2383 Arguments:
2384   code        points to start of search
2385   endcode     points to where to stop
2386   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2387   cd          contains pointers to tables etc.
2388   recurses    chain of recurse_check to catch mutual recursion
2389 
2390 Returns:      TRUE if what is matched could be empty
2391 */
2392 
2393 static BOOL
could_be_empty_branch(const pcre_uchar * code,const pcre_uchar * endcode,BOOL utf,compile_data * cd,recurse_check * recurses)2394 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2395   BOOL utf, compile_data *cd, recurse_check *recurses)
2396 {
2397 register pcre_uchar c;
2398 recurse_check this_recurse;
2399 
2400 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2401      code < endcode;
2402      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2403   {
2404   const pcre_uchar *ccode;
2405 
2406   c = *code;
2407 
2408   /* Skip over forward assertions; the other assertions are skipped by
2409   first_significant_code() with a TRUE final argument. */
2410 
2411   if (c == OP_ASSERT)
2412     {
2413     do code += GET(code, 1); while (*code == OP_ALT);
2414     c = *code;
2415     continue;
2416     }
2417 
2418   /* For a recursion/subroutine call, if its end has been reached, which
2419   implies a backward reference subroutine call, we can scan it. If it's a
2420   forward reference subroutine call, we can't. To detect forward reference
2421   we have to scan up the list that is kept in the workspace. This function is
2422   called only when doing the real compile, not during the pre-compile that
2423   measures the size of the compiled pattern. */
2424 
2425   if (c == OP_RECURSE)
2426     {
2427     const pcre_uchar *scode = cd->start_code + GET(code, 1);
2428     const pcre_uchar *endgroup = scode;
2429     BOOL empty_branch;
2430 
2431     /* Test for forward reference or uncompleted reference. This is disabled
2432     when called to scan a completed pattern by setting cd->start_workspace to
2433     NULL. */
2434 
2435     if (cd->start_workspace != NULL)
2436       {
2437       const pcre_uchar *tcode;
2438       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2439         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2440       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2441       }
2442 
2443     /* If the reference is to a completed group, we need to detect whether this
2444     is a recursive call, as otherwise there will be an infinite loop. If it is
2445     a recursion, just skip over it. Simple recursions are easily detected. For
2446     mutual recursions we keep a chain on the stack. */
2447 
2448     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2449     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2450     else
2451       {
2452       recurse_check *r = recurses;
2453       for (r = recurses; r != NULL; r = r->prev)
2454         if (r->group == scode) break;
2455       if (r != NULL) continue;   /* Mutual recursion */
2456       }
2457 
2458     /* Completed reference; scan the referenced group, remembering it on the
2459     stack chain to detect mutual recursions. */
2460 
2461     empty_branch = FALSE;
2462     this_recurse.prev = recurses;
2463     this_recurse.group = scode;
2464 
2465     do
2466       {
2467       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2468         {
2469         empty_branch = TRUE;
2470         break;
2471         }
2472       scode += GET(scode, 1);
2473       }
2474     while (*scode == OP_ALT);
2475 
2476     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2477     continue;
2478     }
2479 
2480   /* Groups with zero repeats can of course be empty; skip them. */
2481 
2482   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2483       c == OP_BRAPOSZERO)
2484     {
2485     code += PRIV(OP_lengths)[c];
2486     do code += GET(code, 1); while (*code == OP_ALT);
2487     c = *code;
2488     continue;
2489     }
2490 
2491   /* A nested group that is already marked as "could be empty" can just be
2492   skipped. */
2493 
2494   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2495       c == OP_SCBRA || c == OP_SCBRAPOS)
2496     {
2497     do code += GET(code, 1); while (*code == OP_ALT);
2498     c = *code;
2499     continue;
2500     }
2501 
2502   /* For other groups, scan the branches. */
2503 
2504   if (c == OP_BRA  || c == OP_BRAPOS ||
2505       c == OP_CBRA || c == OP_CBRAPOS ||
2506       c == OP_ONCE || c == OP_ONCE_NC ||
2507       c == OP_COND || c == OP_SCOND)
2508     {
2509     BOOL empty_branch;
2510     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2511 
2512     /* If a conditional group has only one branch, there is a second, implied,
2513     empty branch, so just skip over the conditional, because it could be empty.
2514     Otherwise, scan the individual branches of the group. */
2515 
2516     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2517       code += GET(code, 1);
2518     else
2519       {
2520       empty_branch = FALSE;
2521       do
2522         {
2523         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2524           recurses)) empty_branch = TRUE;
2525         code += GET(code, 1);
2526         }
2527       while (*code == OP_ALT);
2528       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2529       }
2530 
2531     c = *code;
2532     continue;
2533     }
2534 
2535   /* Handle the other opcodes */
2536 
2537   switch (c)
2538     {
2539     /* Check for quantifiers after a class. XCLASS is used for classes that
2540     cannot be represented just by a bit map. This includes negated single
2541     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2542     actual length is stored in the compiled code, so we must update "code"
2543     here. */
2544 
2545 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2546     case OP_XCLASS:
2547     ccode = code += GET(code, 1);
2548     goto CHECK_CLASS_REPEAT;
2549 #endif
2550 
2551     case OP_CLASS:
2552     case OP_NCLASS:
2553     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2554 
2555 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2556     CHECK_CLASS_REPEAT:
2557 #endif
2558 
2559     switch (*ccode)
2560       {
2561       case OP_CRSTAR:            /* These could be empty; continue */
2562       case OP_CRMINSTAR:
2563       case OP_CRQUERY:
2564       case OP_CRMINQUERY:
2565       case OP_CRPOSSTAR:
2566       case OP_CRPOSQUERY:
2567       break;
2568 
2569       default:                   /* Non-repeat => class must match */
2570       case OP_CRPLUS:            /* These repeats aren't empty */
2571       case OP_CRMINPLUS:
2572       case OP_CRPOSPLUS:
2573       return FALSE;
2574 
2575       case OP_CRRANGE:
2576       case OP_CRMINRANGE:
2577       case OP_CRPOSRANGE:
2578       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2579       break;
2580       }
2581     break;
2582 
2583     /* Opcodes that must match a character */
2584 
2585     case OP_ANY:
2586     case OP_ALLANY:
2587     case OP_ANYBYTE:
2588 
2589     case OP_PROP:
2590     case OP_NOTPROP:
2591     case OP_ANYNL:
2592 
2593     case OP_NOT_HSPACE:
2594     case OP_HSPACE:
2595     case OP_NOT_VSPACE:
2596     case OP_VSPACE:
2597     case OP_EXTUNI:
2598 
2599     case OP_NOT_DIGIT:
2600     case OP_DIGIT:
2601     case OP_NOT_WHITESPACE:
2602     case OP_WHITESPACE:
2603     case OP_NOT_WORDCHAR:
2604     case OP_WORDCHAR:
2605 
2606     case OP_CHAR:
2607     case OP_CHARI:
2608     case OP_NOT:
2609     case OP_NOTI:
2610 
2611     case OP_PLUS:
2612     case OP_PLUSI:
2613     case OP_MINPLUS:
2614     case OP_MINPLUSI:
2615 
2616     case OP_NOTPLUS:
2617     case OP_NOTPLUSI:
2618     case OP_NOTMINPLUS:
2619     case OP_NOTMINPLUSI:
2620 
2621     case OP_POSPLUS:
2622     case OP_POSPLUSI:
2623     case OP_NOTPOSPLUS:
2624     case OP_NOTPOSPLUSI:
2625 
2626     case OP_EXACT:
2627     case OP_EXACTI:
2628     case OP_NOTEXACT:
2629     case OP_NOTEXACTI:
2630 
2631     case OP_TYPEPLUS:
2632     case OP_TYPEMINPLUS:
2633     case OP_TYPEPOSPLUS:
2634     case OP_TYPEEXACT:
2635 
2636     return FALSE;
2637 
2638     /* These are going to continue, as they may be empty, but we have to
2639     fudge the length for the \p and \P cases. */
2640 
2641     case OP_TYPESTAR:
2642     case OP_TYPEMINSTAR:
2643     case OP_TYPEPOSSTAR:
2644     case OP_TYPEQUERY:
2645     case OP_TYPEMINQUERY:
2646     case OP_TYPEPOSQUERY:
2647     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2648     break;
2649 
2650     /* Same for these */
2651 
2652     case OP_TYPEUPTO:
2653     case OP_TYPEMINUPTO:
2654     case OP_TYPEPOSUPTO:
2655     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2656       code += 2;
2657     break;
2658 
2659     /* End of branch */
2660 
2661     case OP_KET:
2662     case OP_KETRMAX:
2663     case OP_KETRMIN:
2664     case OP_KETRPOS:
2665     case OP_ALT:
2666     return TRUE;
2667 
2668     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2669     MINUPTO, and POSUPTO and their caseless and negative versions may be
2670     followed by a multibyte character. */
2671 
2672 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2673     case OP_STAR:
2674     case OP_STARI:
2675     case OP_NOTSTAR:
2676     case OP_NOTSTARI:
2677 
2678     case OP_MINSTAR:
2679     case OP_MINSTARI:
2680     case OP_NOTMINSTAR:
2681     case OP_NOTMINSTARI:
2682 
2683     case OP_POSSTAR:
2684     case OP_POSSTARI:
2685     case OP_NOTPOSSTAR:
2686     case OP_NOTPOSSTARI:
2687 
2688     case OP_QUERY:
2689     case OP_QUERYI:
2690     case OP_NOTQUERY:
2691     case OP_NOTQUERYI:
2692 
2693     case OP_MINQUERY:
2694     case OP_MINQUERYI:
2695     case OP_NOTMINQUERY:
2696     case OP_NOTMINQUERYI:
2697 
2698     case OP_POSQUERY:
2699     case OP_POSQUERYI:
2700     case OP_NOTPOSQUERY:
2701     case OP_NOTPOSQUERYI:
2702 
2703     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2704     break;
2705 
2706     case OP_UPTO:
2707     case OP_UPTOI:
2708     case OP_NOTUPTO:
2709     case OP_NOTUPTOI:
2710 
2711     case OP_MINUPTO:
2712     case OP_MINUPTOI:
2713     case OP_NOTMINUPTO:
2714     case OP_NOTMINUPTOI:
2715 
2716     case OP_POSUPTO:
2717     case OP_POSUPTOI:
2718     case OP_NOTPOSUPTO:
2719     case OP_NOTPOSUPTOI:
2720 
2721     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2722     break;
2723 #endif
2724 
2725     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2726     string. */
2727 
2728     case OP_MARK:
2729     case OP_PRUNE_ARG:
2730     case OP_SKIP_ARG:
2731     case OP_THEN_ARG:
2732     code += code[1];
2733     break;
2734 
2735     /* None of the remaining opcodes are required to match a character. */
2736 
2737     default:
2738     break;
2739     }
2740   }
2741 
2742 return TRUE;
2743 }
2744 
2745 
2746 
2747 /*************************************************
2748 *    Scan compiled regex for non-emptiness       *
2749 *************************************************/
2750 
2751 /* This function is called to check for left recursive calls. We want to check
2752 the current branch of the current pattern to see if it could match the empty
2753 string. If it could, we must look outwards for branches at other levels,
2754 stopping when we pass beyond the bracket which is the subject of the recursion.
2755 This function is called only during the real compile, not during the
2756 pre-compile.
2757 
2758 Arguments:
2759   code        points to start of the recursion
2760   endcode     points to where to stop (current RECURSE item)
2761   bcptr       points to the chain of current (unclosed) branch starts
2762   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2763   cd          pointers to tables etc
2764 
2765 Returns:      TRUE if what is matched could be empty
2766 */
2767 
2768 static BOOL
could_be_empty(const pcre_uchar * code,const pcre_uchar * endcode,branch_chain * bcptr,BOOL utf,compile_data * cd)2769 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2770   branch_chain *bcptr, BOOL utf, compile_data *cd)
2771 {
2772 while (bcptr != NULL && bcptr->current_branch >= code)
2773   {
2774   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2775     return FALSE;
2776   bcptr = bcptr->outer;
2777   }
2778 return TRUE;
2779 }
2780 
2781 
2782 
2783 /*************************************************
2784 *        Base opcode of repeated opcodes         *
2785 *************************************************/
2786 
2787 /* Returns the base opcode for repeated single character type opcodes. If the
2788 opcode is not a repeated character type, it returns with the original value.
2789 
2790 Arguments:  c opcode
2791 Returns:    base opcode for the type
2792 */
2793 
2794 static pcre_uchar
get_repeat_base(pcre_uchar c)2795 get_repeat_base(pcre_uchar c)
2796 {
2797 return (c > OP_TYPEPOSUPTO)? c :
2798        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2799        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2800        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2801        (c >= OP_STARI)?      OP_STARI :
2802                              OP_STAR;
2803 }
2804 
2805 
2806 
2807 #ifdef SUPPORT_UCP
2808 /*************************************************
2809 *        Check a character and a property        *
2810 *************************************************/
2811 
2812 /* This function is called by check_auto_possessive() when a property item
2813 is adjacent to a fixed character.
2814 
2815 Arguments:
2816   c            the character
2817   ptype        the property type
2818   pdata        the data for the type
2819   negated      TRUE if it's a negated property (\P or \p{^)
2820 
2821 Returns:       TRUE if auto-possessifying is OK
2822 */
2823 
2824 static BOOL
check_char_prop(pcre_uint32 c,unsigned int ptype,unsigned int pdata,BOOL negated)2825 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2826   BOOL negated)
2827 {
2828 const pcre_uint32 *p;
2829 const ucd_record *prop = GET_UCD(c);
2830 
2831 switch(ptype)
2832   {
2833   case PT_LAMP:
2834   return (prop->chartype == ucp_Lu ||
2835           prop->chartype == ucp_Ll ||
2836           prop->chartype == ucp_Lt) == negated;
2837 
2838   case PT_GC:
2839   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2840 
2841   case PT_PC:
2842   return (pdata == prop->chartype) == negated;
2843 
2844   case PT_SC:
2845   return (pdata == prop->script) == negated;
2846 
2847   /* These are specials */
2848 
2849   case PT_ALNUM:
2850   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2851           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2852 
2853   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2854   means that Perl space and POSIX space are now identical. PCRE was changed
2855   at release 8.34. */
2856 
2857   case PT_SPACE:    /* Perl space */
2858   case PT_PXSPACE:  /* POSIX space */
2859   switch(c)
2860     {
2861     HSPACE_CASES:
2862     VSPACE_CASES:
2863     return negated;
2864 
2865     default:
2866     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2867     }
2868   break;  /* Control never reaches here */
2869 
2870   case PT_WORD:
2871   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2872           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2873           c == CHAR_UNDERSCORE) == negated;
2874 
2875   case PT_CLIST:
2876   p = PRIV(ucd_caseless_sets) + prop->caseset;
2877   for (;;)
2878     {
2879     if (c < *p) return !negated;
2880     if (c == *p++) return negated;
2881     }
2882   break;  /* Control never reaches here */
2883   }
2884 
2885 return FALSE;
2886 }
2887 #endif  /* SUPPORT_UCP */
2888 
2889 
2890 
2891 /*************************************************
2892 *        Fill the character property list        *
2893 *************************************************/
2894 
2895 /* Checks whether the code points to an opcode that can take part in auto-
2896 possessification, and if so, fills a list with its properties.
2897 
2898 Arguments:
2899   code        points to start of expression
2900   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2901   fcc         points to case-flipping table
2902   list        points to output list
2903               list[0] will be filled with the opcode
2904               list[1] will be non-zero if this opcode
2905                 can match an empty character string
2906               list[2..7] depends on the opcode
2907 
2908 Returns:      points to the start of the next opcode if *code is accepted
2909               NULL if *code is not accepted
2910 */
2911 
2912 static const pcre_uchar *
get_chr_property_list(const pcre_uchar * code,BOOL utf,const pcre_uint8 * fcc,pcre_uint32 * list)2913 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2914   const pcre_uint8 *fcc, pcre_uint32 *list)
2915 {
2916 pcre_uchar c = *code;
2917 pcre_uchar base;
2918 const pcre_uchar *end;
2919 pcre_uint32 chr;
2920 
2921 #ifdef SUPPORT_UCP
2922 pcre_uint32 *clist_dest;
2923 const pcre_uint32 *clist_src;
2924 #else
2925 utf = utf;  /* Suppress "unused parameter" compiler warning */
2926 #endif
2927 
2928 list[0] = c;
2929 list[1] = FALSE;
2930 code++;
2931 
2932 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2933   {
2934   base = get_repeat_base(c);
2935   c -= (base - OP_STAR);
2936 
2937   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2938     code += IMM2_SIZE;
2939 
2940   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2941 
2942   switch(base)
2943     {
2944     case OP_STAR:
2945     list[0] = OP_CHAR;
2946     break;
2947 
2948     case OP_STARI:
2949     list[0] = OP_CHARI;
2950     break;
2951 
2952     case OP_NOTSTAR:
2953     list[0] = OP_NOT;
2954     break;
2955 
2956     case OP_NOTSTARI:
2957     list[0] = OP_NOTI;
2958     break;
2959 
2960     case OP_TYPESTAR:
2961     list[0] = *code;
2962     code++;
2963     break;
2964     }
2965   c = list[0];
2966   }
2967 
2968 switch(c)
2969   {
2970   case OP_NOT_DIGIT:
2971   case OP_DIGIT:
2972   case OP_NOT_WHITESPACE:
2973   case OP_WHITESPACE:
2974   case OP_NOT_WORDCHAR:
2975   case OP_WORDCHAR:
2976   case OP_ANY:
2977   case OP_ALLANY:
2978   case OP_ANYNL:
2979   case OP_NOT_HSPACE:
2980   case OP_HSPACE:
2981   case OP_NOT_VSPACE:
2982   case OP_VSPACE:
2983   case OP_EXTUNI:
2984   case OP_EODN:
2985   case OP_EOD:
2986   case OP_DOLL:
2987   case OP_DOLLM:
2988   return code;
2989 
2990   case OP_CHAR:
2991   case OP_NOT:
2992   GETCHARINCTEST(chr, code);
2993   list[2] = chr;
2994   list[3] = NOTACHAR;
2995   return code;
2996 
2997   case OP_CHARI:
2998   case OP_NOTI:
2999   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3000   GETCHARINCTEST(chr, code);
3001   list[2] = chr;
3002 
3003 #ifdef SUPPORT_UCP
3004   if (chr < 128 || (chr < 256 && !utf))
3005     list[3] = fcc[chr];
3006   else
3007     list[3] = UCD_OTHERCASE(chr);
3008 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3009   list[3] = (chr < 256) ? fcc[chr] : chr;
3010 #else
3011   list[3] = fcc[chr];
3012 #endif
3013 
3014   /* The othercase might be the same value. */
3015 
3016   if (chr == list[3])
3017     list[3] = NOTACHAR;
3018   else
3019     list[4] = NOTACHAR;
3020   return code;
3021 
3022 #ifdef SUPPORT_UCP
3023   case OP_PROP:
3024   case OP_NOTPROP:
3025   if (code[0] != PT_CLIST)
3026     {
3027     list[2] = code[0];
3028     list[3] = code[1];
3029     return code + 2;
3030     }
3031 
3032   /* Convert only if we have enough space. */
3033 
3034   clist_src = PRIV(ucd_caseless_sets) + code[1];
3035   clist_dest = list + 2;
3036   code += 2;
3037 
3038   do {
3039      if (clist_dest >= list + 8)
3040        {
3041        /* Early return if there is not enough space. This should never
3042        happen, since all clists are shorter than 5 character now. */
3043        list[2] = code[0];
3044        list[3] = code[1];
3045        return code;
3046        }
3047      *clist_dest++ = *clist_src;
3048      }
3049   while(*clist_src++ != NOTACHAR);
3050 
3051   /* All characters are stored. The terminating NOTACHAR
3052   is copied form the clist itself. */
3053 
3054   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3055   return code;
3056 #endif
3057 
3058   case OP_NCLASS:
3059   case OP_CLASS:
3060 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3061   case OP_XCLASS:
3062   if (c == OP_XCLASS)
3063     end = code + GET(code, 0) - 1;
3064   else
3065 #endif
3066     end = code + 32 / sizeof(pcre_uchar);
3067 
3068   switch(*end)
3069     {
3070     case OP_CRSTAR:
3071     case OP_CRMINSTAR:
3072     case OP_CRQUERY:
3073     case OP_CRMINQUERY:
3074     case OP_CRPOSSTAR:
3075     case OP_CRPOSQUERY:
3076     list[1] = TRUE;
3077     end++;
3078     break;
3079 
3080     case OP_CRPLUS:
3081     case OP_CRMINPLUS:
3082     case OP_CRPOSPLUS:
3083     end++;
3084     break;
3085 
3086     case OP_CRRANGE:
3087     case OP_CRMINRANGE:
3088     case OP_CRPOSRANGE:
3089     list[1] = (GET2(end, 1) == 0);
3090     end += 1 + 2 * IMM2_SIZE;
3091     break;
3092     }
3093   list[2] = (pcre_uint32)(end - code);
3094   return end;
3095   }
3096 return NULL;    /* Opcode not accepted */
3097 }
3098 
3099 
3100 
3101 /*************************************************
3102 *    Scan further character sets for match       *
3103 *************************************************/
3104 
3105 /* Checks whether the base and the current opcode have a common character, in
3106 which case the base cannot be possessified.
3107 
3108 Arguments:
3109   code        points to the byte code
3110   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3111   cd          static compile data
3112   base_list   the data list of the base opcode
3113 
3114 Returns:      TRUE if the auto-possessification is possible
3115 */
3116 
3117 static BOOL
compare_opcodes(const pcre_uchar * code,BOOL utf,const compile_data * cd,const pcre_uint32 * base_list,const pcre_uchar * base_end,int * rec_limit)3118 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3119   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3120 {
3121 pcre_uchar c;
3122 pcre_uint32 list[8];
3123 const pcre_uint32 *chr_ptr;
3124 const pcre_uint32 *ochr_ptr;
3125 const pcre_uint32 *list_ptr;
3126 const pcre_uchar *next_code;
3127 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3128 const pcre_uchar *xclass_flags;
3129 #endif
3130 const pcre_uint8 *class_bitset;
3131 const pcre_uint8 *set1, *set2, *set_end;
3132 pcre_uint32 chr;
3133 BOOL accepted, invert_bits;
3134 BOOL entered_a_group = FALSE;
3135 
3136 if (*rec_limit == 0) return FALSE;
3137 --(*rec_limit);
3138 
3139 /* Note: the base_list[1] contains whether the current opcode has greedy
3140 (represented by a non-zero value) quantifier. This is a different from
3141 other character type lists, which stores here that the character iterator
3142 matches to an empty string (also represented by a non-zero value). */
3143 
3144 for(;;)
3145   {
3146   /* All operations move the code pointer forward.
3147   Therefore infinite recursions are not possible. */
3148 
3149   c = *code;
3150 
3151   /* Skip over callouts */
3152 
3153   if (c == OP_CALLOUT)
3154     {
3155     code += PRIV(OP_lengths)[c];
3156     continue;
3157     }
3158 
3159   if (c == OP_ALT)
3160     {
3161     do code += GET(code, 1); while (*code == OP_ALT);
3162     c = *code;
3163     }
3164 
3165   switch(c)
3166     {
3167     case OP_END:
3168     case OP_KETRPOS:
3169     /* TRUE only in greedy case. The non-greedy case could be replaced by
3170     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3171     uses more memory, which we cannot get at this stage.) */
3172 
3173     return base_list[1] != 0;
3174 
3175     case OP_KET:
3176     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3177     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3178     cannot be converted to a possessive form. */
3179 
3180     if (base_list[1] == 0) return FALSE;
3181 
3182     switch(*(code - GET(code, 1)))
3183       {
3184       case OP_ASSERT:
3185       case OP_ASSERT_NOT:
3186       case OP_ASSERTBACK:
3187       case OP_ASSERTBACK_NOT:
3188       case OP_ONCE:
3189       case OP_ONCE_NC:
3190       /* Atomic sub-patterns and assertions can always auto-possessify their
3191       last iterator. However, if the group was entered as a result of checking
3192       a previous iterator, this is not possible. */
3193 
3194       return !entered_a_group;
3195       }
3196 
3197     code += PRIV(OP_lengths)[c];
3198     continue;
3199 
3200     case OP_ONCE:
3201     case OP_ONCE_NC:
3202     case OP_BRA:
3203     case OP_CBRA:
3204     next_code = code + GET(code, 1);
3205     code += PRIV(OP_lengths)[c];
3206 
3207     while (*next_code == OP_ALT)
3208       {
3209       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3210         return FALSE;
3211       code = next_code + 1 + LINK_SIZE;
3212       next_code += GET(next_code, 1);
3213       }
3214 
3215     entered_a_group = TRUE;
3216     continue;
3217 
3218     case OP_BRAZERO:
3219     case OP_BRAMINZERO:
3220 
3221     next_code = code + 1;
3222     if (*next_code != OP_BRA && *next_code != OP_CBRA
3223         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3224 
3225     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3226 
3227     /* The bracket content will be checked by the
3228     OP_BRA/OP_CBRA case above. */
3229     next_code += 1 + LINK_SIZE;
3230     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3231       return FALSE;
3232 
3233     code += PRIV(OP_lengths)[c];
3234     continue;
3235 
3236     default:
3237     break;
3238     }
3239 
3240   /* Check for a supported opcode, and load its properties. */
3241 
3242   code = get_chr_property_list(code, utf, cd->fcc, list);
3243   if (code == NULL) return FALSE;    /* Unsupported */
3244 
3245   /* If either opcode is a small character list, set pointers for comparing
3246   characters from that list with another list, or with a property. */
3247 
3248   if (base_list[0] == OP_CHAR)
3249     {
3250     chr_ptr = base_list + 2;
3251     list_ptr = list;
3252     }
3253   else if (list[0] == OP_CHAR)
3254     {
3255     chr_ptr = list + 2;
3256     list_ptr = base_list;
3257     }
3258 
3259   /* Character bitsets can also be compared to certain opcodes. */
3260 
3261   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3262 #ifdef COMPILE_PCRE8
3263       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3264       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3265 #endif
3266       )
3267     {
3268 #ifdef COMPILE_PCRE8
3269     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3270 #else
3271     if (base_list[0] == OP_CLASS)
3272 #endif
3273       {
3274       set1 = (pcre_uint8 *)(base_end - base_list[2]);
3275       list_ptr = list;
3276       }
3277     else
3278       {
3279       set1 = (pcre_uint8 *)(code - list[2]);
3280       list_ptr = base_list;
3281       }
3282 
3283     invert_bits = FALSE;
3284     switch(list_ptr[0])
3285       {
3286       case OP_CLASS:
3287       case OP_NCLASS:
3288       set2 = (pcre_uint8 *)
3289         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3290       break;
3291 
3292 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3293       case OP_XCLASS:
3294       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3295       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3296       if ((*xclass_flags & XCL_MAP) == 0)
3297         {
3298         /* No bits are set for characters < 256. */
3299         if (list[1] == 0) return TRUE;
3300         /* Might be an empty repeat. */
3301         continue;
3302         }
3303       set2 = (pcre_uint8 *)(xclass_flags + 1);
3304       break;
3305 #endif
3306 
3307       case OP_NOT_DIGIT:
3308       invert_bits = TRUE;
3309       /* Fall through */
3310       case OP_DIGIT:
3311       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3312       break;
3313 
3314       case OP_NOT_WHITESPACE:
3315       invert_bits = TRUE;
3316       /* Fall through */
3317       case OP_WHITESPACE:
3318       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3319       break;
3320 
3321       case OP_NOT_WORDCHAR:
3322       invert_bits = TRUE;
3323       /* Fall through */
3324       case OP_WORDCHAR:
3325       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3326       break;
3327 
3328       default:
3329       return FALSE;
3330       }
3331 
3332     /* Because the sets are unaligned, we need
3333     to perform byte comparison here. */
3334     set_end = set1 + 32;
3335     if (invert_bits)
3336       {
3337       do
3338         {
3339         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3340         }
3341       while (set1 < set_end);
3342       }
3343     else
3344       {
3345       do
3346         {
3347         if ((*set1++ & *set2++) != 0) return FALSE;
3348         }
3349       while (set1 < set_end);
3350       }
3351 
3352     if (list[1] == 0) return TRUE;
3353     /* Might be an empty repeat. */
3354     continue;
3355     }
3356 
3357   /* Some property combinations also acceptable. Unicode property opcodes are
3358   processed specially; the rest can be handled with a lookup table. */
3359 
3360   else
3361     {
3362     pcre_uint32 leftop, rightop;
3363 
3364     leftop = base_list[0];
3365     rightop = list[0];
3366 
3367 #ifdef SUPPORT_UCP
3368     accepted = FALSE; /* Always set in non-unicode case. */
3369     if (leftop == OP_PROP || leftop == OP_NOTPROP)
3370       {
3371       if (rightop == OP_EOD)
3372         accepted = TRUE;
3373       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3374         {
3375         int n;
3376         const pcre_uint8 *p;
3377         BOOL same = leftop == rightop;
3378         BOOL lisprop = leftop == OP_PROP;
3379         BOOL risprop = rightop == OP_PROP;
3380         BOOL bothprop = lisprop && risprop;
3381 
3382         /* There's a table that specifies how each combination is to be
3383         processed:
3384           0   Always return FALSE (never auto-possessify)
3385           1   Character groups are distinct (possessify if both are OP_PROP)
3386           2   Check character categories in the same group (general or particular)
3387           3   Return TRUE if the two opcodes are not the same
3388           ... see comments below
3389         */
3390 
3391         n = propposstab[base_list[2]][list[2]];
3392         switch(n)
3393           {
3394           case 0: break;
3395           case 1: accepted = bothprop; break;
3396           case 2: accepted = (base_list[3] == list[3]) != same; break;
3397           case 3: accepted = !same; break;
3398 
3399           case 4:  /* Left general category, right particular category */
3400           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3401           break;
3402 
3403           case 5:  /* Right general category, left particular category */
3404           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3405           break;
3406 
3407           /* This code is logically tricky. Think hard before fiddling with it.
3408           The posspropstab table has four entries per row. Each row relates to
3409           one of PCRE's special properties such as ALNUM or SPACE or WORD.
3410           Only WORD actually needs all four entries, but using repeats for the
3411           others means they can all use the same code below.
3412 
3413           The first two entries in each row are Unicode general categories, and
3414           apply always, because all the characters they include are part of the
3415           PCRE character set. The third and fourth entries are a general and a
3416           particular category, respectively, that include one or more relevant
3417           characters. One or the other is used, depending on whether the check
3418           is for a general or a particular category. However, in both cases the
3419           category contains more characters than the specials that are defined
3420           for the property being tested against. Therefore, it cannot be used
3421           in a NOTPROP case.
3422 
3423           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3424           Underscore is covered by ucp_P or ucp_Po. */
3425 
3426           case 6:  /* Left alphanum vs right general category */
3427           case 7:  /* Left space vs right general category */
3428           case 8:  /* Left word vs right general category */
3429           p = posspropstab[n-6];
3430           accepted = risprop && lisprop ==
3431             (list[3] != p[0] &&
3432              list[3] != p[1] &&
3433             (list[3] != p[2] || !lisprop));
3434           break;
3435 
3436           case 9:   /* Right alphanum vs left general category */
3437           case 10:  /* Right space vs left general category */
3438           case 11:  /* Right word vs left general category */
3439           p = posspropstab[n-9];
3440           accepted = lisprop && risprop ==
3441             (base_list[3] != p[0] &&
3442              base_list[3] != p[1] &&
3443             (base_list[3] != p[2] || !risprop));
3444           break;
3445 
3446           case 12:  /* Left alphanum vs right particular category */
3447           case 13:  /* Left space vs right particular category */
3448           case 14:  /* Left word vs right particular category */
3449           p = posspropstab[n-12];
3450           accepted = risprop && lisprop ==
3451             (catposstab[p[0]][list[3]] &&
3452              catposstab[p[1]][list[3]] &&
3453             (list[3] != p[3] || !lisprop));
3454           break;
3455 
3456           case 15:  /* Right alphanum vs left particular category */
3457           case 16:  /* Right space vs left particular category */
3458           case 17:  /* Right word vs left particular category */
3459           p = posspropstab[n-15];
3460           accepted = lisprop && risprop ==
3461             (catposstab[p[0]][base_list[3]] &&
3462              catposstab[p[1]][base_list[3]] &&
3463             (base_list[3] != p[3] || !risprop));
3464           break;
3465           }
3466         }
3467       }
3468 
3469     else
3470 #endif  /* SUPPORT_UCP */
3471 
3472     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3473            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3474            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3475 
3476     if (!accepted) return FALSE;
3477 
3478     if (list[1] == 0) return TRUE;
3479     /* Might be an empty repeat. */
3480     continue;
3481     }
3482 
3483   /* Control reaches here only if one of the items is a small character list.
3484   All characters are checked against the other side. */
3485 
3486   do
3487     {
3488     chr = *chr_ptr;
3489 
3490     switch(list_ptr[0])
3491       {
3492       case OP_CHAR:
3493       ochr_ptr = list_ptr + 2;
3494       do
3495         {
3496         if (chr == *ochr_ptr) return FALSE;
3497         ochr_ptr++;
3498         }
3499       while(*ochr_ptr != NOTACHAR);
3500       break;
3501 
3502       case OP_NOT:
3503       ochr_ptr = list_ptr + 2;
3504       do
3505         {
3506         if (chr == *ochr_ptr)
3507           break;
3508         ochr_ptr++;
3509         }
3510       while(*ochr_ptr != NOTACHAR);
3511       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3512       break;
3513 
3514       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3515       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3516 
3517       case OP_DIGIT:
3518       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3519       break;
3520 
3521       case OP_NOT_DIGIT:
3522       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3523       break;
3524 
3525       case OP_WHITESPACE:
3526       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3527       break;
3528 
3529       case OP_NOT_WHITESPACE:
3530       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3531       break;
3532 
3533       case OP_WORDCHAR:
3534       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3535       break;
3536 
3537       case OP_NOT_WORDCHAR:
3538       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3539       break;
3540 
3541       case OP_HSPACE:
3542       switch(chr)
3543         {
3544         HSPACE_CASES: return FALSE;
3545         default: break;
3546         }
3547       break;
3548 
3549       case OP_NOT_HSPACE:
3550       switch(chr)
3551         {
3552         HSPACE_CASES: break;
3553         default: return FALSE;
3554         }
3555       break;
3556 
3557       case OP_ANYNL:
3558       case OP_VSPACE:
3559       switch(chr)
3560         {
3561         VSPACE_CASES: return FALSE;
3562         default: break;
3563         }
3564       break;
3565 
3566       case OP_NOT_VSPACE:
3567       switch(chr)
3568         {
3569         VSPACE_CASES: break;
3570         default: return FALSE;
3571         }
3572       break;
3573 
3574       case OP_DOLL:
3575       case OP_EODN:
3576       switch (chr)
3577         {
3578         case CHAR_CR:
3579         case CHAR_LF:
3580         case CHAR_VT:
3581         case CHAR_FF:
3582         case CHAR_NEL:
3583 #ifndef EBCDIC
3584         case 0x2028:
3585         case 0x2029:
3586 #endif  /* Not EBCDIC */
3587         return FALSE;
3588         }
3589       break;
3590 
3591       case OP_EOD:    /* Can always possessify before \z */
3592       break;
3593 
3594 #ifdef SUPPORT_UCP
3595       case OP_PROP:
3596       case OP_NOTPROP:
3597       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3598             list_ptr[0] == OP_NOTPROP))
3599         return FALSE;
3600       break;
3601 #endif
3602 
3603       case OP_NCLASS:
3604       if (chr > 255) return FALSE;
3605       /* Fall through */
3606 
3607       case OP_CLASS:
3608       if (chr > 255) break;
3609       class_bitset = (pcre_uint8 *)
3610         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3611       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3612       break;
3613 
3614 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3615       case OP_XCLASS:
3616       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3617           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3618       break;
3619 #endif
3620 
3621       default:
3622       return FALSE;
3623       }
3624 
3625     chr_ptr++;
3626     }
3627   while(*chr_ptr != NOTACHAR);
3628 
3629   /* At least one character must be matched from this opcode. */
3630 
3631   if (list[1] == 0) return TRUE;
3632   }
3633 
3634 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3635 but some compilers complain about an unreachable statement. */
3636 
3637 }
3638 
3639 
3640 
3641 /*************************************************
3642 *    Scan compiled regex for auto-possession     *
3643 *************************************************/
3644 
3645 /* Replaces single character iterations with their possessive alternatives
3646 if appropriate. This function modifies the compiled opcode!
3647 
3648 Arguments:
3649   code        points to start of the byte code
3650   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3651   cd          static compile data
3652 
3653 Returns:      nothing
3654 */
3655 
3656 static void
auto_possessify(pcre_uchar * code,BOOL utf,const compile_data * cd)3657 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3658 {
3659 register pcre_uchar c;
3660 const pcre_uchar *end;
3661 pcre_uchar *repeat_opcode;
3662 pcre_uint32 list[8];
3663 int rec_limit;
3664 
3665 for (;;)
3666   {
3667   c = *code;
3668 
3669   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3670   it may compile without complaining, but may get into a loop here if the code
3671   pointer points to a bad value. This is, of course a documentated possibility,
3672   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3673   just give up on this optimization. */
3674 
3675   if (c >= OP_TABLE_LENGTH) return;
3676 
3677   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3678     {
3679     c -= get_repeat_base(c) - OP_STAR;
3680     end = (c <= OP_MINUPTO) ?
3681       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3682     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3683 
3684     rec_limit = 1000;
3685     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3686       {
3687       switch(c)
3688         {
3689         case OP_STAR:
3690         *code += OP_POSSTAR - OP_STAR;
3691         break;
3692 
3693         case OP_MINSTAR:
3694         *code += OP_POSSTAR - OP_MINSTAR;
3695         break;
3696 
3697         case OP_PLUS:
3698         *code += OP_POSPLUS - OP_PLUS;
3699         break;
3700 
3701         case OP_MINPLUS:
3702         *code += OP_POSPLUS - OP_MINPLUS;
3703         break;
3704 
3705         case OP_QUERY:
3706         *code += OP_POSQUERY - OP_QUERY;
3707         break;
3708 
3709         case OP_MINQUERY:
3710         *code += OP_POSQUERY - OP_MINQUERY;
3711         break;
3712 
3713         case OP_UPTO:
3714         *code += OP_POSUPTO - OP_UPTO;
3715         break;
3716 
3717         case OP_MINUPTO:
3718         *code += OP_POSUPTO - OP_MINUPTO;
3719         break;
3720         }
3721       }
3722     c = *code;
3723     }
3724   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3725     {
3726 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3727     if (c == OP_XCLASS)
3728       repeat_opcode = code + GET(code, 1);
3729     else
3730 #endif
3731       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3732 
3733     c = *repeat_opcode;
3734     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3735       {
3736       /* end must not be NULL. */
3737       end = get_chr_property_list(code, utf, cd->fcc, list);
3738 
3739       list[1] = (c & 1) == 0;
3740 
3741       rec_limit = 1000;
3742       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3743         {
3744         switch (c)
3745           {
3746           case OP_CRSTAR:
3747           case OP_CRMINSTAR:
3748           *repeat_opcode = OP_CRPOSSTAR;
3749           break;
3750 
3751           case OP_CRPLUS:
3752           case OP_CRMINPLUS:
3753           *repeat_opcode = OP_CRPOSPLUS;
3754           break;
3755 
3756           case OP_CRQUERY:
3757           case OP_CRMINQUERY:
3758           *repeat_opcode = OP_CRPOSQUERY;
3759           break;
3760 
3761           case OP_CRRANGE:
3762           case OP_CRMINRANGE:
3763           *repeat_opcode = OP_CRPOSRANGE;
3764           break;
3765           }
3766         }
3767       }
3768     c = *code;
3769     }
3770 
3771   switch(c)
3772     {
3773     case OP_END:
3774     return;
3775 
3776     case OP_TYPESTAR:
3777     case OP_TYPEMINSTAR:
3778     case OP_TYPEPLUS:
3779     case OP_TYPEMINPLUS:
3780     case OP_TYPEQUERY:
3781     case OP_TYPEMINQUERY:
3782     case OP_TYPEPOSSTAR:
3783     case OP_TYPEPOSPLUS:
3784     case OP_TYPEPOSQUERY:
3785     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3786     break;
3787 
3788     case OP_TYPEUPTO:
3789     case OP_TYPEMINUPTO:
3790     case OP_TYPEEXACT:
3791     case OP_TYPEPOSUPTO:
3792     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3793       code += 2;
3794     break;
3795 
3796 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3797     case OP_XCLASS:
3798     code += GET(code, 1);
3799     break;
3800 #endif
3801 
3802     case OP_MARK:
3803     case OP_PRUNE_ARG:
3804     case OP_SKIP_ARG:
3805     case OP_THEN_ARG:
3806     code += code[1];
3807     break;
3808     }
3809 
3810   /* Add in the fixed length from the table */
3811 
3812   code += PRIV(OP_lengths)[c];
3813 
3814   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3815   a multi-byte character. The length in the table is a minimum, so we have to
3816   arrange to skip the extra bytes. */
3817 
3818 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3819   if (utf) switch(c)
3820     {
3821     case OP_CHAR:
3822     case OP_CHARI:
3823     case OP_NOT:
3824     case OP_NOTI:
3825     case OP_STAR:
3826     case OP_MINSTAR:
3827     case OP_PLUS:
3828     case OP_MINPLUS:
3829     case OP_QUERY:
3830     case OP_MINQUERY:
3831     case OP_UPTO:
3832     case OP_MINUPTO:
3833     case OP_EXACT:
3834     case OP_POSSTAR:
3835     case OP_POSPLUS:
3836     case OP_POSQUERY:
3837     case OP_POSUPTO:
3838     case OP_STARI:
3839     case OP_MINSTARI:
3840     case OP_PLUSI:
3841     case OP_MINPLUSI:
3842     case OP_QUERYI:
3843     case OP_MINQUERYI:
3844     case OP_UPTOI:
3845     case OP_MINUPTOI:
3846     case OP_EXACTI:
3847     case OP_POSSTARI:
3848     case OP_POSPLUSI:
3849     case OP_POSQUERYI:
3850     case OP_POSUPTOI:
3851     case OP_NOTSTAR:
3852     case OP_NOTMINSTAR:
3853     case OP_NOTPLUS:
3854     case OP_NOTMINPLUS:
3855     case OP_NOTQUERY:
3856     case OP_NOTMINQUERY:
3857     case OP_NOTUPTO:
3858     case OP_NOTMINUPTO:
3859     case OP_NOTEXACT:
3860     case OP_NOTPOSSTAR:
3861     case OP_NOTPOSPLUS:
3862     case OP_NOTPOSQUERY:
3863     case OP_NOTPOSUPTO:
3864     case OP_NOTSTARI:
3865     case OP_NOTMINSTARI:
3866     case OP_NOTPLUSI:
3867     case OP_NOTMINPLUSI:
3868     case OP_NOTQUERYI:
3869     case OP_NOTMINQUERYI:
3870     case OP_NOTUPTOI:
3871     case OP_NOTMINUPTOI:
3872     case OP_NOTEXACTI:
3873     case OP_NOTPOSSTARI:
3874     case OP_NOTPOSPLUSI:
3875     case OP_NOTPOSQUERYI:
3876     case OP_NOTPOSUPTOI:
3877     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3878     break;
3879     }
3880 #else
3881   (void)(utf);  /* Keep compiler happy by referencing function argument */
3882 #endif
3883   }
3884 }
3885 
3886 
3887 
3888 /*************************************************
3889 *           Check for POSIX class syntax         *
3890 *************************************************/
3891 
3892 /* This function is called when the sequence "[:" or "[." or "[=" is
3893 encountered in a character class. It checks whether this is followed by a
3894 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3895 reach an unescaped ']' without the special preceding character, return FALSE.
3896 
3897 Originally, this function only recognized a sequence of letters between the
3898 terminators, but it seems that Perl recognizes any sequence of characters,
3899 though of course unknown POSIX names are subsequently rejected. Perl gives an
3900 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3901 didn't consider this to be a POSIX class. Likewise for [:1234:].
3902 
3903 The problem in trying to be exactly like Perl is in the handling of escapes. We
3904 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3905 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3906 below handles the special cases \\ and \], but does not try to do any other
3907 escape processing. This makes it different from Perl for cases such as
3908 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3909 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3910 when Perl does, I think.
3911 
3912 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3913 It seems that the appearance of a nested POSIX class supersedes an apparent
3914 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3915 a digit.
3916 
3917 In Perl, unescaped square brackets may also appear as part of class names. For
3918 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3919 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3920 seem right at all. PCRE does not allow closing square brackets in POSIX class
3921 names.
3922 
3923 Arguments:
3924   ptr      pointer to the initial [
3925   endptr   where to return the end pointer
3926 
3927 Returns:   TRUE or FALSE
3928 */
3929 
3930 static BOOL
check_posix_syntax(const pcre_uchar * ptr,const pcre_uchar ** endptr)3931 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3932 {
3933 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3934 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3935 for (++ptr; *ptr != CHAR_NULL; ptr++)
3936   {
3937   if (*ptr == CHAR_BACKSLASH &&
3938       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3939        ptr[1] == CHAR_BACKSLASH))
3940     ptr++;
3941   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3942             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3943   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3944     {
3945     *endptr = ptr;
3946     return TRUE;
3947     }
3948   }
3949 return FALSE;
3950 }
3951 
3952 
3953 
3954 
3955 /*************************************************
3956 *          Check POSIX class name                *
3957 *************************************************/
3958 
3959 /* This function is called to check the name given in a POSIX-style class entry
3960 such as [:alnum:].
3961 
3962 Arguments:
3963   ptr        points to the first letter
3964   len        the length of the name
3965 
3966 Returns:     a value representing the name, or -1 if unknown
3967 */
3968 
3969 static int
check_posix_name(const pcre_uchar * ptr,int len)3970 check_posix_name(const pcre_uchar *ptr, int len)
3971 {
3972 const char *pn = posix_names;
3973 register int yield = 0;
3974 while (posix_name_lengths[yield] != 0)
3975   {
3976   if (len == posix_name_lengths[yield] &&
3977     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3978   pn += posix_name_lengths[yield] + 1;
3979   yield++;
3980   }
3981 return -1;
3982 }
3983 
3984 
3985 /*************************************************
3986 *    Adjust OP_RECURSE items in repeated group   *
3987 *************************************************/
3988 
3989 /* OP_RECURSE items contain an offset from the start of the regex to the group
3990 that is referenced. This means that groups can be replicated for fixed
3991 repetition simply by copying (because the recursion is allowed to refer to
3992 earlier groups that are outside the current group). However, when a group is
3993 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3994 inserted before it, after it has been compiled. This means that any OP_RECURSE
3995 items within it that refer to the group itself or any contained groups have to
3996 have their offsets adjusted. That one of the jobs of this function. Before it
3997 is called, the partially compiled regex must be temporarily terminated with
3998 OP_END.
3999 
4000 This function has been extended to cope with forward references for recursions
4001 and subroutine calls. It must check the list of such references for the
4002 group we are dealing with. If it finds that one of the recursions in the
4003 current group is on this list, it does not adjust the value in the reference
4004 (which is a group number). After the group has been scanned, all the offsets in
4005 the forward reference list for the group are adjusted.
4006 
4007 Arguments:
4008   group      points to the start of the group
4009   adjust     the amount by which the group is to be moved
4010   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4011   cd         contains pointers to tables etc.
4012   save_hwm_offset   the hwm forward reference offset at the start of the group
4013 
4014 Returns:     nothing
4015 */
4016 
4017 static void
adjust_recurse(pcre_uchar * group,int adjust,BOOL utf,compile_data * cd,size_t save_hwm_offset)4018 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4019   size_t save_hwm_offset)
4020 {
4021 int offset;
4022 pcre_uchar *hc;
4023 pcre_uchar *ptr = group;
4024 
4025 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4026   {
4027   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4028        hc += LINK_SIZE)
4029     {
4030     offset = (int)GET(hc, 0);
4031     if (cd->start_code + offset == ptr + 1) break;
4032     }
4033 
4034   /* If we have not found this recursion on the forward reference list, adjust
4035   the recursion's offset if it's after the start of this group. */
4036 
4037   if (hc >= cd->hwm)
4038     {
4039     offset = (int)GET(ptr, 1);
4040     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4041     }
4042 
4043   ptr += 1 + LINK_SIZE;
4044   }
4045 
4046 /* Now adjust all forward reference offsets for the group. */
4047 
4048 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4049      hc += LINK_SIZE)
4050   {
4051   offset = (int)GET(hc, 0);
4052   PUT(hc, 0, offset + adjust);
4053   }
4054 }
4055 
4056 
4057 
4058 /*************************************************
4059 *        Insert an automatic callout point       *
4060 *************************************************/
4061 
4062 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4063 callout points before each pattern item.
4064 
4065 Arguments:
4066   code           current code pointer
4067   ptr            current pattern pointer
4068   cd             pointers to tables etc
4069 
4070 Returns:         new code pointer
4071 */
4072 
4073 static pcre_uchar *
auto_callout(pcre_uchar * code,const pcre_uchar * ptr,compile_data * cd)4074 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4075 {
4076 *code++ = OP_CALLOUT;
4077 *code++ = 255;
4078 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4079 PUT(code, LINK_SIZE, 0);                       /* Default length */
4080 return code + 2 * LINK_SIZE;
4081 }
4082 
4083 
4084 
4085 /*************************************************
4086 *         Complete a callout item                *
4087 *************************************************/
4088 
4089 /* A callout item contains the length of the next item in the pattern, which
4090 we can't fill in till after we have reached the relevant point. This is used
4091 for both automatic and manual callouts.
4092 
4093 Arguments:
4094   previous_callout   points to previous callout item
4095   ptr                current pattern pointer
4096   cd                 pointers to tables etc
4097 
4098 Returns:             nothing
4099 */
4100 
4101 static void
complete_callout(pcre_uchar * previous_callout,const pcre_uchar * ptr,compile_data * cd)4102 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4103 {
4104 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4105 PUT(previous_callout, 2 + LINK_SIZE, length);
4106 }
4107 
4108 
4109 
4110 #ifdef SUPPORT_UCP
4111 /*************************************************
4112 *           Get othercase range                  *
4113 *************************************************/
4114 
4115 /* This function is passed the start and end of a class range, in UTF-8 mode
4116 with UCP support. It searches up the characters, looking for ranges of
4117 characters in the "other" case. Each call returns the next one, updating the
4118 start address. A character with multiple other cases is returned on its own
4119 with a special return value.
4120 
4121 Arguments:
4122   cptr        points to starting character value; updated
4123   d           end value
4124   ocptr       where to put start of othercase range
4125   odptr       where to put end of othercase range
4126 
4127 Yield:        -1 when no more
4128                0 when a range is returned
4129               >0 the CASESET offset for char with multiple other cases
4130                 in this case, ocptr contains the original
4131 */
4132 
4133 static int
get_othercase_range(pcre_uint32 * cptr,pcre_uint32 d,pcre_uint32 * ocptr,pcre_uint32 * odptr)4134 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4135   pcre_uint32 *odptr)
4136 {
4137 pcre_uint32 c, othercase, next;
4138 unsigned int co;
4139 
4140 /* Find the first character that has an other case. If it has multiple other
4141 cases, return its case offset value. */
4142 
4143 for (c = *cptr; c <= d; c++)
4144   {
4145   if ((co = UCD_CASESET(c)) != 0)
4146     {
4147     *ocptr = c++;   /* Character that has the set */
4148     *cptr = c;      /* Rest of input range */
4149     return (int)co;
4150     }
4151   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4152   }
4153 
4154 if (c > d) return -1;  /* Reached end of range */
4155 
4156 /* Found a character that has a single other case. Search for the end of the
4157 range, which is either the end of the input range, or a character that has zero
4158 or more than one other cases. */
4159 
4160 *ocptr = othercase;
4161 next = othercase + 1;
4162 
4163 for (++c; c <= d; c++)
4164   {
4165   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4166   next++;
4167   }
4168 
4169 *odptr = next - 1;     /* End of othercase range */
4170 *cptr = c;             /* Rest of input range */
4171 return 0;
4172 }
4173 #endif  /* SUPPORT_UCP */
4174 
4175 
4176 
4177 /*************************************************
4178 *        Add a character or range to a class     *
4179 *************************************************/
4180 
4181 /* This function packages up the logic of adding a character or range of
4182 characters to a class. The character values in the arguments will be within the
4183 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4184 mutually recursive with the function immediately below.
4185 
4186 Arguments:
4187   classbits     the bit map for characters < 256
4188   uchardptr     points to the pointer for extra data
4189   options       the options word
4190   cd            contains pointers to tables etc.
4191   start         start of range character
4192   end           end of range character
4193 
4194 Returns:        the number of < 256 characters added
4195                 the pointer to extra data is updated
4196 */
4197 
4198 static int
add_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,pcre_uint32 start,pcre_uint32 end)4199 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4200   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4201 {
4202 pcre_uint32 c;
4203 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4204 int n8 = 0;
4205 
4206 /* If caseless matching is required, scan the range and process alternate
4207 cases. In Unicode, there are 8-bit characters that have alternate cases that
4208 are greater than 255 and vice-versa. Sometimes we can just extend the original
4209 range. */
4210 
4211 if ((options & PCRE_CASELESS) != 0)
4212   {
4213 #ifdef SUPPORT_UCP
4214   if ((options & PCRE_UTF8) != 0)
4215     {
4216     int rc;
4217     pcre_uint32 oc, od;
4218 
4219     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4220     c = start;
4221 
4222     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4223       {
4224       /* Handle a single character that has more than one other case. */
4225 
4226       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4227         PRIV(ucd_caseless_sets) + rc, oc);
4228 
4229       /* Do nothing if the other case range is within the original range. */
4230 
4231       else if (oc >= start && od <= end) continue;
4232 
4233       /* Extend the original range if there is overlap, noting that if oc < c, we
4234       can't have od > end because a subrange is always shorter than the basic
4235       range. Otherwise, use a recursive call to add the additional range. */
4236 
4237       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4238       else if (od > end && oc <= end + 1)
4239         {
4240         end = od;       /* Extend upwards */
4241         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4242         }
4243       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4244       }
4245     }
4246   else
4247 #endif  /* SUPPORT_UCP */
4248 
4249   /* Not UTF-mode, or no UCP */
4250 
4251   for (c = start; c <= classbits_end; c++)
4252     {
4253     SETBIT(classbits, cd->fcc[c]);
4254     n8++;
4255     }
4256   }
4257 
4258 /* Now handle the original range. Adjust the final value according to the bit
4259 length - this means that the same lists of (e.g.) horizontal spaces can be used
4260 in all cases. */
4261 
4262 #if defined COMPILE_PCRE8
4263 #ifdef SUPPORT_UTF
4264   if ((options & PCRE_UTF8) == 0)
4265 #endif
4266   if (end > 0xff) end = 0xff;
4267 
4268 #elif defined COMPILE_PCRE16
4269 #ifdef SUPPORT_UTF
4270   if ((options & PCRE_UTF16) == 0)
4271 #endif
4272   if (end > 0xffff) end = 0xffff;
4273 
4274 #endif /* COMPILE_PCRE[8|16] */
4275 
4276 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4277 
4278 for (c = start; c <= classbits_end; c++)
4279   {
4280   /* Regardless of start, c will always be <= 255. */
4281   SETBIT(classbits, c);
4282   n8++;
4283   }
4284 
4285 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4286 if (start <= 0xff) start = 0xff + 1;
4287 
4288 if (end >= start)
4289   {
4290   pcre_uchar *uchardata = *uchardptr;
4291 #ifdef SUPPORT_UTF
4292   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4293     {
4294     if (start < end)
4295       {
4296       *uchardata++ = XCL_RANGE;
4297       uchardata += PRIV(ord2utf)(start, uchardata);
4298       uchardata += PRIV(ord2utf)(end, uchardata);
4299       }
4300     else if (start == end)
4301       {
4302       *uchardata++ = XCL_SINGLE;
4303       uchardata += PRIV(ord2utf)(start, uchardata);
4304       }
4305     }
4306   else
4307 #endif  /* SUPPORT_UTF */
4308 
4309   /* Without UTF support, character values are constrained by the bit length,
4310   and can only be > 256 for 16-bit and 32-bit libraries. */
4311 
4312 #ifdef COMPILE_PCRE8
4313     {}
4314 #else
4315   if (start < end)
4316     {
4317     *uchardata++ = XCL_RANGE;
4318     *uchardata++ = start;
4319     *uchardata++ = end;
4320     }
4321   else if (start == end)
4322     {
4323     *uchardata++ = XCL_SINGLE;
4324     *uchardata++ = start;
4325     }
4326 #endif
4327 
4328   *uchardptr = uchardata;   /* Updata extra data pointer */
4329   }
4330 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4331 
4332 return n8;    /* Number of 8-bit characters */
4333 }
4334 
4335 
4336 
4337 
4338 /*************************************************
4339 *        Add a list of characters to a class     *
4340 *************************************************/
4341 
4342 /* This function is used for adding a list of case-equivalent characters to a
4343 class, and also for adding a list of horizontal or vertical whitespace. If the
4344 list is in order (which it should be), ranges of characters are detected and
4345 handled appropriately. This function is mutually recursive with the function
4346 above.
4347 
4348 Arguments:
4349   classbits     the bit map for characters < 256
4350   uchardptr     points to the pointer for extra data
4351   options       the options word
4352   cd            contains pointers to tables etc.
4353   p             points to row of 32-bit values, terminated by NOTACHAR
4354   except        character to omit; this is used when adding lists of
4355                   case-equivalent characters to avoid including the one we
4356                   already know about
4357 
4358 Returns:        the number of < 256 characters added
4359                 the pointer to extra data is updated
4360 */
4361 
4362 static int
add_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p,unsigned int except)4363 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4364   compile_data *cd, const pcre_uint32 *p, unsigned int except)
4365 {
4366 int n8 = 0;
4367 while (p[0] < NOTACHAR)
4368   {
4369   int n = 0;
4370   if (p[0] != except)
4371     {
4372     while(p[n+1] == p[0] + n + 1) n++;
4373     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4374     }
4375   p += n + 1;
4376   }
4377 return n8;
4378 }
4379 
4380 
4381 
4382 /*************************************************
4383 *    Add characters not in a list to a class     *
4384 *************************************************/
4385 
4386 /* This function is used for adding the complement of a list of horizontal or
4387 vertical whitespace to a class. The list must be in order.
4388 
4389 Arguments:
4390   classbits     the bit map for characters < 256
4391   uchardptr     points to the pointer for extra data
4392   options       the options word
4393   cd            contains pointers to tables etc.
4394   p             points to row of 32-bit values, terminated by NOTACHAR
4395 
4396 Returns:        the number of < 256 characters added
4397                 the pointer to extra data is updated
4398 */
4399 
4400 static int
add_not_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p)4401 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4402   int options, compile_data *cd, const pcre_uint32 *p)
4403 {
4404 BOOL utf = (options & PCRE_UTF8) != 0;
4405 int n8 = 0;
4406 if (p[0] > 0)
4407   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4408 while (p[0] < NOTACHAR)
4409   {
4410   while (p[1] == p[0] + 1) p++;
4411   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4412     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4413   p++;
4414   }
4415 return n8;
4416 }
4417 
4418 
4419 
4420 /*************************************************
4421 *           Compile one branch                   *
4422 *************************************************/
4423 
4424 /* Scan the pattern, compiling it into the a vector. If the options are
4425 changed during the branch, the pointer is used to change the external options
4426 bits. This function is used during the pre-compile phase when we are trying
4427 to find out the amount of memory needed, as well as during the real compile
4428 phase. The value of lengthptr distinguishes the two phases.
4429 
4430 Arguments:
4431   optionsptr        pointer to the option bits
4432   codeptr           points to the pointer to the current code point
4433   ptrptr            points to the current pattern pointer
4434   errorcodeptr      points to error code variable
4435   firstcharptr      place to put the first required character
4436   firstcharflagsptr place to put the first character flags, or a negative number
4437   reqcharptr        place to put the last required character
4438   reqcharflagsptr   place to put the last required character flags, or a negative number
4439   bcptr             points to current branch chain
4440   cond_depth        conditional nesting depth
4441   cd                contains pointers to tables etc.
4442   lengthptr         NULL during the real compile phase
4443                     points to length accumulator during pre-compile phase
4444 
4445 Returns:            TRUE on success
4446                     FALSE, with *errorcodeptr set non-zero on error
4447 */
4448 
4449 static BOOL
compile_branch(int * optionsptr,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,int cond_depth,compile_data * cd,int * lengthptr)4450 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4451   const pcre_uchar **ptrptr, int *errorcodeptr,
4452   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4453   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4454   branch_chain *bcptr, int cond_depth,
4455   compile_data *cd, int *lengthptr)
4456 {
4457 int repeat_type, op_type;
4458 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4459 int bravalue = 0;
4460 int greedy_default, greedy_non_default;
4461 pcre_uint32 firstchar, reqchar;
4462 pcre_int32 firstcharflags, reqcharflags;
4463 pcre_uint32 zeroreqchar, zerofirstchar;
4464 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4465 pcre_int32 req_caseopt, reqvary, tempreqvary;
4466 int options = *optionsptr;               /* May change dynamically */
4467 int after_manual_callout = 0;
4468 int length_prevgroup = 0;
4469 register pcre_uint32 c;
4470 int escape;
4471 register pcre_uchar *code = *codeptr;
4472 pcre_uchar *last_code = code;
4473 pcre_uchar *orig_code = code;
4474 pcre_uchar *tempcode;
4475 BOOL inescq = FALSE;
4476 BOOL groupsetfirstchar = FALSE;
4477 const pcre_uchar *ptr = *ptrptr;
4478 const pcre_uchar *tempptr;
4479 const pcre_uchar *nestptr = NULL;
4480 pcre_uchar *previous = NULL;
4481 pcre_uchar *previous_callout = NULL;
4482 size_t item_hwm_offset = 0;
4483 pcre_uint8 classbits[32];
4484 
4485 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4486 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4487 dynamically as we process the pattern. */
4488 
4489 #ifdef SUPPORT_UTF
4490 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4491 BOOL utf = (options & PCRE_UTF8) != 0;
4492 #ifndef COMPILE_PCRE32
4493 pcre_uchar utf_chars[6];
4494 #endif
4495 #else
4496 BOOL utf = FALSE;
4497 #endif
4498 
4499 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4500 class_uchardata always so that it can be passed to add_to_class() always,
4501 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4502 alternative calls for the different cases. */
4503 
4504 pcre_uchar *class_uchardata;
4505 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4506 BOOL xclass;
4507 pcre_uchar *class_uchardata_base;
4508 #endif
4509 
4510 #ifdef PCRE_DEBUG
4511 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4512 #endif
4513 
4514 /* Set up the default and non-default settings for greediness */
4515 
4516 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4517 greedy_non_default = greedy_default ^ 1;
4518 
4519 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4520 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4521 matches a non-fixed char first char; reqchar just remains unset if we never
4522 find one.
4523 
4524 When we hit a repeat whose minimum is zero, we may have to adjust these values
4525 to take the zero repeat into account. This is implemented by setting them to
4526 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4527 item types that can be repeated set these backoff variables appropriately. */
4528 
4529 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4530 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4531 
4532 /* The variable req_caseopt contains either the REQ_CASELESS value
4533 or zero, according to the current setting of the caseless flag. The
4534 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4535 firstchar or reqchar variables to record the case status of the
4536 value. This is used only for ASCII characters. */
4537 
4538 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4539 
4540 /* Switch on next character until the end of the branch */
4541 
4542 for (;; ptr++)
4543   {
4544   BOOL negate_class;
4545   BOOL should_flip_negation;
4546   BOOL possessive_quantifier;
4547   BOOL is_quantifier;
4548   BOOL is_recurse;
4549   BOOL reset_bracount;
4550   int class_has_8bitchar;
4551   int class_one_char;
4552 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4553   BOOL xclass_has_prop;
4554 #endif
4555   int newoptions;
4556   int recno;
4557   int refsign;
4558   int skipbytes;
4559   pcre_uint32 subreqchar, subfirstchar;
4560   pcre_int32 subreqcharflags, subfirstcharflags;
4561   int terminator;
4562   unsigned int mclength;
4563   unsigned int tempbracount;
4564   pcre_uint32 ec;
4565   pcre_uchar mcbuffer[8];
4566 
4567   /* Get next character in the pattern */
4568 
4569   c = *ptr;
4570 
4571   /* If we are at the end of a nested substitution, revert to the outer level
4572   string. Nesting only happens one level deep. */
4573 
4574   if (c == CHAR_NULL && nestptr != NULL)
4575     {
4576     ptr = nestptr;
4577     nestptr = NULL;
4578     c = *ptr;
4579     }
4580 
4581   /* If we are in the pre-compile phase, accumulate the length used for the
4582   previous cycle of this loop. */
4583 
4584   if (lengthptr != NULL)
4585     {
4586 #ifdef PCRE_DEBUG
4587     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4588 #endif
4589     if (code > cd->start_workspace + cd->workspace_size -
4590         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4591       {
4592       *errorcodeptr = ERR52;
4593       goto FAILED;
4594       }
4595 
4596     /* There is at least one situation where code goes backwards: this is the
4597     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4598     the class is simply eliminated. However, it is created first, so we have to
4599     allow memory for it. Therefore, don't ever reduce the length at this point.
4600     */
4601 
4602     if (code < last_code) code = last_code;
4603 
4604     /* Paranoid check for integer overflow */
4605 
4606     if (OFLOW_MAX - *lengthptr < code - last_code)
4607       {
4608       *errorcodeptr = ERR20;
4609       goto FAILED;
4610       }
4611 
4612     *lengthptr += (int)(code - last_code);
4613     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4614       (int)(code - last_code), c, c));
4615 
4616     /* If "previous" is set and it is not at the start of the work space, move
4617     it back to there, in order to avoid filling up the work space. Otherwise,
4618     if "previous" is NULL, reset the current code pointer to the start. */
4619 
4620     if (previous != NULL)
4621       {
4622       if (previous > orig_code)
4623         {
4624         memmove(orig_code, previous, IN_UCHARS(code - previous));
4625         code -= previous - orig_code;
4626         previous = orig_code;
4627         }
4628       }
4629     else code = orig_code;
4630 
4631     /* Remember where this code item starts so we can pick up the length
4632     next time round. */
4633 
4634     last_code = code;
4635     }
4636 
4637   /* In the real compile phase, just check the workspace used by the forward
4638   reference list. */
4639 
4640   else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4641     {
4642     *errorcodeptr = ERR52;
4643     goto FAILED;
4644     }
4645 
4646   /* If in \Q...\E, check for the end; if not, we have a literal */
4647 
4648   if (inescq && c != CHAR_NULL)
4649     {
4650     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4651       {
4652       inescq = FALSE;
4653       ptr++;
4654       continue;
4655       }
4656     else
4657       {
4658       if (previous_callout != NULL)
4659         {
4660         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4661           complete_callout(previous_callout, ptr, cd);
4662         previous_callout = NULL;
4663         }
4664       if ((options & PCRE_AUTO_CALLOUT) != 0)
4665         {
4666         previous_callout = code;
4667         code = auto_callout(code, ptr, cd);
4668         }
4669       goto NORMAL_CHAR;
4670       }
4671     /* Control does not reach here. */
4672     }
4673 
4674   /* In extended mode, skip white space and comments. We need a loop in order
4675   to check for more white space and more comments after a comment. */
4676 
4677   if ((options & PCRE_EXTENDED) != 0)
4678     {
4679     for (;;)
4680       {
4681       while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4682       if (c != CHAR_NUMBER_SIGN) break;
4683       ptr++;
4684       while (*ptr != CHAR_NULL)
4685         {
4686         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4687           {                          /* IS_NEWLINE sets cd->nllen. */
4688           ptr += cd->nllen;
4689           break;
4690           }
4691         ptr++;
4692 #ifdef SUPPORT_UTF
4693         if (utf) FORWARDCHAR(ptr);
4694 #endif
4695         }
4696       c = *ptr;     /* Either NULL or the char after a newline */
4697       }
4698     }
4699 
4700   /* See if the next thing is a quantifier. */
4701 
4702   is_quantifier =
4703     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4704     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4705 
4706   /* Fill in length of a previous callout, except when the next thing is a
4707   quantifier or when processing a property substitution string in UCP mode. */
4708 
4709   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4710        after_manual_callout-- <= 0)
4711     {
4712     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4713       complete_callout(previous_callout, ptr, cd);
4714     previous_callout = NULL;
4715     }
4716 
4717   /* Create auto callout, except for quantifiers, or while processing property
4718   strings that are substituted for \w etc in UCP mode. */
4719 
4720   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4721     {
4722     previous_callout = code;
4723     code = auto_callout(code, ptr, cd);
4724     }
4725 
4726   /* Process the next pattern item. */
4727 
4728   switch(c)
4729     {
4730     /* ===================================================================*/
4731     case CHAR_NULL:                /* The branch terminates at string end */
4732     case CHAR_VERTICAL_LINE:       /* or | or ) */
4733     case CHAR_RIGHT_PARENTHESIS:
4734     *firstcharptr = firstchar;
4735     *firstcharflagsptr = firstcharflags;
4736     *reqcharptr = reqchar;
4737     *reqcharflagsptr = reqcharflags;
4738     *codeptr = code;
4739     *ptrptr = ptr;
4740     if (lengthptr != NULL)
4741       {
4742       if (OFLOW_MAX - *lengthptr < code - last_code)
4743         {
4744         *errorcodeptr = ERR20;
4745         goto FAILED;
4746         }
4747       *lengthptr += (int)(code - last_code);   /* To include callout length */
4748       DPRINTF((">> end branch\n"));
4749       }
4750     return TRUE;
4751 
4752 
4753     /* ===================================================================*/
4754     /* Handle single-character metacharacters. In multiline mode, ^ disables
4755     the setting of any following char as a first character. */
4756 
4757     case CHAR_CIRCUMFLEX_ACCENT:
4758     previous = NULL;
4759     if ((options & PCRE_MULTILINE) != 0)
4760       {
4761       if (firstcharflags == REQ_UNSET)
4762         zerofirstcharflags = firstcharflags = REQ_NONE;
4763       *code++ = OP_CIRCM;
4764       }
4765     else *code++ = OP_CIRC;
4766     break;
4767 
4768     case CHAR_DOLLAR_SIGN:
4769     previous = NULL;
4770     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4771     break;
4772 
4773     /* There can never be a first char if '.' is first, whatever happens about
4774     repeats. The value of reqchar doesn't change either. */
4775 
4776     case CHAR_DOT:
4777     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4778     zerofirstchar = firstchar;
4779     zerofirstcharflags = firstcharflags;
4780     zeroreqchar = reqchar;
4781     zeroreqcharflags = reqcharflags;
4782     previous = code;
4783     item_hwm_offset = cd->hwm - cd->start_workspace;
4784     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4785     break;
4786 
4787 
4788     /* ===================================================================*/
4789     /* Character classes. If the included characters are all < 256, we build a
4790     32-byte bitmap of the permitted characters, except in the special case
4791     where there is only one such character. For negated classes, we build the
4792     map as usual, then invert it at the end. However, we use a different opcode
4793     so that data characters > 255 can be handled correctly.
4794 
4795     If the class contains characters outside the 0-255 range, a different
4796     opcode is compiled. It may optionally have a bit map for characters < 256,
4797     but those above are are explicitly listed afterwards. A flag byte tells
4798     whether the bitmap is present, and whether this is a negated class or not.
4799 
4800     In JavaScript compatibility mode, an isolated ']' causes an error. In
4801     default (Perl) mode, it is treated as a data character. */
4802 
4803     case CHAR_RIGHT_SQUARE_BRACKET:
4804     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4805       {
4806       *errorcodeptr = ERR64;
4807       goto FAILED;
4808       }
4809     goto NORMAL_CHAR;
4810 
4811     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4812     used for "start of word" and "end of word". As these are otherwise illegal
4813     sequences, we don't break anything by recognizing them. They are replaced
4814     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4815     erroneous and are handled by the normal code below. */
4816 
4817     case CHAR_LEFT_SQUARE_BRACKET:
4818     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4819       {
4820       nestptr = ptr + 7;
4821       ptr = sub_start_of_word - 1;
4822       continue;
4823       }
4824 
4825     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4826       {
4827       nestptr = ptr + 7;
4828       ptr = sub_end_of_word - 1;
4829       continue;
4830       }
4831 
4832     /* Handle a real character class. */
4833 
4834     previous = code;
4835     item_hwm_offset = cd->hwm - cd->start_workspace;
4836 
4837     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4838     they are encountered at the top level, so we'll do that too. */
4839 
4840     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4841          ptr[1] == CHAR_EQUALS_SIGN) &&
4842         check_posix_syntax(ptr, &tempptr))
4843       {
4844       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4845       goto FAILED;
4846       }
4847 
4848     /* If the first character is '^', set the negation flag and skip it. Also,
4849     if the first few characters (either before or after ^) are \Q\E or \E we
4850     skip them too. This makes for compatibility with Perl. */
4851 
4852     negate_class = FALSE;
4853     for (;;)
4854       {
4855       c = *(++ptr);
4856       if (c == CHAR_BACKSLASH)
4857         {
4858         if (ptr[1] == CHAR_E)
4859           ptr++;
4860         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4861           ptr += 3;
4862         else
4863           break;
4864         }
4865       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4866         negate_class = TRUE;
4867       else break;
4868       }
4869 
4870     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4871     an initial ']' is taken as a data character -- the code below handles
4872     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4873     [^] must match any character, so generate OP_ALLANY. */
4874 
4875     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4876         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4877       {
4878       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4879       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4880       zerofirstchar = firstchar;
4881       zerofirstcharflags = firstcharflags;
4882       break;
4883       }
4884 
4885     /* If a class contains a negative special such as \S, we need to flip the
4886     negation flag at the end, so that support for characters > 255 works
4887     correctly (they are all included in the class). */
4888 
4889     should_flip_negation = FALSE;
4890 
4891     /* Extended class (xclass) will be used when characters > 255
4892     might match. */
4893 
4894 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4895     xclass = FALSE;
4896     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4897     class_uchardata_base = class_uchardata;   /* Save the start */
4898 #endif
4899 
4900     /* For optimization purposes, we track some properties of the class:
4901     class_has_8bitchar will be non-zero if the class contains at least one <
4902     256 character; class_one_char will be 1 if the class contains just one
4903     character; xclass_has_prop will be TRUE if unicode property checks
4904     are present in the class. */
4905 
4906     class_has_8bitchar = 0;
4907     class_one_char = 0;
4908 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4909     xclass_has_prop = FALSE;
4910 #endif
4911 
4912     /* Initialize the 32-char bit map to all zeros. We build the map in a
4913     temporary bit of memory, in case the class contains fewer than two
4914     8-bit characters because in that case the compiled code doesn't use the bit
4915     map. */
4916 
4917     memset(classbits, 0, 32 * sizeof(pcre_uint8));
4918 
4919     /* Process characters until ] is reached. By writing this as a "do" it
4920     means that an initial ] is taken as a data character. At the start of the
4921     loop, c contains the first byte of the character. */
4922 
4923     if (c != CHAR_NULL) do
4924       {
4925       const pcre_uchar *oldptr;
4926 
4927 #ifdef SUPPORT_UTF
4928       if (utf && HAS_EXTRALEN(c))
4929         {                           /* Braces are required because the */
4930         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4931         }
4932 #endif
4933 
4934 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4935       /* In the pre-compile phase, accumulate the length of any extra
4936       data and reset the pointer. This is so that very large classes that
4937       contain a zillion > 255 characters no longer overwrite the work space
4938       (which is on the stack). We have to remember that there was XCLASS data,
4939       however. */
4940 
4941       if (class_uchardata > class_uchardata_base) xclass = TRUE;
4942 
4943       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4944         {
4945         *lengthptr += (int)(class_uchardata - class_uchardata_base);
4946         class_uchardata = class_uchardata_base;
4947         }
4948 #endif
4949 
4950       /* Inside \Q...\E everything is literal except \E */
4951 
4952       if (inescq)
4953         {
4954         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4955           {
4956           inescq = FALSE;                   /* Reset literal state */
4957           ptr++;                            /* Skip the 'E' */
4958           continue;                         /* Carry on with next */
4959           }
4960         goto CHECK_RANGE;                   /* Could be range if \E follows */
4961         }
4962 
4963       /* Handle POSIX class names. Perl allows a negation extension of the
4964       form [:^name:]. A square bracket that doesn't match the syntax is
4965       treated as a literal. We also recognize the POSIX constructions
4966       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4967       5.6 and 5.8 do. */
4968 
4969       if (c == CHAR_LEFT_SQUARE_BRACKET &&
4970           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4971            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4972         {
4973         BOOL local_negate = FALSE;
4974         int posix_class, taboffset, tabopt;
4975         register const pcre_uint8 *cbits = cd->cbits;
4976         pcre_uint8 pbits[32];
4977 
4978         if (ptr[1] != CHAR_COLON)
4979           {
4980           *errorcodeptr = ERR31;
4981           goto FAILED;
4982           }
4983 
4984         ptr += 2;
4985         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4986           {
4987           local_negate = TRUE;
4988           should_flip_negation = TRUE;  /* Note negative special */
4989           ptr++;
4990           }
4991 
4992         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4993         if (posix_class < 0)
4994           {
4995           *errorcodeptr = ERR30;
4996           goto FAILED;
4997           }
4998 
4999         /* If matching is caseless, upper and lower are converted to
5000         alpha. This relies on the fact that the class table starts with
5001         alpha, lower, upper as the first 3 entries. */
5002 
5003         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5004           posix_class = 0;
5005 
5006         /* When PCRE_UCP is set, some of the POSIX classes are converted to
5007         different escape sequences that use Unicode properties \p or \P. Others
5008         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5009         directly. */
5010 
5011 #ifdef SUPPORT_UCP
5012         if ((options & PCRE_UCP) != 0)
5013           {
5014           unsigned int ptype = 0;
5015           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5016 
5017           /* The posix_substitutes table specifies which POSIX classes can be
5018           converted to \p or \P items. */
5019 
5020           if (posix_substitutes[pc] != NULL)
5021             {
5022             nestptr = tempptr + 1;
5023             ptr = posix_substitutes[pc] - 1;
5024             continue;
5025             }
5026 
5027           /* There are three other classes that generate special property calls
5028           that are recognized only in an XCLASS. */
5029 
5030           else switch(posix_class)
5031             {
5032             case PC_GRAPH:
5033             ptype = PT_PXGRAPH;
5034             /* Fall through */
5035             case PC_PRINT:
5036             if (ptype == 0) ptype = PT_PXPRINT;
5037             /* Fall through */
5038             case PC_PUNCT:
5039             if (ptype == 0) ptype = PT_PXPUNCT;
5040             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5041             *class_uchardata++ = ptype;
5042             *class_uchardata++ = 0;
5043             xclass_has_prop = TRUE;
5044             ptr = tempptr + 1;
5045             continue;
5046 
5047             /* For the other POSIX classes (ascii, xdigit) we are going to fall
5048             through to the non-UCP case and build a bit map for characters with
5049             code points less than 256. If we are in a negated POSIX class
5050             within a non-negated overall class, characters with code points
5051             greater than 255 must all match. In the special case where we have
5052             not yet generated any xclass data, and this is the final item in
5053             the overall class, we need do nothing: later on, the opcode
5054             OP_NCLASS will be used to indicate that characters greater than 255
5055             are acceptable. If we have already seen an xclass item or one may
5056             follow (we have to assume that it might if this is not the end of
5057             the class), explicitly match all wide codepoints. */
5058 
5059             default:
5060             if (!negate_class && local_negate &&
5061                 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5062               {
5063               *class_uchardata++ = XCL_RANGE;
5064               class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5065               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5066               }
5067             break;
5068             }
5069           }
5070 #endif
5071         /* In the non-UCP case, or when UCP makes no difference, we build the
5072         bit map for the POSIX class in a chunk of local store because we may be
5073         adding and subtracting from it, and we don't want to subtract bits that
5074         may be in the main map already. At the end we or the result into the
5075         bit map that is being built. */
5076 
5077         posix_class *= 3;
5078 
5079         /* Copy in the first table (always present) */
5080 
5081         memcpy(pbits, cbits + posix_class_maps[posix_class],
5082           32 * sizeof(pcre_uint8));
5083 
5084         /* If there is a second table, add or remove it as required. */
5085 
5086         taboffset = posix_class_maps[posix_class + 1];
5087         tabopt = posix_class_maps[posix_class + 2];
5088 
5089         if (taboffset >= 0)
5090           {
5091           if (tabopt >= 0)
5092             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5093           else
5094             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5095           }
5096 
5097         /* Now see if we need to remove any special characters. An option
5098         value of 1 removes vertical space and 2 removes underscore. */
5099 
5100         if (tabopt < 0) tabopt = -tabopt;
5101         if (tabopt == 1) pbits[1] &= ~0x3c;
5102           else if (tabopt == 2) pbits[11] &= 0x7f;
5103 
5104         /* Add the POSIX table or its complement into the main table that is
5105         being built and we are done. */
5106 
5107         if (local_negate)
5108           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5109         else
5110           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5111 
5112         ptr = tempptr + 1;
5113         /* Every class contains at least one < 256 character. */
5114         class_has_8bitchar = 1;
5115         /* Every class contains at least two characters. */
5116         class_one_char = 2;
5117         continue;    /* End of POSIX syntax handling */
5118         }
5119 
5120       /* Backslash may introduce a single character, or it may introduce one
5121       of the specials, which just set a flag. The sequence \b is a special
5122       case. Inside a class (and only there) it is treated as backspace. We
5123       assume that other escapes have more than one character in them, so
5124       speculatively set both class_has_8bitchar and class_one_char bigger
5125       than one. Unrecognized escapes fall through and are either treated
5126       as literal characters (by default), or are faulted if
5127       PCRE_EXTRA is set. */
5128 
5129       if (c == CHAR_BACKSLASH)
5130         {
5131         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5132           TRUE);
5133         if (*errorcodeptr != 0) goto FAILED;
5134         if (escape == 0) c = ec;
5135         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5136         else if (escape == ESC_N)          /* \N is not supported in a class */
5137           {
5138           *errorcodeptr = ERR71;
5139           goto FAILED;
5140           }
5141         else if (escape == ESC_Q)            /* Handle start of quoted string */
5142           {
5143           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5144             {
5145             ptr += 2; /* avoid empty string */
5146             }
5147           else inescq = TRUE;
5148           continue;
5149           }
5150         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5151 
5152         else
5153           {
5154           register const pcre_uint8 *cbits = cd->cbits;
5155           /* Every class contains at least two < 256 characters. */
5156           class_has_8bitchar++;
5157           /* Every class contains at least two characters. */
5158           class_one_char += 2;
5159 
5160           switch (escape)
5161             {
5162 #ifdef SUPPORT_UCP
5163             case ESC_du:     /* These are the values given for \d etc */
5164             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5165             case ESC_wu:     /* escape sequence with an appropriate \p */
5166             case ESC_WU:     /* or \P to test Unicode properties instead */
5167             case ESC_su:     /* of the default ASCII testing. */
5168             case ESC_SU:
5169             nestptr = ptr;
5170             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5171             class_has_8bitchar--;                /* Undo! */
5172             continue;
5173 #endif
5174             case ESC_d:
5175             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5176             continue;
5177 
5178             case ESC_D:
5179             should_flip_negation = TRUE;
5180             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5181             continue;
5182 
5183             case ESC_w:
5184             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5185             continue;
5186 
5187             case ESC_W:
5188             should_flip_negation = TRUE;
5189             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5190             continue;
5191 
5192             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5193             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5194             previously set by something earlier in the character class.
5195             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5196             we could just adjust the appropriate bit. From PCRE 8.34 we no
5197             longer treat \s and \S specially. */
5198 
5199             case ESC_s:
5200             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5201             continue;
5202 
5203             case ESC_S:
5204             should_flip_negation = TRUE;
5205             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5206             continue;
5207 
5208             /* The rest apply in both UCP and non-UCP cases. */
5209 
5210             case ESC_h:
5211             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5212               PRIV(hspace_list), NOTACHAR);
5213             continue;
5214 
5215             case ESC_H:
5216             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5217               cd, PRIV(hspace_list));
5218             continue;
5219 
5220             case ESC_v:
5221             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5222               PRIV(vspace_list), NOTACHAR);
5223             continue;
5224 
5225             case ESC_V:
5226             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5227               cd, PRIV(vspace_list));
5228             continue;
5229 
5230             case ESC_p:
5231             case ESC_P:
5232 #ifdef SUPPORT_UCP
5233               {
5234               BOOL negated;
5235               unsigned int ptype = 0, pdata = 0;
5236               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5237                 goto FAILED;
5238               *class_uchardata++ = ((escape == ESC_p) != negated)?
5239                 XCL_PROP : XCL_NOTPROP;
5240               *class_uchardata++ = ptype;
5241               *class_uchardata++ = pdata;
5242               xclass_has_prop = TRUE;
5243               class_has_8bitchar--;                /* Undo! */
5244               continue;
5245               }
5246 #else
5247             *errorcodeptr = ERR45;
5248             goto FAILED;
5249 #endif
5250             /* Unrecognized escapes are faulted if PCRE is running in its
5251             strict mode. By default, for compatibility with Perl, they are
5252             treated as literals. */
5253 
5254             default:
5255             if ((options & PCRE_EXTRA) != 0)
5256               {
5257               *errorcodeptr = ERR7;
5258               goto FAILED;
5259               }
5260             class_has_8bitchar--;    /* Undo the speculative increase. */
5261             class_one_char -= 2;     /* Undo the speculative increase. */
5262             c = *ptr;                /* Get the final character and fall through */
5263             break;
5264             }
5265           }
5266 
5267         /* Fall through if the escape just defined a single character (c >= 0).
5268         This may be greater than 256. */
5269 
5270         escape = 0;
5271 
5272         }   /* End of backslash handling */
5273 
5274       /* A character may be followed by '-' to form a range. However, Perl does
5275       not permit ']' to be the end of the range. A '-' character at the end is
5276       treated as a literal. Perl ignores orphaned \E sequences entirely. The
5277       code for handling \Q and \E is messy. */
5278 
5279       CHECK_RANGE:
5280       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5281         {
5282         inescq = FALSE;
5283         ptr += 2;
5284         }
5285       oldptr = ptr;
5286 
5287       /* Remember if \r or \n were explicitly used */
5288 
5289       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5290 
5291       /* Check for range */
5292 
5293       if (!inescq && ptr[1] == CHAR_MINUS)
5294         {
5295         pcre_uint32 d;
5296         ptr += 2;
5297         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5298 
5299         /* If we hit \Q (not followed by \E) at this point, go into escaped
5300         mode. */
5301 
5302         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5303           {
5304           ptr += 2;
5305           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5306             { ptr += 2; continue; }
5307           inescq = TRUE;
5308           break;
5309           }
5310 
5311         /* Minus (hyphen) at the end of a class is treated as a literal, so put
5312         back the pointer and jump to handle the character that preceded it. */
5313 
5314         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5315           {
5316           ptr = oldptr;
5317           goto CLASS_SINGLE_CHARACTER;
5318           }
5319 
5320         /* Otherwise, we have a potential range; pick up the next character */
5321 
5322 #ifdef SUPPORT_UTF
5323         if (utf)
5324           {                           /* Braces are required because the */
5325           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5326           }
5327         else
5328 #endif
5329         d = *ptr;  /* Not UTF-8 mode */
5330 
5331         /* The second part of a range can be a single-character escape
5332         sequence, but not any of the other escapes. Perl treats a hyphen as a
5333         literal in such circumstances. However, in Perl's warning mode, a
5334         warning is given, so PCRE now faults it as it is almost certainly a
5335         mistake on the user's part. */
5336 
5337         if (!inescq)
5338           {
5339           if (d == CHAR_BACKSLASH)
5340             {
5341             int descape;
5342             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5343             if (*errorcodeptr != 0) goto FAILED;
5344 
5345             /* 0 means a character was put into d; \b is backspace; any other
5346             special causes an error. */
5347 
5348             if (descape != 0)
5349               {
5350               if (descape == ESC_b) d = CHAR_BS; else
5351                 {
5352                 *errorcodeptr = ERR83;
5353                 goto FAILED;
5354                 }
5355               }
5356             }
5357 
5358           /* A hyphen followed by a POSIX class is treated in the same way. */
5359 
5360           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5361                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5362                     ptr[1] == CHAR_EQUALS_SIGN) &&
5363                    check_posix_syntax(ptr, &tempptr))
5364             {
5365             *errorcodeptr = ERR83;
5366             goto FAILED;
5367             }
5368           }
5369 
5370         /* Check that the two values are in the correct order. Optimize
5371         one-character ranges. */
5372 
5373         if (d < c)
5374           {
5375           *errorcodeptr = ERR8;
5376           goto FAILED;
5377           }
5378         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5379 
5380         /* We have found a character range, so single character optimizations
5381         cannot be done anymore. Any value greater than 1 indicates that there
5382         is more than one character. */
5383 
5384         class_one_char = 2;
5385 
5386         /* Remember an explicit \r or \n, and add the range to the class. */
5387 
5388         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5389 
5390         class_has_8bitchar +=
5391           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5392 
5393         continue;   /* Go get the next char in the class */
5394         }
5395 
5396       /* Handle a single character - we can get here for a normal non-escape
5397       char, or after \ that introduces a single character or for an apparent
5398       range that isn't. Only the value 1 matters for class_one_char, so don't
5399       increase it if it is already 2 or more ... just in case there's a class
5400       with a zillion characters in it. */
5401 
5402       CLASS_SINGLE_CHARACTER:
5403       if (class_one_char < 2) class_one_char++;
5404 
5405       /* If xclass_has_prop is false and class_one_char is 1, we have the first
5406       single character in the class, and there have been no prior ranges, or
5407       XCLASS items generated by escapes. If this is the final character in the
5408       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5409       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5410       can cause firstchar to be set. Otherwise, there can be no first char if
5411       this item is first, whatever repeat count may follow. In the case of
5412       reqchar, save the previous value for reinstating. */
5413 
5414       if (!inescq &&
5415 #ifdef SUPPORT_UCP
5416           !xclass_has_prop &&
5417 #endif
5418           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5419         {
5420         ptr++;
5421         zeroreqchar = reqchar;
5422         zeroreqcharflags = reqcharflags;
5423 
5424         if (negate_class)
5425           {
5426 #ifdef SUPPORT_UCP
5427           int d;
5428 #endif
5429           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5430           zerofirstchar = firstchar;
5431           zerofirstcharflags = firstcharflags;
5432 
5433           /* For caseless UTF-8 mode when UCP support is available, check
5434           whether this character has more than one other case. If so, generate
5435           a special OP_NOTPROP item instead of OP_NOTI. */
5436 
5437 #ifdef SUPPORT_UCP
5438           if (utf && (options & PCRE_CASELESS) != 0 &&
5439               (d = UCD_CASESET(c)) != 0)
5440             {
5441             *code++ = OP_NOTPROP;
5442             *code++ = PT_CLIST;
5443             *code++ = d;
5444             }
5445           else
5446 #endif
5447           /* Char has only one other case, or UCP not available */
5448 
5449             {
5450             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5451 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5452             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5453               code += PRIV(ord2utf)(c, code);
5454             else
5455 #endif
5456               *code++ = c;
5457             }
5458 
5459           /* We are finished with this character class */
5460 
5461           goto END_CLASS;
5462           }
5463 
5464         /* For a single, positive character, get the value into mcbuffer, and
5465         then we can handle this with the normal one-character code. */
5466 
5467 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5468         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5469           mclength = PRIV(ord2utf)(c, mcbuffer);
5470         else
5471 #endif
5472           {
5473           mcbuffer[0] = c;
5474           mclength = 1;
5475           }
5476         goto ONE_CHAR;
5477         }       /* End of 1-char optimization */
5478 
5479       /* There is more than one character in the class, or an XCLASS item
5480       has been generated. Add this character to the class. */
5481 
5482       class_has_8bitchar +=
5483         add_to_class(classbits, &class_uchardata, options, cd, c, c);
5484       }
5485 
5486     /* Loop until ']' reached. This "while" is the end of the "do" far above.
5487     If we are at the end of an internal nested string, revert to the outer
5488     string. */
5489 
5490     while (((c = *(++ptr)) != CHAR_NULL ||
5491            (nestptr != NULL &&
5492              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5493            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5494 
5495     /* Check for missing terminating ']' */
5496 
5497     if (c == CHAR_NULL)
5498       {
5499       *errorcodeptr = ERR6;
5500       goto FAILED;
5501       }
5502 
5503     /* We will need an XCLASS if data has been placed in class_uchardata. In
5504     the second phase this is a sufficient test. However, in the pre-compile
5505     phase, class_uchardata gets emptied to prevent workspace overflow, so it
5506     only if the very last character in the class needs XCLASS will it contain
5507     anything at this point. For this reason, xclass gets set TRUE above when
5508     uchar_classdata is emptied, and that's why this code is the way it is here
5509     instead of just doing a test on class_uchardata below. */
5510 
5511 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5512     if (class_uchardata > class_uchardata_base) xclass = TRUE;
5513 #endif
5514 
5515     /* If this is the first thing in the branch, there can be no first char
5516     setting, whatever the repeat count. Any reqchar setting must remain
5517     unchanged after any kind of repeat. */
5518 
5519     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5520     zerofirstchar = firstchar;
5521     zerofirstcharflags = firstcharflags;
5522     zeroreqchar = reqchar;
5523     zeroreqcharflags = reqcharflags;
5524 
5525     /* If there are characters with values > 255, we have to compile an
5526     extended class, with its own opcode, unless there was a negated special
5527     such as \S in the class, and PCRE_UCP is not set, because in that case all
5528     characters > 255 are in the class, so any that were explicitly given as
5529     well can be ignored. If (when there are explicit characters > 255 that must
5530     be listed) there are no characters < 256, we can omit the bitmap in the
5531     actual compiled code. */
5532 
5533 #ifdef SUPPORT_UTF
5534     if (xclass && (xclass_has_prop || !should_flip_negation ||
5535         (options & PCRE_UCP) != 0))
5536 #elif !defined COMPILE_PCRE8
5537     if (xclass && (xclass_has_prop || !should_flip_negation))
5538 #endif
5539 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5540       {
5541       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5542       *code++ = OP_XCLASS;
5543       code += LINK_SIZE;
5544       *code = negate_class? XCL_NOT:0;
5545       if (xclass_has_prop) *code |= XCL_HASPROP;
5546 
5547       /* If the map is required, move up the extra data to make room for it;
5548       otherwise just move the code pointer to the end of the extra data. */
5549 
5550       if (class_has_8bitchar > 0)
5551         {
5552         *code++ |= XCL_MAP;
5553         memmove(code + (32 / sizeof(pcre_uchar)), code,
5554           IN_UCHARS(class_uchardata - code));
5555         if (negate_class && !xclass_has_prop)
5556           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5557         memcpy(code, classbits, 32);
5558         code = class_uchardata + (32 / sizeof(pcre_uchar));
5559         }
5560       else code = class_uchardata;
5561 
5562       /* Now fill in the complete length of the item */
5563 
5564       PUT(previous, 1, (int)(code - previous));
5565       break;   /* End of class handling */
5566       }
5567 
5568     /* Even though any XCLASS list is now discarded, we must allow for
5569     its memory. */
5570 
5571     if (lengthptr != NULL)
5572       *lengthptr += (int)(class_uchardata - class_uchardata_base);
5573 #endif
5574 
5575     /* If there are no characters > 255, or they are all to be included or
5576     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5577     whole class was negated and whether there were negative specials such as \S
5578     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5579     negating it if necessary. */
5580 
5581     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5582     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5583       {
5584       if (negate_class)
5585         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5586       memcpy(code, classbits, 32);
5587       }
5588     code += 32 / sizeof(pcre_uchar);
5589 
5590     END_CLASS:
5591     break;
5592 
5593 
5594     /* ===================================================================*/
5595     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5596     has been tested above. */
5597 
5598     case CHAR_LEFT_CURLY_BRACKET:
5599     if (!is_quantifier) goto NORMAL_CHAR;
5600     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5601     if (*errorcodeptr != 0) goto FAILED;
5602     goto REPEAT;
5603 
5604     case CHAR_ASTERISK:
5605     repeat_min = 0;
5606     repeat_max = -1;
5607     goto REPEAT;
5608 
5609     case CHAR_PLUS:
5610     repeat_min = 1;
5611     repeat_max = -1;
5612     goto REPEAT;
5613 
5614     case CHAR_QUESTION_MARK:
5615     repeat_min = 0;
5616     repeat_max = 1;
5617 
5618     REPEAT:
5619     if (previous == NULL)
5620       {
5621       *errorcodeptr = ERR9;
5622       goto FAILED;
5623       }
5624 
5625     if (repeat_min == 0)
5626       {
5627       firstchar = zerofirstchar;    /* Adjust for zero repeat */
5628       firstcharflags = zerofirstcharflags;
5629       reqchar = zeroreqchar;        /* Ditto */
5630       reqcharflags = zeroreqcharflags;
5631       }
5632 
5633     /* Remember whether this is a variable length repeat */
5634 
5635     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5636 
5637     op_type = 0;                    /* Default single-char op codes */
5638     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5639 
5640     /* Save start of previous item, in case we have to move it up in order to
5641     insert something before it. */
5642 
5643     tempcode = previous;
5644 
5645     /* Before checking for a possessive quantifier, we must skip over
5646     whitespace and comments in extended mode because Perl allows white space at
5647     this point. */
5648 
5649     if ((options & PCRE_EXTENDED) != 0)
5650       {
5651       const pcre_uchar *p = ptr + 1;
5652       for (;;)
5653         {
5654         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5655         if (*p != CHAR_NUMBER_SIGN) break;
5656         p++;
5657         while (*p != CHAR_NULL)
5658           {
5659           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5660             {                        /* IS_NEWLINE sets cd->nllen. */
5661             p += cd->nllen;
5662             break;
5663             }
5664           p++;
5665 #ifdef SUPPORT_UTF
5666           if (utf) FORWARDCHAR(p);
5667 #endif
5668           }           /* Loop for comment characters */
5669         }             /* Loop for multiple comments */
5670       ptr = p - 1;    /* Character before the next significant one. */
5671       }
5672 
5673     /* If the next character is '+', we have a possessive quantifier. This
5674     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5675     If the next character is '?' this is a minimizing repeat, by default,
5676     but if PCRE_UNGREEDY is set, it works the other way round. We change the
5677     repeat type to the non-default. */
5678 
5679     if (ptr[1] == CHAR_PLUS)
5680       {
5681       repeat_type = 0;                  /* Force greedy */
5682       possessive_quantifier = TRUE;
5683       ptr++;
5684       }
5685     else if (ptr[1] == CHAR_QUESTION_MARK)
5686       {
5687       repeat_type = greedy_non_default;
5688       ptr++;
5689       }
5690     else repeat_type = greedy_default;
5691 
5692     /* If previous was a recursion call, wrap it in atomic brackets so that
5693     previous becomes the atomic group. All recursions were so wrapped in the
5694     past, but it no longer happens for non-repeated recursions. In fact, the
5695     repeated ones could be re-implemented independently so as not to need this,
5696     but for the moment we rely on the code for repeating groups. */
5697 
5698     if (*previous == OP_RECURSE)
5699       {
5700       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5701       *previous = OP_ONCE;
5702       PUT(previous, 1, 2 + 2*LINK_SIZE);
5703       previous[2 + 2*LINK_SIZE] = OP_KET;
5704       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5705       code += 2 + 2 * LINK_SIZE;
5706       length_prevgroup = 3 + 3*LINK_SIZE;
5707 
5708       /* When actually compiling, we need to check whether this was a forward
5709       reference, and if so, adjust the offset. */
5710 
5711       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5712         {
5713         int offset = GET(cd->hwm, -LINK_SIZE);
5714         if (offset == previous + 1 - cd->start_code)
5715           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5716         }
5717       }
5718 
5719     /* Now handle repetition for the different types of item. */
5720 
5721     /* If previous was a character or negated character match, abolish the item
5722     and generate a repeat item instead. If a char item has a minimum of more
5723     than one, ensure that it is set in reqchar - it might not be if a sequence
5724     such as x{3} is the first thing in a branch because the x will have gone
5725     into firstchar instead.  */
5726 
5727     if (*previous == OP_CHAR || *previous == OP_CHARI
5728         || *previous == OP_NOT || *previous == OP_NOTI)
5729       {
5730       switch (*previous)
5731         {
5732         default: /* Make compiler happy. */
5733         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5734         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5735         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5736         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5737         }
5738 
5739       /* Deal with UTF characters that take up more than one character. It's
5740       easier to write this out separately than try to macrify it. Use c to
5741       hold the length of the character in bytes, plus UTF_LENGTH to flag that
5742       it's a length rather than a small character. */
5743 
5744 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5745       if (utf && NOT_FIRSTCHAR(code[-1]))
5746         {
5747         pcre_uchar *lastchar = code - 1;
5748         BACKCHAR(lastchar);
5749         c = (int)(code - lastchar);     /* Length of UTF-8 character */
5750         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5751         c |= UTF_LENGTH;                /* Flag c as a length */
5752         }
5753       else
5754 #endif /* SUPPORT_UTF */
5755 
5756       /* Handle the case of a single charater - either with no UTF support, or
5757       with UTF disabled, or for a single character UTF character. */
5758         {
5759         c = code[-1];
5760         if (*previous <= OP_CHARI && repeat_min > 1)
5761           {
5762           reqchar = c;
5763           reqcharflags = req_caseopt | cd->req_varyopt;
5764           }
5765         }
5766 
5767       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5768       }
5769 
5770     /* If previous was a character type match (\d or similar), abolish it and
5771     create a suitable repeat item. The code is shared with single-character
5772     repeats by setting op_type to add a suitable offset into repeat_type. Note
5773     that the Unicode property types will be present only when SUPPORT_UCP is
5774     defined, but we don't wrap the little bits of code here because it just
5775     makes it horribly messy. */
5776 
5777     else if (*previous < OP_EODN)
5778       {
5779       pcre_uchar *oldcode;
5780       int prop_type, prop_value;
5781       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5782       c = *previous;
5783 
5784       OUTPUT_SINGLE_REPEAT:
5785       if (*previous == OP_PROP || *previous == OP_NOTPROP)
5786         {
5787         prop_type = previous[1];
5788         prop_value = previous[2];
5789         }
5790       else prop_type = prop_value = -1;
5791 
5792       oldcode = code;
5793       code = previous;                  /* Usually overwrite previous item */
5794 
5795       /* If the maximum is zero then the minimum must also be zero; Perl allows
5796       this case, so we do too - by simply omitting the item altogether. */
5797 
5798       if (repeat_max == 0) goto END_REPEAT;
5799 
5800       /* Combine the op_type with the repeat_type */
5801 
5802       repeat_type += op_type;
5803 
5804       /* A minimum of zero is handled either as the special case * or ?, or as
5805       an UPTO, with the maximum given. */
5806 
5807       if (repeat_min == 0)
5808         {
5809         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5810           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5811         else
5812           {
5813           *code++ = OP_UPTO + repeat_type;
5814           PUT2INC(code, 0, repeat_max);
5815           }
5816         }
5817 
5818       /* A repeat minimum of 1 is optimized into some special cases. If the
5819       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5820       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5821       one less than the maximum. */
5822 
5823       else if (repeat_min == 1)
5824         {
5825         if (repeat_max == -1)
5826           *code++ = OP_PLUS + repeat_type;
5827         else
5828           {
5829           code = oldcode;                 /* leave previous item in place */
5830           if (repeat_max == 1) goto END_REPEAT;
5831           *code++ = OP_UPTO + repeat_type;
5832           PUT2INC(code, 0, repeat_max - 1);
5833           }
5834         }
5835 
5836       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5837       handled as an EXACT followed by an UPTO. */
5838 
5839       else
5840         {
5841         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5842         PUT2INC(code, 0, repeat_min);
5843 
5844         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5845         we have to insert the character for the previous code. For a repeated
5846         Unicode property match, there are two extra bytes that define the
5847         required property. In UTF-8 mode, long characters have their length in
5848         c, with the UTF_LENGTH bit as a flag. */
5849 
5850         if (repeat_max < 0)
5851           {
5852 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5853           if (utf && (c & UTF_LENGTH) != 0)
5854             {
5855             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5856             code += c & 7;
5857             }
5858           else
5859 #endif
5860             {
5861             *code++ = c;
5862             if (prop_type >= 0)
5863               {
5864               *code++ = prop_type;
5865               *code++ = prop_value;
5866               }
5867             }
5868           *code++ = OP_STAR + repeat_type;
5869           }
5870 
5871         /* Else insert an UPTO if the max is greater than the min, again
5872         preceded by the character, for the previously inserted code. If the
5873         UPTO is just for 1 instance, we can use QUERY instead. */
5874 
5875         else if (repeat_max != repeat_min)
5876           {
5877 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5878           if (utf && (c & UTF_LENGTH) != 0)
5879             {
5880             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5881             code += c & 7;
5882             }
5883           else
5884 #endif
5885           *code++ = c;
5886           if (prop_type >= 0)
5887             {
5888             *code++ = prop_type;
5889             *code++ = prop_value;
5890             }
5891           repeat_max -= repeat_min;
5892 
5893           if (repeat_max == 1)
5894             {
5895             *code++ = OP_QUERY + repeat_type;
5896             }
5897           else
5898             {
5899             *code++ = OP_UPTO + repeat_type;
5900             PUT2INC(code, 0, repeat_max);
5901             }
5902           }
5903         }
5904 
5905       /* The character or character type itself comes last in all cases. */
5906 
5907 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5908       if (utf && (c & UTF_LENGTH) != 0)
5909         {
5910         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5911         code += c & 7;
5912         }
5913       else
5914 #endif
5915       *code++ = c;
5916 
5917       /* For a repeated Unicode property match, there are two extra bytes that
5918       define the required property. */
5919 
5920 #ifdef SUPPORT_UCP
5921       if (prop_type >= 0)
5922         {
5923         *code++ = prop_type;
5924         *code++ = prop_value;
5925         }
5926 #endif
5927       }
5928 
5929     /* If previous was a character class or a back reference, we put the repeat
5930     stuff after it, but just skip the item if the repeat was {0,0}. */
5931 
5932     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5933 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5934              *previous == OP_XCLASS ||
5935 #endif
5936              *previous == OP_REF   || *previous == OP_REFI ||
5937              *previous == OP_DNREF || *previous == OP_DNREFI)
5938       {
5939       if (repeat_max == 0)
5940         {
5941         code = previous;
5942         goto END_REPEAT;
5943         }
5944 
5945       if (repeat_min == 0 && repeat_max == -1)
5946         *code++ = OP_CRSTAR + repeat_type;
5947       else if (repeat_min == 1 && repeat_max == -1)
5948         *code++ = OP_CRPLUS + repeat_type;
5949       else if (repeat_min == 0 && repeat_max == 1)
5950         *code++ = OP_CRQUERY + repeat_type;
5951       else
5952         {
5953         *code++ = OP_CRRANGE + repeat_type;
5954         PUT2INC(code, 0, repeat_min);
5955         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5956         PUT2INC(code, 0, repeat_max);
5957         }
5958       }
5959 
5960     /* If previous was a bracket group, we may have to replicate it in certain
5961     cases. Note that at this point we can encounter only the "basic" bracket
5962     opcodes such as BRA and CBRA, as this is the place where they get converted
5963     into the more special varieties such as BRAPOS and SBRA. A test for >=
5964     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5965     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5966     Originally, PCRE did not allow repetition of assertions, but now it does,
5967     for Perl compatibility. */
5968 
5969     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5970       {
5971       register int i;
5972       int len = (int)(code - previous);
5973       size_t base_hwm_offset = item_hwm_offset;
5974       pcre_uchar *bralink = NULL;
5975       pcre_uchar *brazeroptr = NULL;
5976 
5977       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5978       we just ignore the repeat. */
5979 
5980       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5981         goto END_REPEAT;
5982 
5983       /* There is no sense in actually repeating assertions. The only potential
5984       use of repetition is in cases when the assertion is optional. Therefore,
5985       if the minimum is greater than zero, just ignore the repeat. If the
5986       maximum is not zero or one, set it to 1. */
5987 
5988       if (*previous < OP_ONCE)    /* Assertion */
5989         {
5990         if (repeat_min > 0) goto END_REPEAT;
5991         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5992         }
5993 
5994       /* The case of a zero minimum is special because of the need to stick
5995       OP_BRAZERO in front of it, and because the group appears once in the
5996       data, whereas in other cases it appears the minimum number of times. For
5997       this reason, it is simplest to treat this case separately, as otherwise
5998       the code gets far too messy. There are several special subcases when the
5999       minimum is zero. */
6000 
6001       if (repeat_min == 0)
6002         {
6003         /* If the maximum is also zero, we used to just omit the group from the
6004         output altogether, like this:
6005 
6006         ** if (repeat_max == 0)
6007         **   {
6008         **   code = previous;
6009         **   goto END_REPEAT;
6010         **   }
6011 
6012         However, that fails when a group or a subgroup within it is referenced
6013         as a subroutine from elsewhere in the pattern, so now we stick in
6014         OP_SKIPZERO in front of it so that it is skipped on execution. As we
6015         don't have a list of which groups are referenced, we cannot do this
6016         selectively.
6017 
6018         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6019         and do no more at this point. However, we do need to adjust any
6020         OP_RECURSE calls inside the group that refer to the group itself or any
6021         internal or forward referenced group, because the offset is from the
6022         start of the whole regex. Temporarily terminate the pattern while doing
6023         this. */
6024 
6025         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6026           {
6027           *code = OP_END;
6028           adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6029           memmove(previous + 1, previous, IN_UCHARS(len));
6030           code++;
6031           if (repeat_max == 0)
6032             {
6033             *previous++ = OP_SKIPZERO;
6034             goto END_REPEAT;
6035             }
6036           brazeroptr = previous;    /* Save for possessive optimizing */
6037           *previous++ = OP_BRAZERO + repeat_type;
6038           }
6039 
6040         /* If the maximum is greater than 1 and limited, we have to replicate
6041         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6042         The first one has to be handled carefully because it's the original
6043         copy, which has to be moved up. The remainder can be handled by code
6044         that is common with the non-zero minimum case below. We have to
6045         adjust the value or repeat_max, since one less copy is required. Once
6046         again, we may have to adjust any OP_RECURSE calls inside the group. */
6047 
6048         else
6049           {
6050           int offset;
6051           *code = OP_END;
6052           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6053           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6054           code += 2 + LINK_SIZE;
6055           *previous++ = OP_BRAZERO + repeat_type;
6056           *previous++ = OP_BRA;
6057 
6058           /* We chain together the bracket offset fields that have to be
6059           filled in later when the ends of the brackets are reached. */
6060 
6061           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6062           bralink = previous;
6063           PUTINC(previous, 0, offset);
6064           }
6065 
6066         repeat_max--;
6067         }
6068 
6069       /* If the minimum is greater than zero, replicate the group as many
6070       times as necessary, and adjust the maximum to the number of subsequent
6071       copies that we need. If we set a first char from the group, and didn't
6072       set a required char, copy the latter from the former. If there are any
6073       forward reference subroutine calls in the group, there will be entries on
6074       the workspace list; replicate these with an appropriate increment. */
6075 
6076       else
6077         {
6078         if (repeat_min > 1)
6079           {
6080           /* In the pre-compile phase, we don't actually do the replication. We
6081           just adjust the length as if we had. Do some paranoid checks for
6082           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6083           integer type when available, otherwise double. */
6084 
6085           if (lengthptr != NULL)
6086             {
6087             int delta = (repeat_min - 1)*length_prevgroup;
6088             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6089                   (INT64_OR_DOUBLE)length_prevgroup >
6090                     (INT64_OR_DOUBLE)INT_MAX ||
6091                 OFLOW_MAX - *lengthptr < delta)
6092               {
6093               *errorcodeptr = ERR20;
6094               goto FAILED;
6095               }
6096             *lengthptr += delta;
6097             }
6098 
6099           /* This is compiling for real. If there is a set first byte for
6100           the group, and we have not yet set a "required byte", set it. Make
6101           sure there is enough workspace for copying forward references before
6102           doing the copy. */
6103 
6104           else
6105             {
6106             if (groupsetfirstchar && reqcharflags < 0)
6107               {
6108               reqchar = firstchar;
6109               reqcharflags = firstcharflags;
6110               }
6111 
6112             for (i = 1; i < repeat_min; i++)
6113               {
6114               pcre_uchar *hc;
6115               size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6116               memcpy(code, previous, IN_UCHARS(len));
6117 
6118               while (cd->hwm > cd->start_workspace + cd->workspace_size -
6119                      WORK_SIZE_SAFETY_MARGIN -
6120                      (this_hwm_offset - base_hwm_offset))
6121                 {
6122                 *errorcodeptr = expand_workspace(cd);
6123                 if (*errorcodeptr != 0) goto FAILED;
6124                 }
6125 
6126               for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6127                    hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6128                    hc += LINK_SIZE)
6129                 {
6130                 PUT(cd->hwm, 0, GET(hc, 0) + len);
6131                 cd->hwm += LINK_SIZE;
6132                 }
6133               base_hwm_offset = this_hwm_offset;
6134               code += len;
6135               }
6136             }
6137           }
6138 
6139         if (repeat_max > 0) repeat_max -= repeat_min;
6140         }
6141 
6142       /* This code is common to both the zero and non-zero minimum cases. If
6143       the maximum is limited, it replicates the group in a nested fashion,
6144       remembering the bracket starts on a stack. In the case of a zero minimum,
6145       the first one was set up above. In all cases the repeat_max now specifies
6146       the number of additional copies needed. Again, we must remember to
6147       replicate entries on the forward reference list. */
6148 
6149       if (repeat_max >= 0)
6150         {
6151         /* In the pre-compile phase, we don't actually do the replication. We
6152         just adjust the length as if we had. For each repetition we must add 1
6153         to the length for BRAZERO and for all but the last repetition we must
6154         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6155         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6156         a 64-bit integer type when available, otherwise double. */
6157 
6158         if (lengthptr != NULL && repeat_max > 0)
6159           {
6160           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6161                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6162           if ((INT64_OR_DOUBLE)repeat_max *
6163                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6164                   > (INT64_OR_DOUBLE)INT_MAX ||
6165               OFLOW_MAX - *lengthptr < delta)
6166             {
6167             *errorcodeptr = ERR20;
6168             goto FAILED;
6169             }
6170           *lengthptr += delta;
6171           }
6172 
6173         /* This is compiling for real */
6174 
6175         else for (i = repeat_max - 1; i >= 0; i--)
6176           {
6177           pcre_uchar *hc;
6178           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6179 
6180           *code++ = OP_BRAZERO + repeat_type;
6181 
6182           /* All but the final copy start a new nesting, maintaining the
6183           chain of brackets outstanding. */
6184 
6185           if (i != 0)
6186             {
6187             int offset;
6188             *code++ = OP_BRA;
6189             offset = (bralink == NULL)? 0 : (int)(code - bralink);
6190             bralink = code;
6191             PUTINC(code, 0, offset);
6192             }
6193 
6194           memcpy(code, previous, IN_UCHARS(len));
6195 
6196           /* Ensure there is enough workspace for forward references before
6197           copying them. */
6198 
6199           while (cd->hwm > cd->start_workspace + cd->workspace_size -
6200                  WORK_SIZE_SAFETY_MARGIN -
6201                  (this_hwm_offset - base_hwm_offset))
6202             {
6203             *errorcodeptr = expand_workspace(cd);
6204             if (*errorcodeptr != 0) goto FAILED;
6205             }
6206 
6207           for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6208                hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6209                hc += LINK_SIZE)
6210             {
6211             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6212             cd->hwm += LINK_SIZE;
6213             }
6214           base_hwm_offset = this_hwm_offset;
6215           code += len;
6216           }
6217 
6218         /* Now chain through the pending brackets, and fill in their length
6219         fields (which are holding the chain links pro tem). */
6220 
6221         while (bralink != NULL)
6222           {
6223           int oldlinkoffset;
6224           int offset = (int)(code - bralink + 1);
6225           pcre_uchar *bra = code - offset;
6226           oldlinkoffset = GET(bra, 1);
6227           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6228           *code++ = OP_KET;
6229           PUTINC(code, 0, offset);
6230           PUT(bra, 1, offset);
6231           }
6232         }
6233 
6234       /* If the maximum is unlimited, set a repeater in the final copy. For
6235       ONCE brackets, that's all we need to do. However, possessively repeated
6236       ONCE brackets can be converted into non-capturing brackets, as the
6237       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6238       deal with possessive ONCEs specially.
6239 
6240       Otherwise, when we are doing the actual compile phase, check to see
6241       whether this group is one that could match an empty string. If so,
6242       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6243       that runtime checking can be done. [This check is also applied to ONCE
6244       groups at runtime, but in a different way.]
6245 
6246       Then, if the quantifier was possessive and the bracket is not a
6247       conditional, we convert the BRA code to the POS form, and the KET code to
6248       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6249       subpattern at both the start and at the end.) The use of special opcodes
6250       makes it possible to reduce greatly the stack usage in pcre_exec(). If
6251       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6252 
6253       Then, if the minimum number of matches is 1 or 0, cancel the possessive
6254       flag so that the default action below, of wrapping everything inside
6255       atomic brackets, does not happen. When the minimum is greater than 1,
6256       there will be earlier copies of the group, and so we still have to wrap
6257       the whole thing. */
6258 
6259       else
6260         {
6261         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6262         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6263 
6264         /* Convert possessive ONCE brackets to non-capturing */
6265 
6266         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6267             possessive_quantifier) *bracode = OP_BRA;
6268 
6269         /* For non-possessive ONCE brackets, all we need to do is to
6270         set the KET. */
6271 
6272         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6273           *ketcode = OP_KETRMAX + repeat_type;
6274 
6275         /* Handle non-ONCE brackets and possessive ONCEs (which have been
6276         converted to non-capturing above). */
6277 
6278         else
6279           {
6280           /* In the compile phase, check for empty string matching. */
6281 
6282           if (lengthptr == NULL)
6283             {
6284             pcre_uchar *scode = bracode;
6285             do
6286               {
6287               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6288                 {
6289                 *bracode += OP_SBRA - OP_BRA;
6290                 break;
6291                 }
6292               scode += GET(scode, 1);
6293               }
6294             while (*scode == OP_ALT);
6295             }
6296 
6297           /* A conditional group with only one branch has an implicit empty
6298           alternative branch. */
6299 
6300           if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6301             *bracode = OP_SCOND;
6302 
6303           /* Handle possessive quantifiers. */
6304 
6305           if (possessive_quantifier)
6306             {
6307             /* For COND brackets, we wrap the whole thing in a possessively
6308             repeated non-capturing bracket, because we have not invented POS
6309             versions of the COND opcodes. Because we are moving code along, we
6310             must ensure that any pending recursive references are updated. */
6311 
6312             if (*bracode == OP_COND || *bracode == OP_SCOND)
6313               {
6314               int nlen = (int)(code - bracode);
6315               *code = OP_END;
6316               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6317               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6318               code += 1 + LINK_SIZE;
6319               nlen += 1 + LINK_SIZE;
6320               *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6321               *code++ = OP_KETRPOS;
6322               PUTINC(code, 0, nlen);
6323               PUT(bracode, 1, nlen);
6324               }
6325 
6326             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6327 
6328             else
6329               {
6330               *bracode += 1;              /* Switch to xxxPOS opcodes */
6331               *ketcode = OP_KETRPOS;
6332               }
6333 
6334             /* If the minimum is zero, mark it as possessive, then unset the
6335             possessive flag when the minimum is 0 or 1. */
6336 
6337             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6338             if (repeat_min < 2) possessive_quantifier = FALSE;
6339             }
6340 
6341           /* Non-possessive quantifier */
6342 
6343           else *ketcode = OP_KETRMAX + repeat_type;
6344           }
6345         }
6346       }
6347 
6348     /* If previous is OP_FAIL, it was generated by an empty class [] in
6349     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6350     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6351     error above. We can just ignore the repeat in JS case. */
6352 
6353     else if (*previous == OP_FAIL) goto END_REPEAT;
6354 
6355     /* Else there's some kind of shambles */
6356 
6357     else
6358       {
6359       *errorcodeptr = ERR11;
6360       goto FAILED;
6361       }
6362 
6363     /* If the character following a repeat is '+', possessive_quantifier is
6364     TRUE. For some opcodes, there are special alternative opcodes for this
6365     case. For anything else, we wrap the entire repeated item inside OP_ONCE
6366     brackets. Logically, the '+' notation is just syntactic sugar, taken from
6367     Sun's Java package, but the special opcodes can optimize it.
6368 
6369     Some (but not all) possessively repeated subpatterns have already been
6370     completely handled in the code just above. For them, possessive_quantifier
6371     is always FALSE at this stage. Note that the repeated item starts at
6372     tempcode, not at previous, which might be the first part of a string whose
6373     (former) last char we repeated. */
6374 
6375     if (possessive_quantifier)
6376       {
6377       int len;
6378 
6379       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6380       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6381       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6382       remains is greater than zero, there's a further opcode that can be
6383       handled. If not, do nothing, leaving the EXACT alone. */
6384 
6385       switch(*tempcode)
6386         {
6387         case OP_TYPEEXACT:
6388         tempcode += PRIV(OP_lengths)[*tempcode] +
6389           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6390           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6391         break;
6392 
6393         /* CHAR opcodes are used for exacts whose count is 1. */
6394 
6395         case OP_CHAR:
6396         case OP_CHARI:
6397         case OP_NOT:
6398         case OP_NOTI:
6399         case OP_EXACT:
6400         case OP_EXACTI:
6401         case OP_NOTEXACT:
6402         case OP_NOTEXACTI:
6403         tempcode += PRIV(OP_lengths)[*tempcode];
6404 #ifdef SUPPORT_UTF
6405         if (utf && HAS_EXTRALEN(tempcode[-1]))
6406           tempcode += GET_EXTRALEN(tempcode[-1]);
6407 #endif
6408         break;
6409 
6410         /* For the class opcodes, the repeat operator appears at the end;
6411         adjust tempcode to point to it. */
6412 
6413         case OP_CLASS:
6414         case OP_NCLASS:
6415         tempcode += 1 + 32/sizeof(pcre_uchar);
6416         break;
6417 
6418 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6419         case OP_XCLASS:
6420         tempcode += GET(tempcode, 1);
6421         break;
6422 #endif
6423         }
6424 
6425       /* If tempcode is equal to code (which points to the end of the repeated
6426       item), it means we have skipped an EXACT item but there is no following
6427       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6428       all other cases, tempcode will be pointing to the repeat opcode, and will
6429       be less than code, so the value of len will be greater than 0. */
6430 
6431       len = (int)(code - tempcode);
6432       if (len > 0)
6433         {
6434         unsigned int repcode = *tempcode;
6435 
6436         /* There is a table for possessifying opcodes, all of which are less
6437         than OP_CALLOUT. A zero entry means there is no possessified version.
6438         */
6439 
6440         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6441           *tempcode = opcode_possessify[repcode];
6442 
6443         /* For opcode without a special possessified version, wrap the item in
6444         ONCE brackets. Because we are moving code along, we must ensure that any
6445         pending recursive references are updated. */
6446 
6447         else
6448           {
6449           *code = OP_END;
6450           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6451           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6452           code += 1 + LINK_SIZE;
6453           len += 1 + LINK_SIZE;
6454           tempcode[0] = OP_ONCE;
6455           *code++ = OP_KET;
6456           PUTINC(code, 0, len);
6457           PUT(tempcode, 1, len);
6458           }
6459         }
6460 
6461 #ifdef NEVER
6462       if (len > 0) switch (*tempcode)
6463         {
6464         case OP_STAR:  *tempcode = OP_POSSTAR; break;
6465         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6466         case OP_QUERY: *tempcode = OP_POSQUERY; break;
6467         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6468 
6469         case OP_STARI:  *tempcode = OP_POSSTARI; break;
6470         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6471         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6472         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6473 
6474         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6475         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6476         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6477         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6478 
6479         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6480         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6481         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6482         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6483 
6484         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6485         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6486         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6487         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6488 
6489         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6490         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6491         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6492         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6493 
6494         /* Because we are moving code along, we must ensure that any
6495         pending recursive references are updated. */
6496 
6497         default:
6498         *code = OP_END;
6499         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6500         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6501         code += 1 + LINK_SIZE;
6502         len += 1 + LINK_SIZE;
6503         tempcode[0] = OP_ONCE;
6504         *code++ = OP_KET;
6505         PUTINC(code, 0, len);
6506         PUT(tempcode, 1, len);
6507         break;
6508         }
6509 #endif
6510       }
6511 
6512     /* In all case we no longer have a previous item. We also set the
6513     "follows varying string" flag for subsequently encountered reqchars if
6514     it isn't already set and we have just passed a varying length item. */
6515 
6516     END_REPEAT:
6517     previous = NULL;
6518     cd->req_varyopt |= reqvary;
6519     break;
6520 
6521 
6522     /* ===================================================================*/
6523     /* Start of nested parenthesized sub-expression, or comment or lookahead or
6524     lookbehind or option setting or condition or all the other extended
6525     parenthesis forms.  */
6526 
6527     case CHAR_LEFT_PARENTHESIS:
6528     ptr++;
6529 
6530     /* First deal with comments. Putting this code right at the start ensures
6531     that comments have no bad side effects. */
6532 
6533     if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
6534       {
6535       ptr += 2;
6536       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6537       if (*ptr == CHAR_NULL)
6538         {
6539         *errorcodeptr = ERR18;
6540         goto FAILED;
6541         }
6542       continue;
6543       }
6544 
6545     /* Now deal with various "verbs" that can be introduced by '*'. */
6546 
6547     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6548          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6549       {
6550       int i, namelen;
6551       int arglen = 0;
6552       const char *vn = verbnames;
6553       const pcre_uchar *name = ptr + 1;
6554       const pcre_uchar *arg = NULL;
6555       previous = NULL;
6556       ptr++;
6557       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6558       namelen = (int)(ptr - name);
6559 
6560       /* It appears that Perl allows any characters whatsoever, other than
6561       a closing parenthesis, to appear in arguments, so we no longer insist on
6562       letters, digits, and underscores. */
6563 
6564       if (*ptr == CHAR_COLON)
6565         {
6566         arg = ++ptr;
6567         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6568         arglen = (int)(ptr - arg);
6569         if ((unsigned int)arglen > MAX_MARK)
6570           {
6571           *errorcodeptr = ERR75;
6572           goto FAILED;
6573           }
6574         }
6575 
6576       if (*ptr != CHAR_RIGHT_PARENTHESIS)
6577         {
6578         *errorcodeptr = ERR60;
6579         goto FAILED;
6580         }
6581 
6582       /* Scan the table of verb names */
6583 
6584       for (i = 0; i < verbcount; i++)
6585         {
6586         if (namelen == verbs[i].len &&
6587             STRNCMP_UC_C8(name, vn, namelen) == 0)
6588           {
6589           int setverb;
6590 
6591           /* Check for open captures before ACCEPT and convert it to
6592           ASSERT_ACCEPT if in an assertion. */
6593 
6594           if (verbs[i].op == OP_ACCEPT)
6595             {
6596             open_capitem *oc;
6597             if (arglen != 0)
6598               {
6599               *errorcodeptr = ERR59;
6600               goto FAILED;
6601               }
6602             cd->had_accept = TRUE;
6603             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6604               {
6605               *code++ = OP_CLOSE;
6606               PUT2INC(code, 0, oc->number);
6607               }
6608             setverb = *code++ =
6609               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6610 
6611             /* Do not set firstchar after *ACCEPT */
6612             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6613             }
6614 
6615           /* Handle other cases with/without an argument */
6616 
6617           else if (arglen == 0)
6618             {
6619             if (verbs[i].op < 0)   /* Argument is mandatory */
6620               {
6621               *errorcodeptr = ERR66;
6622               goto FAILED;
6623               }
6624             setverb = *code++ = verbs[i].op;
6625             }
6626 
6627           else
6628             {
6629             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6630               {
6631               *errorcodeptr = ERR59;
6632               goto FAILED;
6633               }
6634             setverb = *code++ = verbs[i].op_arg;
6635             if (lengthptr != NULL)    /* In pass 1 just add in the length */
6636               {                       /* to avoid potential workspace */
6637               *lengthptr += arglen;   /* overflow. */
6638               *code++ = 0;
6639               }
6640             else
6641               {
6642               *code++ = arglen;
6643               memcpy(code, arg, IN_UCHARS(arglen));
6644               code += arglen;
6645               }
6646             *code++ = 0;
6647             }
6648 
6649           switch (setverb)
6650             {
6651             case OP_THEN:
6652             case OP_THEN_ARG:
6653             cd->external_flags |= PCRE_HASTHEN;
6654             break;
6655 
6656             case OP_PRUNE:
6657             case OP_PRUNE_ARG:
6658             case OP_SKIP:
6659             case OP_SKIP_ARG:
6660             cd->had_pruneorskip = TRUE;
6661             break;
6662             }
6663 
6664           break;  /* Found verb, exit loop */
6665           }
6666 
6667         vn += verbs[i].len + 1;
6668         }
6669 
6670       if (i < verbcount) continue;    /* Successfully handled a verb */
6671       *errorcodeptr = ERR60;          /* Verb not recognized */
6672       goto FAILED;
6673       }
6674 
6675     /* Initialize for "real" parentheses */
6676 
6677     newoptions = options;
6678     skipbytes = 0;
6679     bravalue = OP_CBRA;
6680     item_hwm_offset = cd->hwm - cd->start_workspace;
6681     reset_bracount = FALSE;
6682 
6683     /* Deal with the extended parentheses; all are introduced by '?', and the
6684     appearance of any of them means that this is not a capturing group. */
6685 
6686     if (*ptr == CHAR_QUESTION_MARK)
6687       {
6688       int i, set, unset, namelen;
6689       int *optset;
6690       const pcre_uchar *name;
6691       pcre_uchar *slot;
6692 
6693       switch (*(++ptr))
6694         {
6695         /* ------------------------------------------------------------ */
6696         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6697         reset_bracount = TRUE;
6698         cd->dupgroups = TRUE;     /* Record (?| encountered */
6699         /* Fall through */
6700 
6701         /* ------------------------------------------------------------ */
6702         case CHAR_COLON:          /* Non-capturing bracket */
6703         bravalue = OP_BRA;
6704         ptr++;
6705         break;
6706 
6707 
6708         /* ------------------------------------------------------------ */
6709         case CHAR_LEFT_PARENTHESIS:
6710         bravalue = OP_COND;       /* Conditional group */
6711         tempptr = ptr;
6712 
6713         /* A condition can be an assertion, a number (referring to a numbered
6714         group's having been set), a name (referring to a named group), or 'R',
6715         referring to recursion. R<digits> and R&name are also permitted for
6716         recursion tests.
6717 
6718         There are ways of testing a named group: (?(name)) is used by Python;
6719         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6720 
6721         There is one unfortunate ambiguity, caused by history. 'R' can be the
6722         recursive thing or the name 'R' (and similarly for 'R' followed by
6723         digits). We look for a name first; if not found, we try the other case.
6724 
6725         For compatibility with auto-callouts, we allow a callout to be
6726         specified before a condition that is an assertion. First, check for the
6727         syntax of a callout; if found, adjust the temporary pointer that is
6728         used to check for an assertion condition. That's all that is needed! */
6729 
6730         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6731           {
6732           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6733           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6734             tempptr += i + 1;
6735 
6736           /* tempptr should now be pointing to the opening parenthesis of the
6737           assertion condition. */
6738 
6739           if (*tempptr != CHAR_LEFT_PARENTHESIS)
6740             {
6741             *errorcodeptr = ERR28;
6742             goto FAILED;
6743             }
6744           }
6745 
6746         /* For conditions that are assertions, check the syntax, and then exit
6747         the switch. This will take control down to where bracketed groups,
6748         including assertions, are processed. */
6749 
6750         if (tempptr[1] == CHAR_QUESTION_MARK &&
6751               (tempptr[2] == CHAR_EQUALS_SIGN ||
6752                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6753                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6754                    (tempptr[3] == CHAR_EQUALS_SIGN ||
6755                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
6756           {
6757           cd->iscondassert = TRUE;
6758           break;
6759           }
6760 
6761         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6762         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6763 
6764         code[1+LINK_SIZE] = OP_CREF;
6765         skipbytes = 1+IMM2_SIZE;
6766         refsign = -1;     /* => not a number */
6767         namelen = -1;     /* => not a name; must set to avoid warning */
6768         name = NULL;      /* Always set to avoid warning */
6769         recno = 0;        /* Always set to avoid warning */
6770 
6771         /* Check for a test for recursion in a named group. */
6772 
6773         ptr++;
6774         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6775           {
6776           terminator = -1;
6777           ptr += 2;
6778           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6779           }
6780 
6781         /* Check for a test for a named group's having been set, using the Perl
6782         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6783         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6784 
6785         else if (*ptr == CHAR_LESS_THAN_SIGN)
6786           {
6787           terminator = CHAR_GREATER_THAN_SIGN;
6788           ptr++;
6789           }
6790         else if (*ptr == CHAR_APOSTROPHE)
6791           {
6792           terminator = CHAR_APOSTROPHE;
6793           ptr++;
6794           }
6795         else
6796           {
6797           terminator = CHAR_NULL;
6798           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6799             else if (IS_DIGIT(*ptr)) refsign = 0;
6800           }
6801 
6802         /* Handle a number */
6803 
6804         if (refsign >= 0)
6805           {
6806           while (IS_DIGIT(*ptr))
6807             {
6808             if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6809               {
6810               while (IS_DIGIT(*ptr)) ptr++;
6811               *errorcodeptr = ERR61;
6812               goto FAILED;
6813               }
6814             recno = recno * 10 + (int)(*ptr - CHAR_0);
6815             ptr++;
6816             }
6817           }
6818 
6819         /* Otherwise we expect to read a name; anything else is an error. When
6820         a name is one of a number of duplicates, a different opcode is used and
6821         it needs more memory. Unfortunately we cannot tell whether a name is a
6822         duplicate in the first pass, so we have to allow for more memory. */
6823 
6824         else
6825           {
6826           if (IS_DIGIT(*ptr))
6827             {
6828             *errorcodeptr = ERR84;
6829             goto FAILED;
6830             }
6831           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6832             {
6833             *errorcodeptr = ERR28;   /* Assertion expected */
6834             goto FAILED;
6835             }
6836           name = ptr++;
6837           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6838             {
6839             ptr++;
6840             }
6841           namelen = (int)(ptr - name);
6842           if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6843           }
6844 
6845         /* Check the terminator */
6846 
6847         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6848             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6849           {
6850           ptr--;                  /* Error offset */
6851           *errorcodeptr = ERR26;  /* Malformed number or name */
6852           goto FAILED;
6853           }
6854 
6855         /* Do no further checking in the pre-compile phase. */
6856 
6857         if (lengthptr != NULL) break;
6858 
6859         /* In the real compile we do the work of looking for the actual
6860         reference. If refsign is not negative, it means we have a number in
6861         recno. */
6862 
6863         if (refsign >= 0)
6864           {
6865           if (recno <= 0)
6866             {
6867             *errorcodeptr = ERR35;
6868             goto FAILED;
6869             }
6870           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6871             cd->bracount - recno + 1 : recno + cd->bracount;
6872           if (recno <= 0 || recno > cd->final_bracount)
6873             {
6874             *errorcodeptr = ERR15;
6875             goto FAILED;
6876             }
6877           PUT2(code, 2+LINK_SIZE, recno);
6878           if (recno > cd->top_backref) cd->top_backref = recno;
6879           break;
6880           }
6881 
6882         /* Otherwise look for the name. */
6883 
6884         slot = cd->name_table;
6885         for (i = 0; i < cd->names_found; i++)
6886           {
6887           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6888           slot += cd->name_entry_size;
6889           }
6890 
6891         /* Found the named subpattern. If the name is duplicated, add one to
6892         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6893         appropriate data values. Otherwise, just insert the unique subpattern
6894         number. */
6895 
6896         if (i < cd->names_found)
6897           {
6898           int offset = i++;
6899           int count = 1;
6900           recno = GET2(slot, 0);   /* Number from first found */
6901           if (recno > cd->top_backref) cd->top_backref = recno;
6902           for (; i < cd->names_found; i++)
6903             {
6904             slot += cd->name_entry_size;
6905             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6906               (slot+IMM2_SIZE)[namelen] != 0) break;
6907             count++;
6908             }
6909 
6910           if (count > 1)
6911             {
6912             PUT2(code, 2+LINK_SIZE, offset);
6913             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6914             skipbytes += IMM2_SIZE;
6915             code[1+LINK_SIZE]++;
6916             }
6917           else  /* Not a duplicated name */
6918             {
6919             PUT2(code, 2+LINK_SIZE, recno);
6920             }
6921           }
6922 
6923         /* If terminator == CHAR_NULL it means that the name followed directly
6924         after the opening parenthesis [e.g. (?(abc)...] and in this case there
6925         are some further alternatives to try. For the cases where terminator !=
6926         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6927         we have now checked all the possibilities, so give an error. */
6928 
6929         else if (terminator != CHAR_NULL)
6930           {
6931           *errorcodeptr = ERR15;
6932           goto FAILED;
6933           }
6934 
6935         /* Check for (?(R) for recursion. Allow digits after R to specify a
6936         specific group number. */
6937 
6938         else if (*name == CHAR_R)
6939           {
6940           recno = 0;
6941           for (i = 1; i < namelen; i++)
6942             {
6943             if (!IS_DIGIT(name[i]))
6944               {
6945               *errorcodeptr = ERR15;
6946               goto FAILED;
6947               }
6948             if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
6949               {
6950               *errorcodeptr = ERR61;
6951               goto FAILED;
6952               }
6953             recno = recno * 10 + name[i] - CHAR_0;
6954             }
6955           if (recno == 0) recno = RREF_ANY;
6956           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6957           PUT2(code, 2+LINK_SIZE, recno);
6958           }
6959 
6960         /* Similarly, check for the (?(DEFINE) "condition", which is always
6961         false. */
6962 
6963         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6964           {
6965           code[1+LINK_SIZE] = OP_DEF;
6966           skipbytes = 1;
6967           }
6968 
6969         /* Reference to an unidentified subpattern. */
6970 
6971         else
6972           {
6973           *errorcodeptr = ERR15;
6974           goto FAILED;
6975           }
6976         break;
6977 
6978 
6979         /* ------------------------------------------------------------ */
6980         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
6981         bravalue = OP_ASSERT;
6982         cd->assert_depth += 1;
6983         ptr++;
6984         break;
6985 
6986         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6987         thing to do, but Perl allows all assertions to be quantified, and when
6988         they contain capturing parentheses there may be a potential use for
6989         this feature. Not that that applies to a quantified (?!) but we allow
6990         it for uniformity. */
6991 
6992         /* ------------------------------------------------------------ */
6993         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6994         ptr++;
6995         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6996              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6997             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6998           {
6999           *code++ = OP_FAIL;
7000           previous = NULL;
7001           continue;
7002           }
7003         bravalue = OP_ASSERT_NOT;
7004         cd->assert_depth += 1;
7005         break;
7006 
7007 
7008         /* ------------------------------------------------------------ */
7009         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
7010         switch (ptr[1])
7011           {
7012           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
7013           bravalue = OP_ASSERTBACK;
7014           cd->assert_depth += 1;
7015           ptr += 2;
7016           break;
7017 
7018           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
7019           bravalue = OP_ASSERTBACK_NOT;
7020           cd->assert_depth += 1;
7021           ptr += 2;
7022           break;
7023 
7024           default:                /* Could be name define, else bad */
7025           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7026             goto DEFINE_NAME;
7027           ptr++;                  /* Correct offset for error */
7028           *errorcodeptr = ERR24;
7029           goto FAILED;
7030           }
7031         break;
7032 
7033 
7034         /* ------------------------------------------------------------ */
7035         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
7036         bravalue = OP_ONCE;
7037         ptr++;
7038         break;
7039 
7040 
7041         /* ------------------------------------------------------------ */
7042         case CHAR_C:                 /* Callout - may be followed by digits; */
7043         previous_callout = code;     /* Save for later completion */
7044         after_manual_callout = 1;    /* Skip one item before completing */
7045         *code++ = OP_CALLOUT;
7046           {
7047           int n = 0;
7048           ptr++;
7049           while(IS_DIGIT(*ptr))
7050             n = n * 10 + *ptr++ - CHAR_0;
7051           if (*ptr != CHAR_RIGHT_PARENTHESIS)
7052             {
7053             *errorcodeptr = ERR39;
7054             goto FAILED;
7055             }
7056           if (n > 255)
7057             {
7058             *errorcodeptr = ERR38;
7059             goto FAILED;
7060             }
7061           *code++ = n;
7062           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7063           PUT(code, LINK_SIZE, 0);                          /* Default length */
7064           code += 2 * LINK_SIZE;
7065           }
7066         previous = NULL;
7067         continue;
7068 
7069 
7070         /* ------------------------------------------------------------ */
7071         case CHAR_P:              /* Python-style named subpattern handling */
7072         if (*(++ptr) == CHAR_EQUALS_SIGN ||
7073             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
7074           {
7075           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7076           terminator = CHAR_RIGHT_PARENTHESIS;
7077           goto NAMED_REF_OR_RECURSE;
7078           }
7079         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7080           {
7081           *errorcodeptr = ERR41;
7082           goto FAILED;
7083           }
7084         /* Fall through to handle (?P< as (?< is handled */
7085 
7086 
7087         /* ------------------------------------------------------------ */
7088         DEFINE_NAME:    /* Come here from (?< handling */
7089         case CHAR_APOSTROPHE:
7090         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7091           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7092         name = ++ptr;
7093         if (IS_DIGIT(*ptr))
7094           {
7095           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7096           goto FAILED;
7097           }
7098         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7099         namelen = (int)(ptr - name);
7100 
7101         /* In the pre-compile phase, do a syntax check, remember the longest
7102         name, and then remember the group in a vector, expanding it if
7103         necessary. Duplicates for the same number are skipped; other duplicates
7104         are checked for validity. In the actual compile, there is nothing to
7105         do. */
7106 
7107         if (lengthptr != NULL)
7108           {
7109           named_group *ng;
7110           pcre_uint32 number = cd->bracount + 1;
7111 
7112           if (*ptr != (pcre_uchar)terminator)
7113             {
7114             *errorcodeptr = ERR42;
7115             goto FAILED;
7116             }
7117 
7118           if (cd->names_found >= MAX_NAME_COUNT)
7119             {
7120             *errorcodeptr = ERR49;
7121             goto FAILED;
7122             }
7123 
7124           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7125             {
7126             cd->name_entry_size = namelen + IMM2_SIZE + 1;
7127             if (namelen > MAX_NAME_SIZE)
7128               {
7129               *errorcodeptr = ERR48;
7130               goto FAILED;
7131               }
7132             }
7133 
7134           /* Scan the list to check for duplicates. For duplicate names, if the
7135           number is the same, break the loop, which causes the name to be
7136           discarded; otherwise, if DUPNAMES is not set, give an error.
7137           If it is set, allow the name with a different number, but continue
7138           scanning in case this is a duplicate with the same number. For
7139           non-duplicate names, give an error if the number is duplicated. */
7140 
7141           ng = cd->named_groups;
7142           for (i = 0; i < cd->names_found; i++, ng++)
7143             {
7144             if (namelen == ng->length &&
7145                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7146               {
7147               if (ng->number == number) break;
7148               if ((options & PCRE_DUPNAMES) == 0)
7149                 {
7150                 *errorcodeptr = ERR43;
7151                 goto FAILED;
7152                 }
7153               cd->dupnames = TRUE;  /* Duplicate names exist */
7154               }
7155             else if (ng->number == number)
7156               {
7157               *errorcodeptr = ERR65;
7158               goto FAILED;
7159               }
7160             }
7161 
7162           if (i >= cd->names_found)     /* Not a duplicate with same number */
7163             {
7164             /* Increase the list size if necessary */
7165 
7166             if (cd->names_found >= cd->named_group_list_size)
7167               {
7168               int newsize = cd->named_group_list_size * 2;
7169               named_group *newspace = (PUBL(malloc))
7170                 (newsize * sizeof(named_group));
7171 
7172               if (newspace == NULL)
7173                 {
7174                 *errorcodeptr = ERR21;
7175                 goto FAILED;
7176                 }
7177 
7178               memcpy(newspace, cd->named_groups,
7179                 cd->named_group_list_size * sizeof(named_group));
7180               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7181                 (PUBL(free))((void *)cd->named_groups);
7182               cd->named_groups = newspace;
7183               cd->named_group_list_size = newsize;
7184               }
7185 
7186             cd->named_groups[cd->names_found].name = name;
7187             cd->named_groups[cd->names_found].length = namelen;
7188             cd->named_groups[cd->names_found].number = number;
7189             cd->names_found++;
7190             }
7191           }
7192 
7193         ptr++;                    /* Move past > or ' in both passes. */
7194         goto NUMBERED_GROUP;
7195 
7196 
7197         /* ------------------------------------------------------------ */
7198         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7199         terminator = CHAR_RIGHT_PARENTHESIS;
7200         is_recurse = TRUE;
7201         /* Fall through */
7202 
7203         /* We come here from the Python syntax above that handles both
7204         references (?P=name) and recursion (?P>name), as well as falling
7205         through from the Perl recursion syntax (?&name). We also come here from
7206         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7207         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7208 
7209         NAMED_REF_OR_RECURSE:
7210         name = ++ptr;
7211         if (IS_DIGIT(*ptr))
7212           {
7213           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7214           goto FAILED;
7215           }
7216         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7217         namelen = (int)(ptr - name);
7218 
7219         /* In the pre-compile phase, do a syntax check. We used to just set
7220         a dummy reference number, because it was not used in the first pass.
7221         However, with the change of recursive back references to be atomic,
7222         we have to look for the number so that this state can be identified, as
7223         otherwise the incorrect length is computed. If it's not a backwards
7224         reference, the dummy number will do. */
7225 
7226         if (lengthptr != NULL)
7227           {
7228           named_group *ng;
7229           recno = 0;
7230 
7231           if (namelen == 0)
7232             {
7233             *errorcodeptr = ERR62;
7234             goto FAILED;
7235             }
7236           if (*ptr != (pcre_uchar)terminator)
7237             {
7238             *errorcodeptr = ERR42;
7239             goto FAILED;
7240             }
7241           if (namelen > MAX_NAME_SIZE)
7242             {
7243             *errorcodeptr = ERR48;
7244             goto FAILED;
7245             }
7246 
7247           /* Count named back references. */
7248 
7249           if (!is_recurse) cd->namedrefcount++;
7250 
7251           /* We have to allow for a named reference to a duplicated name (this
7252           cannot be determined until the second pass). This needs an extra
7253           16-bit data item. */
7254 
7255           *lengthptr += IMM2_SIZE;
7256 
7257           /* If this is a forward reference and we are within a (?|...) group,
7258           the reference may end up as the number of a group which we are
7259           currently inside, that is, it could be a recursive reference. In the
7260           real compile this will be picked up and the reference wrapped with
7261           OP_ONCE to make it atomic, so we must space in case this occurs. */
7262 
7263           /* In fact, this can happen for a non-forward reference because
7264           another group with the same number might be created later. This
7265           issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7266           only mode, we finesse the bug by allowing more memory always. */
7267 
7268           *lengthptr += 2 + 2*LINK_SIZE;
7269 
7270           /* It is even worse than that. The current reference may be to an
7271           existing named group with a different number (so apparently not
7272           recursive) but which later on is also attached to a group with the
7273           current number. This can only happen if $(| has been previous
7274           encountered. In that case, we allow yet more memory, just in case.
7275           (Again, this is fixed "properly" in PCRE2. */
7276 
7277           if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7278 
7279           /* Otherwise, check for recursion here. The name table does not exist
7280           in the first pass; instead we must scan the list of names encountered
7281           so far in order to get the number. If the name is not found, leave
7282           the value of recno as 0 for a forward reference. */
7283 
7284           /* This patch (removing "else") fixes a problem when a reference is
7285           to multiple identically named nested groups from within the nest.
7286           Once again, it is not the "proper" fix, and it results in an
7287           over-allocation of memory. */
7288 
7289           /* else */
7290             {
7291             ng = cd->named_groups;
7292             for (i = 0; i < cd->names_found; i++, ng++)
7293               {
7294               if (namelen == ng->length &&
7295                   STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7296                 {
7297                 open_capitem *oc;
7298                 recno = ng->number;
7299                 if (is_recurse) break;
7300                 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7301                   {
7302                   if (oc->number == recno)
7303                     {
7304                     oc->flag = TRUE;
7305                     break;
7306                     }
7307                   }
7308                 }
7309               }
7310             }
7311           }
7312 
7313         /* In the real compile, search the name table. We check the name
7314         first, and then check that we have reached the end of the name in the
7315         table. That way, if the name is longer than any in the table, the
7316         comparison will fail without reading beyond the table entry. */
7317 
7318         else
7319           {
7320           slot = cd->name_table;
7321           for (i = 0; i < cd->names_found; i++)
7322             {
7323             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7324                 slot[IMM2_SIZE+namelen] == 0)
7325               break;
7326             slot += cd->name_entry_size;
7327             }
7328 
7329           if (i < cd->names_found)
7330             {
7331             recno = GET2(slot, 0);
7332             }
7333           else
7334             {
7335             *errorcodeptr = ERR15;
7336             goto FAILED;
7337             }
7338           }
7339 
7340         /* In both phases, for recursions, we can now go to the code than
7341         handles numerical recursion. */
7342 
7343         if (is_recurse) goto HANDLE_RECURSION;
7344 
7345         /* In the second pass we must see if the name is duplicated. If so, we
7346         generate a different opcode. */
7347 
7348         if (lengthptr == NULL && cd->dupnames)
7349           {
7350           int count = 1;
7351           unsigned int index = i;
7352           pcre_uchar *cslot = slot + cd->name_entry_size;
7353 
7354           for (i++; i < cd->names_found; i++)
7355             {
7356             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7357             count++;
7358             cslot += cd->name_entry_size;
7359             }
7360 
7361           if (count > 1)
7362             {
7363             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7364             previous = code;
7365             item_hwm_offset = cd->hwm - cd->start_workspace;
7366             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7367             PUT2INC(code, 0, index);
7368             PUT2INC(code, 0, count);
7369 
7370             /* Process each potentially referenced group. */
7371 
7372             for (; slot < cslot; slot += cd->name_entry_size)
7373               {
7374               open_capitem *oc;
7375               recno = GET2(slot, 0);
7376               cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7377               if (recno > cd->top_backref) cd->top_backref = recno;
7378 
7379               /* Check to see if this back reference is recursive, that it, it
7380               is inside the group that it references. A flag is set so that the
7381               group can be made atomic. */
7382 
7383               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7384                 {
7385                 if (oc->number == recno)
7386                   {
7387                   oc->flag = TRUE;
7388                   break;
7389                   }
7390                 }
7391               }
7392 
7393             continue;  /* End of back ref handling */
7394             }
7395           }
7396 
7397         /* First pass, or a non-duplicated name. */
7398 
7399         goto HANDLE_REFERENCE;
7400 
7401 
7402         /* ------------------------------------------------------------ */
7403         case CHAR_R:              /* Recursion, same as (?0) */
7404         recno = 0;
7405         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7406           {
7407           *errorcodeptr = ERR29;
7408           goto FAILED;
7409           }
7410         goto HANDLE_RECURSION;
7411 
7412 
7413         /* ------------------------------------------------------------ */
7414         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7415         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7416         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7417           {
7418           const pcre_uchar *called;
7419           terminator = CHAR_RIGHT_PARENTHESIS;
7420 
7421           /* Come here from the \g<...> and \g'...' code (Oniguruma
7422           compatibility). However, the syntax has been checked to ensure that
7423           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7424           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7425           ever be taken. */
7426 
7427           HANDLE_NUMERICAL_RECURSION:
7428 
7429           if ((refsign = *ptr) == CHAR_PLUS)
7430             {
7431             ptr++;
7432             if (!IS_DIGIT(*ptr))
7433               {
7434               *errorcodeptr = ERR63;
7435               goto FAILED;
7436               }
7437             }
7438           else if (refsign == CHAR_MINUS)
7439             {
7440             if (!IS_DIGIT(ptr[1]))
7441               goto OTHER_CHAR_AFTER_QUERY;
7442             ptr++;
7443             }
7444 
7445           recno = 0;
7446           while(IS_DIGIT(*ptr))
7447             {
7448             if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7449               {
7450               while (IS_DIGIT(*ptr)) ptr++;
7451               *errorcodeptr = ERR61;
7452               goto FAILED;
7453               }
7454             recno = recno * 10 + *ptr++ - CHAR_0;
7455             }
7456 
7457           if (*ptr != (pcre_uchar)terminator)
7458             {
7459             *errorcodeptr = ERR29;
7460             goto FAILED;
7461             }
7462 
7463           if (refsign == CHAR_MINUS)
7464             {
7465             if (recno == 0)
7466               {
7467               *errorcodeptr = ERR58;
7468               goto FAILED;
7469               }
7470             recno = cd->bracount - recno + 1;
7471             if (recno <= 0)
7472               {
7473               *errorcodeptr = ERR15;
7474               goto FAILED;
7475               }
7476             }
7477           else if (refsign == CHAR_PLUS)
7478             {
7479             if (recno == 0)
7480               {
7481               *errorcodeptr = ERR58;
7482               goto FAILED;
7483               }
7484             recno += cd->bracount;
7485             }
7486 
7487           /* Come here from code above that handles a named recursion */
7488 
7489           HANDLE_RECURSION:
7490 
7491           previous = code;
7492           item_hwm_offset = cd->hwm - cd->start_workspace;
7493           called = cd->start_code;
7494 
7495           /* When we are actually compiling, find the bracket that is being
7496           referenced. Temporarily end the regex in case it doesn't exist before
7497           this point. If we end up with a forward reference, first check that
7498           the bracket does occur later so we can give the error (and position)
7499           now. Then remember this forward reference in the workspace so it can
7500           be filled in at the end. */
7501 
7502           if (lengthptr == NULL)
7503             {
7504             *code = OP_END;
7505             if (recno != 0)
7506               called = PRIV(find_bracket)(cd->start_code, utf, recno);
7507 
7508             /* Forward reference */
7509 
7510             if (called == NULL)
7511               {
7512               if (recno > cd->final_bracount)
7513                 {
7514                 *errorcodeptr = ERR15;
7515                 goto FAILED;
7516                 }
7517 
7518               /* Fudge the value of "called" so that when it is inserted as an
7519               offset below, what it actually inserted is the reference number
7520               of the group. Then remember the forward reference. */
7521 
7522               called = cd->start_code + recno;
7523               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7524                   WORK_SIZE_SAFETY_MARGIN)
7525                 {
7526                 *errorcodeptr = expand_workspace(cd);
7527                 if (*errorcodeptr != 0) goto FAILED;
7528                 }
7529               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7530               }
7531 
7532             /* If not a forward reference, and the subpattern is still open,
7533             this is a recursive call. We check to see if this is a left
7534             recursion that could loop for ever, and diagnose that case. We
7535             must not, however, do this check if we are in a conditional
7536             subpattern because the condition might be testing for recursion in
7537             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7538             Forever loops are also detected at runtime, so those that occur in
7539             conditional subpatterns will be picked up then. */
7540 
7541             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7542                      could_be_empty(called, code, bcptr, utf, cd))
7543               {
7544               *errorcodeptr = ERR40;
7545               goto FAILED;
7546               }
7547             }
7548 
7549           /* Insert the recursion/subroutine item. It does not have a set first
7550           character (relevant if it is repeated, because it will then be
7551           wrapped with ONCE brackets). */
7552 
7553           *code = OP_RECURSE;
7554           PUT(code, 1, (int)(called - cd->start_code));
7555           code += 1 + LINK_SIZE;
7556           groupsetfirstchar = FALSE;
7557           }
7558 
7559         /* Can't determine a first byte now */
7560 
7561         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7562         continue;
7563 
7564 
7565         /* ------------------------------------------------------------ */
7566         default:              /* Other characters: check option setting */
7567         OTHER_CHAR_AFTER_QUERY:
7568         set = unset = 0;
7569         optset = &set;
7570 
7571         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7572           {
7573           switch (*ptr++)
7574             {
7575             case CHAR_MINUS: optset = &unset; break;
7576 
7577             case CHAR_J:    /* Record that it changed in the external options */
7578             *optset |= PCRE_DUPNAMES;
7579             cd->external_flags |= PCRE_JCHANGED;
7580             break;
7581 
7582             case CHAR_i: *optset |= PCRE_CASELESS; break;
7583             case CHAR_m: *optset |= PCRE_MULTILINE; break;
7584             case CHAR_s: *optset |= PCRE_DOTALL; break;
7585             case CHAR_x: *optset |= PCRE_EXTENDED; break;
7586             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7587             case CHAR_X: *optset |= PCRE_EXTRA; break;
7588 
7589             default:  *errorcodeptr = ERR12;
7590                       ptr--;    /* Correct the offset */
7591                       goto FAILED;
7592             }
7593           }
7594 
7595         /* Set up the changed option bits, but don't change anything yet. */
7596 
7597         newoptions = (options | set) & (~unset);
7598 
7599         /* If the options ended with ')' this is not the start of a nested
7600         group with option changes, so the options change at this level. If this
7601         item is right at the start of the pattern, the options can be
7602         abstracted and made external in the pre-compile phase, and ignored in
7603         the compile phase. This can be helpful when matching -- for instance in
7604         caseless checking of required bytes.
7605 
7606         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
7607         definitely *not* at the start of the pattern because something has been
7608         compiled. In the pre-compile phase, however, the code pointer can have
7609         that value after the start, because it gets reset as code is discarded
7610         during the pre-compile. However, this can happen only at top level - if
7611         we are within parentheses, the starting BRA will still be present. At
7612         any parenthesis level, the length value can be used to test if anything
7613         has been compiled at that level. Thus, a test for both these conditions
7614         is necessary to ensure we correctly detect the start of the pattern in
7615         both phases.
7616 
7617         If we are not at the pattern start, reset the greedy defaults and the
7618         case value for firstchar and reqchar. */
7619 
7620         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7621           {
7622           if (code == cd->start_code + 1 + LINK_SIZE &&
7623                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
7624             {
7625             cd->external_options = newoptions;
7626             }
7627           else
7628             {
7629             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7630             greedy_non_default = greedy_default ^ 1;
7631             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7632             }
7633 
7634           /* Change options at this level, and pass them back for use
7635           in subsequent branches. */
7636 
7637           *optionsptr = options = newoptions;
7638           previous = NULL;       /* This item can't be repeated */
7639           continue;              /* It is complete */
7640           }
7641 
7642         /* If the options ended with ':' we are heading into a nested group
7643         with possible change of options. Such groups are non-capturing and are
7644         not assertions of any kind. All we need to do is skip over the ':';
7645         the newoptions value is handled below. */
7646 
7647         bravalue = OP_BRA;
7648         ptr++;
7649         }     /* End of switch for character following (? */
7650       }       /* End of (? handling */
7651 
7652     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7653     is set, all unadorned brackets become non-capturing and behave like (?:...)
7654     brackets. */
7655 
7656     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7657       {
7658       bravalue = OP_BRA;
7659       }
7660 
7661     /* Else we have a capturing group. */
7662 
7663     else
7664       {
7665       NUMBERED_GROUP:
7666       cd->bracount += 1;
7667       PUT2(code, 1+LINK_SIZE, cd->bracount);
7668       skipbytes = IMM2_SIZE;
7669       }
7670 
7671     /* Process nested bracketed regex. First check for parentheses nested too
7672     deeply. */
7673 
7674     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7675       {
7676       *errorcodeptr = ERR82;
7677       goto FAILED;
7678       }
7679 
7680     /* All assertions used not to be repeatable, but this was changed for Perl
7681     compatibility. All kinds can now be repeated except for assertions that are
7682     conditions (Perl also forbids these to be repeated). We copy code into a
7683     non-register variable (tempcode) in order to be able to pass its address
7684     because some compilers complain otherwise. At the start of a conditional
7685     group whose condition is an assertion, cd->iscondassert is set. We unset it
7686     here so as to allow assertions later in the group to be quantified. */
7687 
7688     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7689         cd->iscondassert)
7690       {
7691       previous = NULL;
7692       cd->iscondassert = FALSE;
7693       }
7694     else
7695       {
7696       previous = code;
7697       item_hwm_offset = cd->hwm - cd->start_workspace;
7698       }
7699 
7700     *code = bravalue;
7701     tempcode = code;
7702     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7703     tempbracount = cd->bracount;          /* Save value before bracket */
7704     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7705 
7706     if (!compile_regex(
7707          newoptions,                      /* The complete new option state */
7708          &tempcode,                       /* Where to put code (updated) */
7709          &ptr,                            /* Input pointer (updated) */
7710          errorcodeptr,                    /* Where to put an error message */
7711          (bravalue == OP_ASSERTBACK ||
7712           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7713          reset_bracount,                  /* True if (?| group */
7714          skipbytes,                       /* Skip over bracket number */
7715          cond_depth +
7716            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7717          &subfirstchar,                   /* For possible first char */
7718          &subfirstcharflags,
7719          &subreqchar,                     /* For possible last char */
7720          &subreqcharflags,
7721          bcptr,                           /* Current branch chain */
7722          cd,                              /* Tables block */
7723          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7724            &length_prevgroup              /* Pre-compile phase */
7725          ))
7726       goto FAILED;
7727 
7728     cd->parens_depth -= 1;
7729 
7730     /* If this was an atomic group and there are no capturing groups within it,
7731     generate OP_ONCE_NC instead of OP_ONCE. */
7732 
7733     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7734       *code = OP_ONCE_NC;
7735 
7736     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7737       cd->assert_depth -= 1;
7738 
7739     /* At the end of compiling, code is still pointing to the start of the
7740     group, while tempcode has been updated to point past the end of the group.
7741     The pattern pointer (ptr) is on the bracket.
7742 
7743     If this is a conditional bracket, check that there are no more than
7744     two branches in the group, or just one if it's a DEFINE group. We do this
7745     in the real compile phase, not in the pre-pass, where the whole group may
7746     not be available. */
7747 
7748     if (bravalue == OP_COND && lengthptr == NULL)
7749       {
7750       pcre_uchar *tc = code;
7751       int condcount = 0;
7752 
7753       do {
7754          condcount++;
7755          tc += GET(tc,1);
7756          }
7757       while (*tc != OP_KET);
7758 
7759       /* A DEFINE group is never obeyed inline (the "condition" is always
7760       false). It must have only one branch. */
7761 
7762       if (code[LINK_SIZE+1] == OP_DEF)
7763         {
7764         if (condcount > 1)
7765           {
7766           *errorcodeptr = ERR54;
7767           goto FAILED;
7768           }
7769         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7770         }
7771 
7772       /* A "normal" conditional group. If there is just one branch, we must not
7773       make use of its firstchar or reqchar, because this is equivalent to an
7774       empty second branch. */
7775 
7776       else
7777         {
7778         if (condcount > 2)
7779           {
7780           *errorcodeptr = ERR27;
7781           goto FAILED;
7782           }
7783         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7784         }
7785       }
7786 
7787     /* Error if hit end of pattern */
7788 
7789     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7790       {
7791       *errorcodeptr = ERR14;
7792       goto FAILED;
7793       }
7794 
7795     /* In the pre-compile phase, update the length by the length of the group,
7796     less the brackets at either end. Then reduce the compiled code to just a
7797     set of non-capturing brackets so that it doesn't use much memory if it is
7798     duplicated by a quantifier.*/
7799 
7800     if (lengthptr != NULL)
7801       {
7802       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7803         {
7804         *errorcodeptr = ERR20;
7805         goto FAILED;
7806         }
7807       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7808       code++;   /* This already contains bravalue */
7809       PUTINC(code, 0, 1 + LINK_SIZE);
7810       *code++ = OP_KET;
7811       PUTINC(code, 0, 1 + LINK_SIZE);
7812       break;    /* No need to waste time with special character handling */
7813       }
7814 
7815     /* Otherwise update the main code pointer to the end of the group. */
7816 
7817     code = tempcode;
7818 
7819     /* For a DEFINE group, required and first character settings are not
7820     relevant. */
7821 
7822     if (bravalue == OP_DEF) break;
7823 
7824     /* Handle updating of the required and first characters for other types of
7825     group. Update for normal brackets of all kinds, and conditions with two
7826     branches (see code above). If the bracket is followed by a quantifier with
7827     zero repeat, we have to back off. Hence the definition of zeroreqchar and
7828     zerofirstchar outside the main loop so that they can be accessed for the
7829     back off. */
7830 
7831     zeroreqchar = reqchar;
7832     zeroreqcharflags = reqcharflags;
7833     zerofirstchar = firstchar;
7834     zerofirstcharflags = firstcharflags;
7835     groupsetfirstchar = FALSE;
7836 
7837     if (bravalue >= OP_ONCE)
7838       {
7839       /* If we have not yet set a firstchar in this branch, take it from the
7840       subpattern, remembering that it was set here so that a repeat of more
7841       than one can replicate it as reqchar if necessary. If the subpattern has
7842       no firstchar, set "none" for the whole branch. In both cases, a zero
7843       repeat forces firstchar to "none". */
7844 
7845       if (firstcharflags == REQ_UNSET)
7846         {
7847         if (subfirstcharflags >= 0)
7848           {
7849           firstchar = subfirstchar;
7850           firstcharflags = subfirstcharflags;
7851           groupsetfirstchar = TRUE;
7852           }
7853         else firstcharflags = REQ_NONE;
7854         zerofirstcharflags = REQ_NONE;
7855         }
7856 
7857       /* If firstchar was previously set, convert the subpattern's firstchar
7858       into reqchar if there wasn't one, using the vary flag that was in
7859       existence beforehand. */
7860 
7861       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7862         {
7863         subreqchar = subfirstchar;
7864         subreqcharflags = subfirstcharflags | tempreqvary;
7865         }
7866 
7867       /* If the subpattern set a required byte (or set a first byte that isn't
7868       really the first byte - see above), set it. */
7869 
7870       if (subreqcharflags >= 0)
7871         {
7872         reqchar = subreqchar;
7873         reqcharflags = subreqcharflags;
7874         }
7875       }
7876 
7877     /* For a forward assertion, we take the reqchar, if set. This can be
7878     helpful if the pattern that follows the assertion doesn't set a different
7879     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
7880     for an assertion, however because it leads to incorrect effect for patterns
7881     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
7882     of a firstchar. This is overcome by a scan at the end if there's no
7883     firstchar, looking for an asserted first char. */
7884 
7885     else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
7886       {
7887       reqchar = subreqchar;
7888       reqcharflags = subreqcharflags;
7889       }
7890     break;     /* End of processing '(' */
7891 
7892 
7893     /* ===================================================================*/
7894     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7895     are arranged to be the negation of the corresponding OP_values in the
7896     default case when PCRE_UCP is not set. For the back references, the values
7897     are negative the reference number. Only back references and those types
7898     that consume a character may be repeated. We can test for values between
7899     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7900     ever created. */
7901 
7902     case CHAR_BACKSLASH:
7903     tempptr = ptr;
7904     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7905     if (*errorcodeptr != 0) goto FAILED;
7906 
7907     if (escape == 0)                  /* The escape coded a single character */
7908       c = ec;
7909     else
7910       {
7911       if (escape == ESC_Q)            /* Handle start of quoted string */
7912         {
7913         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
7914           ptr += 2;               /* avoid empty string */
7915             else inescq = TRUE;
7916         continue;
7917         }
7918 
7919       if (escape == ESC_E) continue;  /* Perl ignores an orphan \E */
7920 
7921       /* For metasequences that actually match a character, we disable the
7922       setting of a first character if it hasn't already been set. */
7923 
7924       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7925         firstcharflags = REQ_NONE;
7926 
7927       /* Set values to reset to if this is followed by a zero repeat. */
7928 
7929       zerofirstchar = firstchar;
7930       zerofirstcharflags = firstcharflags;
7931       zeroreqchar = reqchar;
7932       zeroreqcharflags = reqcharflags;
7933 
7934       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7935       is a subroutine call by number (Oniguruma syntax). In fact, the value
7936       ESC_g is returned only for these cases. So we don't need to check for <
7937       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7938       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7939       that is a synonym for a named back reference). */
7940 
7941       if (escape == ESC_g)
7942         {
7943         const pcre_uchar *p;
7944         pcre_uint32 cf;
7945 
7946         item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
7947         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7948           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7949 
7950         /* These two statements stop the compiler for warning about possibly
7951         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7952         fact, because we do the check for a number below, the paths that
7953         would actually be in error are never taken. */
7954 
7955         skipbytes = 0;
7956         reset_bracount = FALSE;
7957 
7958         /* If it's not a signed or unsigned number, treat it as a name. */
7959 
7960         cf = ptr[1];
7961         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7962           {
7963           is_recurse = TRUE;
7964           goto NAMED_REF_OR_RECURSE;
7965           }
7966 
7967         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7968         or a digit. */
7969 
7970         p = ptr + 2;
7971         while (IS_DIGIT(*p)) p++;
7972         if (*p != (pcre_uchar)terminator)
7973           {
7974           *errorcodeptr = ERR57;
7975           goto FAILED;
7976           }
7977         ptr++;
7978         goto HANDLE_NUMERICAL_RECURSION;
7979         }
7980 
7981       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7982       We also support \k{name} (.NET syntax).  */
7983 
7984       if (escape == ESC_k)
7985         {
7986         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7987           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7988           {
7989           *errorcodeptr = ERR69;
7990           goto FAILED;
7991           }
7992         is_recurse = FALSE;
7993         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7994           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7995           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7996         goto NAMED_REF_OR_RECURSE;
7997         }
7998 
7999       /* Back references are handled specially; must disable firstchar if
8000       not set to cope with cases like (?=(\w+))\1: which would otherwise set
8001       ':' later. */
8002 
8003       if (escape < 0)
8004         {
8005         open_capitem *oc;
8006         recno = -escape;
8007 
8008         /* Come here from named backref handling when the reference is to a
8009         single group (i.e. not to a duplicated name. */
8010 
8011         HANDLE_REFERENCE:
8012         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8013         previous = code;
8014         item_hwm_offset = cd->hwm - cd->start_workspace;
8015         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8016         PUT2INC(code, 0, recno);
8017         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8018         if (recno > cd->top_backref) cd->top_backref = recno;
8019 
8020         /* Check to see if this back reference is recursive, that it, it
8021         is inside the group that it references. A flag is set so that the
8022         group can be made atomic. */
8023 
8024         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8025           {
8026           if (oc->number == recno)
8027             {
8028             oc->flag = TRUE;
8029             break;
8030             }
8031           }
8032         }
8033 
8034       /* So are Unicode property matches, if supported. */
8035 
8036 #ifdef SUPPORT_UCP
8037       else if (escape == ESC_P || escape == ESC_p)
8038         {
8039         BOOL negated;
8040         unsigned int ptype = 0, pdata = 0;
8041         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8042           goto FAILED;
8043         previous = code;
8044         item_hwm_offset = cd->hwm - cd->start_workspace;
8045         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8046         *code++ = ptype;
8047         *code++ = pdata;
8048         }
8049 #else
8050 
8051       /* If Unicode properties are not supported, \X, \P, and \p are not
8052       allowed. */
8053 
8054       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8055         {
8056         *errorcodeptr = ERR45;
8057         goto FAILED;
8058         }
8059 #endif
8060 
8061       /* For the rest (including \X when Unicode properties are supported), we
8062       can obtain the OP value by negating the escape value in the default
8063       situation when PCRE_UCP is not set. When it *is* set, we substitute
8064       Unicode property tests. Note that \b and \B do a one-character
8065       lookbehind, and \A also behaves as if it does. */
8066 
8067       else
8068         {
8069         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8070              cd->max_lookbehind == 0)
8071           cd->max_lookbehind = 1;
8072 #ifdef SUPPORT_UCP
8073         if (escape >= ESC_DU && escape <= ESC_wu)
8074           {
8075           nestptr = ptr + 1;                   /* Where to resume */
8076           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
8077           }
8078         else
8079 #endif
8080         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8081         so that it works in DFA mode and in lookbehinds. */
8082 
8083           {
8084           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8085           item_hwm_offset = cd->hwm - cd->start_workspace;
8086           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8087           }
8088         }
8089       continue;
8090       }
8091 
8092     /* We have a data character whose value is in c. In UTF-8 mode it may have
8093     a value > 127. We set its representation in the length/buffer, and then
8094     handle it as a data character. */
8095 
8096 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8097     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8098       mclength = PRIV(ord2utf)(c, mcbuffer);
8099     else
8100 #endif
8101 
8102      {
8103      mcbuffer[0] = c;
8104      mclength = 1;
8105      }
8106     goto ONE_CHAR;
8107 
8108 
8109     /* ===================================================================*/
8110     /* Handle a literal character. It is guaranteed not to be whitespace or #
8111     when the extended flag is set. If we are in a UTF mode, it may be a
8112     multi-unit literal character. */
8113 
8114     default:
8115     NORMAL_CHAR:
8116     mclength = 1;
8117     mcbuffer[0] = c;
8118 
8119 #ifdef SUPPORT_UTF
8120     if (utf && HAS_EXTRALEN(c))
8121       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8122 #endif
8123 
8124     /* At this point we have the character's bytes in mcbuffer, and the length
8125     in mclength. When not in UTF-8 mode, the length is always 1. */
8126 
8127     ONE_CHAR:
8128     previous = code;
8129     item_hwm_offset = cd->hwm - cd->start_workspace;
8130 
8131     /* For caseless UTF-8 mode when UCP support is available, check whether
8132     this character has more than one other case. If so, generate a special
8133     OP_PROP item instead of OP_CHARI. */
8134 
8135 #ifdef SUPPORT_UCP
8136     if (utf && (options & PCRE_CASELESS) != 0)
8137       {
8138       GETCHAR(c, mcbuffer);
8139       if ((c = UCD_CASESET(c)) != 0)
8140         {
8141         *code++ = OP_PROP;
8142         *code++ = PT_CLIST;
8143         *code++ = c;
8144         if (firstcharflags == REQ_UNSET)
8145           firstcharflags = zerofirstcharflags = REQ_NONE;
8146         break;
8147         }
8148       }
8149 #endif
8150 
8151     /* Caseful matches, or not one of the multicase characters. */
8152 
8153     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8154     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8155 
8156     /* Remember if \r or \n were seen */
8157 
8158     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8159       cd->external_flags |= PCRE_HASCRORLF;
8160 
8161     /* Set the first and required bytes appropriately. If no previous first
8162     byte, set it from this character, but revert to none on a zero repeat.
8163     Otherwise, leave the firstchar value alone, and don't change it on a zero
8164     repeat. */
8165 
8166     if (firstcharflags == REQ_UNSET)
8167       {
8168       zerofirstcharflags = REQ_NONE;
8169       zeroreqchar = reqchar;
8170       zeroreqcharflags = reqcharflags;
8171 
8172       /* If the character is more than one byte long, we can set firstchar
8173       only if it is not to be matched caselessly. */
8174 
8175       if (mclength == 1 || req_caseopt == 0)
8176         {
8177         firstchar = mcbuffer[0] | req_caseopt;
8178         firstchar = mcbuffer[0];
8179         firstcharflags = req_caseopt;
8180 
8181         if (mclength != 1)
8182           {
8183           reqchar = code[-1];
8184           reqcharflags = cd->req_varyopt;
8185           }
8186         }
8187       else firstcharflags = reqcharflags = REQ_NONE;
8188       }
8189 
8190     /* firstchar was previously set; we can set reqchar only if the length is
8191     1 or the matching is caseful. */
8192 
8193     else
8194       {
8195       zerofirstchar = firstchar;
8196       zerofirstcharflags = firstcharflags;
8197       zeroreqchar = reqchar;
8198       zeroreqcharflags = reqcharflags;
8199       if (mclength == 1 || req_caseopt == 0)
8200         {
8201         reqchar = code[-1];
8202         reqcharflags = req_caseopt | cd->req_varyopt;
8203         }
8204       }
8205 
8206     break;            /* End of literal character handling */
8207     }
8208   }                   /* end of big loop */
8209 
8210 
8211 /* Control never reaches here by falling through, only by a goto for all the
8212 error states. Pass back the position in the pattern so that it can be displayed
8213 to the user for diagnosing the error. */
8214 
8215 FAILED:
8216 *ptrptr = ptr;
8217 return FALSE;
8218 }
8219 
8220 
8221 
8222 /*************************************************
8223 *     Compile sequence of alternatives           *
8224 *************************************************/
8225 
8226 /* On entry, ptr is pointing past the bracket character, but on return it
8227 points to the closing bracket, or vertical bar, or end of string. The code
8228 variable is pointing at the byte into which the BRA operator has been stored.
8229 This function is used during the pre-compile phase when we are trying to find
8230 out the amount of memory needed, as well as during the real compile phase. The
8231 value of lengthptr distinguishes the two phases.
8232 
8233 Arguments:
8234   options           option bits, including any changes for this subpattern
8235   codeptr           -> the address of the current code pointer
8236   ptrptr            -> the address of the current pattern pointer
8237   errorcodeptr      -> pointer to error code variable
8238   lookbehind        TRUE if this is a lookbehind assertion
8239   reset_bracount    TRUE to reset the count for each branch
8240   skipbytes         skip this many bytes at start (for brackets and OP_COND)
8241   cond_depth        depth of nesting for conditional subpatterns
8242   firstcharptr      place to put the first required character
8243   firstcharflagsptr place to put the first character flags, or a negative number
8244   reqcharptr        place to put the last required character
8245   reqcharflagsptr   place to put the last required character flags, or a negative number
8246   bcptr             pointer to the chain of currently open branches
8247   cd                points to the data block with tables pointers etc.
8248   lengthptr         NULL during the real compile phase
8249                     points to length accumulator during pre-compile phase
8250 
8251 Returns:            TRUE on success
8252 */
8253 
8254 static BOOL
compile_regex(int options,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,int skipbytes,int cond_depth,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,compile_data * cd,int * lengthptr)8255 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8256   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8257   int cond_depth,
8258   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8259   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8260   branch_chain *bcptr, compile_data *cd, int *lengthptr)
8261 {
8262 const pcre_uchar *ptr = *ptrptr;
8263 pcre_uchar *code = *codeptr;
8264 pcre_uchar *last_branch = code;
8265 pcre_uchar *start_bracket = code;
8266 pcre_uchar *reverse_count = NULL;
8267 open_capitem capitem;
8268 int capnumber = 0;
8269 pcre_uint32 firstchar, reqchar;
8270 pcre_int32 firstcharflags, reqcharflags;
8271 pcre_uint32 branchfirstchar, branchreqchar;
8272 pcre_int32 branchfirstcharflags, branchreqcharflags;
8273 int length;
8274 unsigned int orig_bracount;
8275 unsigned int max_bracount;
8276 branch_chain bc;
8277 size_t save_hwm_offset;
8278 
8279 /* If set, call the external function that checks for stack availability. */
8280 
8281 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8282   {
8283   *errorcodeptr= ERR85;
8284   return FALSE;
8285   }
8286 
8287 /* Miscellaneous initialization */
8288 
8289 bc.outer = bcptr;
8290 bc.current_branch = code;
8291 
8292 firstchar = reqchar = 0;
8293 firstcharflags = reqcharflags = REQ_UNSET;
8294 
8295 save_hwm_offset = cd->hwm - cd->start_workspace;
8296 
8297 /* Accumulate the length for use in the pre-compile phase. Start with the
8298 length of the BRA and KET and any extra bytes that are required at the
8299 beginning. We accumulate in a local variable to save frequent testing of
8300 lenthptr for NULL. We cannot do this by looking at the value of code at the
8301 start and end of each alternative, because compiled items are discarded during
8302 the pre-compile phase so that the work space is not exceeded. */
8303 
8304 length = 2 + 2*LINK_SIZE + skipbytes;
8305 
8306 /* WARNING: If the above line is changed for any reason, you must also change
8307 the code that abstracts option settings at the start of the pattern and makes
8308 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8309 pre-compile phase to find out whether anything has yet been compiled or not. */
8310 
8311 /* If this is a capturing subpattern, add to the chain of open capturing items
8312 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8313 detect groups that contain recursive back references to themselves. Note that
8314 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8315 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8316 
8317 if (*code == OP_CBRA)
8318   {
8319   capnumber = GET2(code, 1 + LINK_SIZE);
8320   capitem.number = capnumber;
8321   capitem.next = cd->open_caps;
8322   capitem.flag = FALSE;
8323   cd->open_caps = &capitem;
8324   }
8325 
8326 /* Offset is set zero to mark that this bracket is still open */
8327 
8328 PUT(code, 1, 0);
8329 code += 1 + LINK_SIZE + skipbytes;
8330 
8331 /* Loop for each alternative branch */
8332 
8333 orig_bracount = max_bracount = cd->bracount;
8334 for (;;)
8335   {
8336   /* For a (?| group, reset the capturing bracket count so that each branch
8337   uses the same numbers. */
8338 
8339   if (reset_bracount) cd->bracount = orig_bracount;
8340 
8341   /* Set up dummy OP_REVERSE if lookbehind assertion */
8342 
8343   if (lookbehind)
8344     {
8345     *code++ = OP_REVERSE;
8346     reverse_count = code;
8347     PUTINC(code, 0, 0);
8348     length += 1 + LINK_SIZE;
8349     }
8350 
8351   /* Now compile the branch; in the pre-compile phase its length gets added
8352   into the length. */
8353 
8354   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8355         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8356         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8357     {
8358     *ptrptr = ptr;
8359     return FALSE;
8360     }
8361 
8362   /* Keep the highest bracket count in case (?| was used and some branch
8363   has fewer than the rest. */
8364 
8365   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8366 
8367   /* In the real compile phase, there is some post-processing to be done. */
8368 
8369   if (lengthptr == NULL)
8370     {
8371     /* If this is the first branch, the firstchar and reqchar values for the
8372     branch become the values for the regex. */
8373 
8374     if (*last_branch != OP_ALT)
8375       {
8376       firstchar = branchfirstchar;
8377       firstcharflags = branchfirstcharflags;
8378       reqchar = branchreqchar;
8379       reqcharflags = branchreqcharflags;
8380       }
8381 
8382     /* If this is not the first branch, the first char and reqchar have to
8383     match the values from all the previous branches, except that if the
8384     previous value for reqchar didn't have REQ_VARY set, it can still match,
8385     and we set REQ_VARY for the regex. */
8386 
8387     else
8388       {
8389       /* If we previously had a firstchar, but it doesn't match the new branch,
8390       we have to abandon the firstchar for the regex, but if there was
8391       previously no reqchar, it takes on the value of the old firstchar. */
8392 
8393       if (firstcharflags >= 0 &&
8394           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8395         {
8396         if (reqcharflags < 0)
8397           {
8398           reqchar = firstchar;
8399           reqcharflags = firstcharflags;
8400           }
8401         firstcharflags = REQ_NONE;
8402         }
8403 
8404       /* If we (now or from before) have no firstchar, a firstchar from the
8405       branch becomes a reqchar if there isn't a branch reqchar. */
8406 
8407       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8408         {
8409         branchreqchar = branchfirstchar;
8410         branchreqcharflags = branchfirstcharflags;
8411         }
8412 
8413       /* Now ensure that the reqchars match */
8414 
8415       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8416           reqchar != branchreqchar)
8417         reqcharflags = REQ_NONE;
8418       else
8419         {
8420         reqchar = branchreqchar;
8421         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8422         }
8423       }
8424 
8425     /* If lookbehind, check that this branch matches a fixed-length string, and
8426     put the length into the OP_REVERSE item. Temporarily mark the end of the
8427     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8428     because there may be forward references that we can't check here. Set a
8429     flag to cause another lookbehind check at the end. Why not do it all at the
8430     end? Because common, erroneous checks are picked up here and the offset of
8431     the problem can be shown. */
8432 
8433     if (lookbehind)
8434       {
8435       int fixed_length;
8436       *code = OP_END;
8437       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8438         FALSE, cd, NULL);
8439       DPRINTF(("fixed length = %d\n", fixed_length));
8440       if (fixed_length == -3)
8441         {
8442         cd->check_lookbehind = TRUE;
8443         }
8444       else if (fixed_length < 0)
8445         {
8446         *errorcodeptr = (fixed_length == -2)? ERR36 :
8447                         (fixed_length == -4)? ERR70: ERR25;
8448         *ptrptr = ptr;
8449         return FALSE;
8450         }
8451       else
8452         {
8453         if (fixed_length > cd->max_lookbehind)
8454           cd->max_lookbehind = fixed_length;
8455         PUT(reverse_count, 0, fixed_length);
8456         }
8457       }
8458     }
8459 
8460   /* Reached end of expression, either ')' or end of pattern. In the real
8461   compile phase, go back through the alternative branches and reverse the chain
8462   of offsets, with the field in the BRA item now becoming an offset to the
8463   first alternative. If there are no alternatives, it points to the end of the
8464   group. The length in the terminating ket is always the length of the whole
8465   bracketed item. Return leaving the pointer at the terminating char. */
8466 
8467   if (*ptr != CHAR_VERTICAL_LINE)
8468     {
8469     if (lengthptr == NULL)
8470       {
8471       int branch_length = (int)(code - last_branch);
8472       do
8473         {
8474         int prev_length = GET(last_branch, 1);
8475         PUT(last_branch, 1, branch_length);
8476         branch_length = prev_length;
8477         last_branch -= branch_length;
8478         }
8479       while (branch_length > 0);
8480       }
8481 
8482     /* Fill in the ket */
8483 
8484     *code = OP_KET;
8485     PUT(code, 1, (int)(code - start_bracket));
8486     code += 1 + LINK_SIZE;
8487 
8488     /* If it was a capturing subpattern, check to see if it contained any
8489     recursive back references. If so, we must wrap it in atomic brackets.
8490     Because we are moving code along, we must ensure that any pending recursive
8491     references are updated. In any event, remove the block from the chain. */
8492 
8493     if (capnumber > 0)
8494       {
8495       if (cd->open_caps->flag)
8496         {
8497         *code = OP_END;
8498         adjust_recurse(start_bracket, 1 + LINK_SIZE,
8499           (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8500         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8501           IN_UCHARS(code - start_bracket));
8502         *start_bracket = OP_ONCE;
8503         code += 1 + LINK_SIZE;
8504         PUT(start_bracket, 1, (int)(code - start_bracket));
8505         *code = OP_KET;
8506         PUT(code, 1, (int)(code - start_bracket));
8507         code += 1 + LINK_SIZE;
8508         length += 2 + 2*LINK_SIZE;
8509         }
8510       cd->open_caps = cd->open_caps->next;
8511       }
8512 
8513     /* Retain the highest bracket number, in case resetting was used. */
8514 
8515     cd->bracount = max_bracount;
8516 
8517     /* Set values to pass back */
8518 
8519     *codeptr = code;
8520     *ptrptr = ptr;
8521     *firstcharptr = firstchar;
8522     *firstcharflagsptr = firstcharflags;
8523     *reqcharptr = reqchar;
8524     *reqcharflagsptr = reqcharflags;
8525     if (lengthptr != NULL)
8526       {
8527       if (OFLOW_MAX - *lengthptr < length)
8528         {
8529         *errorcodeptr = ERR20;
8530         return FALSE;
8531         }
8532       *lengthptr += length;
8533       }
8534     return TRUE;
8535     }
8536 
8537   /* Another branch follows. In the pre-compile phase, we can move the code
8538   pointer back to where it was for the start of the first branch. (That is,
8539   pretend that each branch is the only one.)
8540 
8541   In the real compile phase, insert an ALT node. Its length field points back
8542   to the previous branch while the bracket remains open. At the end the chain
8543   is reversed. It's done like this so that the start of the bracket has a
8544   zero offset until it is closed, making it possible to detect recursion. */
8545 
8546   if (lengthptr != NULL)
8547     {
8548     code = *codeptr + 1 + LINK_SIZE + skipbytes;
8549     length += 1 + LINK_SIZE;
8550     }
8551   else
8552     {
8553     *code = OP_ALT;
8554     PUT(code, 1, (int)(code - last_branch));
8555     bc.current_branch = last_branch = code;
8556     code += 1 + LINK_SIZE;
8557     }
8558 
8559   ptr++;
8560   }
8561 /* Control never reaches here */
8562 }
8563 
8564 
8565 
8566 
8567 /*************************************************
8568 *          Check for anchored expression         *
8569 *************************************************/
8570 
8571 /* Try to find out if this is an anchored regular expression. Consider each
8572 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8573 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8574 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8575 be found, because ^ generates OP_CIRCM in that mode.
8576 
8577 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8578 This is the code for \G, which means "match at start of match position, taking
8579 into account the match offset".
8580 
8581 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8582 because that will try the rest of the pattern at all possible matching points,
8583 so there is no point trying again.... er ....
8584 
8585 .... except when the .* appears inside capturing parentheses, and there is a
8586 subsequent back reference to those parentheses. We haven't enough information
8587 to catch that case precisely.
8588 
8589 At first, the best we could do was to detect when .* was in capturing brackets
8590 and the highest back reference was greater than or equal to that level.
8591 However, by keeping a bitmap of the first 31 back references, we can catch some
8592 of the more common cases more precisely.
8593 
8594 ... A second exception is when the .* appears inside an atomic group, because
8595 this prevents the number of characters it matches from being adjusted.
8596 
8597 Arguments:
8598   code           points to start of expression (the bracket)
8599   bracket_map    a bitmap of which brackets we are inside while testing; this
8600                   handles up to substring 31; after that we just have to take
8601                   the less precise approach
8602   cd             points to the compile data block
8603   atomcount      atomic group level
8604 
8605 Returns:     TRUE or FALSE
8606 */
8607 
8608 static BOOL
is_anchored(register const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8609 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8610   compile_data *cd, int atomcount)
8611 {
8612 do {
8613    const pcre_uchar *scode = first_significant_code(
8614      code + PRIV(OP_lengths)[*code], FALSE);
8615    register int op = *scode;
8616 
8617    /* Non-capturing brackets */
8618 
8619    if (op == OP_BRA  || op == OP_BRAPOS ||
8620        op == OP_SBRA || op == OP_SBRAPOS)
8621      {
8622      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8623      }
8624 
8625    /* Capturing brackets */
8626 
8627    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8628             op == OP_SCBRA || op == OP_SCBRAPOS)
8629      {
8630      int n = GET2(scode, 1+LINK_SIZE);
8631      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8632      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8633      }
8634 
8635    /* Positive forward assertions and conditions */
8636 
8637    else if (op == OP_ASSERT || op == OP_COND)
8638      {
8639      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8640      }
8641 
8642    /* Atomic groups */
8643 
8644    else if (op == OP_ONCE || op == OP_ONCE_NC)
8645      {
8646      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8647        return FALSE;
8648      }
8649 
8650    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8651    it isn't in brackets that are or may be referenced or inside an atomic
8652    group. */
8653 
8654    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8655              op == OP_TYPEPOSSTAR))
8656      {
8657      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8658          atomcount > 0 || cd->had_pruneorskip)
8659        return FALSE;
8660      }
8661 
8662    /* Check for explicit anchoring */
8663 
8664    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8665 
8666    code += GET(code, 1);
8667    }
8668 while (*code == OP_ALT);   /* Loop for each alternative */
8669 return TRUE;
8670 }
8671 
8672 
8673 
8674 /*************************************************
8675 *         Check for starting with ^ or .*        *
8676 *************************************************/
8677 
8678 /* This is called to find out if every branch starts with ^ or .* so that
8679 "first char" processing can be done to speed things up in multiline
8680 matching and for non-DOTALL patterns that start with .* (which must start at
8681 the beginning or after \n). As in the case of is_anchored() (see above), we
8682 have to take account of back references to capturing brackets that contain .*
8683 because in that case we can't make the assumption. Also, the appearance of .*
8684 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8685 count, because once again the assumption no longer holds.
8686 
8687 Arguments:
8688   code           points to start of expression (the bracket)
8689   bracket_map    a bitmap of which brackets we are inside while testing; this
8690                   handles up to substring 31; after that we just have to take
8691                   the less precise approach
8692   cd             points to the compile data
8693   atomcount      atomic group level
8694 
8695 Returns:         TRUE or FALSE
8696 */
8697 
8698 static BOOL
is_startline(const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8699 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8700   compile_data *cd, int atomcount)
8701 {
8702 do {
8703    const pcre_uchar *scode = first_significant_code(
8704      code + PRIV(OP_lengths)[*code], FALSE);
8705    register int op = *scode;
8706 
8707    /* If we are at the start of a conditional assertion group, *both* the
8708    conditional assertion *and* what follows the condition must satisfy the test
8709    for start of line. Other kinds of condition fail. Note that there may be an
8710    auto-callout at the start of a condition. */
8711 
8712    if (op == OP_COND)
8713      {
8714      scode += 1 + LINK_SIZE;
8715      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8716      switch (*scode)
8717        {
8718        case OP_CREF:
8719        case OP_DNCREF:
8720        case OP_RREF:
8721        case OP_DNRREF:
8722        case OP_DEF:
8723        case OP_FAIL:
8724        return FALSE;
8725 
8726        default:     /* Assertion */
8727        if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8728        do scode += GET(scode, 1); while (*scode == OP_ALT);
8729        scode += 1 + LINK_SIZE;
8730        break;
8731        }
8732      scode = first_significant_code(scode, FALSE);
8733      op = *scode;
8734      }
8735 
8736    /* Non-capturing brackets */
8737 
8738    if (op == OP_BRA  || op == OP_BRAPOS ||
8739        op == OP_SBRA || op == OP_SBRAPOS)
8740      {
8741      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8742      }
8743 
8744    /* Capturing brackets */
8745 
8746    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8747             op == OP_SCBRA || op == OP_SCBRAPOS)
8748      {
8749      int n = GET2(scode, 1+LINK_SIZE);
8750      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8751      if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
8752      }
8753 
8754    /* Positive forward assertions */
8755 
8756    else if (op == OP_ASSERT)
8757      {
8758      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8759      }
8760 
8761    /* Atomic brackets */
8762 
8763    else if (op == OP_ONCE || op == OP_ONCE_NC)
8764      {
8765      if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
8766      }
8767 
8768    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8769    brackets that may be referenced, as long as the pattern does not contain
8770    *PRUNE or *SKIP, because these break the feature. Consider, for example,
8771    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8772    start of a line. */
8773 
8774    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8775      {
8776      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8777          atomcount > 0 || cd->had_pruneorskip)
8778        return FALSE;
8779      }
8780 
8781    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8782    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8783    because the number of characters matched by .* cannot be adjusted inside
8784    them. */
8785 
8786    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8787 
8788    /* Move on to the next alternative */
8789 
8790    code += GET(code, 1);
8791    }
8792 while (*code == OP_ALT);  /* Loop for each alternative */
8793 return TRUE;
8794 }
8795 
8796 
8797 
8798 /*************************************************
8799 *       Check for asserted fixed first char      *
8800 *************************************************/
8801 
8802 /* During compilation, the "first char" settings from forward assertions are
8803 discarded, because they can cause conflicts with actual literals that follow.
8804 However, if we end up without a first char setting for an unanchored pattern,
8805 it is worth scanning the regex to see if there is an initial asserted first
8806 char. If all branches start with the same asserted char, or with a
8807 non-conditional bracket all of whose alternatives start with the same asserted
8808 char (recurse ad lib), then we return that char, with the flags set to zero or
8809 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8810 
8811 Arguments:
8812   code       points to start of expression (the bracket)
8813   flags      points to the first char flags, or to REQ_NONE
8814   inassert   TRUE if in an assertion
8815 
8816 Returns:     the fixed first char, or 0 with REQ_NONE in flags
8817 */
8818 
8819 static pcre_uint32
find_firstassertedchar(const pcre_uchar * code,pcre_int32 * flags,BOOL inassert)8820 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8821   BOOL inassert)
8822 {
8823 register pcre_uint32 c = 0;
8824 int cflags = REQ_NONE;
8825 
8826 *flags = REQ_NONE;
8827 do {
8828    pcre_uint32 d;
8829    int dflags;
8830    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8831              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8832    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8833      TRUE);
8834    register pcre_uchar op = *scode;
8835 
8836    switch(op)
8837      {
8838      default:
8839      return 0;
8840 
8841      case OP_BRA:
8842      case OP_BRAPOS:
8843      case OP_CBRA:
8844      case OP_SCBRA:
8845      case OP_CBRAPOS:
8846      case OP_SCBRAPOS:
8847      case OP_ASSERT:
8848      case OP_ONCE:
8849      case OP_ONCE_NC:
8850      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8851      if (dflags < 0)
8852        return 0;
8853      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8854      break;
8855 
8856      case OP_EXACT:
8857      scode += IMM2_SIZE;
8858      /* Fall through */
8859 
8860      case OP_CHAR:
8861      case OP_PLUS:
8862      case OP_MINPLUS:
8863      case OP_POSPLUS:
8864      if (!inassert) return 0;
8865      if (cflags < 0) { c = scode[1]; cflags = 0; }
8866        else if (c != scode[1]) return 0;
8867      break;
8868 
8869      case OP_EXACTI:
8870      scode += IMM2_SIZE;
8871      /* Fall through */
8872 
8873      case OP_CHARI:
8874      case OP_PLUSI:
8875      case OP_MINPLUSI:
8876      case OP_POSPLUSI:
8877      if (!inassert) return 0;
8878      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8879        else if (c != scode[1]) return 0;
8880      break;
8881      }
8882 
8883    code += GET(code, 1);
8884    }
8885 while (*code == OP_ALT);
8886 
8887 *flags = cflags;
8888 return c;
8889 }
8890 
8891 
8892 
8893 /*************************************************
8894 *     Add an entry to the name/number table      *
8895 *************************************************/
8896 
8897 /* This function is called between compiling passes to add an entry to the
8898 name/number table, maintaining alphabetical order. Checking for permitted
8899 and forbidden duplicates has already been done.
8900 
8901 Arguments:
8902   cd           the compile data block
8903   name         the name to add
8904   length       the length of the name
8905   groupno      the group number
8906 
8907 Returns:       nothing
8908 */
8909 
8910 static void
add_name(compile_data * cd,const pcre_uchar * name,int length,unsigned int groupno)8911 add_name(compile_data *cd, const pcre_uchar *name, int length,
8912   unsigned int groupno)
8913 {
8914 int i;
8915 pcre_uchar *slot = cd->name_table;
8916 
8917 for (i = 0; i < cd->names_found; i++)
8918   {
8919   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8920   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8921     crc = -1; /* Current name is a substring */
8922 
8923   /* Make space in the table and break the loop for an earlier name. For a
8924   duplicate or later name, carry on. We do this for duplicates so that in the
8925   simple case (when ?(| is not used) they are in order of their numbers. In all
8926   cases they are in the order in which they appear in the pattern. */
8927 
8928   if (crc < 0)
8929     {
8930     memmove(slot + cd->name_entry_size, slot,
8931       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8932     break;
8933     }
8934 
8935   /* Continue the loop for a later or duplicate name */
8936 
8937   slot += cd->name_entry_size;
8938   }
8939 
8940 PUT2(slot, 0, groupno);
8941 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8942 slot[IMM2_SIZE + length] = 0;
8943 cd->names_found++;
8944 }
8945 
8946 
8947 
8948 /*************************************************
8949 *        Compile a Regular Expression            *
8950 *************************************************/
8951 
8952 /* This function takes a string and returns a pointer to a block of store
8953 holding a compiled version of the expression. The original API for this
8954 function had no error code return variable; it is retained for backwards
8955 compatibility. The new function is given a new name.
8956 
8957 Arguments:
8958   pattern       the regular expression
8959   options       various option bits
8960   errorcodeptr  pointer to error code variable (pcre_compile2() only)
8961                   can be NULL if you don't want a code value
8962   errorptr      pointer to pointer to error text
8963   erroroffset   ptr offset in pattern where error was detected
8964   tables        pointer to character tables or NULL
8965 
8966 Returns:        pointer to compiled data block, or NULL on error,
8967                 with errorptr and erroroffset set
8968 */
8969 
8970 #if defined COMPILE_PCRE8
8971 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char * pattern,int options,const char ** errorptr,int * erroroffset,const unsigned char * tables)8972 pcre_compile(const char *pattern, int options, const char **errorptr,
8973   int *erroroffset, const unsigned char *tables)
8974 #elif defined COMPILE_PCRE16
8975 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8976 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
8977   int *erroroffset, const unsigned char *tables)
8978 #elif defined COMPILE_PCRE32
8979 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8980 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
8981   int *erroroffset, const unsigned char *tables)
8982 #endif
8983 {
8984 #if defined COMPILE_PCRE8
8985 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8986 #elif defined COMPILE_PCRE16
8987 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8988 #elif defined COMPILE_PCRE32
8989 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8990 #endif
8991 }
8992 
8993 
8994 #if defined COMPILE_PCRE8
8995 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char * pattern,int options,int * errorcodeptr,const char ** errorptr,int * erroroffset,const unsigned char * tables)8996 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
8997   const char **errorptr, int *erroroffset, const unsigned char *tables)
8998 #elif defined COMPILE_PCRE16
8999 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9000 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9001   const char **errorptr, int *erroroffset, const unsigned char *tables)
9002 #elif defined COMPILE_PCRE32
9003 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9004 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9005   const char **errorptr, int *erroroffset, const unsigned char *tables)
9006 #endif
9007 {
9008 REAL_PCRE *re;
9009 int length = 1;  /* For final END opcode */
9010 pcre_int32 firstcharflags, reqcharflags;
9011 pcre_uint32 firstchar, reqchar;
9012 pcre_uint32 limit_match = PCRE_UINT32_MAX;
9013 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9014 int newline;
9015 int errorcode = 0;
9016 int skipatstart = 0;
9017 BOOL utf;
9018 BOOL never_utf = FALSE;
9019 size_t size;
9020 pcre_uchar *code;
9021 const pcre_uchar *codestart;
9022 const pcre_uchar *ptr;
9023 compile_data compile_block;
9024 compile_data *cd = &compile_block;
9025 
9026 /* This space is used for "compiling" into during the first phase, when we are
9027 computing the amount of memory that is needed. Compiled items are thrown away
9028 as soon as possible, so that a fairly large buffer should be sufficient for
9029 this purpose. The same space is used in the second phase for remembering where
9030 to fill in forward references to subpatterns. That may overflow, in which case
9031 new memory is obtained from malloc(). */
9032 
9033 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9034 
9035 /* This vector is used for remembering name groups during the pre-compile. In a
9036 similar way to cworkspace, it can be expanded using malloc() if necessary. */
9037 
9038 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9039 
9040 /* Set this early so that early errors get offset 0. */
9041 
9042 ptr = (const pcre_uchar *)pattern;
9043 
9044 /* We can't pass back an error message if errorptr is NULL; I guess the best we
9045 can do is just return NULL, but we can set a code value if there is a code
9046 pointer. */
9047 
9048 if (errorptr == NULL)
9049   {
9050   if (errorcodeptr != NULL) *errorcodeptr = 99;
9051   return NULL;
9052   }
9053 
9054 *errorptr = NULL;
9055 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9056 
9057 /* However, we can give a message for this error */
9058 
9059 if (erroroffset == NULL)
9060   {
9061   errorcode = ERR16;
9062   goto PCRE_EARLY_ERROR_RETURN2;
9063   }
9064 
9065 *erroroffset = 0;
9066 
9067 /* Set up pointers to the individual character tables */
9068 
9069 if (tables == NULL) tables = PRIV(default_tables);
9070 cd->lcc = tables + lcc_offset;
9071 cd->fcc = tables + fcc_offset;
9072 cd->cbits = tables + cbits_offset;
9073 cd->ctypes = tables + ctypes_offset;
9074 
9075 /* Check that all undefined public option bits are zero */
9076 
9077 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9078   {
9079   errorcode = ERR17;
9080   goto PCRE_EARLY_ERROR_RETURN;
9081   }
9082 
9083 /* If PCRE_NEVER_UTF is set, remember it. */
9084 
9085 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9086 
9087 /* Check for global one-time settings at the start of the pattern, and remember
9088 the offset for later. */
9089 
9090 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
9091 
9092 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9093        ptr[skipatstart+1] == CHAR_ASTERISK)
9094   {
9095   int newnl = 0;
9096   int newbsr = 0;
9097 
9098 /* For completeness and backward compatibility, (*UTFn) is supported in the
9099 relevant libraries, but (*UTF) is generic and always supported. Note that
9100 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9101 
9102 #ifdef COMPILE_PCRE8
9103   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9104     { skipatstart += 7; options |= PCRE_UTF8; continue; }
9105 #endif
9106 #ifdef COMPILE_PCRE16
9107   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9108     { skipatstart += 8; options |= PCRE_UTF16; continue; }
9109 #endif
9110 #ifdef COMPILE_PCRE32
9111   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9112     { skipatstart += 8; options |= PCRE_UTF32; continue; }
9113 #endif
9114 
9115   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9116     { skipatstart += 6; options |= PCRE_UTF8; continue; }
9117   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9118     { skipatstart += 6; options |= PCRE_UCP; continue; }
9119   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9120     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9121   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9122     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9123 
9124   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9125     {
9126     pcre_uint32 c = 0;
9127     int p = skipatstart + 14;
9128     while (isdigit(ptr[p]))
9129       {
9130       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9131       c = c*10 + ptr[p++] - CHAR_0;
9132       }
9133     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9134     if (c < limit_match)
9135       {
9136       limit_match = c;
9137       cd->external_flags |= PCRE_MLSET;
9138       }
9139     skipatstart = p;
9140     continue;
9141     }
9142 
9143   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9144     {
9145     pcre_uint32 c = 0;
9146     int p = skipatstart + 18;
9147     while (isdigit(ptr[p]))
9148       {
9149       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9150       c = c*10 + ptr[p++] - CHAR_0;
9151       }
9152     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9153     if (c < limit_recursion)
9154       {
9155       limit_recursion = c;
9156       cd->external_flags |= PCRE_RLSET;
9157       }
9158     skipatstart = p;
9159     continue;
9160     }
9161 
9162   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9163     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9164   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9165     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9166   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9167     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9168   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9169     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9170   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9171     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9172 
9173   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9174     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9175   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9176     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9177 
9178   if (newnl != 0)
9179     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9180   else if (newbsr != 0)
9181     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9182   else break;
9183   }
9184 
9185 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9186 utf = (options & PCRE_UTF8) != 0;
9187 if (utf && never_utf)
9188   {
9189   errorcode = ERR78;
9190   goto PCRE_EARLY_ERROR_RETURN2;
9191   }
9192 
9193 /* Can't support UTF unless PCRE has been compiled to include the code. The
9194 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9195 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9196 not used here. */
9197 
9198 #ifdef SUPPORT_UTF
9199 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9200      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9201   {
9202 #if defined COMPILE_PCRE8
9203   errorcode = ERR44;
9204 #elif defined COMPILE_PCRE16
9205   errorcode = ERR74;
9206 #elif defined COMPILE_PCRE32
9207   errorcode = ERR77;
9208 #endif
9209   goto PCRE_EARLY_ERROR_RETURN2;
9210   }
9211 #else
9212 if (utf)
9213   {
9214   errorcode = ERR32;
9215   goto PCRE_EARLY_ERROR_RETURN;
9216   }
9217 #endif
9218 
9219 /* Can't support UCP unless PCRE has been compiled to include the code. */
9220 
9221 #ifndef SUPPORT_UCP
9222 if ((options & PCRE_UCP) != 0)
9223   {
9224   errorcode = ERR67;
9225   goto PCRE_EARLY_ERROR_RETURN;
9226   }
9227 #endif
9228 
9229 /* Check validity of \R options. */
9230 
9231 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9232      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9233   {
9234   errorcode = ERR56;
9235   goto PCRE_EARLY_ERROR_RETURN;
9236   }
9237 
9238 /* Handle different types of newline. The three bits give seven cases. The
9239 current code allows for fixed one- or two-byte sequences, plus "any" and
9240 "anycrlf". */
9241 
9242 switch (options & PCRE_NEWLINE_BITS)
9243   {
9244   case 0: newline = NEWLINE; break;   /* Build-time default */
9245   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9246   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9247   case PCRE_NEWLINE_CR+
9248        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9249   case PCRE_NEWLINE_ANY: newline = -1; break;
9250   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9251   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9252   }
9253 
9254 if (newline == -2)
9255   {
9256   cd->nltype = NLTYPE_ANYCRLF;
9257   }
9258 else if (newline < 0)
9259   {
9260   cd->nltype = NLTYPE_ANY;
9261   }
9262 else
9263   {
9264   cd->nltype = NLTYPE_FIXED;
9265   if (newline > 255)
9266     {
9267     cd->nllen = 2;
9268     cd->nl[0] = (newline >> 8) & 255;
9269     cd->nl[1] = newline & 255;
9270     }
9271   else
9272     {
9273     cd->nllen = 1;
9274     cd->nl[0] = newline;
9275     }
9276   }
9277 
9278 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9279 references to help in deciding whether (.*) can be treated as anchored or not.
9280 */
9281 
9282 cd->top_backref = 0;
9283 cd->backref_map = 0;
9284 
9285 /* Reflect pattern for debugging output */
9286 
9287 DPRINTF(("------------------------------------------------------------------\n"));
9288 #ifdef PCRE_DEBUG
9289 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9290 #endif
9291 DPRINTF(("\n"));
9292 
9293 /* Pretend to compile the pattern while actually just accumulating the length
9294 of memory required. This behaviour is triggered by passing a non-NULL final
9295 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9296 to compile parts of the pattern into; the compiled code is discarded when it is
9297 no longer needed, so hopefully this workspace will never overflow, though there
9298 is a test for its doing so. */
9299 
9300 cd->bracount = cd->final_bracount = 0;
9301 cd->names_found = 0;
9302 cd->name_entry_size = 0;
9303 cd->name_table = NULL;
9304 cd->dupnames = FALSE;
9305 cd->dupgroups = FALSE;
9306 cd->namedrefcount = 0;
9307 cd->start_code = cworkspace;
9308 cd->hwm = cworkspace;
9309 cd->iscondassert = FALSE;
9310 cd->start_workspace = cworkspace;
9311 cd->workspace_size = COMPILE_WORK_SIZE;
9312 cd->named_groups = named_groups;
9313 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9314 cd->start_pattern = (const pcre_uchar *)pattern;
9315 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9316 cd->req_varyopt = 0;
9317 cd->parens_depth = 0;
9318 cd->assert_depth = 0;
9319 cd->max_lookbehind = 0;
9320 cd->external_options = options;
9321 cd->open_caps = NULL;
9322 
9323 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9324 don't need to look at the result of the function here. The initial options have
9325 been put into the cd block so that they can be changed if an option setting is
9326 found within the regex right at the beginning. Bringing initial option settings
9327 outside can help speed up starting point checks. */
9328 
9329 ptr += skipatstart;
9330 code = cworkspace;
9331 *code = OP_BRA;
9332 
9333 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9334   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9335   cd, &length);
9336 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9337 
9338 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9339   (int)(cd->hwm - cworkspace)));
9340 
9341 if (length > MAX_PATTERN_SIZE)
9342   {
9343   errorcode = ERR20;
9344   goto PCRE_EARLY_ERROR_RETURN;
9345   }
9346 
9347 /* Compute the size of the data block for storing the compiled pattern. Integer
9348 overflow should no longer be possible because nowadays we limit the maximum
9349 value of cd->names_found and cd->name_entry_size. */
9350 
9351 size = sizeof(REAL_PCRE) +
9352   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9353 
9354 /* Get the memory. */
9355 
9356 re = (REAL_PCRE *)(PUBL(malloc))(size);
9357 if (re == NULL)
9358   {
9359   errorcode = ERR21;
9360   goto PCRE_EARLY_ERROR_RETURN;
9361   }
9362 
9363 /* Put in the magic number, and save the sizes, initial options, internal
9364 flags, and character table pointer. NULL is used for the default character
9365 tables. The nullpad field is at the end; it's there to help in the case when a
9366 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9367 pointers. */
9368 
9369 re->magic_number = MAGIC_NUMBER;
9370 re->size = (int)size;
9371 re->options = cd->external_options;
9372 re->flags = cd->external_flags;
9373 re->limit_match = limit_match;
9374 re->limit_recursion = limit_recursion;
9375 re->first_char = 0;
9376 re->req_char = 0;
9377 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9378 re->name_entry_size = cd->name_entry_size;
9379 re->name_count = cd->names_found;
9380 re->ref_count = 0;
9381 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9382 re->nullpad = NULL;
9383 #ifdef COMPILE_PCRE32
9384 re->dummy = 0;
9385 #else
9386 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9387 #endif
9388 
9389 /* The starting points of the name/number translation table and of the code are
9390 passed around in the compile data block. The start/end pattern and initial
9391 options are already set from the pre-compile phase, as is the name_entry_size
9392 field. Reset the bracket count and the names_found field. Also reset the hwm
9393 field; this time it's used for remembering forward references to subpatterns.
9394 */
9395 
9396 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9397 cd->parens_depth = 0;
9398 cd->assert_depth = 0;
9399 cd->bracount = 0;
9400 cd->max_lookbehind = 0;
9401 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9402 codestart = cd->name_table + re->name_entry_size * re->name_count;
9403 cd->start_code = codestart;
9404 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9405 cd->iscondassert = FALSE;
9406 cd->req_varyopt = 0;
9407 cd->had_accept = FALSE;
9408 cd->had_pruneorskip = FALSE;
9409 cd->check_lookbehind = FALSE;
9410 cd->open_caps = NULL;
9411 
9412 /* If any named groups were found, create the name/number table from the list
9413 created in the first pass. */
9414 
9415 if (cd->names_found > 0)
9416   {
9417   int i = cd->names_found;
9418   named_group *ng = cd->named_groups;
9419   cd->names_found = 0;
9420   for (; i > 0; i--, ng++)
9421     add_name(cd, ng->name, ng->length, ng->number);
9422   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9423     (PUBL(free))((void *)cd->named_groups);
9424   }
9425 
9426 /* Set up a starting, non-extracting bracket, then compile the expression. On
9427 error, errorcode will be set non-zero, so we don't need to look at the result
9428 of the function here. */
9429 
9430 ptr = (const pcre_uchar *)pattern + skipatstart;
9431 code = (pcre_uchar *)codestart;
9432 *code = OP_BRA;
9433 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9434   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9435 re->top_bracket = cd->bracount;
9436 re->top_backref = cd->top_backref;
9437 re->max_lookbehind = cd->max_lookbehind;
9438 re->flags = cd->external_flags | PCRE_MODE;
9439 
9440 if (cd->had_accept)
9441   {
9442   reqchar = 0;              /* Must disable after (*ACCEPT) */
9443   reqcharflags = REQ_NONE;
9444   }
9445 
9446 /* If not reached end of pattern on success, there's an excess bracket. */
9447 
9448 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9449 
9450 /* Fill in the terminating state and check for disastrous overflow, but
9451 if debugging, leave the test till after things are printed out. */
9452 
9453 *code++ = OP_END;
9454 
9455 #ifndef PCRE_DEBUG
9456 if (code - codestart > length) errorcode = ERR23;
9457 #endif
9458 
9459 #ifdef SUPPORT_VALGRIND
9460 /* If the estimated length exceeds the really used length, mark the extra
9461 allocated memory as unaddressable, so that any out-of-bound reads can be
9462 detected. */
9463 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9464 #endif
9465 
9466 /* Fill in any forward references that are required. There may be repeated
9467 references; optimize for them, as searching a large regex takes time. */
9468 
9469 if (cd->hwm > cd->start_workspace)
9470   {
9471   int prev_recno = -1;
9472   const pcre_uchar *groupptr = NULL;
9473   while (errorcode == 0 && cd->hwm > cd->start_workspace)
9474     {
9475     int offset, recno;
9476     cd->hwm -= LINK_SIZE;
9477     offset = GET(cd->hwm, 0);
9478 
9479     /* Check that the hwm handling hasn't gone wrong. This whole area is
9480     rewritten in PCRE2 because there are some obscure cases. */
9481 
9482     if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9483       {
9484       errorcode = ERR10;
9485       break;
9486       }
9487 
9488     recno = GET(codestart, offset);
9489     if (recno != prev_recno)
9490       {
9491       groupptr = PRIV(find_bracket)(codestart, utf, recno);
9492       prev_recno = recno;
9493       }
9494     if (groupptr == NULL) errorcode = ERR53;
9495       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9496     }
9497   }
9498 
9499 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9500 NULL to indicate that forward references have been filled in. */
9501 
9502 if (cd->workspace_size > COMPILE_WORK_SIZE)
9503   (PUBL(free))((void *)cd->start_workspace);
9504 cd->start_workspace = NULL;
9505 
9506 /* Give an error if there's back reference to a non-existent capturing
9507 subpattern. */
9508 
9509 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9510 
9511 /* Unless disabled, check whether any single character iterators can be
9512 auto-possessified. The function overwrites the appropriate opcode values, so
9513 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9514 used in this code because at least one compiler gives a warning about loss of
9515 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9516 function call. */
9517 
9518 if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9519   {
9520   pcre_uchar *temp = (pcre_uchar *)codestart;
9521   auto_possessify(temp, utf, cd);
9522   }
9523 
9524 /* If there were any lookbehind assertions that contained OP_RECURSE
9525 (recursions or subroutine calls), a flag is set for them to be checked here,
9526 because they may contain forward references. Actual recursions cannot be fixed
9527 length, but subroutine calls can. It is done like this so that those without
9528 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9529 exceptional ones forgo this. We scan the pattern to check that they are fixed
9530 length, and set their lengths. */
9531 
9532 if (errorcode == 0 && cd->check_lookbehind)
9533   {
9534   pcre_uchar *cc = (pcre_uchar *)codestart;
9535 
9536   /* Loop, searching for OP_REVERSE items, and process those that do not have
9537   their length set. (Actually, it will also re-process any that have a length
9538   of zero, but that is a pathological case, and it does no harm.) When we find
9539   one, we temporarily terminate the branch it is in while we scan it. */
9540 
9541   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9542        cc != NULL;
9543        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9544     {
9545     if (GET(cc, 1) == 0)
9546       {
9547       int fixed_length;
9548       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9549       int end_op = *be;
9550       *be = OP_END;
9551       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9552         cd, NULL);
9553       *be = end_op;
9554       DPRINTF(("fixed length = %d\n", fixed_length));
9555       if (fixed_length < 0)
9556         {
9557         errorcode = (fixed_length == -2)? ERR36 :
9558                     (fixed_length == -4)? ERR70 : ERR25;
9559         break;
9560         }
9561       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9562       PUT(cc, 1, fixed_length);
9563       }
9564     cc += 1 + LINK_SIZE;
9565     }
9566   }
9567 
9568 /* Failed to compile, or error while post-processing */
9569 
9570 if (errorcode != 0)
9571   {
9572   (PUBL(free))(re);
9573   PCRE_EARLY_ERROR_RETURN:
9574   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9575   PCRE_EARLY_ERROR_RETURN2:
9576   *errorptr = find_error_text(errorcode);
9577   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9578   return NULL;
9579   }
9580 
9581 /* If the anchored option was not passed, set the flag if we can determine that
9582 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9583 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9584 of *PRUNE or *SKIP.
9585 
9586 Otherwise, if we know what the first byte has to be, save it, because that
9587 speeds up unanchored matches no end. If not, see if we can set the
9588 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9589 start with ^. and also when all branches start with non-atomic .* for
9590 non-DOTALL matches when *PRUNE and SKIP are not present. */
9591 
9592 if ((re->options & PCRE_ANCHORED) == 0)
9593   {
9594   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9595   else
9596     {
9597     if (firstcharflags < 0)
9598       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9599     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9600       {
9601 #if defined COMPILE_PCRE8
9602       re->first_char = firstchar & 0xff;
9603 #elif defined COMPILE_PCRE16
9604       re->first_char = firstchar & 0xffff;
9605 #elif defined COMPILE_PCRE32
9606       re->first_char = firstchar;
9607 #endif
9608       if ((firstcharflags & REQ_CASELESS) != 0)
9609         {
9610 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9611         /* We ignore non-ASCII first chars in 8 bit mode. */
9612         if (utf)
9613           {
9614           if (re->first_char < 128)
9615             {
9616             if (cd->fcc[re->first_char] != re->first_char)
9617               re->flags |= PCRE_FCH_CASELESS;
9618             }
9619           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9620             re->flags |= PCRE_FCH_CASELESS;
9621           }
9622         else
9623 #endif
9624         if (MAX_255(re->first_char)
9625             && cd->fcc[re->first_char] != re->first_char)
9626           re->flags |= PCRE_FCH_CASELESS;
9627         }
9628 
9629       re->flags |= PCRE_FIRSTSET;
9630       }
9631 
9632     else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
9633     }
9634   }
9635 
9636 /* For an anchored pattern, we use the "required byte" only if it follows a
9637 variable length item in the regex. Remove the caseless flag for non-caseable
9638 bytes. */
9639 
9640 if (reqcharflags >= 0 &&
9641      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9642   {
9643 #if defined COMPILE_PCRE8
9644   re->req_char = reqchar & 0xff;
9645 #elif defined COMPILE_PCRE16
9646   re->req_char = reqchar & 0xffff;
9647 #elif defined COMPILE_PCRE32
9648   re->req_char = reqchar;
9649 #endif
9650   if ((reqcharflags & REQ_CASELESS) != 0)
9651     {
9652 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9653     /* We ignore non-ASCII first chars in 8 bit mode. */
9654     if (utf)
9655       {
9656       if (re->req_char < 128)
9657         {
9658         if (cd->fcc[re->req_char] != re->req_char)
9659           re->flags |= PCRE_RCH_CASELESS;
9660         }
9661       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9662         re->flags |= PCRE_RCH_CASELESS;
9663       }
9664     else
9665 #endif
9666     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9667       re->flags |= PCRE_RCH_CASELESS;
9668     }
9669 
9670   re->flags |= PCRE_REQCHSET;
9671   }
9672 
9673 /* Print out the compiled data if debugging is enabled. This is never the
9674 case when building a production library. */
9675 
9676 #ifdef PCRE_DEBUG
9677 printf("Length = %d top_bracket = %d top_backref = %d\n",
9678   length, re->top_bracket, re->top_backref);
9679 
9680 printf("Options=%08x\n", re->options);
9681 
9682 if ((re->flags & PCRE_FIRSTSET) != 0)
9683   {
9684   pcre_uchar ch = re->first_char;
9685   const char *caseless =
9686     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9687   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9688     else printf("First char = \\x%02x%s\n", ch, caseless);
9689   }
9690 
9691 if ((re->flags & PCRE_REQCHSET) != 0)
9692   {
9693   pcre_uchar ch = re->req_char;
9694   const char *caseless =
9695     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9696   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9697     else printf("Req char = \\x%02x%s\n", ch, caseless);
9698   }
9699 
9700 #if defined COMPILE_PCRE8
9701 pcre_printint((pcre *)re, stdout, TRUE);
9702 #elif defined COMPILE_PCRE16
9703 pcre16_printint((pcre *)re, stdout, TRUE);
9704 #elif defined COMPILE_PCRE32
9705 pcre32_printint((pcre *)re, stdout, TRUE);
9706 #endif
9707 
9708 /* This check is done here in the debugging case so that the code that
9709 was compiled can be seen. */
9710 
9711 if (code - codestart > length)
9712   {
9713   (PUBL(free))(re);
9714   *errorptr = find_error_text(ERR23);
9715   *erroroffset = ptr - (pcre_uchar *)pattern;
9716   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9717   return NULL;
9718   }
9719 #endif   /* PCRE_DEBUG */
9720 
9721 /* Check for a pattern than can match an empty string, so that this information
9722 can be provided to applications. */
9723 
9724 do
9725   {
9726   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9727     {
9728     re->flags |= PCRE_MATCH_EMPTY;
9729     break;
9730     }
9731   codestart += GET(codestart, 1);
9732   }
9733 while (*codestart == OP_ALT);
9734 
9735 #if defined COMPILE_PCRE8
9736 return (pcre *)re;
9737 #elif defined COMPILE_PCRE16
9738 return (pcre16 *)re;
9739 #elif defined COMPILE_PCRE32
9740 return (pcre32 *)re;
9741 #endif
9742 }
9743 
9744 /* End of pcre_compile.c */
9745