xref: /PHP-5.4/ext/pcre/pcrelib/pcre_compile.c (revision 95fa7279)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2014 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #define NLBLOCK cd             /* Block containing newline information */
50 #define PSSTART start_pattern  /* Field containing pattern start */
51 #define PSEND   end_pattern    /* Field containing pattern end */
52 
53 #include "pcre_internal.h"
54 
55 
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60 
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67 
68 
69 /* Macro for setting individual bits in class bitmaps. */
70 
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72 
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77 
78 #define OFLOW_MAX (INT_MAX - 20)
79 
80 /* Definitions to allow mutual recursion */
81 
82 static int
83   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84     const pcre_uint32 *, unsigned int);
85 
86 static BOOL
87   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89     compile_data *, int *);
90 
91 
92 
93 /*************************************************
94 *      Code parameters and static tables         *
95 *************************************************/
96 
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103 
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114 
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117 
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122 
123 #define NAMED_GROUP_LIST_SIZE  20
124 
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127 
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129 
130 /* Private flags added to firstchar and reqchar. */
131 
132 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
133 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET       (-2)
136 #define REQ_NONE        (-1)
137 
138 /* Repeated character flags. */
139 
140 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
141 
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146 
147 #ifndef EBCDIC
148 
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151 
152 static const short int escapes[] = {
153      0,                       0,
154      0,                       0,
155      0,                       0,
156      0,                       0,
157      0,                       0,
158      CHAR_COLON,              CHAR_SEMICOLON,
159      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
160      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
161      CHAR_COMMERCIAL_AT,      -ESC_A,
162      -ESC_B,                  -ESC_C,
163      -ESC_D,                  -ESC_E,
164      0,                       -ESC_G,
165      -ESC_H,                  0,
166      0,                       -ESC_K,
167      0,                       0,
168      -ESC_N,                  0,
169      -ESC_P,                  -ESC_Q,
170      -ESC_R,                  -ESC_S,
171      0,                       0,
172      -ESC_V,                  -ESC_W,
173      -ESC_X,                  0,
174      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
175      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
176      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
177      CHAR_GRAVE_ACCENT,       7,
178      -ESC_b,                  0,
179      -ESC_d,                  ESC_e,
180      ESC_f,                   0,
181      -ESC_h,                  0,
182      0,                       -ESC_k,
183      0,                       0,
184      ESC_n,                   0,
185      -ESC_p,                  0,
186      ESC_r,                   -ESC_s,
187      ESC_tee,                 0,
188      -ESC_v,                  -ESC_w,
189      0,                       0,
190      -ESC_z
191 };
192 
193 #else
194 
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196 
197 static const short int escapes[] = {
198 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
199 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
200 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
201 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
202 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
203 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
204 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
205 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
206 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
207 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
208 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
209 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
210 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
211 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
212 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
213 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
214 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
215 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
216 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
217 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
218 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
219 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
220 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
221 };
222 #endif
223 
224 
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230 
231 typedef struct verbitem {
232   int   len;                 /* Length of verb name */
233   int   op;                  /* Op when no arg, or -1 if arg mandatory */
234   int   op_arg;              /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236 
237 static const char verbnames[] =
238   "\0"                       /* Empty name is a shorthand for MARK */
239   STRING_MARK0
240   STRING_ACCEPT0
241   STRING_COMMIT0
242   STRING_F0
243   STRING_FAIL0
244   STRING_PRUNE0
245   STRING_SKIP0
246   STRING_THEN;
247 
248 static const verbitem verbs[] = {
249   { 0, -1,        OP_MARK },
250   { 4, -1,        OP_MARK },
251   { 6, OP_ACCEPT, -1 },
252   { 6, OP_COMMIT, -1 },
253   { 1, OP_FAIL,   -1 },
254   { 4, OP_FAIL,   -1 },
255   { 5, OP_PRUNE,  OP_PRUNE_ARG },
256   { 4, OP_SKIP,   OP_SKIP_ARG  },
257   { 4, OP_THEN,   OP_THEN_ARG  }
258 };
259 
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261 
262 
263 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
264 another regex library. */
265 
266 static const pcre_uchar sub_start_of_word[] = {
267   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
268   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
269 
270 static const pcre_uchar sub_end_of_word[] = {
271   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
272   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
273   CHAR_RIGHT_PARENTHESIS, '\0' };
274 
275 
276 /* Tables of names of POSIX character classes and their lengths. The names are
277 now all in a single string, to reduce the number of relocations when a shared
278 library is dynamically loaded. The list of lengths is terminated by a zero
279 length entry. The first three must be alpha, lower, upper, as this is assumed
280 for handling case independence. The indices for graph, print, and punct are
281 needed, so identify them. */
282 
283 static const char posix_names[] =
284   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
285   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
286   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
287   STRING_word0  STRING_xdigit;
288 
289 static const pcre_uint8 posix_name_lengths[] = {
290   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
291 
292 #define PC_GRAPH  8
293 #define PC_PRINT  9
294 #define PC_PUNCT 10
295 
296 
297 /* Table of class bit maps for each POSIX class. Each class is formed from a
298 base map, with an optional addition or removal of another map. Then, for some
299 classes, there is some additional tweaking: for [:blank:] the vertical space
300 characters are removed, and for [:alpha:] and [:alnum:] the underscore
301 character is removed. The triples in the table consist of the base map offset,
302 second map offset or -1 if no second map, and a non-negative value for map
303 addition or a negative value for map subtraction (if there are two maps). The
304 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
305 remove vertical space characters, 2 => remove underscore. */
306 
307 static const int posix_class_maps[] = {
308   cbit_word,  cbit_digit, -2,             /* alpha */
309   cbit_lower, -1,          0,             /* lower */
310   cbit_upper, -1,          0,             /* upper */
311   cbit_word,  -1,          2,             /* alnum - word without underscore */
312   cbit_print, cbit_cntrl,  0,             /* ascii */
313   cbit_space, -1,          1,             /* blank - a GNU extension */
314   cbit_cntrl, -1,          0,             /* cntrl */
315   cbit_digit, -1,          0,             /* digit */
316   cbit_graph, -1,          0,             /* graph */
317   cbit_print, -1,          0,             /* print */
318   cbit_punct, -1,          0,             /* punct */
319   cbit_space, -1,          0,             /* space */
320   cbit_word,  -1,          0,             /* word - a Perl extension */
321   cbit_xdigit,-1,          0              /* xdigit */
322 };
323 
324 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
325 Unicode property escapes. */
326 
327 #ifdef SUPPORT_UCP
328 static const pcre_uchar string_PNd[]  = {
329   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
330   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
331 static const pcre_uchar string_pNd[]  = {
332   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
333   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
334 static const pcre_uchar string_PXsp[] = {
335   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
336   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337 static const pcre_uchar string_pXsp[] = {
338   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340 static const pcre_uchar string_PXwd[] = {
341   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
342   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343 static const pcre_uchar string_pXwd[] = {
344   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
345   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346 
347 static const pcre_uchar *substitutes[] = {
348   string_PNd,           /* \D */
349   string_pNd,           /* \d */
350   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
351   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
352   string_PXwd,          /* \W */
353   string_pXwd           /* \w */
354 };
355 
356 /* The POSIX class substitutes must be in the order of the POSIX class names,
357 defined above, and there are both positive and negative cases. NULL means no
358 general substitute of a Unicode property escape (\p or \P). However, for some
359 POSIX classes (e.g. graph, print, punct) a special property code is compiled
360 directly. */
361 
362 static const pcre_uchar string_pL[] =   {
363   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
364   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
365 static const pcre_uchar string_pLl[] =  {
366   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
367   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
368 static const pcre_uchar string_pLu[] =  {
369   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
371 static const pcre_uchar string_pXan[] = {
372   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
373   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
374 static const pcre_uchar string_h[] =    {
375   CHAR_BACKSLASH, CHAR_h, '\0' };
376 static const pcre_uchar string_pXps[] = {
377   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
378   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
379 static const pcre_uchar string_PL[] =   {
380   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
381   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
382 static const pcre_uchar string_PLl[] =  {
383   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
384   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
385 static const pcre_uchar string_PLu[] =  {
386   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
387   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
388 static const pcre_uchar string_PXan[] = {
389   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
390   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
391 static const pcre_uchar string_H[] =    {
392   CHAR_BACKSLASH, CHAR_H, '\0' };
393 static const pcre_uchar string_PXps[] = {
394   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
395   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
396 
397 static const pcre_uchar *posix_substitutes[] = {
398   string_pL,            /* alpha */
399   string_pLl,           /* lower */
400   string_pLu,           /* upper */
401   string_pXan,          /* alnum */
402   NULL,                 /* ascii */
403   string_h,             /* blank */
404   NULL,                 /* cntrl */
405   string_pNd,           /* digit */
406   NULL,                 /* graph */
407   NULL,                 /* print */
408   NULL,                 /* punct */
409   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
410   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
411   NULL,                 /* xdigit */
412   /* Negated cases */
413   string_PL,            /* ^alpha */
414   string_PLl,           /* ^lower */
415   string_PLu,           /* ^upper */
416   string_PXan,          /* ^alnum */
417   NULL,                 /* ^ascii */
418   string_H,             /* ^blank */
419   NULL,                 /* ^cntrl */
420   string_PNd,           /* ^digit */
421   NULL,                 /* ^graph */
422   NULL,                 /* ^print */
423   NULL,                 /* ^punct */
424   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
425   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
426   NULL                  /* ^xdigit */
427 };
428 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
429 #endif
430 
431 #define STRING(a)  # a
432 #define XSTRING(s) STRING(s)
433 
434 /* The texts of compile-time error messages. These are "char *" because they
435 are passed to the outside world. Do not ever re-use any error number, because
436 they are documented. Always add a new error instead. Messages marked DEAD below
437 are no longer used. This used to be a table of strings, but in order to reduce
438 the number of relocations needed when a shared library is loaded dynamically,
439 it is now one long string. We cannot use a table of offsets, because the
440 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
441 simply count through to the one we want - this isn't a performance issue
442 because these strings are used only when there is a compilation error.
443 
444 Each substring ends with \0 to insert a null character. This includes the final
445 substring, so that the whole string ends with \0\0, which can be detected when
446 counting through. */
447 
448 static const char error_texts[] =
449   "no error\0"
450   "\\ at end of pattern\0"
451   "\\c at end of pattern\0"
452   "unrecognized character follows \\\0"
453   "numbers out of order in {} quantifier\0"
454   /* 5 */
455   "number too big in {} quantifier\0"
456   "missing terminating ] for character class\0"
457   "invalid escape sequence in character class\0"
458   "range out of order in character class\0"
459   "nothing to repeat\0"
460   /* 10 */
461   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
462   "internal error: unexpected repeat\0"
463   "unrecognized character after (? or (?-\0"
464   "POSIX named classes are supported only within a class\0"
465   "missing )\0"
466   /* 15 */
467   "reference to non-existent subpattern\0"
468   "erroffset passed as NULL\0"
469   "unknown option bit(s) set\0"
470   "missing ) after comment\0"
471   "parentheses nested too deeply\0"  /** DEAD **/
472   /* 20 */
473   "regular expression is too large\0"
474   "failed to get memory\0"
475   "unmatched parentheses\0"
476   "internal error: code overflow\0"
477   "unrecognized character after (?<\0"
478   /* 25 */
479   "lookbehind assertion is not fixed length\0"
480   "malformed number or name after (?(\0"
481   "conditional group contains more than two branches\0"
482   "assertion expected after (?(\0"
483   "(?R or (?[+-]digits must be followed by )\0"
484   /* 30 */
485   "unknown POSIX class name\0"
486   "POSIX collating elements are not supported\0"
487   "this version of PCRE is compiled without UTF support\0"
488   "spare error\0"  /** DEAD **/
489   "character value in \\x{} or \\o{} is too large\0"
490   /* 35 */
491   "invalid condition (?(0)\0"
492   "\\C not allowed in lookbehind assertion\0"
493   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
494   "number after (?C is > 255\0"
495   "closing ) for (?C expected\0"
496   /* 40 */
497   "recursive call could loop indefinitely\0"
498   "unrecognized character after (?P\0"
499   "syntax error in subpattern name (missing terminator)\0"
500   "two named subpatterns have the same name\0"
501   "invalid UTF-8 string\0"
502   /* 45 */
503   "support for \\P, \\p, and \\X has not been compiled\0"
504   "malformed \\P or \\p sequence\0"
505   "unknown property name after \\P or \\p\0"
506   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
507   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
508   /* 50 */
509   "repeated subpattern is too long\0"    /** DEAD **/
510   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
511   "internal error: overran compiling workspace\0"
512   "internal error: previously-checked referenced subpattern not found\0"
513   "DEFINE group contains more than one branch\0"
514   /* 55 */
515   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
516   "inconsistent NEWLINE options\0"
517   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
518   "a numbered reference must not be zero\0"
519   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
520   /* 60 */
521   "(*VERB) not recognized or malformed\0"
522   "number is too big\0"
523   "subpattern name expected\0"
524   "digit expected after (?+\0"
525   "] is an invalid data character in JavaScript compatibility mode\0"
526   /* 65 */
527   "different names for subpatterns of the same number are not allowed\0"
528   "(*MARK) must have an argument\0"
529   "this version of PCRE is not compiled with Unicode property support\0"
530   "\\c must be followed by an ASCII character\0"
531   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
532   /* 70 */
533   "internal error: unknown opcode in find_fixedlength()\0"
534   "\\N is not supported in a class\0"
535   "too many forward references\0"
536   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
537   "invalid UTF-16 string\0"
538   /* 75 */
539   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
540   "character value in \\u.... sequence is too large\0"
541   "invalid UTF-32 string\0"
542   "setting UTF is disabled by the application\0"
543   "non-hex character in \\x{} (closing brace missing?)\0"
544   /* 80 */
545   "non-octal character in \\o{} (closing brace missing?)\0"
546   "missing opening brace after \\o\0"
547   "parentheses are too deeply nested\0"
548   "invalid range in character class\0"
549   "group name must start with a non-digit\0"
550   /* 85 */
551   "parentheses are too deeply nested (stack check)\0"
552   "digits missing in \\x{} or \\o{}\0"
553   ;
554 
555 /* Table to identify digits and hex digits. This is used when compiling
556 patterns. Note that the tables in chartables are dependent on the locale, and
557 may mark arbitrary characters as digits - but the PCRE compiling code expects
558 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
559 a private table here. It costs 256 bytes, but it is a lot faster than doing
560 character value tests (at least in some simple cases I timed), and in some
561 applications one wants PCRE to compile efficiently as well as match
562 efficiently.
563 
564 For convenience, we use the same bit definitions as in chartables:
565 
566   0x04   decimal digit
567   0x08   hexadecimal digit
568 
569 Then we can use ctype_digit and ctype_xdigit in the code. */
570 
571 /* Using a simple comparison for decimal numbers rather than a memory read
572 is much faster, and the resulting code is simpler (the compiler turns it
573 into a subtraction and unsigned comparison). */
574 
575 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
576 
577 #ifndef EBCDIC
578 
579 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
580 UTF-8 mode. */
581 
582 static const pcre_uint8 digitab[] =
583   {
584   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
585   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
586   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
587   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
588   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
589   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
590   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
591   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
592   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
593   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
594   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
596   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
600   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
601   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
602   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
603   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
604   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
607   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
616 
617 #else
618 
619 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
620 
621 static const pcre_uint8 digitab[] =
622   {
623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
626   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
627   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
628   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
629   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
630   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
631   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
632   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
633   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
639   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
647   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
650   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
653   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
654   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
655 
656 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
657   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
658   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
659   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
661   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
663   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
664   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
665   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
666   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
667   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
668   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
669   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
670   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
671   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
672   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
673   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
674   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
675   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
676   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
677   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
678   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
679   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
680   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
681   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
682   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
683   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
684   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
685   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
686   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
687   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
688   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
689 #endif
690 
691 
692 /* This table is used to check whether auto-possessification is possible
693 between adjacent character-type opcodes. The left-hand (repeated) opcode is
694 used to select the row, and the right-hand opcode is use to select the column.
695 A value of 1 means that auto-possessification is OK. For example, the second
696 value in the first row means that \D+\d can be turned into \D++\d.
697 
698 The Unicode property types (\P and \p) have to be present to fill out the table
699 because of what their opcode values are, but the table values should always be
700 zero because property types are handled separately in the code. The last four
701 columns apply to items that cannot be repeated, so there is no need to have
702 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
703 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
704 
705 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
706 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
707 
708 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
709 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
710   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
711   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
712   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
713   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
714   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
715   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
716   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
717   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
718   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
719   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
720   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
721   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
722   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
723   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
724   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
725   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
726   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
727 };
728 
729 
730 /* This table is used to check whether auto-possessification is possible
731 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
732 left-hand (repeated) opcode is used to select the row, and the right-hand
733 opcode is used to select the column. The values are as follows:
734 
735   0   Always return FALSE (never auto-possessify)
736   1   Character groups are distinct (possessify if both are OP_PROP)
737   2   Check character categories in the same group (general or particular)
738   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
739 
740   4   Check left general category vs right particular category
741   5   Check right general category vs left particular category
742 
743   6   Left alphanum vs right general category
744   7   Left space vs right general category
745   8   Left word vs right general category
746 
747   9   Right alphanum vs left general category
748  10   Right space vs left general category
749  11   Right word vs left general category
750 
751  12   Left alphanum vs right particular category
752  13   Left space vs right particular category
753  14   Left word vs right particular category
754 
755  15   Right alphanum vs left particular category
756  16   Right space vs left particular category
757  17   Right word vs left particular category
758 */
759 
760 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
761 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
762   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
763   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
764   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
765   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
766   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
767   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
768   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
769   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
770   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
771   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
772   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
773 };
774 
775 /* This table is used to check whether auto-possessification is possible
776 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
777 specifies a general category and the other specifies a particular category. The
778 row is selected by the general category and the column by the particular
779 category. The value is 1 if the particular category is not part of the general
780 category. */
781 
782 static const pcre_uint8 catposstab[7][30] = {
783 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
784   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
785   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
786   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
787   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
788   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
789   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
790   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
791 };
792 
793 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
794 a general or particular category. The properties in each row are those
795 that apply to the character set in question. Duplication means that a little
796 unnecessary work is done when checking, but this keeps things much simpler
797 because they can all use the same code. For more details see the comment where
798 this table is used.
799 
800 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
801 "space", but from Perl 5.18 it's included, so both categories are treated the
802 same here. */
803 
804 static const pcre_uint8 posspropstab[3][4] = {
805   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
806   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
807   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
808 };
809 
810 /* This table is used when converting repeating opcodes into possessified
811 versions as a result of an explicit possessive quantifier such as ++. A zero
812 value means there is no possessified version - in those cases the item in
813 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
814 because all relevant opcodes are less than that. */
815 
816 static const pcre_uint8 opcode_possessify[] = {
817   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
818   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
819 
820   0,                       /* NOTI */
821   OP_POSSTAR, 0,           /* STAR, MINSTAR */
822   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
823   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
824   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
825   0,                       /* EXACT */
826   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
827 
828   OP_POSSTARI, 0,          /* STARI, MINSTARI */
829   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
830   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
831   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
832   0,                       /* EXACTI */
833   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
834 
835   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
836   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
837   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
838   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
839   0,                       /* NOTEXACT */
840   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
841 
842   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
843   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
844   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
845   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
846   0,                       /* NOTEXACTI */
847   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
848 
849   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
850   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
851   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
852   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
853   0,                       /* TYPEEXACT */
854   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
855 
856   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
857   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
858   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
859   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
860   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
861 
862   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
863   0, 0,                    /* REF, REFI */
864   0, 0,                    /* DNREF, DNREFI */
865   0, 0                     /* RECURSE, CALLOUT */
866 };
867 
868 
869 
870 /*************************************************
871 *            Find an error text                  *
872 *************************************************/
873 
874 /* The error texts are now all in one long string, to save on relocations. As
875 some of the text is of unknown length, we can't use a table of offsets.
876 Instead, just count through the strings. This is not a performance issue
877 because it happens only when there has been a compilation error.
878 
879 Argument:   the error number
880 Returns:    pointer to the error string
881 */
882 
883 static const char *
find_error_text(int n)884 find_error_text(int n)
885 {
886 const char *s = error_texts;
887 for (; n > 0; n--)
888   {
889   while (*s++ != CHAR_NULL) {};
890   if (*s == CHAR_NULL) return "Error text not found (please report)";
891   }
892 return s;
893 }
894 
895 
896 
897 /*************************************************
898 *           Expand the workspace                 *
899 *************************************************/
900 
901 /* This function is called during the second compiling phase, if the number of
902 forward references fills the existing workspace, which is originally a block on
903 the stack. A larger block is obtained from malloc() unless the ultimate limit
904 has been reached or the increase will be rather small.
905 
906 Argument: pointer to the compile data block
907 Returns:  0 if all went well, else an error number
908 */
909 
910 static int
expand_workspace(compile_data * cd)911 expand_workspace(compile_data *cd)
912 {
913 pcre_uchar *newspace;
914 int newsize = cd->workspace_size * 2;
915 
916 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
917 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
918     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
919  return ERR72;
920 
921 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
922 if (newspace == NULL) return ERR21;
923 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
924 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
925 if (cd->workspace_size > COMPILE_WORK_SIZE)
926   (PUBL(free))((void *)cd->start_workspace);
927 cd->start_workspace = newspace;
928 cd->workspace_size = newsize;
929 return 0;
930 }
931 
932 
933 
934 /*************************************************
935 *            Check for counted repeat            *
936 *************************************************/
937 
938 /* This function is called when a '{' is encountered in a place where it might
939 start a quantifier. It looks ahead to see if it really is a quantifier or not.
940 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
941 where the ddds are digits.
942 
943 Arguments:
944   p         pointer to the first char after '{'
945 
946 Returns:    TRUE or FALSE
947 */
948 
949 static BOOL
is_counted_repeat(const pcre_uchar * p)950 is_counted_repeat(const pcre_uchar *p)
951 {
952 if (!IS_DIGIT(*p)) return FALSE;
953 p++;
954 while (IS_DIGIT(*p)) p++;
955 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
956 
957 if (*p++ != CHAR_COMMA) return FALSE;
958 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
959 
960 if (!IS_DIGIT(*p)) return FALSE;
961 p++;
962 while (IS_DIGIT(*p)) p++;
963 
964 return (*p == CHAR_RIGHT_CURLY_BRACKET);
965 }
966 
967 
968 
969 /*************************************************
970 *            Handle escapes                      *
971 *************************************************/
972 
973 /* This function is called when a \ has been encountered. It either returns a
974 positive value for a simple escape such as \n, or 0 for a data character which
975 will be placed in chptr. A backreference to group n is returned as negative n.
976 When UTF-8 is enabled, a positive value greater than 255 may be returned in
977 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
978 character of the escape sequence.
979 
980 Arguments:
981   ptrptr         points to the pattern position pointer
982   chptr          points to a returned data character
983   errorcodeptr   points to the errorcode variable
984   bracount       number of previous extracting brackets
985   options        the options bits
986   isclass        TRUE if inside a character class
987 
988 Returns:         zero => a data character
989                  positive => a special escape sequence
990                  negative => a back reference
991                  on error, errorcodeptr is set
992 */
993 
994 static int
check_escape(const pcre_uchar ** ptrptr,pcre_uint32 * chptr,int * errorcodeptr,int bracount,int options,BOOL isclass)995 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
996   int bracount, int options, BOOL isclass)
997 {
998 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
999 BOOL utf = (options & PCRE_UTF8) != 0;
1000 const pcre_uchar *ptr = *ptrptr + 1;
1001 pcre_uint32 c;
1002 int escape = 0;
1003 int i;
1004 
1005 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1006 ptr--;                            /* Set pointer back to the last byte */
1007 
1008 /* If backslash is at the end of the pattern, it's an error. */
1009 
1010 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1011 
1012 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1013 in a table. A non-zero result is something that can be returned immediately.
1014 Otherwise further processing may be required. */
1015 
1016 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1017 /* Not alphanumeric */
1018 else if (c < CHAR_0 || c > CHAR_z) {}
1019 else if ((i = escapes[c - CHAR_0]) != 0)
1020   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1021 
1022 #else           /* EBCDIC coding */
1023 /* Not alphanumeric */
1024 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1025 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1026 #endif
1027 
1028 /* Escapes that need further processing, or are illegal. */
1029 
1030 else
1031   {
1032   const pcre_uchar *oldptr;
1033   BOOL braced, negated, overflow;
1034   int s;
1035 
1036   switch (c)
1037     {
1038     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1039     error. */
1040 
1041     case CHAR_l:
1042     case CHAR_L:
1043     *errorcodeptr = ERR37;
1044     break;
1045 
1046     case CHAR_u:
1047     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1048       {
1049       /* In JavaScript, \u must be followed by four hexadecimal numbers.
1050       Otherwise it is a lowercase u letter. */
1051       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1052         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1053         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1054         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1055         {
1056         c = 0;
1057         for (i = 0; i < 4; ++i)
1058           {
1059           register pcre_uint32 cc = *(++ptr);
1060 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1061           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1062           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1063 #else           /* EBCDIC coding */
1064           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1065           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1066 #endif
1067           }
1068 
1069 #if defined COMPILE_PCRE8
1070         if (c > (utf ? 0x10ffffU : 0xffU))
1071 #elif defined COMPILE_PCRE16
1072         if (c > (utf ? 0x10ffffU : 0xffffU))
1073 #elif defined COMPILE_PCRE32
1074         if (utf && c > 0x10ffffU)
1075 #endif
1076           {
1077           *errorcodeptr = ERR76;
1078           }
1079         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1080         }
1081       }
1082     else
1083       *errorcodeptr = ERR37;
1084     break;
1085 
1086     case CHAR_U:
1087     /* In JavaScript, \U is an uppercase U letter. */
1088     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1089     break;
1090 
1091     /* In a character class, \g is just a literal "g". Outside a character
1092     class, \g must be followed by one of a number of specific things:
1093 
1094     (1) A number, either plain or braced. If positive, it is an absolute
1095     backreference. If negative, it is a relative backreference. This is a Perl
1096     5.10 feature.
1097 
1098     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1099     is part of Perl's movement towards a unified syntax for back references. As
1100     this is synonymous with \k{name}, we fudge it up by pretending it really
1101     was \k.
1102 
1103     (3) For Oniguruma compatibility we also support \g followed by a name or a
1104     number either in angle brackets or in single quotes. However, these are
1105     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1106     the ESC_g code (cf \k). */
1107 
1108     case CHAR_g:
1109     if (isclass) break;
1110     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1111       {
1112       escape = ESC_g;
1113       break;
1114       }
1115 
1116     /* Handle the Perl-compatible cases */
1117 
1118     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1119       {
1120       const pcre_uchar *p;
1121       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1122         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1123       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1124         {
1125         escape = ESC_k;
1126         break;
1127         }
1128       braced = TRUE;
1129       ptr++;
1130       }
1131     else braced = FALSE;
1132 
1133     if (ptr[1] == CHAR_MINUS)
1134       {
1135       negated = TRUE;
1136       ptr++;
1137       }
1138     else negated = FALSE;
1139 
1140     /* The integer range is limited by the machine's int representation. */
1141     s = 0;
1142     overflow = FALSE;
1143     while (IS_DIGIT(ptr[1]))
1144       {
1145       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1146         {
1147         overflow = TRUE;
1148         break;
1149         }
1150       s = s * 10 + (int)(*(++ptr) - CHAR_0);
1151       }
1152     if (overflow) /* Integer overflow */
1153       {
1154       while (IS_DIGIT(ptr[1]))
1155         ptr++;
1156       *errorcodeptr = ERR61;
1157       break;
1158       }
1159 
1160     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1161       {
1162       *errorcodeptr = ERR57;
1163       break;
1164       }
1165 
1166     if (s == 0)
1167       {
1168       *errorcodeptr = ERR58;
1169       break;
1170       }
1171 
1172     if (negated)
1173       {
1174       if (s > bracount)
1175         {
1176         *errorcodeptr = ERR15;
1177         break;
1178         }
1179       s = bracount - (s - 1);
1180       }
1181 
1182     escape = -s;
1183     break;
1184 
1185     /* The handling of escape sequences consisting of a string of digits
1186     starting with one that is not zero is not straightforward. Perl has changed
1187     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1188     recommended to avoid the ambiguities in the old syntax.
1189 
1190     Outside a character class, the digits are read as a decimal number. If the
1191     number is less than 8 (used to be 10), or if there are that many previous
1192     extracting left brackets, then it is a back reference. Otherwise, up to
1193     three octal digits are read to form an escaped byte. Thus \123 is likely to
1194     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1195     the octal value is greater than 377, the least significant 8 bits are
1196     taken. \8 and \9 are treated as the literal characters 8 and 9.
1197 
1198     Inside a character class, \ followed by a digit is always either a literal
1199     8 or 9 or an octal number. */
1200 
1201     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1202     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1203 
1204     if (!isclass)
1205       {
1206       oldptr = ptr;
1207       /* The integer range is limited by the machine's int representation. */
1208       s = (int)(c -CHAR_0);
1209       overflow = FALSE;
1210       while (IS_DIGIT(ptr[1]))
1211         {
1212         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1213           {
1214           overflow = TRUE;
1215           break;
1216           }
1217         s = s * 10 + (int)(*(++ptr) - CHAR_0);
1218         }
1219       if (overflow) /* Integer overflow */
1220         {
1221         while (IS_DIGIT(ptr[1]))
1222           ptr++;
1223         *errorcodeptr = ERR61;
1224         break;
1225         }
1226       if (s < 8 || s <= bracount)  /* Check for back reference */
1227         {
1228         escape = -s;
1229         break;
1230         }
1231       ptr = oldptr;      /* Put the pointer back and fall through */
1232       }
1233 
1234     /* Handle a digit following \ when the number is not a back reference. If
1235     the first digit is 8 or 9, Perl used to generate a binary zero byte and
1236     then treat the digit as a following literal. At least by Perl 5.18 this
1237     changed so as not to insert the binary zero. */
1238 
1239     if ((c = *ptr) >= CHAR_8) break;
1240 
1241     /* Fall through with a digit less than 8 */
1242 
1243     /* \0 always starts an octal number, but we may drop through to here with a
1244     larger first octal digit. The original code used just to take the least
1245     significant 8 bits of octal numbers (I think this is what early Perls used
1246     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1247     but no more than 3 octal digits. */
1248 
1249     case CHAR_0:
1250     c -= CHAR_0;
1251     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1252         c = c * 8 + *(++ptr) - CHAR_0;
1253 #ifdef COMPILE_PCRE8
1254     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1255 #endif
1256     break;
1257 
1258     /* \o is a relatively new Perl feature, supporting a more general way of
1259     specifying character codes in octal. The only supported form is \o{ddd}. */
1260 
1261     case CHAR_o:
1262     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1263     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1264       {
1265       ptr += 2;
1266       c = 0;
1267       overflow = FALSE;
1268       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1269         {
1270         register pcre_uint32 cc = *ptr++;
1271         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1272 #ifdef COMPILE_PCRE32
1273         if (c >= 0x20000000l) { overflow = TRUE; break; }
1274 #endif
1275         c = (c << 3) + cc - CHAR_0 ;
1276 #if defined COMPILE_PCRE8
1277         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1278 #elif defined COMPILE_PCRE16
1279         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1280 #elif defined COMPILE_PCRE32
1281         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1282 #endif
1283         }
1284       if (overflow)
1285         {
1286         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1287         *errorcodeptr = ERR34;
1288         }
1289       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1290         {
1291         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1292         }
1293       else *errorcodeptr = ERR80;
1294       }
1295     break;
1296 
1297     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1298     numbers. Otherwise it is a lowercase x letter. */
1299 
1300     case CHAR_x:
1301     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1302       {
1303       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1304         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1305         {
1306         c = 0;
1307         for (i = 0; i < 2; ++i)
1308           {
1309           register pcre_uint32 cc = *(++ptr);
1310 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1311           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1312           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1313 #else           /* EBCDIC coding */
1314           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1315           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1316 #endif
1317           }
1318         }
1319       }    /* End JavaScript handling */
1320 
1321     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1322     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1323     digits. If not, { used to be treated as a data character. However, Perl
1324     seems to read hex digits up to the first non-such, and ignore the rest, so
1325     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1326     now gives an error. */
1327 
1328     else
1329       {
1330       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1331         {
1332         ptr += 2;
1333         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1334           {
1335           *errorcodeptr = ERR86;
1336           break;
1337           }
1338         c = 0;
1339         overflow = FALSE;
1340         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1341           {
1342           register pcre_uint32 cc = *ptr++;
1343           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1344 
1345 #ifdef COMPILE_PCRE32
1346           if (c >= 0x10000000l) { overflow = TRUE; break; }
1347 #endif
1348 
1349 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1350           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1351           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1352 #else           /* EBCDIC coding */
1353           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1354           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1355 #endif
1356 
1357 #if defined COMPILE_PCRE8
1358           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1359 #elif defined COMPILE_PCRE16
1360           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1361 #elif defined COMPILE_PCRE32
1362           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1363 #endif
1364           }
1365 
1366         if (overflow)
1367           {
1368           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1369           *errorcodeptr = ERR34;
1370           }
1371 
1372         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1373           {
1374           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1375           }
1376 
1377         /* If the sequence of hex digits does not end with '}', give an error.
1378         We used just to recognize this construct and fall through to the normal
1379         \x handling, but nowadays Perl gives an error, which seems much more
1380         sensible, so we do too. */
1381 
1382         else *errorcodeptr = ERR79;
1383         }   /* End of \x{} processing */
1384 
1385       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1386 
1387       else
1388         {
1389         c = 0;
1390         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1391           {
1392           pcre_uint32 cc;                          /* Some compilers don't like */
1393           cc = *(++ptr);                           /* ++ in initializers */
1394 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1395           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1396           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1397 #else           /* EBCDIC coding */
1398           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1399           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1400 #endif
1401           }
1402         }     /* End of \xdd handling */
1403       }       /* End of Perl-style \x handling */
1404     break;
1405 
1406     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1407     An error is given if the byte following \c is not an ASCII character. This
1408     coding is ASCII-specific, but then the whole concept of \cx is
1409     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1410 
1411     case CHAR_c:
1412     c = *(++ptr);
1413     if (c == CHAR_NULL)
1414       {
1415       *errorcodeptr = ERR2;
1416       break;
1417       }
1418 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1419     if (c > 127)  /* Excludes all non-ASCII in either mode */
1420       {
1421       *errorcodeptr = ERR68;
1422       break;
1423       }
1424     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1425     c ^= 0x40;
1426 #else             /* EBCDIC coding */
1427     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1428     c ^= 0xC0;
1429 #endif
1430     break;
1431 
1432     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1433     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1434     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1435     odd, but there used to be some cases other than the default, and there may
1436     be again in future, so I haven't "optimized" it. */
1437 
1438     default:
1439     if ((options & PCRE_EXTRA) != 0) switch(c)
1440       {
1441       default:
1442       *errorcodeptr = ERR3;
1443       break;
1444       }
1445     break;
1446     }
1447   }
1448 
1449 /* Perl supports \N{name} for character names, as well as plain \N for "not
1450 newline". PCRE does not support \N{name}. However, it does support
1451 quantification such as \N{2,3}. */
1452 
1453 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1454      !is_counted_repeat(ptr+2))
1455   *errorcodeptr = ERR37;
1456 
1457 /* If PCRE_UCP is set, we change the values for \d etc. */
1458 
1459 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1460   escape += (ESC_DU - ESC_D);
1461 
1462 /* Set the pointer to the final character before returning. */
1463 
1464 *ptrptr = ptr;
1465 *chptr = c;
1466 return escape;
1467 }
1468 
1469 
1470 
1471 #ifdef SUPPORT_UCP
1472 /*************************************************
1473 *               Handle \P and \p                 *
1474 *************************************************/
1475 
1476 /* This function is called after \P or \p has been encountered, provided that
1477 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1478 pointing at the P or p. On exit, it is pointing at the final character of the
1479 escape sequence.
1480 
1481 Argument:
1482   ptrptr         points to the pattern position pointer
1483   negptr         points to a boolean that is set TRUE for negation else FALSE
1484   ptypeptr       points to an unsigned int that is set to the type value
1485   pdataptr       points to an unsigned int that is set to the detailed property value
1486   errorcodeptr   points to the error code variable
1487 
1488 Returns:         TRUE if the type value was found, or FALSE for an invalid type
1489 */
1490 
1491 static BOOL
get_ucp(const pcre_uchar ** ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr)1492 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1493   unsigned int *pdataptr, int *errorcodeptr)
1494 {
1495 pcre_uchar c;
1496 int i, bot, top;
1497 const pcre_uchar *ptr = *ptrptr;
1498 pcre_uchar name[32];
1499 
1500 c = *(++ptr);
1501 if (c == CHAR_NULL) goto ERROR_RETURN;
1502 
1503 *negptr = FALSE;
1504 
1505 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1506 negation. */
1507 
1508 if (c == CHAR_LEFT_CURLY_BRACKET)
1509   {
1510   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1511     {
1512     *negptr = TRUE;
1513     ptr++;
1514     }
1515   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1516     {
1517     c = *(++ptr);
1518     if (c == CHAR_NULL) goto ERROR_RETURN;
1519     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1520     name[i] = c;
1521     }
1522   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1523   name[i] = 0;
1524   }
1525 
1526 /* Otherwise there is just one following character */
1527 
1528 else
1529   {
1530   name[0] = c;
1531   name[1] = 0;
1532   }
1533 
1534 *ptrptr = ptr;
1535 
1536 /* Search for a recognized property name using binary chop */
1537 
1538 bot = 0;
1539 top = PRIV(utt_size);
1540 
1541 while (bot < top)
1542   {
1543   int r;
1544   i = (bot + top) >> 1;
1545   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1546   if (r == 0)
1547     {
1548     *ptypeptr = PRIV(utt)[i].type;
1549     *pdataptr = PRIV(utt)[i].value;
1550     return TRUE;
1551     }
1552   if (r > 0) bot = i + 1; else top = i;
1553   }
1554 
1555 *errorcodeptr = ERR47;
1556 *ptrptr = ptr;
1557 return FALSE;
1558 
1559 ERROR_RETURN:
1560 *errorcodeptr = ERR46;
1561 *ptrptr = ptr;
1562 return FALSE;
1563 }
1564 #endif
1565 
1566 
1567 
1568 /*************************************************
1569 *         Read repeat counts                     *
1570 *************************************************/
1571 
1572 /* Read an item of the form {n,m} and return the values. This is called only
1573 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1574 so the syntax is guaranteed to be correct, but we need to check the values.
1575 
1576 Arguments:
1577   p              pointer to first char after '{'
1578   minp           pointer to int for min
1579   maxp           pointer to int for max
1580                  returned as -1 if no max
1581   errorcodeptr   points to error code variable
1582 
1583 Returns:         pointer to '}' on success;
1584                  current ptr on error, with errorcodeptr set non-zero
1585 */
1586 
1587 static const pcre_uchar *
read_repeat_counts(const pcre_uchar * p,int * minp,int * maxp,int * errorcodeptr)1588 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1589 {
1590 int min = 0;
1591 int max = -1;
1592 
1593 while (IS_DIGIT(*p))
1594   {
1595   min = min * 10 + (int)(*p++ - CHAR_0);
1596   if (min > 65535)
1597     {
1598     *errorcodeptr = ERR5;
1599     return p;
1600     }
1601   }
1602 
1603 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1604   {
1605   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1606     {
1607     max = 0;
1608     while(IS_DIGIT(*p))
1609       {
1610       max = max * 10 + (int)(*p++ - CHAR_0);
1611       if (max > 65535)
1612         {
1613         *errorcodeptr = ERR5;
1614         return p;
1615         }
1616       }
1617     if (max < min)
1618       {
1619       *errorcodeptr = ERR4;
1620       return p;
1621       }
1622     }
1623   }
1624 
1625 *minp = min;
1626 *maxp = max;
1627 return p;
1628 }
1629 
1630 
1631 
1632 /*************************************************
1633 *      Find first significant op code            *
1634 *************************************************/
1635 
1636 /* This is called by several functions that scan a compiled expression looking
1637 for a fixed first character, or an anchoring op code etc. It skips over things
1638 that do not influence this. For some calls, it makes sense to skip negative
1639 forward and all backward assertions, and also the \b assertion; for others it
1640 does not.
1641 
1642 Arguments:
1643   code         pointer to the start of the group
1644   skipassert   TRUE if certain assertions are to be skipped
1645 
1646 Returns:       pointer to the first significant opcode
1647 */
1648 
1649 static const pcre_uchar*
first_significant_code(const pcre_uchar * code,BOOL skipassert)1650 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1651 {
1652 for (;;)
1653   {
1654   switch ((int)*code)
1655     {
1656     case OP_ASSERT_NOT:
1657     case OP_ASSERTBACK:
1658     case OP_ASSERTBACK_NOT:
1659     if (!skipassert) return code;
1660     do code += GET(code, 1); while (*code == OP_ALT);
1661     code += PRIV(OP_lengths)[*code];
1662     break;
1663 
1664     case OP_WORD_BOUNDARY:
1665     case OP_NOT_WORD_BOUNDARY:
1666     if (!skipassert) return code;
1667     /* Fall through */
1668 
1669     case OP_CALLOUT:
1670     case OP_CREF:
1671     case OP_DNCREF:
1672     case OP_RREF:
1673     case OP_DNRREF:
1674     case OP_DEF:
1675     code += PRIV(OP_lengths)[*code];
1676     break;
1677 
1678     default:
1679     return code;
1680     }
1681   }
1682 /* Control never reaches here */
1683 }
1684 
1685 
1686 
1687 /*************************************************
1688 *        Find the fixed length of a branch       *
1689 *************************************************/
1690 
1691 /* Scan a branch and compute the fixed length of subject that will match it,
1692 if the length is fixed. This is needed for dealing with backward assertions.
1693 In UTF8 mode, the result is in characters rather than bytes. The branch is
1694 temporarily terminated with OP_END when this function is called.
1695 
1696 This function is called when a backward assertion is encountered, so that if it
1697 fails, the error message can point to the correct place in the pattern.
1698 However, we cannot do this when the assertion contains subroutine calls,
1699 because they can be forward references. We solve this by remembering this case
1700 and doing the check at the end; a flag specifies which mode we are running in.
1701 
1702 Arguments:
1703   code     points to the start of the pattern (the bracket)
1704   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1705   atend    TRUE if called when the pattern is complete
1706   cd       the "compile data" structure
1707   recurses    chain of recurse_check to catch mutual recursion
1708 
1709 Returns:   the fixed length,
1710              or -1 if there is no fixed length,
1711              or -2 if \C was encountered (in UTF-8 mode only)
1712              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1713              or -4 if an unknown opcode was encountered (internal error)
1714 */
1715 
1716 static int
find_fixedlength(pcre_uchar * code,BOOL utf,BOOL atend,compile_data * cd,recurse_check * recurses)1717 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1718   recurse_check *recurses)
1719 {
1720 int length = -1;
1721 recurse_check this_recurse;
1722 register int branchlength = 0;
1723 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1724 
1725 /* Scan along the opcodes for this branch. If we get to the end of the
1726 branch, check the length against that of the other branches. */
1727 
1728 for (;;)
1729   {
1730   int d;
1731   pcre_uchar *ce, *cs;
1732   register pcre_uchar op = *cc;
1733 
1734   switch (op)
1735     {
1736     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1737     OP_BRA (normal non-capturing bracket) because the other variants of these
1738     opcodes are all concerned with unlimited repeated groups, which of course
1739     are not of fixed length. */
1740 
1741     case OP_CBRA:
1742     case OP_BRA:
1743     case OP_ONCE:
1744     case OP_ONCE_NC:
1745     case OP_COND:
1746     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1747       recurses);
1748     if (d < 0) return d;
1749     branchlength += d;
1750     do cc += GET(cc, 1); while (*cc == OP_ALT);
1751     cc += 1 + LINK_SIZE;
1752     break;
1753 
1754     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1755     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1756     an ALT. If it is END it's the end of the outer call. All can be handled by
1757     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1758     because they all imply an unlimited repeat. */
1759 
1760     case OP_ALT:
1761     case OP_KET:
1762     case OP_END:
1763     case OP_ACCEPT:
1764     case OP_ASSERT_ACCEPT:
1765     if (length < 0) length = branchlength;
1766       else if (length != branchlength) return -1;
1767     if (*cc != OP_ALT) return length;
1768     cc += 1 + LINK_SIZE;
1769     branchlength = 0;
1770     break;
1771 
1772     /* A true recursion implies not fixed length, but a subroutine call may
1773     be OK. If the subroutine is a forward reference, we can't deal with
1774     it until the end of the pattern, so return -3. */
1775 
1776     case OP_RECURSE:
1777     if (!atend) return -3;
1778     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1779     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1780     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1781     else   /* Check for mutual recursion */
1782       {
1783       recurse_check *r = recurses;
1784       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1785       if (r != NULL) return -1;   /* Mutual recursion */
1786       }
1787     this_recurse.prev = recurses;
1788     this_recurse.group = cs;
1789     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1790     if (d < 0) return d;
1791     branchlength += d;
1792     cc += 1 + LINK_SIZE;
1793     break;
1794 
1795     /* Skip over assertive subpatterns */
1796 
1797     case OP_ASSERT:
1798     case OP_ASSERT_NOT:
1799     case OP_ASSERTBACK:
1800     case OP_ASSERTBACK_NOT:
1801     do cc += GET(cc, 1); while (*cc == OP_ALT);
1802     cc += PRIV(OP_lengths)[*cc];
1803     break;
1804 
1805     /* Skip over things that don't match chars */
1806 
1807     case OP_MARK:
1808     case OP_PRUNE_ARG:
1809     case OP_SKIP_ARG:
1810     case OP_THEN_ARG:
1811     cc += cc[1] + PRIV(OP_lengths)[*cc];
1812     break;
1813 
1814     case OP_CALLOUT:
1815     case OP_CIRC:
1816     case OP_CIRCM:
1817     case OP_CLOSE:
1818     case OP_COMMIT:
1819     case OP_CREF:
1820     case OP_DEF:
1821     case OP_DNCREF:
1822     case OP_DNRREF:
1823     case OP_DOLL:
1824     case OP_DOLLM:
1825     case OP_EOD:
1826     case OP_EODN:
1827     case OP_FAIL:
1828     case OP_NOT_WORD_BOUNDARY:
1829     case OP_PRUNE:
1830     case OP_REVERSE:
1831     case OP_RREF:
1832     case OP_SET_SOM:
1833     case OP_SKIP:
1834     case OP_SOD:
1835     case OP_SOM:
1836     case OP_THEN:
1837     case OP_WORD_BOUNDARY:
1838     cc += PRIV(OP_lengths)[*cc];
1839     break;
1840 
1841     /* Handle literal characters */
1842 
1843     case OP_CHAR:
1844     case OP_CHARI:
1845     case OP_NOT:
1846     case OP_NOTI:
1847     branchlength++;
1848     cc += 2;
1849 #ifdef SUPPORT_UTF
1850     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1851 #endif
1852     break;
1853 
1854     /* Handle exact repetitions. The count is already in characters, but we
1855     need to skip over a multibyte character in UTF8 mode.  */
1856 
1857     case OP_EXACT:
1858     case OP_EXACTI:
1859     case OP_NOTEXACT:
1860     case OP_NOTEXACTI:
1861     branchlength += (int)GET2(cc,1);
1862     cc += 2 + IMM2_SIZE;
1863 #ifdef SUPPORT_UTF
1864     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1865 #endif
1866     break;
1867 
1868     case OP_TYPEEXACT:
1869     branchlength += GET2(cc,1);
1870     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1871       cc += 2;
1872     cc += 1 + IMM2_SIZE + 1;
1873     break;
1874 
1875     /* Handle single-char matchers */
1876 
1877     case OP_PROP:
1878     case OP_NOTPROP:
1879     cc += 2;
1880     /* Fall through */
1881 
1882     case OP_HSPACE:
1883     case OP_VSPACE:
1884     case OP_NOT_HSPACE:
1885     case OP_NOT_VSPACE:
1886     case OP_NOT_DIGIT:
1887     case OP_DIGIT:
1888     case OP_NOT_WHITESPACE:
1889     case OP_WHITESPACE:
1890     case OP_NOT_WORDCHAR:
1891     case OP_WORDCHAR:
1892     case OP_ANY:
1893     case OP_ALLANY:
1894     branchlength++;
1895     cc++;
1896     break;
1897 
1898     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1899     otherwise \C is coded as OP_ALLANY. */
1900 
1901     case OP_ANYBYTE:
1902     return -2;
1903 
1904     /* Check a class for variable quantification */
1905 
1906     case OP_CLASS:
1907     case OP_NCLASS:
1908 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1909     case OP_XCLASS:
1910     /* The original code caused an unsigned overflow in 64 bit systems,
1911     so now we use a conditional statement. */
1912     if (op == OP_XCLASS)
1913       cc += GET(cc, 1);
1914     else
1915       cc += PRIV(OP_lengths)[OP_CLASS];
1916 #else
1917     cc += PRIV(OP_lengths)[OP_CLASS];
1918 #endif
1919 
1920     switch (*cc)
1921       {
1922       case OP_CRSTAR:
1923       case OP_CRMINSTAR:
1924       case OP_CRPLUS:
1925       case OP_CRMINPLUS:
1926       case OP_CRQUERY:
1927       case OP_CRMINQUERY:
1928       case OP_CRPOSSTAR:
1929       case OP_CRPOSPLUS:
1930       case OP_CRPOSQUERY:
1931       return -1;
1932 
1933       case OP_CRRANGE:
1934       case OP_CRMINRANGE:
1935       case OP_CRPOSRANGE:
1936       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1937       branchlength += (int)GET2(cc,1);
1938       cc += 1 + 2 * IMM2_SIZE;
1939       break;
1940 
1941       default:
1942       branchlength++;
1943       }
1944     break;
1945 
1946     /* Anything else is variable length */
1947 
1948     case OP_ANYNL:
1949     case OP_BRAMINZERO:
1950     case OP_BRAPOS:
1951     case OP_BRAPOSZERO:
1952     case OP_BRAZERO:
1953     case OP_CBRAPOS:
1954     case OP_EXTUNI:
1955     case OP_KETRMAX:
1956     case OP_KETRMIN:
1957     case OP_KETRPOS:
1958     case OP_MINPLUS:
1959     case OP_MINPLUSI:
1960     case OP_MINQUERY:
1961     case OP_MINQUERYI:
1962     case OP_MINSTAR:
1963     case OP_MINSTARI:
1964     case OP_MINUPTO:
1965     case OP_MINUPTOI:
1966     case OP_NOTMINPLUS:
1967     case OP_NOTMINPLUSI:
1968     case OP_NOTMINQUERY:
1969     case OP_NOTMINQUERYI:
1970     case OP_NOTMINSTAR:
1971     case OP_NOTMINSTARI:
1972     case OP_NOTMINUPTO:
1973     case OP_NOTMINUPTOI:
1974     case OP_NOTPLUS:
1975     case OP_NOTPLUSI:
1976     case OP_NOTPOSPLUS:
1977     case OP_NOTPOSPLUSI:
1978     case OP_NOTPOSQUERY:
1979     case OP_NOTPOSQUERYI:
1980     case OP_NOTPOSSTAR:
1981     case OP_NOTPOSSTARI:
1982     case OP_NOTPOSUPTO:
1983     case OP_NOTPOSUPTOI:
1984     case OP_NOTQUERY:
1985     case OP_NOTQUERYI:
1986     case OP_NOTSTAR:
1987     case OP_NOTSTARI:
1988     case OP_NOTUPTO:
1989     case OP_NOTUPTOI:
1990     case OP_PLUS:
1991     case OP_PLUSI:
1992     case OP_POSPLUS:
1993     case OP_POSPLUSI:
1994     case OP_POSQUERY:
1995     case OP_POSQUERYI:
1996     case OP_POSSTAR:
1997     case OP_POSSTARI:
1998     case OP_POSUPTO:
1999     case OP_POSUPTOI:
2000     case OP_QUERY:
2001     case OP_QUERYI:
2002     case OP_REF:
2003     case OP_REFI:
2004     case OP_DNREF:
2005     case OP_DNREFI:
2006     case OP_SBRA:
2007     case OP_SBRAPOS:
2008     case OP_SCBRA:
2009     case OP_SCBRAPOS:
2010     case OP_SCOND:
2011     case OP_SKIPZERO:
2012     case OP_STAR:
2013     case OP_STARI:
2014     case OP_TYPEMINPLUS:
2015     case OP_TYPEMINQUERY:
2016     case OP_TYPEMINSTAR:
2017     case OP_TYPEMINUPTO:
2018     case OP_TYPEPLUS:
2019     case OP_TYPEPOSPLUS:
2020     case OP_TYPEPOSQUERY:
2021     case OP_TYPEPOSSTAR:
2022     case OP_TYPEPOSUPTO:
2023     case OP_TYPEQUERY:
2024     case OP_TYPESTAR:
2025     case OP_TYPEUPTO:
2026     case OP_UPTO:
2027     case OP_UPTOI:
2028     return -1;
2029 
2030     /* Catch unrecognized opcodes so that when new ones are added they
2031     are not forgotten, as has happened in the past. */
2032 
2033     default:
2034     return -4;
2035     }
2036   }
2037 /* Control never gets here */
2038 }
2039 
2040 
2041 
2042 /*************************************************
2043 *    Scan compiled regex for specific bracket    *
2044 *************************************************/
2045 
2046 /* This little function scans through a compiled pattern until it finds a
2047 capturing bracket with the given number, or, if the number is negative, an
2048 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2049 so that it can be called from pcre_study() when finding the minimum matching
2050 length.
2051 
2052 Arguments:
2053   code        points to start of expression
2054   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2055   number      the required bracket number or negative to find a lookbehind
2056 
2057 Returns:      pointer to the opcode for the bracket, or NULL if not found
2058 */
2059 
2060 const pcre_uchar *
PRIV(find_bracket)2061 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2062 {
2063 for (;;)
2064   {
2065   register pcre_uchar c = *code;
2066 
2067   if (c == OP_END) return NULL;
2068 
2069   /* XCLASS is used for classes that cannot be represented just by a bit
2070   map. This includes negated single high-valued characters. The length in
2071   the table is zero; the actual length is stored in the compiled code. */
2072 
2073   if (c == OP_XCLASS) code += GET(code, 1);
2074 
2075   /* Handle recursion */
2076 
2077   else if (c == OP_REVERSE)
2078     {
2079     if (number < 0) return (pcre_uchar *)code;
2080     code += PRIV(OP_lengths)[c];
2081     }
2082 
2083   /* Handle capturing bracket */
2084 
2085   else if (c == OP_CBRA || c == OP_SCBRA ||
2086            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2087     {
2088     int n = (int)GET2(code, 1+LINK_SIZE);
2089     if (n == number) return (pcre_uchar *)code;
2090     code += PRIV(OP_lengths)[c];
2091     }
2092 
2093   /* Otherwise, we can get the item's length from the table, except that for
2094   repeated character types, we have to test for \p and \P, which have an extra
2095   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2096   must add in its length. */
2097 
2098   else
2099     {
2100     switch(c)
2101       {
2102       case OP_TYPESTAR:
2103       case OP_TYPEMINSTAR:
2104       case OP_TYPEPLUS:
2105       case OP_TYPEMINPLUS:
2106       case OP_TYPEQUERY:
2107       case OP_TYPEMINQUERY:
2108       case OP_TYPEPOSSTAR:
2109       case OP_TYPEPOSPLUS:
2110       case OP_TYPEPOSQUERY:
2111       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2112       break;
2113 
2114       case OP_TYPEUPTO:
2115       case OP_TYPEMINUPTO:
2116       case OP_TYPEEXACT:
2117       case OP_TYPEPOSUPTO:
2118       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2119         code += 2;
2120       break;
2121 
2122       case OP_MARK:
2123       case OP_PRUNE_ARG:
2124       case OP_SKIP_ARG:
2125       case OP_THEN_ARG:
2126       code += code[1];
2127       break;
2128       }
2129 
2130     /* Add in the fixed length from the table */
2131 
2132     code += PRIV(OP_lengths)[c];
2133 
2134   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2135   a multi-byte character. The length in the table is a minimum, so we have to
2136   arrange to skip the extra bytes. */
2137 
2138 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2139     if (utf) switch(c)
2140       {
2141       case OP_CHAR:
2142       case OP_CHARI:
2143       case OP_NOT:
2144       case OP_NOTI:
2145       case OP_EXACT:
2146       case OP_EXACTI:
2147       case OP_NOTEXACT:
2148       case OP_NOTEXACTI:
2149       case OP_UPTO:
2150       case OP_UPTOI:
2151       case OP_NOTUPTO:
2152       case OP_NOTUPTOI:
2153       case OP_MINUPTO:
2154       case OP_MINUPTOI:
2155       case OP_NOTMINUPTO:
2156       case OP_NOTMINUPTOI:
2157       case OP_POSUPTO:
2158       case OP_POSUPTOI:
2159       case OP_NOTPOSUPTO:
2160       case OP_NOTPOSUPTOI:
2161       case OP_STAR:
2162       case OP_STARI:
2163       case OP_NOTSTAR:
2164       case OP_NOTSTARI:
2165       case OP_MINSTAR:
2166       case OP_MINSTARI:
2167       case OP_NOTMINSTAR:
2168       case OP_NOTMINSTARI:
2169       case OP_POSSTAR:
2170       case OP_POSSTARI:
2171       case OP_NOTPOSSTAR:
2172       case OP_NOTPOSSTARI:
2173       case OP_PLUS:
2174       case OP_PLUSI:
2175       case OP_NOTPLUS:
2176       case OP_NOTPLUSI:
2177       case OP_MINPLUS:
2178       case OP_MINPLUSI:
2179       case OP_NOTMINPLUS:
2180       case OP_NOTMINPLUSI:
2181       case OP_POSPLUS:
2182       case OP_POSPLUSI:
2183       case OP_NOTPOSPLUS:
2184       case OP_NOTPOSPLUSI:
2185       case OP_QUERY:
2186       case OP_QUERYI:
2187       case OP_NOTQUERY:
2188       case OP_NOTQUERYI:
2189       case OP_MINQUERY:
2190       case OP_MINQUERYI:
2191       case OP_NOTMINQUERY:
2192       case OP_NOTMINQUERYI:
2193       case OP_POSQUERY:
2194       case OP_POSQUERYI:
2195       case OP_NOTPOSQUERY:
2196       case OP_NOTPOSQUERYI:
2197       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2198       break;
2199       }
2200 #else
2201     (void)(utf);  /* Keep compiler happy by referencing function argument */
2202 #endif
2203     }
2204   }
2205 }
2206 
2207 
2208 
2209 /*************************************************
2210 *   Scan compiled regex for recursion reference  *
2211 *************************************************/
2212 
2213 /* This little function scans through a compiled pattern until it finds an
2214 instance of OP_RECURSE.
2215 
2216 Arguments:
2217   code        points to start of expression
2218   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2219 
2220 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2221 */
2222 
2223 static const pcre_uchar *
find_recurse(const pcre_uchar * code,BOOL utf)2224 find_recurse(const pcre_uchar *code, BOOL utf)
2225 {
2226 for (;;)
2227   {
2228   register pcre_uchar c = *code;
2229   if (c == OP_END) return NULL;
2230   if (c == OP_RECURSE) return code;
2231 
2232   /* XCLASS is used for classes that cannot be represented just by a bit
2233   map. This includes negated single high-valued characters. The length in
2234   the table is zero; the actual length is stored in the compiled code. */
2235 
2236   if (c == OP_XCLASS) code += GET(code, 1);
2237 
2238   /* Otherwise, we can get the item's length from the table, except that for
2239   repeated character types, we have to test for \p and \P, which have an extra
2240   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2241   must add in its length. */
2242 
2243   else
2244     {
2245     switch(c)
2246       {
2247       case OP_TYPESTAR:
2248       case OP_TYPEMINSTAR:
2249       case OP_TYPEPLUS:
2250       case OP_TYPEMINPLUS:
2251       case OP_TYPEQUERY:
2252       case OP_TYPEMINQUERY:
2253       case OP_TYPEPOSSTAR:
2254       case OP_TYPEPOSPLUS:
2255       case OP_TYPEPOSQUERY:
2256       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2257       break;
2258 
2259       case OP_TYPEPOSUPTO:
2260       case OP_TYPEUPTO:
2261       case OP_TYPEMINUPTO:
2262       case OP_TYPEEXACT:
2263       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2264         code += 2;
2265       break;
2266 
2267       case OP_MARK:
2268       case OP_PRUNE_ARG:
2269       case OP_SKIP_ARG:
2270       case OP_THEN_ARG:
2271       code += code[1];
2272       break;
2273       }
2274 
2275     /* Add in the fixed length from the table */
2276 
2277     code += PRIV(OP_lengths)[c];
2278 
2279     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2280     by a multi-byte character. The length in the table is a minimum, so we have
2281     to arrange to skip the extra bytes. */
2282 
2283 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2284     if (utf) switch(c)
2285       {
2286       case OP_CHAR:
2287       case OP_CHARI:
2288       case OP_NOT:
2289       case OP_NOTI:
2290       case OP_EXACT:
2291       case OP_EXACTI:
2292       case OP_NOTEXACT:
2293       case OP_NOTEXACTI:
2294       case OP_UPTO:
2295       case OP_UPTOI:
2296       case OP_NOTUPTO:
2297       case OP_NOTUPTOI:
2298       case OP_MINUPTO:
2299       case OP_MINUPTOI:
2300       case OP_NOTMINUPTO:
2301       case OP_NOTMINUPTOI:
2302       case OP_POSUPTO:
2303       case OP_POSUPTOI:
2304       case OP_NOTPOSUPTO:
2305       case OP_NOTPOSUPTOI:
2306       case OP_STAR:
2307       case OP_STARI:
2308       case OP_NOTSTAR:
2309       case OP_NOTSTARI:
2310       case OP_MINSTAR:
2311       case OP_MINSTARI:
2312       case OP_NOTMINSTAR:
2313       case OP_NOTMINSTARI:
2314       case OP_POSSTAR:
2315       case OP_POSSTARI:
2316       case OP_NOTPOSSTAR:
2317       case OP_NOTPOSSTARI:
2318       case OP_PLUS:
2319       case OP_PLUSI:
2320       case OP_NOTPLUS:
2321       case OP_NOTPLUSI:
2322       case OP_MINPLUS:
2323       case OP_MINPLUSI:
2324       case OP_NOTMINPLUS:
2325       case OP_NOTMINPLUSI:
2326       case OP_POSPLUS:
2327       case OP_POSPLUSI:
2328       case OP_NOTPOSPLUS:
2329       case OP_NOTPOSPLUSI:
2330       case OP_QUERY:
2331       case OP_QUERYI:
2332       case OP_NOTQUERY:
2333       case OP_NOTQUERYI:
2334       case OP_MINQUERY:
2335       case OP_MINQUERYI:
2336       case OP_NOTMINQUERY:
2337       case OP_NOTMINQUERYI:
2338       case OP_POSQUERY:
2339       case OP_POSQUERYI:
2340       case OP_NOTPOSQUERY:
2341       case OP_NOTPOSQUERYI:
2342       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2343       break;
2344       }
2345 #else
2346     (void)(utf);  /* Keep compiler happy by referencing function argument */
2347 #endif
2348     }
2349   }
2350 }
2351 
2352 
2353 
2354 /*************************************************
2355 *    Scan compiled branch for non-emptiness      *
2356 *************************************************/
2357 
2358 /* This function scans through a branch of a compiled pattern to see whether it
2359 can match the empty string or not. It is called from could_be_empty()
2360 below and from compile_branch() when checking for an unlimited repeat of a
2361 group that can match nothing. Note that first_significant_code() skips over
2362 backward and negative forward assertions when its final argument is TRUE. If we
2363 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2364 bracket whose current branch will already have been scanned.
2365 
2366 Arguments:
2367   code        points to start of search
2368   endcode     points to where to stop
2369   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2370   cd          contains pointers to tables etc.
2371   recurses    chain of recurse_check to catch mutual recursion
2372 
2373 Returns:      TRUE if what is matched could be empty
2374 */
2375 
2376 static BOOL
could_be_empty_branch(const pcre_uchar * code,const pcre_uchar * endcode,BOOL utf,compile_data * cd,recurse_check * recurses)2377 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2378   BOOL utf, compile_data *cd, recurse_check *recurses)
2379 {
2380 register pcre_uchar c;
2381 recurse_check this_recurse;
2382 
2383 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2384      code < endcode;
2385      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2386   {
2387   const pcre_uchar *ccode;
2388 
2389   c = *code;
2390 
2391   /* Skip over forward assertions; the other assertions are skipped by
2392   first_significant_code() with a TRUE final argument. */
2393 
2394   if (c == OP_ASSERT)
2395     {
2396     do code += GET(code, 1); while (*code == OP_ALT);
2397     c = *code;
2398     continue;
2399     }
2400 
2401   /* For a recursion/subroutine call, if its end has been reached, which
2402   implies a backward reference subroutine call, we can scan it. If it's a
2403   forward reference subroutine call, we can't. To detect forward reference
2404   we have to scan up the list that is kept in the workspace. This function is
2405   called only when doing the real compile, not during the pre-compile that
2406   measures the size of the compiled pattern. */
2407 
2408   if (c == OP_RECURSE)
2409     {
2410     const pcre_uchar *scode = cd->start_code + GET(code, 1);
2411     const pcre_uchar *endgroup = scode;
2412     BOOL empty_branch;
2413 
2414     /* Test for forward reference or uncompleted reference. This is disabled
2415     when called to scan a completed pattern by setting cd->start_workspace to
2416     NULL. */
2417 
2418     if (cd->start_workspace != NULL)
2419       {
2420       const pcre_uchar *tcode;
2421       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2422         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2423       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2424       }
2425 
2426     /* If the reference is to a completed group, we need to detect whether this
2427     is a recursive call, as otherwise there will be an infinite loop. If it is
2428     a recursion, just skip over it. Simple recursions are easily detected. For
2429     mutual recursions we keep a chain on the stack. */
2430 
2431     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2432     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2433     else
2434       {
2435       recurse_check *r = recurses;
2436       for (r = recurses; r != NULL; r = r->prev)
2437         if (r->group == scode) break;
2438       if (r != NULL) continue;   /* Mutual recursion */
2439       }
2440 
2441     /* Completed reference; scan the referenced group, remembering it on the
2442     stack chain to detect mutual recursions. */
2443 
2444     empty_branch = FALSE;
2445     this_recurse.prev = recurses;
2446     this_recurse.group = scode;
2447 
2448     do
2449       {
2450       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2451         {
2452         empty_branch = TRUE;
2453         break;
2454         }
2455       scode += GET(scode, 1);
2456       }
2457     while (*scode == OP_ALT);
2458 
2459     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2460     continue;
2461     }
2462 
2463   /* Groups with zero repeats can of course be empty; skip them. */
2464 
2465   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2466       c == OP_BRAPOSZERO)
2467     {
2468     code += PRIV(OP_lengths)[c];
2469     do code += GET(code, 1); while (*code == OP_ALT);
2470     c = *code;
2471     continue;
2472     }
2473 
2474   /* A nested group that is already marked as "could be empty" can just be
2475   skipped. */
2476 
2477   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2478       c == OP_SCBRA || c == OP_SCBRAPOS)
2479     {
2480     do code += GET(code, 1); while (*code == OP_ALT);
2481     c = *code;
2482     continue;
2483     }
2484 
2485   /* For other groups, scan the branches. */
2486 
2487   if (c == OP_BRA  || c == OP_BRAPOS ||
2488       c == OP_CBRA || c == OP_CBRAPOS ||
2489       c == OP_ONCE || c == OP_ONCE_NC ||
2490       c == OP_COND)
2491     {
2492     BOOL empty_branch;
2493     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2494 
2495     /* If a conditional group has only one branch, there is a second, implied,
2496     empty branch, so just skip over the conditional, because it could be empty.
2497     Otherwise, scan the individual branches of the group. */
2498 
2499     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2500       code += GET(code, 1);
2501     else
2502       {
2503       empty_branch = FALSE;
2504       do
2505         {
2506         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2507           recurses)) empty_branch = TRUE;
2508         code += GET(code, 1);
2509         }
2510       while (*code == OP_ALT);
2511       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2512       }
2513 
2514     c = *code;
2515     continue;
2516     }
2517 
2518   /* Handle the other opcodes */
2519 
2520   switch (c)
2521     {
2522     /* Check for quantifiers after a class. XCLASS is used for classes that
2523     cannot be represented just by a bit map. This includes negated single
2524     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2525     actual length is stored in the compiled code, so we must update "code"
2526     here. */
2527 
2528 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2529     case OP_XCLASS:
2530     ccode = code += GET(code, 1);
2531     goto CHECK_CLASS_REPEAT;
2532 #endif
2533 
2534     case OP_CLASS:
2535     case OP_NCLASS:
2536     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2537 
2538 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2539     CHECK_CLASS_REPEAT:
2540 #endif
2541 
2542     switch (*ccode)
2543       {
2544       case OP_CRSTAR:            /* These could be empty; continue */
2545       case OP_CRMINSTAR:
2546       case OP_CRQUERY:
2547       case OP_CRMINQUERY:
2548       case OP_CRPOSSTAR:
2549       case OP_CRPOSQUERY:
2550       break;
2551 
2552       default:                   /* Non-repeat => class must match */
2553       case OP_CRPLUS:            /* These repeats aren't empty */
2554       case OP_CRMINPLUS:
2555       case OP_CRPOSPLUS:
2556       return FALSE;
2557 
2558       case OP_CRRANGE:
2559       case OP_CRMINRANGE:
2560       case OP_CRPOSRANGE:
2561       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2562       break;
2563       }
2564     break;
2565 
2566     /* Opcodes that must match a character */
2567 
2568     case OP_ANY:
2569     case OP_ALLANY:
2570     case OP_ANYBYTE:
2571 
2572     case OP_PROP:
2573     case OP_NOTPROP:
2574     case OP_ANYNL:
2575 
2576     case OP_NOT_HSPACE:
2577     case OP_HSPACE:
2578     case OP_NOT_VSPACE:
2579     case OP_VSPACE:
2580     case OP_EXTUNI:
2581 
2582     case OP_NOT_DIGIT:
2583     case OP_DIGIT:
2584     case OP_NOT_WHITESPACE:
2585     case OP_WHITESPACE:
2586     case OP_NOT_WORDCHAR:
2587     case OP_WORDCHAR:
2588 
2589     case OP_CHAR:
2590     case OP_CHARI:
2591     case OP_NOT:
2592     case OP_NOTI:
2593 
2594     case OP_PLUS:
2595     case OP_PLUSI:
2596     case OP_MINPLUS:
2597     case OP_MINPLUSI:
2598 
2599     case OP_NOTPLUS:
2600     case OP_NOTPLUSI:
2601     case OP_NOTMINPLUS:
2602     case OP_NOTMINPLUSI:
2603 
2604     case OP_POSPLUS:
2605     case OP_POSPLUSI:
2606     case OP_NOTPOSPLUS:
2607     case OP_NOTPOSPLUSI:
2608 
2609     case OP_EXACT:
2610     case OP_EXACTI:
2611     case OP_NOTEXACT:
2612     case OP_NOTEXACTI:
2613 
2614     case OP_TYPEPLUS:
2615     case OP_TYPEMINPLUS:
2616     case OP_TYPEPOSPLUS:
2617     case OP_TYPEEXACT:
2618 
2619     return FALSE;
2620 
2621     /* These are going to continue, as they may be empty, but we have to
2622     fudge the length for the \p and \P cases. */
2623 
2624     case OP_TYPESTAR:
2625     case OP_TYPEMINSTAR:
2626     case OP_TYPEPOSSTAR:
2627     case OP_TYPEQUERY:
2628     case OP_TYPEMINQUERY:
2629     case OP_TYPEPOSQUERY:
2630     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2631     break;
2632 
2633     /* Same for these */
2634 
2635     case OP_TYPEUPTO:
2636     case OP_TYPEMINUPTO:
2637     case OP_TYPEPOSUPTO:
2638     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2639       code += 2;
2640     break;
2641 
2642     /* End of branch */
2643 
2644     case OP_KET:
2645     case OP_KETRMAX:
2646     case OP_KETRMIN:
2647     case OP_KETRPOS:
2648     case OP_ALT:
2649     return TRUE;
2650 
2651     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2652     MINUPTO, and POSUPTO and their caseless and negative versions may be
2653     followed by a multibyte character. */
2654 
2655 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2656     case OP_STAR:
2657     case OP_STARI:
2658     case OP_NOTSTAR:
2659     case OP_NOTSTARI:
2660 
2661     case OP_MINSTAR:
2662     case OP_MINSTARI:
2663     case OP_NOTMINSTAR:
2664     case OP_NOTMINSTARI:
2665 
2666     case OP_POSSTAR:
2667     case OP_POSSTARI:
2668     case OP_NOTPOSSTAR:
2669     case OP_NOTPOSSTARI:
2670 
2671     case OP_QUERY:
2672     case OP_QUERYI:
2673     case OP_NOTQUERY:
2674     case OP_NOTQUERYI:
2675 
2676     case OP_MINQUERY:
2677     case OP_MINQUERYI:
2678     case OP_NOTMINQUERY:
2679     case OP_NOTMINQUERYI:
2680 
2681     case OP_POSQUERY:
2682     case OP_POSQUERYI:
2683     case OP_NOTPOSQUERY:
2684     case OP_NOTPOSQUERYI:
2685 
2686     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2687     break;
2688 
2689     case OP_UPTO:
2690     case OP_UPTOI:
2691     case OP_NOTUPTO:
2692     case OP_NOTUPTOI:
2693 
2694     case OP_MINUPTO:
2695     case OP_MINUPTOI:
2696     case OP_NOTMINUPTO:
2697     case OP_NOTMINUPTOI:
2698 
2699     case OP_POSUPTO:
2700     case OP_POSUPTOI:
2701     case OP_NOTPOSUPTO:
2702     case OP_NOTPOSUPTOI:
2703 
2704     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2705     break;
2706 #endif
2707 
2708     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2709     string. */
2710 
2711     case OP_MARK:
2712     case OP_PRUNE_ARG:
2713     case OP_SKIP_ARG:
2714     case OP_THEN_ARG:
2715     code += code[1];
2716     break;
2717 
2718     /* None of the remaining opcodes are required to match a character. */
2719 
2720     default:
2721     break;
2722     }
2723   }
2724 
2725 return TRUE;
2726 }
2727 
2728 
2729 
2730 /*************************************************
2731 *    Scan compiled regex for non-emptiness       *
2732 *************************************************/
2733 
2734 /* This function is called to check for left recursive calls. We want to check
2735 the current branch of the current pattern to see if it could match the empty
2736 string. If it could, we must look outwards for branches at other levels,
2737 stopping when we pass beyond the bracket which is the subject of the recursion.
2738 This function is called only during the real compile, not during the
2739 pre-compile.
2740 
2741 Arguments:
2742   code        points to start of the recursion
2743   endcode     points to where to stop (current RECURSE item)
2744   bcptr       points to the chain of current (unclosed) branch starts
2745   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2746   cd          pointers to tables etc
2747 
2748 Returns:      TRUE if what is matched could be empty
2749 */
2750 
2751 static BOOL
could_be_empty(const pcre_uchar * code,const pcre_uchar * endcode,branch_chain * bcptr,BOOL utf,compile_data * cd)2752 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2753   branch_chain *bcptr, BOOL utf, compile_data *cd)
2754 {
2755 while (bcptr != NULL && bcptr->current_branch >= code)
2756   {
2757   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2758     return FALSE;
2759   bcptr = bcptr->outer;
2760   }
2761 return TRUE;
2762 }
2763 
2764 
2765 
2766 /*************************************************
2767 *        Base opcode of repeated opcodes         *
2768 *************************************************/
2769 
2770 /* Returns the base opcode for repeated single character type opcodes. If the
2771 opcode is not a repeated character type, it returns with the original value.
2772 
2773 Arguments:  c opcode
2774 Returns:    base opcode for the type
2775 */
2776 
2777 static pcre_uchar
get_repeat_base(pcre_uchar c)2778 get_repeat_base(pcre_uchar c)
2779 {
2780 return (c > OP_TYPEPOSUPTO)? c :
2781        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2782        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2783        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2784        (c >= OP_STARI)?      OP_STARI :
2785                              OP_STAR;
2786 }
2787 
2788 
2789 
2790 #ifdef SUPPORT_UCP
2791 /*************************************************
2792 *        Check a character and a property        *
2793 *************************************************/
2794 
2795 /* This function is called by check_auto_possessive() when a property item
2796 is adjacent to a fixed character.
2797 
2798 Arguments:
2799   c            the character
2800   ptype        the property type
2801   pdata        the data for the type
2802   negated      TRUE if it's a negated property (\P or \p{^)
2803 
2804 Returns:       TRUE if auto-possessifying is OK
2805 */
2806 
2807 static BOOL
check_char_prop(pcre_uint32 c,unsigned int ptype,unsigned int pdata,BOOL negated)2808 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2809   BOOL negated)
2810 {
2811 const pcre_uint32 *p;
2812 const ucd_record *prop = GET_UCD(c);
2813 
2814 switch(ptype)
2815   {
2816   case PT_LAMP:
2817   return (prop->chartype == ucp_Lu ||
2818           prop->chartype == ucp_Ll ||
2819           prop->chartype == ucp_Lt) == negated;
2820 
2821   case PT_GC:
2822   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2823 
2824   case PT_PC:
2825   return (pdata == prop->chartype) == negated;
2826 
2827   case PT_SC:
2828   return (pdata == prop->script) == negated;
2829 
2830   /* These are specials */
2831 
2832   case PT_ALNUM:
2833   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2834           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2835 
2836   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2837   means that Perl space and POSIX space are now identical. PCRE was changed
2838   at release 8.34. */
2839 
2840   case PT_SPACE:    /* Perl space */
2841   case PT_PXSPACE:  /* POSIX space */
2842   switch(c)
2843     {
2844     HSPACE_CASES:
2845     VSPACE_CASES:
2846     return negated;
2847 
2848     default:
2849     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2850     }
2851   break;  /* Control never reaches here */
2852 
2853   case PT_WORD:
2854   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2855           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2856           c == CHAR_UNDERSCORE) == negated;
2857 
2858   case PT_CLIST:
2859   p = PRIV(ucd_caseless_sets) + prop->caseset;
2860   for (;;)
2861     {
2862     if (c < *p) return !negated;
2863     if (c == *p++) return negated;
2864     }
2865   break;  /* Control never reaches here */
2866   }
2867 
2868 return FALSE;
2869 }
2870 #endif  /* SUPPORT_UCP */
2871 
2872 
2873 
2874 /*************************************************
2875 *        Fill the character property list        *
2876 *************************************************/
2877 
2878 /* Checks whether the code points to an opcode that can take part in auto-
2879 possessification, and if so, fills a list with its properties.
2880 
2881 Arguments:
2882   code        points to start of expression
2883   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2884   fcc         points to case-flipping table
2885   list        points to output list
2886               list[0] will be filled with the opcode
2887               list[1] will be non-zero if this opcode
2888                 can match an empty character string
2889               list[2..7] depends on the opcode
2890 
2891 Returns:      points to the start of the next opcode if *code is accepted
2892               NULL if *code is not accepted
2893 */
2894 
2895 static const pcre_uchar *
get_chr_property_list(const pcre_uchar * code,BOOL utf,const pcre_uint8 * fcc,pcre_uint32 * list)2896 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2897   const pcre_uint8 *fcc, pcre_uint32 *list)
2898 {
2899 pcre_uchar c = *code;
2900 pcre_uchar base;
2901 const pcre_uchar *end;
2902 pcre_uint32 chr;
2903 
2904 #ifdef SUPPORT_UCP
2905 pcre_uint32 *clist_dest;
2906 const pcre_uint32 *clist_src;
2907 #else
2908 utf = utf;  /* Suppress "unused parameter" compiler warning */
2909 #endif
2910 
2911 list[0] = c;
2912 list[1] = FALSE;
2913 code++;
2914 
2915 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2916   {
2917   base = get_repeat_base(c);
2918   c -= (base - OP_STAR);
2919 
2920   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2921     code += IMM2_SIZE;
2922 
2923   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2924 
2925   switch(base)
2926     {
2927     case OP_STAR:
2928     list[0] = OP_CHAR;
2929     break;
2930 
2931     case OP_STARI:
2932     list[0] = OP_CHARI;
2933     break;
2934 
2935     case OP_NOTSTAR:
2936     list[0] = OP_NOT;
2937     break;
2938 
2939     case OP_NOTSTARI:
2940     list[0] = OP_NOTI;
2941     break;
2942 
2943     case OP_TYPESTAR:
2944     list[0] = *code;
2945     code++;
2946     break;
2947     }
2948   c = list[0];
2949   }
2950 
2951 switch(c)
2952   {
2953   case OP_NOT_DIGIT:
2954   case OP_DIGIT:
2955   case OP_NOT_WHITESPACE:
2956   case OP_WHITESPACE:
2957   case OP_NOT_WORDCHAR:
2958   case OP_WORDCHAR:
2959   case OP_ANY:
2960   case OP_ALLANY:
2961   case OP_ANYNL:
2962   case OP_NOT_HSPACE:
2963   case OP_HSPACE:
2964   case OP_NOT_VSPACE:
2965   case OP_VSPACE:
2966   case OP_EXTUNI:
2967   case OP_EODN:
2968   case OP_EOD:
2969   case OP_DOLL:
2970   case OP_DOLLM:
2971   return code;
2972 
2973   case OP_CHAR:
2974   case OP_NOT:
2975   GETCHARINCTEST(chr, code);
2976   list[2] = chr;
2977   list[3] = NOTACHAR;
2978   return code;
2979 
2980   case OP_CHARI:
2981   case OP_NOTI:
2982   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2983   GETCHARINCTEST(chr, code);
2984   list[2] = chr;
2985 
2986 #ifdef SUPPORT_UCP
2987   if (chr < 128 || (chr < 256 && !utf))
2988     list[3] = fcc[chr];
2989   else
2990     list[3] = UCD_OTHERCASE(chr);
2991 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2992   list[3] = (chr < 256) ? fcc[chr] : chr;
2993 #else
2994   list[3] = fcc[chr];
2995 #endif
2996 
2997   /* The othercase might be the same value. */
2998 
2999   if (chr == list[3])
3000     list[3] = NOTACHAR;
3001   else
3002     list[4] = NOTACHAR;
3003   return code;
3004 
3005 #ifdef SUPPORT_UCP
3006   case OP_PROP:
3007   case OP_NOTPROP:
3008   if (code[0] != PT_CLIST)
3009     {
3010     list[2] = code[0];
3011     list[3] = code[1];
3012     return code + 2;
3013     }
3014 
3015   /* Convert only if we have enough space. */
3016 
3017   clist_src = PRIV(ucd_caseless_sets) + code[1];
3018   clist_dest = list + 2;
3019   code += 2;
3020 
3021   do {
3022      if (clist_dest >= list + 8)
3023        {
3024        /* Early return if there is not enough space. This should never
3025        happen, since all clists are shorter than 5 character now. */
3026        list[2] = code[0];
3027        list[3] = code[1];
3028        return code;
3029        }
3030      *clist_dest++ = *clist_src;
3031      }
3032   while(*clist_src++ != NOTACHAR);
3033 
3034   /* All characters are stored. The terminating NOTACHAR
3035   is copied form the clist itself. */
3036 
3037   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3038   return code;
3039 #endif
3040 
3041   case OP_NCLASS:
3042   case OP_CLASS:
3043 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3044   case OP_XCLASS:
3045   if (c == OP_XCLASS)
3046     end = code + GET(code, 0) - 1;
3047   else
3048 #endif
3049     end = code + 32 / sizeof(pcre_uchar);
3050 
3051   switch(*end)
3052     {
3053     case OP_CRSTAR:
3054     case OP_CRMINSTAR:
3055     case OP_CRQUERY:
3056     case OP_CRMINQUERY:
3057     case OP_CRPOSSTAR:
3058     case OP_CRPOSQUERY:
3059     list[1] = TRUE;
3060     end++;
3061     break;
3062 
3063     case OP_CRPLUS:
3064     case OP_CRMINPLUS:
3065     case OP_CRPOSPLUS:
3066     end++;
3067     break;
3068 
3069     case OP_CRRANGE:
3070     case OP_CRMINRANGE:
3071     case OP_CRPOSRANGE:
3072     list[1] = (GET2(end, 1) == 0);
3073     end += 1 + 2 * IMM2_SIZE;
3074     break;
3075     }
3076   list[2] = (pcre_uint32)(end - code);
3077   return end;
3078   }
3079 return NULL;    /* Opcode not accepted */
3080 }
3081 
3082 
3083 
3084 /*************************************************
3085 *    Scan further character sets for match       *
3086 *************************************************/
3087 
3088 /* Checks whether the base and the current opcode have a common character, in
3089 which case the base cannot be possessified.
3090 
3091 Arguments:
3092   code        points to the byte code
3093   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3094   cd          static compile data
3095   base_list   the data list of the base opcode
3096 
3097 Returns:      TRUE if the auto-possessification is possible
3098 */
3099 
3100 static BOOL
compare_opcodes(const pcre_uchar * code,BOOL utf,const compile_data * cd,const pcre_uint32 * base_list,const pcre_uchar * base_end,int * rec_limit)3101 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3102   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3103 {
3104 pcre_uchar c;
3105 pcre_uint32 list[8];
3106 const pcre_uint32 *chr_ptr;
3107 const pcre_uint32 *ochr_ptr;
3108 const pcre_uint32 *list_ptr;
3109 const pcre_uchar *next_code;
3110 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3111 const pcre_uchar *xclass_flags;
3112 #endif
3113 const pcre_uint8 *class_bitset;
3114 const pcre_uint8 *set1, *set2, *set_end;
3115 pcre_uint32 chr;
3116 BOOL accepted, invert_bits;
3117 BOOL entered_a_group = FALSE;
3118 
3119 if (*rec_limit == 0) return FALSE;
3120 --(*rec_limit);
3121 
3122 /* Note: the base_list[1] contains whether the current opcode has greedy
3123 (represented by a non-zero value) quantifier. This is a different from
3124 other character type lists, which stores here that the character iterator
3125 matches to an empty string (also represented by a non-zero value). */
3126 
3127 for(;;)
3128   {
3129   /* All operations move the code pointer forward.
3130   Therefore infinite recursions are not possible. */
3131 
3132   c = *code;
3133 
3134   /* Skip over callouts */
3135 
3136   if (c == OP_CALLOUT)
3137     {
3138     code += PRIV(OP_lengths)[c];
3139     continue;
3140     }
3141 
3142   if (c == OP_ALT)
3143     {
3144     do code += GET(code, 1); while (*code == OP_ALT);
3145     c = *code;
3146     }
3147 
3148   switch(c)
3149     {
3150     case OP_END:
3151     case OP_KETRPOS:
3152     /* TRUE only in greedy case. The non-greedy case could be replaced by
3153     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3154     uses more memory, which we cannot get at this stage.) */
3155 
3156     return base_list[1] != 0;
3157 
3158     case OP_KET:
3159     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3160     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3161     cannot be converted to a possessive form. */
3162 
3163     if (base_list[1] == 0) return FALSE;
3164 
3165     switch(*(code - GET(code, 1)))
3166       {
3167       case OP_ASSERT:
3168       case OP_ASSERT_NOT:
3169       case OP_ASSERTBACK:
3170       case OP_ASSERTBACK_NOT:
3171       case OP_ONCE:
3172       case OP_ONCE_NC:
3173       /* Atomic sub-patterns and assertions can always auto-possessify their
3174       last iterator. However, if the group was entered as a result of checking
3175       a previous iterator, this is not possible. */
3176 
3177       return !entered_a_group;
3178       }
3179 
3180     code += PRIV(OP_lengths)[c];
3181     continue;
3182 
3183     case OP_ONCE:
3184     case OP_ONCE_NC:
3185     case OP_BRA:
3186     case OP_CBRA:
3187     next_code = code + GET(code, 1);
3188     code += PRIV(OP_lengths)[c];
3189 
3190     while (*next_code == OP_ALT)
3191       {
3192       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3193         return FALSE;
3194       code = next_code + 1 + LINK_SIZE;
3195       next_code += GET(next_code, 1);
3196       }
3197 
3198     entered_a_group = TRUE;
3199     continue;
3200 
3201     case OP_BRAZERO:
3202     case OP_BRAMINZERO:
3203 
3204     next_code = code + 1;
3205     if (*next_code != OP_BRA && *next_code != OP_CBRA
3206         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3207 
3208     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3209 
3210     /* The bracket content will be checked by the
3211     OP_BRA/OP_CBRA case above. */
3212     next_code += 1 + LINK_SIZE;
3213     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3214       return FALSE;
3215 
3216     code += PRIV(OP_lengths)[c];
3217     continue;
3218 
3219     default:
3220     break;
3221     }
3222 
3223   /* Check for a supported opcode, and load its properties. */
3224 
3225   code = get_chr_property_list(code, utf, cd->fcc, list);
3226   if (code == NULL) return FALSE;    /* Unsupported */
3227 
3228   /* If either opcode is a small character list, set pointers for comparing
3229   characters from that list with another list, or with a property. */
3230 
3231   if (base_list[0] == OP_CHAR)
3232     {
3233     chr_ptr = base_list + 2;
3234     list_ptr = list;
3235     }
3236   else if (list[0] == OP_CHAR)
3237     {
3238     chr_ptr = list + 2;
3239     list_ptr = base_list;
3240     }
3241 
3242   /* Character bitsets can also be compared to certain opcodes. */
3243 
3244   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3245 #ifdef COMPILE_PCRE8
3246       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3247       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3248 #endif
3249       )
3250     {
3251 #ifdef COMPILE_PCRE8
3252     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3253 #else
3254     if (base_list[0] == OP_CLASS)
3255 #endif
3256       {
3257       set1 = (pcre_uint8 *)(base_end - base_list[2]);
3258       list_ptr = list;
3259       }
3260     else
3261       {
3262       set1 = (pcre_uint8 *)(code - list[2]);
3263       list_ptr = base_list;
3264       }
3265 
3266     invert_bits = FALSE;
3267     switch(list_ptr[0])
3268       {
3269       case OP_CLASS:
3270       case OP_NCLASS:
3271       set2 = (pcre_uint8 *)
3272         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3273       break;
3274 
3275 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3276       case OP_XCLASS:
3277       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3278       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3279       if ((*xclass_flags & XCL_MAP) == 0)
3280         {
3281         /* No bits are set for characters < 256. */
3282         if (list[1] == 0) return TRUE;
3283         /* Might be an empty repeat. */
3284         continue;
3285         }
3286       set2 = (pcre_uint8 *)(xclass_flags + 1);
3287       break;
3288 #endif
3289 
3290       case OP_NOT_DIGIT:
3291       invert_bits = TRUE;
3292       /* Fall through */
3293       case OP_DIGIT:
3294       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3295       break;
3296 
3297       case OP_NOT_WHITESPACE:
3298       invert_bits = TRUE;
3299       /* Fall through */
3300       case OP_WHITESPACE:
3301       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3302       break;
3303 
3304       case OP_NOT_WORDCHAR:
3305       invert_bits = TRUE;
3306       /* Fall through */
3307       case OP_WORDCHAR:
3308       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3309       break;
3310 
3311       default:
3312       return FALSE;
3313       }
3314 
3315     /* Because the sets are unaligned, we need
3316     to perform byte comparison here. */
3317     set_end = set1 + 32;
3318     if (invert_bits)
3319       {
3320       do
3321         {
3322         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3323         }
3324       while (set1 < set_end);
3325       }
3326     else
3327       {
3328       do
3329         {
3330         if ((*set1++ & *set2++) != 0) return FALSE;
3331         }
3332       while (set1 < set_end);
3333       }
3334 
3335     if (list[1] == 0) return TRUE;
3336     /* Might be an empty repeat. */
3337     continue;
3338     }
3339 
3340   /* Some property combinations also acceptable. Unicode property opcodes are
3341   processed specially; the rest can be handled with a lookup table. */
3342 
3343   else
3344     {
3345     pcre_uint32 leftop, rightop;
3346 
3347     leftop = base_list[0];
3348     rightop = list[0];
3349 
3350 #ifdef SUPPORT_UCP
3351     accepted = FALSE; /* Always set in non-unicode case. */
3352     if (leftop == OP_PROP || leftop == OP_NOTPROP)
3353       {
3354       if (rightop == OP_EOD)
3355         accepted = TRUE;
3356       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3357         {
3358         int n;
3359         const pcre_uint8 *p;
3360         BOOL same = leftop == rightop;
3361         BOOL lisprop = leftop == OP_PROP;
3362         BOOL risprop = rightop == OP_PROP;
3363         BOOL bothprop = lisprop && risprop;
3364 
3365         /* There's a table that specifies how each combination is to be
3366         processed:
3367           0   Always return FALSE (never auto-possessify)
3368           1   Character groups are distinct (possessify if both are OP_PROP)
3369           2   Check character categories in the same group (general or particular)
3370           3   Return TRUE if the two opcodes are not the same
3371           ... see comments below
3372         */
3373 
3374         n = propposstab[base_list[2]][list[2]];
3375         switch(n)
3376           {
3377           case 0: break;
3378           case 1: accepted = bothprop; break;
3379           case 2: accepted = (base_list[3] == list[3]) != same; break;
3380           case 3: accepted = !same; break;
3381 
3382           case 4:  /* Left general category, right particular category */
3383           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3384           break;
3385 
3386           case 5:  /* Right general category, left particular category */
3387           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3388           break;
3389 
3390           /* This code is logically tricky. Think hard before fiddling with it.
3391           The posspropstab table has four entries per row. Each row relates to
3392           one of PCRE's special properties such as ALNUM or SPACE or WORD.
3393           Only WORD actually needs all four entries, but using repeats for the
3394           others means they can all use the same code below.
3395 
3396           The first two entries in each row are Unicode general categories, and
3397           apply always, because all the characters they include are part of the
3398           PCRE character set. The third and fourth entries are a general and a
3399           particular category, respectively, that include one or more relevant
3400           characters. One or the other is used, depending on whether the check
3401           is for a general or a particular category. However, in both cases the
3402           category contains more characters than the specials that are defined
3403           for the property being tested against. Therefore, it cannot be used
3404           in a NOTPROP case.
3405 
3406           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3407           Underscore is covered by ucp_P or ucp_Po. */
3408 
3409           case 6:  /* Left alphanum vs right general category */
3410           case 7:  /* Left space vs right general category */
3411           case 8:  /* Left word vs right general category */
3412           p = posspropstab[n-6];
3413           accepted = risprop && lisprop ==
3414             (list[3] != p[0] &&
3415              list[3] != p[1] &&
3416             (list[3] != p[2] || !lisprop));
3417           break;
3418 
3419           case 9:   /* Right alphanum vs left general category */
3420           case 10:  /* Right space vs left general category */
3421           case 11:  /* Right word vs left general category */
3422           p = posspropstab[n-9];
3423           accepted = lisprop && risprop ==
3424             (base_list[3] != p[0] &&
3425              base_list[3] != p[1] &&
3426             (base_list[3] != p[2] || !risprop));
3427           break;
3428 
3429           case 12:  /* Left alphanum vs right particular category */
3430           case 13:  /* Left space vs right particular category */
3431           case 14:  /* Left word vs right particular category */
3432           p = posspropstab[n-12];
3433           accepted = risprop && lisprop ==
3434             (catposstab[p[0]][list[3]] &&
3435              catposstab[p[1]][list[3]] &&
3436             (list[3] != p[3] || !lisprop));
3437           break;
3438 
3439           case 15:  /* Right alphanum vs left particular category */
3440           case 16:  /* Right space vs left particular category */
3441           case 17:  /* Right word vs left particular category */
3442           p = posspropstab[n-15];
3443           accepted = lisprop && risprop ==
3444             (catposstab[p[0]][base_list[3]] &&
3445              catposstab[p[1]][base_list[3]] &&
3446             (base_list[3] != p[3] || !risprop));
3447           break;
3448           }
3449         }
3450       }
3451 
3452     else
3453 #endif  /* SUPPORT_UCP */
3454 
3455     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3456            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3457            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3458 
3459     if (!accepted) return FALSE;
3460 
3461     if (list[1] == 0) return TRUE;
3462     /* Might be an empty repeat. */
3463     continue;
3464     }
3465 
3466   /* Control reaches here only if one of the items is a small character list.
3467   All characters are checked against the other side. */
3468 
3469   do
3470     {
3471     chr = *chr_ptr;
3472 
3473     switch(list_ptr[0])
3474       {
3475       case OP_CHAR:
3476       ochr_ptr = list_ptr + 2;
3477       do
3478         {
3479         if (chr == *ochr_ptr) return FALSE;
3480         ochr_ptr++;
3481         }
3482       while(*ochr_ptr != NOTACHAR);
3483       break;
3484 
3485       case OP_NOT:
3486       ochr_ptr = list_ptr + 2;
3487       do
3488         {
3489         if (chr == *ochr_ptr)
3490           break;
3491         ochr_ptr++;
3492         }
3493       while(*ochr_ptr != NOTACHAR);
3494       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3495       break;
3496 
3497       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3498       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3499 
3500       case OP_DIGIT:
3501       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3502       break;
3503 
3504       case OP_NOT_DIGIT:
3505       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3506       break;
3507 
3508       case OP_WHITESPACE:
3509       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3510       break;
3511 
3512       case OP_NOT_WHITESPACE:
3513       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3514       break;
3515 
3516       case OP_WORDCHAR:
3517       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3518       break;
3519 
3520       case OP_NOT_WORDCHAR:
3521       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3522       break;
3523 
3524       case OP_HSPACE:
3525       switch(chr)
3526         {
3527         HSPACE_CASES: return FALSE;
3528         default: break;
3529         }
3530       break;
3531 
3532       case OP_NOT_HSPACE:
3533       switch(chr)
3534         {
3535         HSPACE_CASES: break;
3536         default: return FALSE;
3537         }
3538       break;
3539 
3540       case OP_ANYNL:
3541       case OP_VSPACE:
3542       switch(chr)
3543         {
3544         VSPACE_CASES: return FALSE;
3545         default: break;
3546         }
3547       break;
3548 
3549       case OP_NOT_VSPACE:
3550       switch(chr)
3551         {
3552         VSPACE_CASES: break;
3553         default: return FALSE;
3554         }
3555       break;
3556 
3557       case OP_DOLL:
3558       case OP_EODN:
3559       switch (chr)
3560         {
3561         case CHAR_CR:
3562         case CHAR_LF:
3563         case CHAR_VT:
3564         case CHAR_FF:
3565         case CHAR_NEL:
3566 #ifndef EBCDIC
3567         case 0x2028:
3568         case 0x2029:
3569 #endif  /* Not EBCDIC */
3570         return FALSE;
3571         }
3572       break;
3573 
3574       case OP_EOD:    /* Can always possessify before \z */
3575       break;
3576 
3577 #ifdef SUPPORT_UCP
3578       case OP_PROP:
3579       case OP_NOTPROP:
3580       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3581             list_ptr[0] == OP_NOTPROP))
3582         return FALSE;
3583       break;
3584 #endif
3585 
3586       case OP_NCLASS:
3587       if (chr > 255) return FALSE;
3588       /* Fall through */
3589 
3590       case OP_CLASS:
3591       if (chr > 255) break;
3592       class_bitset = (pcre_uint8 *)
3593         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3594       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3595       break;
3596 
3597 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3598       case OP_XCLASS:
3599       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3600           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3601       break;
3602 #endif
3603 
3604       default:
3605       return FALSE;
3606       }
3607 
3608     chr_ptr++;
3609     }
3610   while(*chr_ptr != NOTACHAR);
3611 
3612   /* At least one character must be matched from this opcode. */
3613 
3614   if (list[1] == 0) return TRUE;
3615   }
3616 
3617 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3618 but some compilers complain about an unreachable statement. */
3619 
3620 }
3621 
3622 
3623 
3624 /*************************************************
3625 *    Scan compiled regex for auto-possession     *
3626 *************************************************/
3627 
3628 /* Replaces single character iterations with their possessive alternatives
3629 if appropriate. This function modifies the compiled opcode!
3630 
3631 Arguments:
3632   code        points to start of the byte code
3633   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3634   cd          static compile data
3635 
3636 Returns:      nothing
3637 */
3638 
3639 static void
auto_possessify(pcre_uchar * code,BOOL utf,const compile_data * cd)3640 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3641 {
3642 register pcre_uchar c;
3643 const pcre_uchar *end;
3644 pcre_uchar *repeat_opcode;
3645 pcre_uint32 list[8];
3646 int rec_limit;
3647 
3648 for (;;)
3649   {
3650   c = *code;
3651 
3652   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3653   it may compile without complaining, but may get into a loop here if the code
3654   pointer points to a bad value. This is, of course a documentated possibility,
3655   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3656   just give up on this optimization. */
3657 
3658   if (c >= OP_TABLE_LENGTH) return;
3659 
3660   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3661     {
3662     c -= get_repeat_base(c) - OP_STAR;
3663     end = (c <= OP_MINUPTO) ?
3664       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3665     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3666 
3667     rec_limit = 1000;
3668     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3669       {
3670       switch(c)
3671         {
3672         case OP_STAR:
3673         *code += OP_POSSTAR - OP_STAR;
3674         break;
3675 
3676         case OP_MINSTAR:
3677         *code += OP_POSSTAR - OP_MINSTAR;
3678         break;
3679 
3680         case OP_PLUS:
3681         *code += OP_POSPLUS - OP_PLUS;
3682         break;
3683 
3684         case OP_MINPLUS:
3685         *code += OP_POSPLUS - OP_MINPLUS;
3686         break;
3687 
3688         case OP_QUERY:
3689         *code += OP_POSQUERY - OP_QUERY;
3690         break;
3691 
3692         case OP_MINQUERY:
3693         *code += OP_POSQUERY - OP_MINQUERY;
3694         break;
3695 
3696         case OP_UPTO:
3697         *code += OP_POSUPTO - OP_UPTO;
3698         break;
3699 
3700         case OP_MINUPTO:
3701         *code += OP_POSUPTO - OP_MINUPTO;
3702         break;
3703         }
3704       }
3705     c = *code;
3706     }
3707   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3708     {
3709 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3710     if (c == OP_XCLASS)
3711       repeat_opcode = code + GET(code, 1);
3712     else
3713 #endif
3714       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3715 
3716     c = *repeat_opcode;
3717     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3718       {
3719       /* end must not be NULL. */
3720       end = get_chr_property_list(code, utf, cd->fcc, list);
3721 
3722       list[1] = (c & 1) == 0;
3723 
3724       rec_limit = 1000;
3725       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3726         {
3727         switch (c)
3728           {
3729           case OP_CRSTAR:
3730           case OP_CRMINSTAR:
3731           *repeat_opcode = OP_CRPOSSTAR;
3732           break;
3733 
3734           case OP_CRPLUS:
3735           case OP_CRMINPLUS:
3736           *repeat_opcode = OP_CRPOSPLUS;
3737           break;
3738 
3739           case OP_CRQUERY:
3740           case OP_CRMINQUERY:
3741           *repeat_opcode = OP_CRPOSQUERY;
3742           break;
3743 
3744           case OP_CRRANGE:
3745           case OP_CRMINRANGE:
3746           *repeat_opcode = OP_CRPOSRANGE;
3747           break;
3748           }
3749         }
3750       }
3751     c = *code;
3752     }
3753 
3754   switch(c)
3755     {
3756     case OP_END:
3757     return;
3758 
3759     case OP_TYPESTAR:
3760     case OP_TYPEMINSTAR:
3761     case OP_TYPEPLUS:
3762     case OP_TYPEMINPLUS:
3763     case OP_TYPEQUERY:
3764     case OP_TYPEMINQUERY:
3765     case OP_TYPEPOSSTAR:
3766     case OP_TYPEPOSPLUS:
3767     case OP_TYPEPOSQUERY:
3768     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3769     break;
3770 
3771     case OP_TYPEUPTO:
3772     case OP_TYPEMINUPTO:
3773     case OP_TYPEEXACT:
3774     case OP_TYPEPOSUPTO:
3775     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3776       code += 2;
3777     break;
3778 
3779 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3780     case OP_XCLASS:
3781     code += GET(code, 1);
3782     break;
3783 #endif
3784 
3785     case OP_MARK:
3786     case OP_PRUNE_ARG:
3787     case OP_SKIP_ARG:
3788     case OP_THEN_ARG:
3789     code += code[1];
3790     break;
3791     }
3792 
3793   /* Add in the fixed length from the table */
3794 
3795   code += PRIV(OP_lengths)[c];
3796 
3797   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3798   a multi-byte character. The length in the table is a minimum, so we have to
3799   arrange to skip the extra bytes. */
3800 
3801 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3802   if (utf) switch(c)
3803     {
3804     case OP_CHAR:
3805     case OP_CHARI:
3806     case OP_NOT:
3807     case OP_NOTI:
3808     case OP_STAR:
3809     case OP_MINSTAR:
3810     case OP_PLUS:
3811     case OP_MINPLUS:
3812     case OP_QUERY:
3813     case OP_MINQUERY:
3814     case OP_UPTO:
3815     case OP_MINUPTO:
3816     case OP_EXACT:
3817     case OP_POSSTAR:
3818     case OP_POSPLUS:
3819     case OP_POSQUERY:
3820     case OP_POSUPTO:
3821     case OP_STARI:
3822     case OP_MINSTARI:
3823     case OP_PLUSI:
3824     case OP_MINPLUSI:
3825     case OP_QUERYI:
3826     case OP_MINQUERYI:
3827     case OP_UPTOI:
3828     case OP_MINUPTOI:
3829     case OP_EXACTI:
3830     case OP_POSSTARI:
3831     case OP_POSPLUSI:
3832     case OP_POSQUERYI:
3833     case OP_POSUPTOI:
3834     case OP_NOTSTAR:
3835     case OP_NOTMINSTAR:
3836     case OP_NOTPLUS:
3837     case OP_NOTMINPLUS:
3838     case OP_NOTQUERY:
3839     case OP_NOTMINQUERY:
3840     case OP_NOTUPTO:
3841     case OP_NOTMINUPTO:
3842     case OP_NOTEXACT:
3843     case OP_NOTPOSSTAR:
3844     case OP_NOTPOSPLUS:
3845     case OP_NOTPOSQUERY:
3846     case OP_NOTPOSUPTO:
3847     case OP_NOTSTARI:
3848     case OP_NOTMINSTARI:
3849     case OP_NOTPLUSI:
3850     case OP_NOTMINPLUSI:
3851     case OP_NOTQUERYI:
3852     case OP_NOTMINQUERYI:
3853     case OP_NOTUPTOI:
3854     case OP_NOTMINUPTOI:
3855     case OP_NOTEXACTI:
3856     case OP_NOTPOSSTARI:
3857     case OP_NOTPOSPLUSI:
3858     case OP_NOTPOSQUERYI:
3859     case OP_NOTPOSUPTOI:
3860     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3861     break;
3862     }
3863 #else
3864   (void)(utf);  /* Keep compiler happy by referencing function argument */
3865 #endif
3866   }
3867 }
3868 
3869 
3870 
3871 /*************************************************
3872 *           Check for POSIX class syntax         *
3873 *************************************************/
3874 
3875 /* This function is called when the sequence "[:" or "[." or "[=" is
3876 encountered in a character class. It checks whether this is followed by a
3877 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3878 reach an unescaped ']' without the special preceding character, return FALSE.
3879 
3880 Originally, this function only recognized a sequence of letters between the
3881 terminators, but it seems that Perl recognizes any sequence of characters,
3882 though of course unknown POSIX names are subsequently rejected. Perl gives an
3883 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3884 didn't consider this to be a POSIX class. Likewise for [:1234:].
3885 
3886 The problem in trying to be exactly like Perl is in the handling of escapes. We
3887 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3888 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3889 below handles the special case of \], but does not try to do any other escape
3890 processing. This makes it different from Perl for cases such as [:l\ower:]
3891 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3892 "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
3893 I think.
3894 
3895 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3896 It seems that the appearance of a nested POSIX class supersedes an apparent
3897 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3898 a digit.
3899 
3900 In Perl, unescaped square brackets may also appear as part of class names. For
3901 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3902 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3903 seem right at all. PCRE does not allow closing square brackets in POSIX class
3904 names.
3905 
3906 Arguments:
3907   ptr      pointer to the initial [
3908   endptr   where to return the end pointer
3909 
3910 Returns:   TRUE or FALSE
3911 */
3912 
3913 static BOOL
check_posix_syntax(const pcre_uchar * ptr,const pcre_uchar ** endptr)3914 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3915 {
3916 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3917 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3918 for (++ptr; *ptr != CHAR_NULL; ptr++)
3919   {
3920   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3921     ptr++;
3922   else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3923   else
3924     {
3925     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3926       {
3927       *endptr = ptr;
3928       return TRUE;
3929       }
3930     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3931          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3932           ptr[1] == CHAR_EQUALS_SIGN) &&
3933         check_posix_syntax(ptr, endptr))
3934       return FALSE;
3935     }
3936   }
3937 return FALSE;
3938 }
3939 
3940 
3941 
3942 
3943 /*************************************************
3944 *          Check POSIX class name                *
3945 *************************************************/
3946 
3947 /* This function is called to check the name given in a POSIX-style class entry
3948 such as [:alnum:].
3949 
3950 Arguments:
3951   ptr        points to the first letter
3952   len        the length of the name
3953 
3954 Returns:     a value representing the name, or -1 if unknown
3955 */
3956 
3957 static int
check_posix_name(const pcre_uchar * ptr,int len)3958 check_posix_name(const pcre_uchar *ptr, int len)
3959 {
3960 const char *pn = posix_names;
3961 register int yield = 0;
3962 while (posix_name_lengths[yield] != 0)
3963   {
3964   if (len == posix_name_lengths[yield] &&
3965     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3966   pn += posix_name_lengths[yield] + 1;
3967   yield++;
3968   }
3969 return -1;
3970 }
3971 
3972 
3973 /*************************************************
3974 *    Adjust OP_RECURSE items in repeated group   *
3975 *************************************************/
3976 
3977 /* OP_RECURSE items contain an offset from the start of the regex to the group
3978 that is referenced. This means that groups can be replicated for fixed
3979 repetition simply by copying (because the recursion is allowed to refer to
3980 earlier groups that are outside the current group). However, when a group is
3981 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3982 inserted before it, after it has been compiled. This means that any OP_RECURSE
3983 items within it that refer to the group itself or any contained groups have to
3984 have their offsets adjusted. That one of the jobs of this function. Before it
3985 is called, the partially compiled regex must be temporarily terminated with
3986 OP_END.
3987 
3988 This function has been extended with the possibility of forward references for
3989 recursions and subroutine calls. It must also check the list of such references
3990 for the group we are dealing with. If it finds that one of the recursions in
3991 the current group is on this list, it adjusts the offset in the list, not the
3992 value in the reference (which is a group number).
3993 
3994 Arguments:
3995   group      points to the start of the group
3996   adjust     the amount by which the group is to be moved
3997   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
3998   cd         contains pointers to tables etc.
3999   save_hwm_offset   the hwm forward reference offset at the start of the group
4000 
4001 Returns:     nothing
4002 */
4003 
4004 static void
adjust_recurse(pcre_uchar * group,int adjust,BOOL utf,compile_data * cd,size_t save_hwm_offset)4005 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4006   size_t save_hwm_offset)
4007 {
4008 pcre_uchar *ptr = group;
4009 
4010 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4011   {
4012   int offset;
4013   pcre_uchar *hc;
4014 
4015   /* See if this recursion is on the forward reference list. If so, adjust the
4016   reference. */
4017 
4018   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4019        hc += LINK_SIZE)
4020     {
4021     offset = (int)GET(hc, 0);
4022     if (cd->start_code + offset == ptr + 1)
4023       {
4024       PUT(hc, 0, offset + adjust);
4025       break;
4026       }
4027     }
4028 
4029   /* Otherwise, adjust the recursion offset if it's after the start of this
4030   group. */
4031 
4032   if (hc >= cd->hwm)
4033     {
4034     offset = (int)GET(ptr, 1);
4035     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4036     }
4037 
4038   ptr += 1 + LINK_SIZE;
4039   }
4040 }
4041 
4042 
4043 
4044 /*************************************************
4045 *        Insert an automatic callout point       *
4046 *************************************************/
4047 
4048 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4049 callout points before each pattern item.
4050 
4051 Arguments:
4052   code           current code pointer
4053   ptr            current pattern pointer
4054   cd             pointers to tables etc
4055 
4056 Returns:         new code pointer
4057 */
4058 
4059 static pcre_uchar *
auto_callout(pcre_uchar * code,const pcre_uchar * ptr,compile_data * cd)4060 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4061 {
4062 *code++ = OP_CALLOUT;
4063 *code++ = 255;
4064 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4065 PUT(code, LINK_SIZE, 0);                       /* Default length */
4066 return code + 2 * LINK_SIZE;
4067 }
4068 
4069 
4070 
4071 /*************************************************
4072 *         Complete a callout item                *
4073 *************************************************/
4074 
4075 /* A callout item contains the length of the next item in the pattern, which
4076 we can't fill in till after we have reached the relevant point. This is used
4077 for both automatic and manual callouts.
4078 
4079 Arguments:
4080   previous_callout   points to previous callout item
4081   ptr                current pattern pointer
4082   cd                 pointers to tables etc
4083 
4084 Returns:             nothing
4085 */
4086 
4087 static void
complete_callout(pcre_uchar * previous_callout,const pcre_uchar * ptr,compile_data * cd)4088 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4089 {
4090 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4091 PUT(previous_callout, 2 + LINK_SIZE, length);
4092 }
4093 
4094 
4095 
4096 #ifdef SUPPORT_UCP
4097 /*************************************************
4098 *           Get othercase range                  *
4099 *************************************************/
4100 
4101 /* This function is passed the start and end of a class range, in UTF-8 mode
4102 with UCP support. It searches up the characters, looking for ranges of
4103 characters in the "other" case. Each call returns the next one, updating the
4104 start address. A character with multiple other cases is returned on its own
4105 with a special return value.
4106 
4107 Arguments:
4108   cptr        points to starting character value; updated
4109   d           end value
4110   ocptr       where to put start of othercase range
4111   odptr       where to put end of othercase range
4112 
4113 Yield:        -1 when no more
4114                0 when a range is returned
4115               >0 the CASESET offset for char with multiple other cases
4116                 in this case, ocptr contains the original
4117 */
4118 
4119 static int
get_othercase_range(pcre_uint32 * cptr,pcre_uint32 d,pcre_uint32 * ocptr,pcre_uint32 * odptr)4120 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4121   pcre_uint32 *odptr)
4122 {
4123 pcre_uint32 c, othercase, next;
4124 unsigned int co;
4125 
4126 /* Find the first character that has an other case. If it has multiple other
4127 cases, return its case offset value. */
4128 
4129 for (c = *cptr; c <= d; c++)
4130   {
4131   if ((co = UCD_CASESET(c)) != 0)
4132     {
4133     *ocptr = c++;   /* Character that has the set */
4134     *cptr = c;      /* Rest of input range */
4135     return (int)co;
4136     }
4137   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4138   }
4139 
4140 if (c > d) return -1;  /* Reached end of range */
4141 
4142 /* Found a character that has a single other case. Search for the end of the
4143 range, which is either the end of the input range, or a character that has zero
4144 or more than one other cases. */
4145 
4146 *ocptr = othercase;
4147 next = othercase + 1;
4148 
4149 for (++c; c <= d; c++)
4150   {
4151   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4152   next++;
4153   }
4154 
4155 *odptr = next - 1;     /* End of othercase range */
4156 *cptr = c;             /* Rest of input range */
4157 return 0;
4158 }
4159 #endif  /* SUPPORT_UCP */
4160 
4161 
4162 
4163 /*************************************************
4164 *        Add a character or range to a class     *
4165 *************************************************/
4166 
4167 /* This function packages up the logic of adding a character or range of
4168 characters to a class. The character values in the arguments will be within the
4169 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4170 mutually recursive with the function immediately below.
4171 
4172 Arguments:
4173   classbits     the bit map for characters < 256
4174   uchardptr     points to the pointer for extra data
4175   options       the options word
4176   cd            contains pointers to tables etc.
4177   start         start of range character
4178   end           end of range character
4179 
4180 Returns:        the number of < 256 characters added
4181                 the pointer to extra data is updated
4182 */
4183 
4184 static int
add_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,pcre_uint32 start,pcre_uint32 end)4185 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4186   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4187 {
4188 pcre_uint32 c;
4189 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4190 int n8 = 0;
4191 
4192 /* If caseless matching is required, scan the range and process alternate
4193 cases. In Unicode, there are 8-bit characters that have alternate cases that
4194 are greater than 255 and vice-versa. Sometimes we can just extend the original
4195 range. */
4196 
4197 if ((options & PCRE_CASELESS) != 0)
4198   {
4199 #ifdef SUPPORT_UCP
4200   if ((options & PCRE_UTF8) != 0)
4201     {
4202     int rc;
4203     pcre_uint32 oc, od;
4204 
4205     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4206     c = start;
4207 
4208     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4209       {
4210       /* Handle a single character that has more than one other case. */
4211 
4212       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4213         PRIV(ucd_caseless_sets) + rc, oc);
4214 
4215       /* Do nothing if the other case range is within the original range. */
4216 
4217       else if (oc >= start && od <= end) continue;
4218 
4219       /* Extend the original range if there is overlap, noting that if oc < c, we
4220       can't have od > end because a subrange is always shorter than the basic
4221       range. Otherwise, use a recursive call to add the additional range. */
4222 
4223       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4224       else if (od > end && oc <= end + 1)
4225         {
4226         end = od;       /* Extend upwards */
4227         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4228         }
4229       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4230       }
4231     }
4232   else
4233 #endif  /* SUPPORT_UCP */
4234 
4235   /* Not UTF-mode, or no UCP */
4236 
4237   for (c = start; c <= classbits_end; c++)
4238     {
4239     SETBIT(classbits, cd->fcc[c]);
4240     n8++;
4241     }
4242   }
4243 
4244 /* Now handle the original range. Adjust the final value according to the bit
4245 length - this means that the same lists of (e.g.) horizontal spaces can be used
4246 in all cases. */
4247 
4248 #if defined COMPILE_PCRE8
4249 #ifdef SUPPORT_UTF
4250   if ((options & PCRE_UTF8) == 0)
4251 #endif
4252   if (end > 0xff) end = 0xff;
4253 
4254 #elif defined COMPILE_PCRE16
4255 #ifdef SUPPORT_UTF
4256   if ((options & PCRE_UTF16) == 0)
4257 #endif
4258   if (end > 0xffff) end = 0xffff;
4259 
4260 #endif /* COMPILE_PCRE[8|16] */
4261 
4262 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4263 
4264 for (c = start; c <= classbits_end; c++)
4265   {
4266   /* Regardless of start, c will always be <= 255. */
4267   SETBIT(classbits, c);
4268   n8++;
4269   }
4270 
4271 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4272 if (start <= 0xff) start = 0xff + 1;
4273 
4274 if (end >= start)
4275   {
4276   pcre_uchar *uchardata = *uchardptr;
4277 #ifdef SUPPORT_UTF
4278   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4279     {
4280     if (start < end)
4281       {
4282       *uchardata++ = XCL_RANGE;
4283       uchardata += PRIV(ord2utf)(start, uchardata);
4284       uchardata += PRIV(ord2utf)(end, uchardata);
4285       }
4286     else if (start == end)
4287       {
4288       *uchardata++ = XCL_SINGLE;
4289       uchardata += PRIV(ord2utf)(start, uchardata);
4290       }
4291     }
4292   else
4293 #endif  /* SUPPORT_UTF */
4294 
4295   /* Without UTF support, character values are constrained by the bit length,
4296   and can only be > 256 for 16-bit and 32-bit libraries. */
4297 
4298 #ifdef COMPILE_PCRE8
4299     {}
4300 #else
4301   if (start < end)
4302     {
4303     *uchardata++ = XCL_RANGE;
4304     *uchardata++ = start;
4305     *uchardata++ = end;
4306     }
4307   else if (start == end)
4308     {
4309     *uchardata++ = XCL_SINGLE;
4310     *uchardata++ = start;
4311     }
4312 #endif
4313 
4314   *uchardptr = uchardata;   /* Updata extra data pointer */
4315   }
4316 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4317 
4318 return n8;    /* Number of 8-bit characters */
4319 }
4320 
4321 
4322 
4323 
4324 /*************************************************
4325 *        Add a list of characters to a class     *
4326 *************************************************/
4327 
4328 /* This function is used for adding a list of case-equivalent characters to a
4329 class, and also for adding a list of horizontal or vertical whitespace. If the
4330 list is in order (which it should be), ranges of characters are detected and
4331 handled appropriately. This function is mutually recursive with the function
4332 above.
4333 
4334 Arguments:
4335   classbits     the bit map for characters < 256
4336   uchardptr     points to the pointer for extra data
4337   options       the options word
4338   cd            contains pointers to tables etc.
4339   p             points to row of 32-bit values, terminated by NOTACHAR
4340   except        character to omit; this is used when adding lists of
4341                   case-equivalent characters to avoid including the one we
4342                   already know about
4343 
4344 Returns:        the number of < 256 characters added
4345                 the pointer to extra data is updated
4346 */
4347 
4348 static int
add_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p,unsigned int except)4349 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4350   compile_data *cd, const pcre_uint32 *p, unsigned int except)
4351 {
4352 int n8 = 0;
4353 while (p[0] < NOTACHAR)
4354   {
4355   int n = 0;
4356   if (p[0] != except)
4357     {
4358     while(p[n+1] == p[0] + n + 1) n++;
4359     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4360     }
4361   p += n + 1;
4362   }
4363 return n8;
4364 }
4365 
4366 
4367 
4368 /*************************************************
4369 *    Add characters not in a list to a class     *
4370 *************************************************/
4371 
4372 /* This function is used for adding the complement of a list of horizontal or
4373 vertical whitespace to a class. The list must be in order.
4374 
4375 Arguments:
4376   classbits     the bit map for characters < 256
4377   uchardptr     points to the pointer for extra data
4378   options       the options word
4379   cd            contains pointers to tables etc.
4380   p             points to row of 32-bit values, terminated by NOTACHAR
4381 
4382 Returns:        the number of < 256 characters added
4383                 the pointer to extra data is updated
4384 */
4385 
4386 static int
add_not_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p)4387 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4388   int options, compile_data *cd, const pcre_uint32 *p)
4389 {
4390 BOOL utf = (options & PCRE_UTF8) != 0;
4391 int n8 = 0;
4392 if (p[0] > 0)
4393   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4394 while (p[0] < NOTACHAR)
4395   {
4396   while (p[1] == p[0] + 1) p++;
4397   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4398     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4399   p++;
4400   }
4401 return n8;
4402 }
4403 
4404 
4405 
4406 /*************************************************
4407 *           Compile one branch                   *
4408 *************************************************/
4409 
4410 /* Scan the pattern, compiling it into the a vector. If the options are
4411 changed during the branch, the pointer is used to change the external options
4412 bits. This function is used during the pre-compile phase when we are trying
4413 to find out the amount of memory needed, as well as during the real compile
4414 phase. The value of lengthptr distinguishes the two phases.
4415 
4416 Arguments:
4417   optionsptr        pointer to the option bits
4418   codeptr           points to the pointer to the current code point
4419   ptrptr            points to the current pattern pointer
4420   errorcodeptr      points to error code variable
4421   firstcharptr      place to put the first required character
4422   firstcharflagsptr place to put the first character flags, or a negative number
4423   reqcharptr        place to put the last required character
4424   reqcharflagsptr   place to put the last required character flags, or a negative number
4425   bcptr             points to current branch chain
4426   cond_depth        conditional nesting depth
4427   cd                contains pointers to tables etc.
4428   lengthptr         NULL during the real compile phase
4429                     points to length accumulator during pre-compile phase
4430 
4431 Returns:            TRUE on success
4432                     FALSE, with *errorcodeptr set non-zero on error
4433 */
4434 
4435 static BOOL
compile_branch(int * optionsptr,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,int cond_depth,compile_data * cd,int * lengthptr)4436 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4437   const pcre_uchar **ptrptr, int *errorcodeptr,
4438   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4439   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4440   branch_chain *bcptr, int cond_depth,
4441   compile_data *cd, int *lengthptr)
4442 {
4443 int repeat_type, op_type;
4444 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4445 int bravalue = 0;
4446 int greedy_default, greedy_non_default;
4447 pcre_uint32 firstchar, reqchar;
4448 pcre_int32 firstcharflags, reqcharflags;
4449 pcre_uint32 zeroreqchar, zerofirstchar;
4450 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4451 pcre_int32 req_caseopt, reqvary, tempreqvary;
4452 int options = *optionsptr;               /* May change dynamically */
4453 int after_manual_callout = 0;
4454 int length_prevgroup = 0;
4455 register pcre_uint32 c;
4456 int escape;
4457 register pcre_uchar *code = *codeptr;
4458 pcre_uchar *last_code = code;
4459 pcre_uchar *orig_code = code;
4460 pcre_uchar *tempcode;
4461 BOOL inescq = FALSE;
4462 BOOL groupsetfirstchar = FALSE;
4463 const pcre_uchar *ptr = *ptrptr;
4464 const pcre_uchar *tempptr;
4465 const pcre_uchar *nestptr = NULL;
4466 pcre_uchar *previous = NULL;
4467 pcre_uchar *previous_callout = NULL;
4468 size_t save_hwm_offset = 0;
4469 pcre_uint8 classbits[32];
4470 
4471 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4472 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4473 dynamically as we process the pattern. */
4474 
4475 #ifdef SUPPORT_UTF
4476 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4477 BOOL utf = (options & PCRE_UTF8) != 0;
4478 #ifndef COMPILE_PCRE32
4479 pcre_uchar utf_chars[6];
4480 #endif
4481 #else
4482 BOOL utf = FALSE;
4483 #endif
4484 
4485 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4486 class_uchardata always so that it can be passed to add_to_class() always,
4487 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4488 alternative calls for the different cases. */
4489 
4490 pcre_uchar *class_uchardata;
4491 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4492 BOOL xclass;
4493 pcre_uchar *class_uchardata_base;
4494 #endif
4495 
4496 #ifdef PCRE_DEBUG
4497 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4498 #endif
4499 
4500 /* Set up the default and non-default settings for greediness */
4501 
4502 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4503 greedy_non_default = greedy_default ^ 1;
4504 
4505 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4506 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4507 matches a non-fixed char first char; reqchar just remains unset if we never
4508 find one.
4509 
4510 When we hit a repeat whose minimum is zero, we may have to adjust these values
4511 to take the zero repeat into account. This is implemented by setting them to
4512 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4513 item types that can be repeated set these backoff variables appropriately. */
4514 
4515 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4516 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4517 
4518 /* The variable req_caseopt contains either the REQ_CASELESS value
4519 or zero, according to the current setting of the caseless flag. The
4520 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4521 firstchar or reqchar variables to record the case status of the
4522 value. This is used only for ASCII characters. */
4523 
4524 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4525 
4526 /* Switch on next character until the end of the branch */
4527 
4528 for (;; ptr++)
4529   {
4530   BOOL negate_class;
4531   BOOL should_flip_negation;
4532   BOOL possessive_quantifier;
4533   BOOL is_quantifier;
4534   BOOL is_recurse;
4535   BOOL reset_bracount;
4536   int class_has_8bitchar;
4537   int class_one_char;
4538 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4539   BOOL xclass_has_prop;
4540 #endif
4541   int newoptions;
4542   int recno;
4543   int refsign;
4544   int skipbytes;
4545   pcre_uint32 subreqchar, subfirstchar;
4546   pcre_int32 subreqcharflags, subfirstcharflags;
4547   int terminator;
4548   unsigned int mclength;
4549   unsigned int tempbracount;
4550   pcre_uint32 ec;
4551   pcre_uchar mcbuffer[8];
4552 
4553   /* Get next character in the pattern */
4554 
4555   c = *ptr;
4556 
4557   /* If we are at the end of a nested substitution, revert to the outer level
4558   string. Nesting only happens one level deep. */
4559 
4560   if (c == CHAR_NULL && nestptr != NULL)
4561     {
4562     ptr = nestptr;
4563     nestptr = NULL;
4564     c = *ptr;
4565     }
4566 
4567   /* If we are in the pre-compile phase, accumulate the length used for the
4568   previous cycle of this loop. */
4569 
4570   if (lengthptr != NULL)
4571     {
4572 #ifdef PCRE_DEBUG
4573     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4574 #endif
4575     if (code > cd->start_workspace + cd->workspace_size -
4576         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4577       {
4578       *errorcodeptr = ERR52;
4579       goto FAILED;
4580       }
4581 
4582     /* There is at least one situation where code goes backwards: this is the
4583     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4584     the class is simply eliminated. However, it is created first, so we have to
4585     allow memory for it. Therefore, don't ever reduce the length at this point.
4586     */
4587 
4588     if (code < last_code) code = last_code;
4589 
4590     /* Paranoid check for integer overflow */
4591 
4592     if (OFLOW_MAX - *lengthptr < code - last_code)
4593       {
4594       *errorcodeptr = ERR20;
4595       goto FAILED;
4596       }
4597 
4598     *lengthptr += (int)(code - last_code);
4599     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4600       (int)(code - last_code), c, c));
4601 
4602     /* If "previous" is set and it is not at the start of the work space, move
4603     it back to there, in order to avoid filling up the work space. Otherwise,
4604     if "previous" is NULL, reset the current code pointer to the start. */
4605 
4606     if (previous != NULL)
4607       {
4608       if (previous > orig_code)
4609         {
4610         memmove(orig_code, previous, IN_UCHARS(code - previous));
4611         code -= previous - orig_code;
4612         previous = orig_code;
4613         }
4614       }
4615     else code = orig_code;
4616 
4617     /* Remember where this code item starts so we can pick up the length
4618     next time round. */
4619 
4620     last_code = code;
4621     }
4622 
4623   /* In the real compile phase, just check the workspace used by the forward
4624   reference list. */
4625 
4626   else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4627            WORK_SIZE_SAFETY_MARGIN)
4628     {
4629     *errorcodeptr = ERR52;
4630     goto FAILED;
4631     }
4632 
4633   /* If in \Q...\E, check for the end; if not, we have a literal */
4634 
4635   if (inescq && c != CHAR_NULL)
4636     {
4637     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4638       {
4639       inescq = FALSE;
4640       ptr++;
4641       continue;
4642       }
4643     else
4644       {
4645       if (previous_callout != NULL)
4646         {
4647         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4648           complete_callout(previous_callout, ptr, cd);
4649         previous_callout = NULL;
4650         }
4651       if ((options & PCRE_AUTO_CALLOUT) != 0)
4652         {
4653         previous_callout = code;
4654         code = auto_callout(code, ptr, cd);
4655         }
4656       goto NORMAL_CHAR;
4657       }
4658     /* Control does not reach here. */
4659     }
4660 
4661   /* In extended mode, skip white space and comments. We need a loop in order
4662   to check for more white space and more comments after a comment. */
4663 
4664   if ((options & PCRE_EXTENDED) != 0)
4665     {
4666     for (;;)
4667       {
4668       while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4669       if (c != CHAR_NUMBER_SIGN) break;
4670       ptr++;
4671       while (*ptr != CHAR_NULL)
4672         {
4673         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4674           {                          /* IS_NEWLINE sets cd->nllen. */
4675           ptr += cd->nllen;
4676           break;
4677           }
4678         ptr++;
4679 #ifdef SUPPORT_UTF
4680         if (utf) FORWARDCHAR(ptr);
4681 #endif
4682         }
4683       c = *ptr;     /* Either NULL or the char after a newline */
4684       }
4685     }
4686 
4687   /* See if the next thing is a quantifier. */
4688 
4689   is_quantifier =
4690     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4691     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4692 
4693   /* Fill in length of a previous callout, except when the next thing is a
4694   quantifier or when processing a property substitution string in UCP mode. */
4695 
4696   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4697        after_manual_callout-- <= 0)
4698     {
4699     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4700       complete_callout(previous_callout, ptr, cd);
4701     previous_callout = NULL;
4702     }
4703 
4704   /* Create auto callout, except for quantifiers, or while processing property
4705   strings that are substituted for \w etc in UCP mode. */
4706 
4707   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4708     {
4709     previous_callout = code;
4710     code = auto_callout(code, ptr, cd);
4711     }
4712 
4713   /* Process the next pattern item. */
4714 
4715   switch(c)
4716     {
4717     /* ===================================================================*/
4718     case CHAR_NULL:                /* The branch terminates at string end */
4719     case CHAR_VERTICAL_LINE:       /* or | or ) */
4720     case CHAR_RIGHT_PARENTHESIS:
4721     *firstcharptr = firstchar;
4722     *firstcharflagsptr = firstcharflags;
4723     *reqcharptr = reqchar;
4724     *reqcharflagsptr = reqcharflags;
4725     *codeptr = code;
4726     *ptrptr = ptr;
4727     if (lengthptr != NULL)
4728       {
4729       if (OFLOW_MAX - *lengthptr < code - last_code)
4730         {
4731         *errorcodeptr = ERR20;
4732         goto FAILED;
4733         }
4734       *lengthptr += (int)(code - last_code);   /* To include callout length */
4735       DPRINTF((">> end branch\n"));
4736       }
4737     return TRUE;
4738 
4739 
4740     /* ===================================================================*/
4741     /* Handle single-character metacharacters. In multiline mode, ^ disables
4742     the setting of any following char as a first character. */
4743 
4744     case CHAR_CIRCUMFLEX_ACCENT:
4745     previous = NULL;
4746     if ((options & PCRE_MULTILINE) != 0)
4747       {
4748       if (firstcharflags == REQ_UNSET)
4749         zerofirstcharflags = firstcharflags = REQ_NONE;
4750       *code++ = OP_CIRCM;
4751       }
4752     else *code++ = OP_CIRC;
4753     break;
4754 
4755     case CHAR_DOLLAR_SIGN:
4756     previous = NULL;
4757     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4758     break;
4759 
4760     /* There can never be a first char if '.' is first, whatever happens about
4761     repeats. The value of reqchar doesn't change either. */
4762 
4763     case CHAR_DOT:
4764     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4765     zerofirstchar = firstchar;
4766     zerofirstcharflags = firstcharflags;
4767     zeroreqchar = reqchar;
4768     zeroreqcharflags = reqcharflags;
4769     previous = code;
4770     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4771     break;
4772 
4773 
4774     /* ===================================================================*/
4775     /* Character classes. If the included characters are all < 256, we build a
4776     32-byte bitmap of the permitted characters, except in the special case
4777     where there is only one such character. For negated classes, we build the
4778     map as usual, then invert it at the end. However, we use a different opcode
4779     so that data characters > 255 can be handled correctly.
4780 
4781     If the class contains characters outside the 0-255 range, a different
4782     opcode is compiled. It may optionally have a bit map for characters < 256,
4783     but those above are are explicitly listed afterwards. A flag byte tells
4784     whether the bitmap is present, and whether this is a negated class or not.
4785 
4786     In JavaScript compatibility mode, an isolated ']' causes an error. In
4787     default (Perl) mode, it is treated as a data character. */
4788 
4789     case CHAR_RIGHT_SQUARE_BRACKET:
4790     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4791       {
4792       *errorcodeptr = ERR64;
4793       goto FAILED;
4794       }
4795     goto NORMAL_CHAR;
4796 
4797     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4798     used for "start of word" and "end of word". As these are otherwise illegal
4799     sequences, we don't break anything by recognizing them. They are replaced
4800     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4801     erroneous and are handled by the normal code below. */
4802 
4803     case CHAR_LEFT_SQUARE_BRACKET:
4804     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4805       {
4806       nestptr = ptr + 7;
4807       ptr = sub_start_of_word - 1;
4808       continue;
4809       }
4810 
4811     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4812       {
4813       nestptr = ptr + 7;
4814       ptr = sub_end_of_word - 1;
4815       continue;
4816       }
4817 
4818     /* Handle a real character class. */
4819 
4820     previous = code;
4821 
4822     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4823     they are encountered at the top level, so we'll do that too. */
4824 
4825     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4826          ptr[1] == CHAR_EQUALS_SIGN) &&
4827         check_posix_syntax(ptr, &tempptr))
4828       {
4829       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4830       goto FAILED;
4831       }
4832 
4833     /* If the first character is '^', set the negation flag and skip it. Also,
4834     if the first few characters (either before or after ^) are \Q\E or \E we
4835     skip them too. This makes for compatibility with Perl. */
4836 
4837     negate_class = FALSE;
4838     for (;;)
4839       {
4840       c = *(++ptr);
4841       if (c == CHAR_BACKSLASH)
4842         {
4843         if (ptr[1] == CHAR_E)
4844           ptr++;
4845         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4846           ptr += 3;
4847         else
4848           break;
4849         }
4850       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4851         negate_class = TRUE;
4852       else break;
4853       }
4854 
4855     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4856     an initial ']' is taken as a data character -- the code below handles
4857     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4858     [^] must match any character, so generate OP_ALLANY. */
4859 
4860     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4861         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4862       {
4863       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4864       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4865       zerofirstchar = firstchar;
4866       zerofirstcharflags = firstcharflags;
4867       break;
4868       }
4869 
4870     /* If a class contains a negative special such as \S, we need to flip the
4871     negation flag at the end, so that support for characters > 255 works
4872     correctly (they are all included in the class). */
4873 
4874     should_flip_negation = FALSE;
4875 
4876     /* Extended class (xclass) will be used when characters > 255
4877     might match. */
4878 
4879 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4880     xclass = FALSE;
4881     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4882     class_uchardata_base = class_uchardata;   /* Save the start */
4883 #endif
4884 
4885     /* For optimization purposes, we track some properties of the class:
4886     class_has_8bitchar will be non-zero if the class contains at least one <
4887     256 character; class_one_char will be 1 if the class contains just one
4888     character; xclass_has_prop will be TRUE if unicode property checks
4889     are present in the class. */
4890 
4891     class_has_8bitchar = 0;
4892     class_one_char = 0;
4893 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4894     xclass_has_prop = FALSE;
4895 #endif
4896 
4897     /* Initialize the 32-char bit map to all zeros. We build the map in a
4898     temporary bit of memory, in case the class contains fewer than two
4899     8-bit characters because in that case the compiled code doesn't use the bit
4900     map. */
4901 
4902     memset(classbits, 0, 32 * sizeof(pcre_uint8));
4903 
4904     /* Process characters until ] is reached. By writing this as a "do" it
4905     means that an initial ] is taken as a data character. At the start of the
4906     loop, c contains the first byte of the character. */
4907 
4908     if (c != CHAR_NULL) do
4909       {
4910       const pcre_uchar *oldptr;
4911 
4912 #ifdef SUPPORT_UTF
4913       if (utf && HAS_EXTRALEN(c))
4914         {                           /* Braces are required because the */
4915         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4916         }
4917 #endif
4918 
4919 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4920       /* In the pre-compile phase, accumulate the length of any extra
4921       data and reset the pointer. This is so that very large classes that
4922       contain a zillion > 255 characters no longer overwrite the work space
4923       (which is on the stack). We have to remember that there was XCLASS data,
4924       however. */
4925 
4926       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4927         {
4928         xclass = TRUE;
4929         *lengthptr += (int)(class_uchardata - class_uchardata_base);
4930         class_uchardata = class_uchardata_base;
4931         }
4932 #endif
4933 
4934       /* Inside \Q...\E everything is literal except \E */
4935 
4936       if (inescq)
4937         {
4938         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4939           {
4940           inescq = FALSE;                   /* Reset literal state */
4941           ptr++;                            /* Skip the 'E' */
4942           continue;                         /* Carry on with next */
4943           }
4944         goto CHECK_RANGE;                   /* Could be range if \E follows */
4945         }
4946 
4947       /* Handle POSIX class names. Perl allows a negation extension of the
4948       form [:^name:]. A square bracket that doesn't match the syntax is
4949       treated as a literal. We also recognize the POSIX constructions
4950       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4951       5.6 and 5.8 do. */
4952 
4953       if (c == CHAR_LEFT_SQUARE_BRACKET &&
4954           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4955            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4956         {
4957         BOOL local_negate = FALSE;
4958         int posix_class, taboffset, tabopt;
4959         register const pcre_uint8 *cbits = cd->cbits;
4960         pcre_uint8 pbits[32];
4961 
4962         if (ptr[1] != CHAR_COLON)
4963           {
4964           *errorcodeptr = ERR31;
4965           goto FAILED;
4966           }
4967 
4968         ptr += 2;
4969         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4970           {
4971           local_negate = TRUE;
4972           should_flip_negation = TRUE;  /* Note negative special */
4973           ptr++;
4974           }
4975 
4976         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4977         if (posix_class < 0)
4978           {
4979           *errorcodeptr = ERR30;
4980           goto FAILED;
4981           }
4982 
4983         /* If matching is caseless, upper and lower are converted to
4984         alpha. This relies on the fact that the class table starts with
4985         alpha, lower, upper as the first 3 entries. */
4986 
4987         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4988           posix_class = 0;
4989 
4990         /* When PCRE_UCP is set, some of the POSIX classes are converted to
4991         different escape sequences that use Unicode properties \p or \P. Others
4992         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4993         directly. */
4994 
4995 #ifdef SUPPORT_UCP
4996         if ((options & PCRE_UCP) != 0)
4997           {
4998           unsigned int ptype = 0;
4999           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5000 
5001           /* The posix_substitutes table specifies which POSIX classes can be
5002           converted to \p or \P items. */
5003 
5004           if (posix_substitutes[pc] != NULL)
5005             {
5006             nestptr = tempptr + 1;
5007             ptr = posix_substitutes[pc] - 1;
5008             continue;
5009             }
5010 
5011           /* There are three other classes that generate special property calls
5012           that are recognized only in an XCLASS. */
5013 
5014           else switch(posix_class)
5015             {
5016             case PC_GRAPH:
5017             ptype = PT_PXGRAPH;
5018             /* Fall through */
5019             case PC_PRINT:
5020             if (ptype == 0) ptype = PT_PXPRINT;
5021             /* Fall through */
5022             case PC_PUNCT:
5023             if (ptype == 0) ptype = PT_PXPUNCT;
5024             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5025             *class_uchardata++ = ptype;
5026             *class_uchardata++ = 0;
5027             xclass_has_prop = TRUE;
5028             ptr = tempptr + 1;
5029             continue;
5030 
5031             /* For all other POSIX classes, no special action is taken in UCP
5032             mode. Fall through to the non_UCP case. */
5033 
5034             default:
5035             break;
5036             }
5037           }
5038 #endif
5039         /* In the non-UCP case, or when UCP makes no difference, we build the
5040         bit map for the POSIX class in a chunk of local store because we may be
5041         adding and subtracting from it, and we don't want to subtract bits that
5042         may be in the main map already. At the end we or the result into the
5043         bit map that is being built. */
5044 
5045         posix_class *= 3;
5046 
5047         /* Copy in the first table (always present) */
5048 
5049         memcpy(pbits, cbits + posix_class_maps[posix_class],
5050           32 * sizeof(pcre_uint8));
5051 
5052         /* If there is a second table, add or remove it as required. */
5053 
5054         taboffset = posix_class_maps[posix_class + 1];
5055         tabopt = posix_class_maps[posix_class + 2];
5056 
5057         if (taboffset >= 0)
5058           {
5059           if (tabopt >= 0)
5060             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5061           else
5062             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5063           }
5064 
5065         /* Now see if we need to remove any special characters. An option
5066         value of 1 removes vertical space and 2 removes underscore. */
5067 
5068         if (tabopt < 0) tabopt = -tabopt;
5069         if (tabopt == 1) pbits[1] &= ~0x3c;
5070           else if (tabopt == 2) pbits[11] &= 0x7f;
5071 
5072         /* Add the POSIX table or its complement into the main table that is
5073         being built and we are done. */
5074 
5075         if (local_negate)
5076           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5077         else
5078           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5079 
5080         ptr = tempptr + 1;
5081         /* Every class contains at least one < 256 character. */
5082         class_has_8bitchar = 1;
5083         /* Every class contains at least two characters. */
5084         class_one_char = 2;
5085         continue;    /* End of POSIX syntax handling */
5086         }
5087 
5088       /* Backslash may introduce a single character, or it may introduce one
5089       of the specials, which just set a flag. The sequence \b is a special
5090       case. Inside a class (and only there) it is treated as backspace. We
5091       assume that other escapes have more than one character in them, so
5092       speculatively set both class_has_8bitchar and class_one_char bigger
5093       than one. Unrecognized escapes fall through and are either treated
5094       as literal characters (by default), or are faulted if
5095       PCRE_EXTRA is set. */
5096 
5097       if (c == CHAR_BACKSLASH)
5098         {
5099         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5100           TRUE);
5101         if (*errorcodeptr != 0) goto FAILED;
5102         if (escape == 0) c = ec;
5103         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5104         else if (escape == ESC_N)          /* \N is not supported in a class */
5105           {
5106           *errorcodeptr = ERR71;
5107           goto FAILED;
5108           }
5109         else if (escape == ESC_Q)            /* Handle start of quoted string */
5110           {
5111           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5112             {
5113             ptr += 2; /* avoid empty string */
5114             }
5115           else inescq = TRUE;
5116           continue;
5117           }
5118         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5119 
5120         else
5121           {
5122           register const pcre_uint8 *cbits = cd->cbits;
5123           /* Every class contains at least two < 256 characters. */
5124           class_has_8bitchar++;
5125           /* Every class contains at least two characters. */
5126           class_one_char += 2;
5127 
5128           switch (escape)
5129             {
5130 #ifdef SUPPORT_UCP
5131             case ESC_du:     /* These are the values given for \d etc */
5132             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5133             case ESC_wu:     /* escape sequence with an appropriate \p */
5134             case ESC_WU:     /* or \P to test Unicode properties instead */
5135             case ESC_su:     /* of the default ASCII testing. */
5136             case ESC_SU:
5137             nestptr = ptr;
5138             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5139             class_has_8bitchar--;                /* Undo! */
5140             continue;
5141 #endif
5142             case ESC_d:
5143             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5144             continue;
5145 
5146             case ESC_D:
5147             should_flip_negation = TRUE;
5148             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5149             continue;
5150 
5151             case ESC_w:
5152             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5153             continue;
5154 
5155             case ESC_W:
5156             should_flip_negation = TRUE;
5157             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5158             continue;
5159 
5160             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5161             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5162             previously set by something earlier in the character class.
5163             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5164             we could just adjust the appropriate bit. From PCRE 8.34 we no
5165             longer treat \s and \S specially. */
5166 
5167             case ESC_s:
5168             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5169             continue;
5170 
5171             case ESC_S:
5172             should_flip_negation = TRUE;
5173             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5174             continue;
5175 
5176             /* The rest apply in both UCP and non-UCP cases. */
5177 
5178             case ESC_h:
5179             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5180               PRIV(hspace_list), NOTACHAR);
5181             continue;
5182 
5183             case ESC_H:
5184             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5185               cd, PRIV(hspace_list));
5186             continue;
5187 
5188             case ESC_v:
5189             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5190               PRIV(vspace_list), NOTACHAR);
5191             continue;
5192 
5193             case ESC_V:
5194             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5195               cd, PRIV(vspace_list));
5196             continue;
5197 
5198 #ifdef SUPPORT_UCP
5199             case ESC_p:
5200             case ESC_P:
5201               {
5202               BOOL negated;
5203               unsigned int ptype = 0, pdata = 0;
5204               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5205                 goto FAILED;
5206               *class_uchardata++ = ((escape == ESC_p) != negated)?
5207                 XCL_PROP : XCL_NOTPROP;
5208               *class_uchardata++ = ptype;
5209               *class_uchardata++ = pdata;
5210               xclass_has_prop = TRUE;
5211               class_has_8bitchar--;                /* Undo! */
5212               continue;
5213               }
5214 #endif
5215             /* Unrecognized escapes are faulted if PCRE is running in its
5216             strict mode. By default, for compatibility with Perl, they are
5217             treated as literals. */
5218 
5219             default:
5220             if ((options & PCRE_EXTRA) != 0)
5221               {
5222               *errorcodeptr = ERR7;
5223               goto FAILED;
5224               }
5225             class_has_8bitchar--;    /* Undo the speculative increase. */
5226             class_one_char -= 2;     /* Undo the speculative increase. */
5227             c = *ptr;                /* Get the final character and fall through */
5228             break;
5229             }
5230           }
5231 
5232         /* Fall through if the escape just defined a single character (c >= 0).
5233         This may be greater than 256. */
5234 
5235         escape = 0;
5236 
5237         }   /* End of backslash handling */
5238 
5239       /* A character may be followed by '-' to form a range. However, Perl does
5240       not permit ']' to be the end of the range. A '-' character at the end is
5241       treated as a literal. Perl ignores orphaned \E sequences entirely. The
5242       code for handling \Q and \E is messy. */
5243 
5244       CHECK_RANGE:
5245       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5246         {
5247         inescq = FALSE;
5248         ptr += 2;
5249         }
5250       oldptr = ptr;
5251 
5252       /* Remember if \r or \n were explicitly used */
5253 
5254       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5255 
5256       /* Check for range */
5257 
5258       if (!inescq && ptr[1] == CHAR_MINUS)
5259         {
5260         pcre_uint32 d;
5261         ptr += 2;
5262         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5263 
5264         /* If we hit \Q (not followed by \E) at this point, go into escaped
5265         mode. */
5266 
5267         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5268           {
5269           ptr += 2;
5270           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5271             { ptr += 2; continue; }
5272           inescq = TRUE;
5273           break;
5274           }
5275 
5276         /* Minus (hyphen) at the end of a class is treated as a literal, so put
5277         back the pointer and jump to handle the character that preceded it. */
5278 
5279         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5280           {
5281           ptr = oldptr;
5282           goto CLASS_SINGLE_CHARACTER;
5283           }
5284 
5285         /* Otherwise, we have a potential range; pick up the next character */
5286 
5287 #ifdef SUPPORT_UTF
5288         if (utf)
5289           {                           /* Braces are required because the */
5290           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5291           }
5292         else
5293 #endif
5294         d = *ptr;  /* Not UTF-8 mode */
5295 
5296         /* The second part of a range can be a single-character escape
5297         sequence, but not any of the other escapes. Perl treats a hyphen as a
5298         literal in such circumstances. However, in Perl's warning mode, a
5299         warning is given, so PCRE now faults it as it is almost certainly a
5300         mistake on the user's part. */
5301 
5302         if (!inescq)
5303           {
5304           if (d == CHAR_BACKSLASH)
5305             {
5306             int descape;
5307             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5308             if (*errorcodeptr != 0) goto FAILED;
5309 
5310             /* 0 means a character was put into d; \b is backspace; any other
5311             special causes an error. */
5312 
5313             if (descape != 0)
5314               {
5315               if (descape == ESC_b) d = CHAR_BS; else
5316                 {
5317                 *errorcodeptr = ERR83;
5318                 goto FAILED;
5319                 }
5320               }
5321             }
5322 
5323           /* A hyphen followed by a POSIX class is treated in the same way. */
5324 
5325           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5326                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5327                     ptr[1] == CHAR_EQUALS_SIGN) &&
5328                    check_posix_syntax(ptr, &tempptr))
5329             {
5330             *errorcodeptr = ERR83;
5331             goto FAILED;
5332             }
5333           }
5334 
5335         /* Check that the two values are in the correct order. Optimize
5336         one-character ranges. */
5337 
5338         if (d < c)
5339           {
5340           *errorcodeptr = ERR8;
5341           goto FAILED;
5342           }
5343         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5344 
5345         /* We have found a character range, so single character optimizations
5346         cannot be done anymore. Any value greater than 1 indicates that there
5347         is more than one character. */
5348 
5349         class_one_char = 2;
5350 
5351         /* Remember an explicit \r or \n, and add the range to the class. */
5352 
5353         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5354 
5355         class_has_8bitchar +=
5356           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5357 
5358         continue;   /* Go get the next char in the class */
5359         }
5360 
5361       /* Handle a single character - we can get here for a normal non-escape
5362       char, or after \ that introduces a single character or for an apparent
5363       range that isn't. Only the value 1 matters for class_one_char, so don't
5364       increase it if it is already 2 or more ... just in case there's a class
5365       with a zillion characters in it. */
5366 
5367       CLASS_SINGLE_CHARACTER:
5368       if (class_one_char < 2) class_one_char++;
5369 
5370       /* If class_one_char is 1, we have the first single character in the
5371       class, and there have been no prior ranges, or XCLASS items generated by
5372       escapes. If this is the final character in the class, we can optimize by
5373       turning the item into a 1-character OP_CHAR[I] if it's positive, or
5374       OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
5375       to be set. Otherwise, there can be no first char if this item is first,
5376       whatever repeat count may follow. In the case of reqchar, save the
5377       previous value for reinstating. */
5378 
5379       if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5380         {
5381         ptr++;
5382         zeroreqchar = reqchar;
5383         zeroreqcharflags = reqcharflags;
5384 
5385         if (negate_class)
5386           {
5387 #ifdef SUPPORT_UCP
5388           int d;
5389 #endif
5390           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5391           zerofirstchar = firstchar;
5392           zerofirstcharflags = firstcharflags;
5393 
5394           /* For caseless UTF-8 mode when UCP support is available, check
5395           whether this character has more than one other case. If so, generate
5396           a special OP_NOTPROP item instead of OP_NOTI. */
5397 
5398 #ifdef SUPPORT_UCP
5399           if (utf && (options & PCRE_CASELESS) != 0 &&
5400               (d = UCD_CASESET(c)) != 0)
5401             {
5402             *code++ = OP_NOTPROP;
5403             *code++ = PT_CLIST;
5404             *code++ = d;
5405             }
5406           else
5407 #endif
5408           /* Char has only one other case, or UCP not available */
5409 
5410             {
5411             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5412 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5413             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5414               code += PRIV(ord2utf)(c, code);
5415             else
5416 #endif
5417               *code++ = c;
5418             }
5419 
5420           /* We are finished with this character class */
5421 
5422           goto END_CLASS;
5423           }
5424 
5425         /* For a single, positive character, get the value into mcbuffer, and
5426         then we can handle this with the normal one-character code. */
5427 
5428 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5429         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5430           mclength = PRIV(ord2utf)(c, mcbuffer);
5431         else
5432 #endif
5433           {
5434           mcbuffer[0] = c;
5435           mclength = 1;
5436           }
5437         goto ONE_CHAR;
5438         }       /* End of 1-char optimization */
5439 
5440       /* There is more than one character in the class, or an XCLASS item
5441       has been generated. Add this character to the class. */
5442 
5443       class_has_8bitchar +=
5444         add_to_class(classbits, &class_uchardata, options, cd, c, c);
5445       }
5446 
5447     /* Loop until ']' reached. This "while" is the end of the "do" far above.
5448     If we are at the end of an internal nested string, revert to the outer
5449     string. */
5450 
5451     while (((c = *(++ptr)) != CHAR_NULL ||
5452            (nestptr != NULL &&
5453              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5454            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5455 
5456     /* Check for missing terminating ']' */
5457 
5458     if (c == CHAR_NULL)
5459       {
5460       *errorcodeptr = ERR6;
5461       goto FAILED;
5462       }
5463 
5464     /* We will need an XCLASS if data has been placed in class_uchardata. In
5465     the second phase this is a sufficient test. However, in the pre-compile
5466     phase, class_uchardata gets emptied to prevent workspace overflow, so it
5467     only if the very last character in the class needs XCLASS will it contain
5468     anything at this point. For this reason, xclass gets set TRUE above when
5469     uchar_classdata is emptied, and that's why this code is the way it is here
5470     instead of just doing a test on class_uchardata below. */
5471 
5472 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5473     if (class_uchardata > class_uchardata_base) xclass = TRUE;
5474 #endif
5475 
5476     /* If this is the first thing in the branch, there can be no first char
5477     setting, whatever the repeat count. Any reqchar setting must remain
5478     unchanged after any kind of repeat. */
5479 
5480     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5481     zerofirstchar = firstchar;
5482     zerofirstcharflags = firstcharflags;
5483     zeroreqchar = reqchar;
5484     zeroreqcharflags = reqcharflags;
5485 
5486     /* If there are characters with values > 255, we have to compile an
5487     extended class, with its own opcode, unless there was a negated special
5488     such as \S in the class, and PCRE_UCP is not set, because in that case all
5489     characters > 255 are in the class, so any that were explicitly given as
5490     well can be ignored. If (when there are explicit characters > 255 that must
5491     be listed) there are no characters < 256, we can omit the bitmap in the
5492     actual compiled code. */
5493 
5494 #ifdef SUPPORT_UTF
5495     if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5496 #elif !defined COMPILE_PCRE8
5497     if (xclass && !should_flip_negation)
5498 #endif
5499 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5500       {
5501       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5502       *code++ = OP_XCLASS;
5503       code += LINK_SIZE;
5504       *code = negate_class? XCL_NOT:0;
5505       if (xclass_has_prop) *code |= XCL_HASPROP;
5506 
5507       /* If the map is required, move up the extra data to make room for it;
5508       otherwise just move the code pointer to the end of the extra data. */
5509 
5510       if (class_has_8bitchar > 0)
5511         {
5512         *code++ |= XCL_MAP;
5513         memmove(code + (32 / sizeof(pcre_uchar)), code,
5514           IN_UCHARS(class_uchardata - code));
5515         if (negate_class && !xclass_has_prop)
5516           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5517         memcpy(code, classbits, 32);
5518         code = class_uchardata + (32 / sizeof(pcre_uchar));
5519         }
5520       else code = class_uchardata;
5521 
5522       /* Now fill in the complete length of the item */
5523 
5524       PUT(previous, 1, (int)(code - previous));
5525       break;   /* End of class handling */
5526       }
5527 
5528     /* Even though any XCLASS list is now discarded, we must allow for
5529     its memory. */
5530 
5531     if (lengthptr != NULL)
5532       *lengthptr += (int)(class_uchardata - class_uchardata_base);
5533 #endif
5534 
5535     /* If there are no characters > 255, or they are all to be included or
5536     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5537     whole class was negated and whether there were negative specials such as \S
5538     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5539     negating it if necessary. */
5540 
5541     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5542     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5543       {
5544       if (negate_class)
5545         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5546       memcpy(code, classbits, 32);
5547       }
5548     code += 32 / sizeof(pcre_uchar);
5549 
5550     END_CLASS:
5551     break;
5552 
5553 
5554     /* ===================================================================*/
5555     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5556     has been tested above. */
5557 
5558     case CHAR_LEFT_CURLY_BRACKET:
5559     if (!is_quantifier) goto NORMAL_CHAR;
5560     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5561     if (*errorcodeptr != 0) goto FAILED;
5562     goto REPEAT;
5563 
5564     case CHAR_ASTERISK:
5565     repeat_min = 0;
5566     repeat_max = -1;
5567     goto REPEAT;
5568 
5569     case CHAR_PLUS:
5570     repeat_min = 1;
5571     repeat_max = -1;
5572     goto REPEAT;
5573 
5574     case CHAR_QUESTION_MARK:
5575     repeat_min = 0;
5576     repeat_max = 1;
5577 
5578     REPEAT:
5579     if (previous == NULL)
5580       {
5581       *errorcodeptr = ERR9;
5582       goto FAILED;
5583       }
5584 
5585     if (repeat_min == 0)
5586       {
5587       firstchar = zerofirstchar;    /* Adjust for zero repeat */
5588       firstcharflags = zerofirstcharflags;
5589       reqchar = zeroreqchar;        /* Ditto */
5590       reqcharflags = zeroreqcharflags;
5591       }
5592 
5593     /* Remember whether this is a variable length repeat */
5594 
5595     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5596 
5597     op_type = 0;                    /* Default single-char op codes */
5598     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5599 
5600     /* Save start of previous item, in case we have to move it up in order to
5601     insert something before it. */
5602 
5603     tempcode = previous;
5604 
5605     /* Before checking for a possessive quantifier, we must skip over
5606     whitespace and comments in extended mode because Perl allows white space at
5607     this point. */
5608 
5609     if ((options & PCRE_EXTENDED) != 0)
5610       {
5611       const pcre_uchar *p = ptr + 1;
5612       for (;;)
5613         {
5614         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5615         if (*p != CHAR_NUMBER_SIGN) break;
5616         p++;
5617         while (*p != CHAR_NULL)
5618           {
5619           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5620             {                        /* IS_NEWLINE sets cd->nllen. */
5621             p += cd->nllen;
5622             break;
5623             }
5624           p++;
5625 #ifdef SUPPORT_UTF
5626           if (utf) FORWARDCHAR(p);
5627 #endif
5628           }           /* Loop for comment characters */
5629         }             /* Loop for multiple comments */
5630       ptr = p - 1;    /* Character before the next significant one. */
5631       }
5632 
5633     /* If the next character is '+', we have a possessive quantifier. This
5634     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5635     If the next character is '?' this is a minimizing repeat, by default,
5636     but if PCRE_UNGREEDY is set, it works the other way round. We change the
5637     repeat type to the non-default. */
5638 
5639     if (ptr[1] == CHAR_PLUS)
5640       {
5641       repeat_type = 0;                  /* Force greedy */
5642       possessive_quantifier = TRUE;
5643       ptr++;
5644       }
5645     else if (ptr[1] == CHAR_QUESTION_MARK)
5646       {
5647       repeat_type = greedy_non_default;
5648       ptr++;
5649       }
5650     else repeat_type = greedy_default;
5651 
5652     /* If previous was a recursion call, wrap it in atomic brackets so that
5653     previous becomes the atomic group. All recursions were so wrapped in the
5654     past, but it no longer happens for non-repeated recursions. In fact, the
5655     repeated ones could be re-implemented independently so as not to need this,
5656     but for the moment we rely on the code for repeating groups. */
5657 
5658     if (*previous == OP_RECURSE)
5659       {
5660       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5661       *previous = OP_ONCE;
5662       PUT(previous, 1, 2 + 2*LINK_SIZE);
5663       previous[2 + 2*LINK_SIZE] = OP_KET;
5664       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5665       code += 2 + 2 * LINK_SIZE;
5666       length_prevgroup = 3 + 3*LINK_SIZE;
5667 
5668       /* When actually compiling, we need to check whether this was a forward
5669       reference, and if so, adjust the offset. */
5670 
5671       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5672         {
5673         int offset = GET(cd->hwm, -LINK_SIZE);
5674         if (offset == previous + 1 - cd->start_code)
5675           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5676         }
5677       }
5678 
5679     /* Now handle repetition for the different types of item. */
5680 
5681     /* If previous was a character or negated character match, abolish the item
5682     and generate a repeat item instead. If a char item has a minimum of more
5683     than one, ensure that it is set in reqchar - it might not be if a sequence
5684     such as x{3} is the first thing in a branch because the x will have gone
5685     into firstchar instead.  */
5686 
5687     if (*previous == OP_CHAR || *previous == OP_CHARI
5688         || *previous == OP_NOT || *previous == OP_NOTI)
5689       {
5690       switch (*previous)
5691         {
5692         default: /* Make compiler happy. */
5693         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5694         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5695         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5696         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5697         }
5698 
5699       /* Deal with UTF characters that take up more than one character. It's
5700       easier to write this out separately than try to macrify it. Use c to
5701       hold the length of the character in bytes, plus UTF_LENGTH to flag that
5702       it's a length rather than a small character. */
5703 
5704 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5705       if (utf && NOT_FIRSTCHAR(code[-1]))
5706         {
5707         pcre_uchar *lastchar = code - 1;
5708         BACKCHAR(lastchar);
5709         c = (int)(code - lastchar);     /* Length of UTF-8 character */
5710         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5711         c |= UTF_LENGTH;                /* Flag c as a length */
5712         }
5713       else
5714 #endif /* SUPPORT_UTF */
5715 
5716       /* Handle the case of a single charater - either with no UTF support, or
5717       with UTF disabled, or for a single character UTF character. */
5718         {
5719         c = code[-1];
5720         if (*previous <= OP_CHARI && repeat_min > 1)
5721           {
5722           reqchar = c;
5723           reqcharflags = req_caseopt | cd->req_varyopt;
5724           }
5725         }
5726 
5727       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5728       }
5729 
5730     /* If previous was a character type match (\d or similar), abolish it and
5731     create a suitable repeat item. The code is shared with single-character
5732     repeats by setting op_type to add a suitable offset into repeat_type. Note
5733     the the Unicode property types will be present only when SUPPORT_UCP is
5734     defined, but we don't wrap the little bits of code here because it just
5735     makes it horribly messy. */
5736 
5737     else if (*previous < OP_EODN)
5738       {
5739       pcre_uchar *oldcode;
5740       int prop_type, prop_value;
5741       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5742       c = *previous;
5743 
5744       OUTPUT_SINGLE_REPEAT:
5745       if (*previous == OP_PROP || *previous == OP_NOTPROP)
5746         {
5747         prop_type = previous[1];
5748         prop_value = previous[2];
5749         }
5750       else prop_type = prop_value = -1;
5751 
5752       oldcode = code;
5753       code = previous;                  /* Usually overwrite previous item */
5754 
5755       /* If the maximum is zero then the minimum must also be zero; Perl allows
5756       this case, so we do too - by simply omitting the item altogether. */
5757 
5758       if (repeat_max == 0) goto END_REPEAT;
5759 
5760       /* Combine the op_type with the repeat_type */
5761 
5762       repeat_type += op_type;
5763 
5764       /* A minimum of zero is handled either as the special case * or ?, or as
5765       an UPTO, with the maximum given. */
5766 
5767       if (repeat_min == 0)
5768         {
5769         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5770           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5771         else
5772           {
5773           *code++ = OP_UPTO + repeat_type;
5774           PUT2INC(code, 0, repeat_max);
5775           }
5776         }
5777 
5778       /* A repeat minimum of 1 is optimized into some special cases. If the
5779       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5780       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5781       one less than the maximum. */
5782 
5783       else if (repeat_min == 1)
5784         {
5785         if (repeat_max == -1)
5786           *code++ = OP_PLUS + repeat_type;
5787         else
5788           {
5789           code = oldcode;                 /* leave previous item in place */
5790           if (repeat_max == 1) goto END_REPEAT;
5791           *code++ = OP_UPTO + repeat_type;
5792           PUT2INC(code, 0, repeat_max - 1);
5793           }
5794         }
5795 
5796       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5797       handled as an EXACT followed by an UPTO. */
5798 
5799       else
5800         {
5801         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5802         PUT2INC(code, 0, repeat_min);
5803 
5804         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5805         we have to insert the character for the previous code. For a repeated
5806         Unicode property match, there are two extra bytes that define the
5807         required property. In UTF-8 mode, long characters have their length in
5808         c, with the UTF_LENGTH bit as a flag. */
5809 
5810         if (repeat_max < 0)
5811           {
5812 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5813           if (utf && (c & UTF_LENGTH) != 0)
5814             {
5815             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5816             code += c & 7;
5817             }
5818           else
5819 #endif
5820             {
5821             *code++ = c;
5822             if (prop_type >= 0)
5823               {
5824               *code++ = prop_type;
5825               *code++ = prop_value;
5826               }
5827             }
5828           *code++ = OP_STAR + repeat_type;
5829           }
5830 
5831         /* Else insert an UPTO if the max is greater than the min, again
5832         preceded by the character, for the previously inserted code. If the
5833         UPTO is just for 1 instance, we can use QUERY instead. */
5834 
5835         else if (repeat_max != repeat_min)
5836           {
5837 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5838           if (utf && (c & UTF_LENGTH) != 0)
5839             {
5840             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5841             code += c & 7;
5842             }
5843           else
5844 #endif
5845           *code++ = c;
5846           if (prop_type >= 0)
5847             {
5848             *code++ = prop_type;
5849             *code++ = prop_value;
5850             }
5851           repeat_max -= repeat_min;
5852 
5853           if (repeat_max == 1)
5854             {
5855             *code++ = OP_QUERY + repeat_type;
5856             }
5857           else
5858             {
5859             *code++ = OP_UPTO + repeat_type;
5860             PUT2INC(code, 0, repeat_max);
5861             }
5862           }
5863         }
5864 
5865       /* The character or character type itself comes last in all cases. */
5866 
5867 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5868       if (utf && (c & UTF_LENGTH) != 0)
5869         {
5870         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5871         code += c & 7;
5872         }
5873       else
5874 #endif
5875       *code++ = c;
5876 
5877       /* For a repeated Unicode property match, there are two extra bytes that
5878       define the required property. */
5879 
5880 #ifdef SUPPORT_UCP
5881       if (prop_type >= 0)
5882         {
5883         *code++ = prop_type;
5884         *code++ = prop_value;
5885         }
5886 #endif
5887       }
5888 
5889     /* If previous was a character class or a back reference, we put the repeat
5890     stuff after it, but just skip the item if the repeat was {0,0}. */
5891 
5892     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5893 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5894              *previous == OP_XCLASS ||
5895 #endif
5896              *previous == OP_REF   || *previous == OP_REFI ||
5897              *previous == OP_DNREF || *previous == OP_DNREFI)
5898       {
5899       if (repeat_max == 0)
5900         {
5901         code = previous;
5902         goto END_REPEAT;
5903         }
5904 
5905       if (repeat_min == 0 && repeat_max == -1)
5906         *code++ = OP_CRSTAR + repeat_type;
5907       else if (repeat_min == 1 && repeat_max == -1)
5908         *code++ = OP_CRPLUS + repeat_type;
5909       else if (repeat_min == 0 && repeat_max == 1)
5910         *code++ = OP_CRQUERY + repeat_type;
5911       else
5912         {
5913         *code++ = OP_CRRANGE + repeat_type;
5914         PUT2INC(code, 0, repeat_min);
5915         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5916         PUT2INC(code, 0, repeat_max);
5917         }
5918       }
5919 
5920     /* If previous was a bracket group, we may have to replicate it in certain
5921     cases. Note that at this point we can encounter only the "basic" bracket
5922     opcodes such as BRA and CBRA, as this is the place where they get converted
5923     into the more special varieties such as BRAPOS and SBRA. A test for >=
5924     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5925     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5926     Originally, PCRE did not allow repetition of assertions, but now it does,
5927     for Perl compatibility. */
5928 
5929     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5930       {
5931       register int i;
5932       int len = (int)(code - previous);
5933       size_t base_hwm_offset = save_hwm_offset;
5934       pcre_uchar *bralink = NULL;
5935       pcre_uchar *brazeroptr = NULL;
5936 
5937       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5938       we just ignore the repeat. */
5939 
5940       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5941         goto END_REPEAT;
5942 
5943       /* There is no sense in actually repeating assertions. The only potential
5944       use of repetition is in cases when the assertion is optional. Therefore,
5945       if the minimum is greater than zero, just ignore the repeat. If the
5946       maximum is not zero or one, set it to 1. */
5947 
5948       if (*previous < OP_ONCE)    /* Assertion */
5949         {
5950         if (repeat_min > 0) goto END_REPEAT;
5951         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5952         }
5953 
5954       /* The case of a zero minimum is special because of the need to stick
5955       OP_BRAZERO in front of it, and because the group appears once in the
5956       data, whereas in other cases it appears the minimum number of times. For
5957       this reason, it is simplest to treat this case separately, as otherwise
5958       the code gets far too messy. There are several special subcases when the
5959       minimum is zero. */
5960 
5961       if (repeat_min == 0)
5962         {
5963         /* If the maximum is also zero, we used to just omit the group from the
5964         output altogether, like this:
5965 
5966         ** if (repeat_max == 0)
5967         **   {
5968         **   code = previous;
5969         **   goto END_REPEAT;
5970         **   }
5971 
5972         However, that fails when a group or a subgroup within it is referenced
5973         as a subroutine from elsewhere in the pattern, so now we stick in
5974         OP_SKIPZERO in front of it so that it is skipped on execution. As we
5975         don't have a list of which groups are referenced, we cannot do this
5976         selectively.
5977 
5978         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5979         and do no more at this point. However, we do need to adjust any
5980         OP_RECURSE calls inside the group that refer to the group itself or any
5981         internal or forward referenced group, because the offset is from the
5982         start of the whole regex. Temporarily terminate the pattern while doing
5983         this. */
5984 
5985         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5986           {
5987           *code = OP_END;
5988           adjust_recurse(previous, 1, utf, cd, save_hwm_offset);
5989           memmove(previous + 1, previous, IN_UCHARS(len));
5990           code++;
5991           if (repeat_max == 0)
5992             {
5993             *previous++ = OP_SKIPZERO;
5994             goto END_REPEAT;
5995             }
5996           brazeroptr = previous;    /* Save for possessive optimizing */
5997           *previous++ = OP_BRAZERO + repeat_type;
5998           }
5999 
6000         /* If the maximum is greater than 1 and limited, we have to replicate
6001         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6002         The first one has to be handled carefully because it's the original
6003         copy, which has to be moved up. The remainder can be handled by code
6004         that is common with the non-zero minimum case below. We have to
6005         adjust the value or repeat_max, since one less copy is required. Once
6006         again, we may have to adjust any OP_RECURSE calls inside the group. */
6007 
6008         else
6009           {
6010           int offset;
6011           *code = OP_END;
6012           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm_offset);
6013           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6014           code += 2 + LINK_SIZE;
6015           *previous++ = OP_BRAZERO + repeat_type;
6016           *previous++ = OP_BRA;
6017 
6018           /* We chain together the bracket offset fields that have to be
6019           filled in later when the ends of the brackets are reached. */
6020 
6021           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6022           bralink = previous;
6023           PUTINC(previous, 0, offset);
6024           }
6025 
6026         repeat_max--;
6027         }
6028 
6029       /* If the minimum is greater than zero, replicate the group as many
6030       times as necessary, and adjust the maximum to the number of subsequent
6031       copies that we need. If we set a first char from the group, and didn't
6032       set a required char, copy the latter from the former. If there are any
6033       forward reference subroutine calls in the group, there will be entries on
6034       the workspace list; replicate these with an appropriate increment. */
6035 
6036       else
6037         {
6038         if (repeat_min > 1)
6039           {
6040           /* In the pre-compile phase, we don't actually do the replication. We
6041           just adjust the length as if we had. Do some paranoid checks for
6042           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6043           integer type when available, otherwise double. */
6044 
6045           if (lengthptr != NULL)
6046             {
6047             int delta = (repeat_min - 1)*length_prevgroup;
6048             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6049                   (INT64_OR_DOUBLE)length_prevgroup >
6050                     (INT64_OR_DOUBLE)INT_MAX ||
6051                 OFLOW_MAX - *lengthptr < delta)
6052               {
6053               *errorcodeptr = ERR20;
6054               goto FAILED;
6055               }
6056             *lengthptr += delta;
6057             }
6058 
6059           /* This is compiling for real. If there is a set first byte for
6060           the group, and we have not yet set a "required byte", set it. Make
6061           sure there is enough workspace for copying forward references before
6062           doing the copy. */
6063 
6064           else
6065             {
6066             if (groupsetfirstchar && reqcharflags < 0)
6067               {
6068               reqchar = firstchar;
6069               reqcharflags = firstcharflags;
6070               }
6071 
6072             for (i = 1; i < repeat_min; i++)
6073               {
6074               pcre_uchar *hc;
6075               size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6076               memcpy(code, previous, IN_UCHARS(len));
6077 
6078               while (cd->hwm > cd->start_workspace + cd->workspace_size -
6079                      WORK_SIZE_SAFETY_MARGIN -
6080                      (this_hwm_offset - base_hwm_offset))
6081                 {
6082                 *errorcodeptr = expand_workspace(cd);
6083                 if (*errorcodeptr != 0) goto FAILED;
6084                 }
6085 
6086               for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6087                    hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6088                    hc += LINK_SIZE)
6089                 {
6090                 PUT(cd->hwm, 0, GET(hc, 0) + len);
6091                 cd->hwm += LINK_SIZE;
6092                 }
6093               base_hwm_offset = this_hwm_offset;
6094               code += len;
6095               }
6096             }
6097           }
6098 
6099         if (repeat_max > 0) repeat_max -= repeat_min;
6100         }
6101 
6102       /* This code is common to both the zero and non-zero minimum cases. If
6103       the maximum is limited, it replicates the group in a nested fashion,
6104       remembering the bracket starts on a stack. In the case of a zero minimum,
6105       the first one was set up above. In all cases the repeat_max now specifies
6106       the number of additional copies needed. Again, we must remember to
6107       replicate entries on the forward reference list. */
6108 
6109       if (repeat_max >= 0)
6110         {
6111         /* In the pre-compile phase, we don't actually do the replication. We
6112         just adjust the length as if we had. For each repetition we must add 1
6113         to the length for BRAZERO and for all but the last repetition we must
6114         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6115         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6116         a 64-bit integer type when available, otherwise double. */
6117 
6118         if (lengthptr != NULL && repeat_max > 0)
6119           {
6120           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6121                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6122           if ((INT64_OR_DOUBLE)repeat_max *
6123                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6124                   > (INT64_OR_DOUBLE)INT_MAX ||
6125               OFLOW_MAX - *lengthptr < delta)
6126             {
6127             *errorcodeptr = ERR20;
6128             goto FAILED;
6129             }
6130           *lengthptr += delta;
6131           }
6132 
6133         /* This is compiling for real */
6134 
6135         else for (i = repeat_max - 1; i >= 0; i--)
6136           {
6137           pcre_uchar *hc;
6138           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6139 
6140           *code++ = OP_BRAZERO + repeat_type;
6141 
6142           /* All but the final copy start a new nesting, maintaining the
6143           chain of brackets outstanding. */
6144 
6145           if (i != 0)
6146             {
6147             int offset;
6148             *code++ = OP_BRA;
6149             offset = (bralink == NULL)? 0 : (int)(code - bralink);
6150             bralink = code;
6151             PUTINC(code, 0, offset);
6152             }
6153 
6154           memcpy(code, previous, IN_UCHARS(len));
6155 
6156           /* Ensure there is enough workspace for forward references before
6157           copying them. */
6158 
6159           while (cd->hwm > cd->start_workspace + cd->workspace_size -
6160                  WORK_SIZE_SAFETY_MARGIN -
6161                  (this_hwm_offset - base_hwm_offset))
6162             {
6163             *errorcodeptr = expand_workspace(cd);
6164             if (*errorcodeptr != 0) goto FAILED;
6165             }
6166 
6167           for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6168                hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6169                hc += LINK_SIZE)
6170             {
6171             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6172             cd->hwm += LINK_SIZE;
6173             }
6174           base_hwm_offset = this_hwm_offset;
6175           code += len;
6176           }
6177 
6178         /* Now chain through the pending brackets, and fill in their length
6179         fields (which are holding the chain links pro tem). */
6180 
6181         while (bralink != NULL)
6182           {
6183           int oldlinkoffset;
6184           int offset = (int)(code - bralink + 1);
6185           pcre_uchar *bra = code - offset;
6186           oldlinkoffset = GET(bra, 1);
6187           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6188           *code++ = OP_KET;
6189           PUTINC(code, 0, offset);
6190           PUT(bra, 1, offset);
6191           }
6192         }
6193 
6194       /* If the maximum is unlimited, set a repeater in the final copy. For
6195       ONCE brackets, that's all we need to do. However, possessively repeated
6196       ONCE brackets can be converted into non-capturing brackets, as the
6197       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6198       deal with possessive ONCEs specially.
6199 
6200       Otherwise, when we are doing the actual compile phase, check to see
6201       whether this group is one that could match an empty string. If so,
6202       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6203       that runtime checking can be done. [This check is also applied to ONCE
6204       groups at runtime, but in a different way.]
6205 
6206       Then, if the quantifier was possessive and the bracket is not a
6207       conditional, we convert the BRA code to the POS form, and the KET code to
6208       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6209       subpattern at both the start and at the end.) The use of special opcodes
6210       makes it possible to reduce greatly the stack usage in pcre_exec(). If
6211       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6212 
6213       Then, if the minimum number of matches is 1 or 0, cancel the possessive
6214       flag so that the default action below, of wrapping everything inside
6215       atomic brackets, does not happen. When the minimum is greater than 1,
6216       there will be earlier copies of the group, and so we still have to wrap
6217       the whole thing. */
6218 
6219       else
6220         {
6221         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6222         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6223 
6224         /* Convert possessive ONCE brackets to non-capturing */
6225 
6226         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6227             possessive_quantifier) *bracode = OP_BRA;
6228 
6229         /* For non-possessive ONCE brackets, all we need to do is to
6230         set the KET. */
6231 
6232         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6233           *ketcode = OP_KETRMAX + repeat_type;
6234 
6235         /* Handle non-ONCE brackets and possessive ONCEs (which have been
6236         converted to non-capturing above). */
6237 
6238         else
6239           {
6240           /* In the compile phase, check for empty string matching. */
6241 
6242           if (lengthptr == NULL)
6243             {
6244             pcre_uchar *scode = bracode;
6245             do
6246               {
6247               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6248                 {
6249                 *bracode += OP_SBRA - OP_BRA;
6250                 break;
6251                 }
6252               scode += GET(scode, 1);
6253               }
6254             while (*scode == OP_ALT);
6255             }
6256 
6257           /* Handle possessive quantifiers. */
6258 
6259           if (possessive_quantifier)
6260             {
6261             /* For COND brackets, we wrap the whole thing in a possessively
6262             repeated non-capturing bracket, because we have not invented POS
6263             versions of the COND opcodes. Because we are moving code along, we
6264             must ensure that any pending recursive references are updated. */
6265 
6266             if (*bracode == OP_COND || *bracode == OP_SCOND)
6267               {
6268               int nlen = (int)(code - bracode);
6269               *code = OP_END;
6270               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
6271               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6272               code += 1 + LINK_SIZE;
6273               nlen += 1 + LINK_SIZE;
6274               *bracode = OP_BRAPOS;
6275               *code++ = OP_KETRPOS;
6276               PUTINC(code, 0, nlen);
6277               PUT(bracode, 1, nlen);
6278               }
6279 
6280             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6281 
6282             else
6283               {
6284               *bracode += 1;              /* Switch to xxxPOS opcodes */
6285               *ketcode = OP_KETRPOS;
6286               }
6287 
6288             /* If the minimum is zero, mark it as possessive, then unset the
6289             possessive flag when the minimum is 0 or 1. */
6290 
6291             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6292             if (repeat_min < 2) possessive_quantifier = FALSE;
6293             }
6294 
6295           /* Non-possessive quantifier */
6296 
6297           else *ketcode = OP_KETRMAX + repeat_type;
6298           }
6299         }
6300       }
6301 
6302     /* If previous is OP_FAIL, it was generated by an empty class [] in
6303     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6304     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6305     error above. We can just ignore the repeat in JS case. */
6306 
6307     else if (*previous == OP_FAIL) goto END_REPEAT;
6308 
6309     /* Else there's some kind of shambles */
6310 
6311     else
6312       {
6313       *errorcodeptr = ERR11;
6314       goto FAILED;
6315       }
6316 
6317     /* If the character following a repeat is '+', possessive_quantifier is
6318     TRUE. For some opcodes, there are special alternative opcodes for this
6319     case. For anything else, we wrap the entire repeated item inside OP_ONCE
6320     brackets. Logically, the '+' notation is just syntactic sugar, taken from
6321     Sun's Java package, but the special opcodes can optimize it.
6322 
6323     Some (but not all) possessively repeated subpatterns have already been
6324     completely handled in the code just above. For them, possessive_quantifier
6325     is always FALSE at this stage. Note that the repeated item starts at
6326     tempcode, not at previous, which might be the first part of a string whose
6327     (former) last char we repeated. */
6328 
6329     if (possessive_quantifier)
6330       {
6331       int len;
6332 
6333       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6334       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6335       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6336       remains is greater than zero, there's a further opcode that can be
6337       handled. If not, do nothing, leaving the EXACT alone. */
6338 
6339       switch(*tempcode)
6340         {
6341         case OP_TYPEEXACT:
6342         tempcode += PRIV(OP_lengths)[*tempcode] +
6343           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6344           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6345         break;
6346 
6347         /* CHAR opcodes are used for exacts whose count is 1. */
6348 
6349         case OP_CHAR:
6350         case OP_CHARI:
6351         case OP_NOT:
6352         case OP_NOTI:
6353         case OP_EXACT:
6354         case OP_EXACTI:
6355         case OP_NOTEXACT:
6356         case OP_NOTEXACTI:
6357         tempcode += PRIV(OP_lengths)[*tempcode];
6358 #ifdef SUPPORT_UTF
6359         if (utf && HAS_EXTRALEN(tempcode[-1]))
6360           tempcode += GET_EXTRALEN(tempcode[-1]);
6361 #endif
6362         break;
6363 
6364         /* For the class opcodes, the repeat operator appears at the end;
6365         adjust tempcode to point to it. */
6366 
6367         case OP_CLASS:
6368         case OP_NCLASS:
6369         tempcode += 1 + 32/sizeof(pcre_uchar);
6370         break;
6371 
6372 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6373         case OP_XCLASS:
6374         tempcode += GET(tempcode, 1);
6375         break;
6376 #endif
6377         }
6378 
6379       /* If tempcode is equal to code (which points to the end of the repeated
6380       item), it means we have skipped an EXACT item but there is no following
6381       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6382       all other cases, tempcode will be pointing to the repeat opcode, and will
6383       be less than code, so the value of len will be greater than 0. */
6384 
6385       len = (int)(code - tempcode);
6386       if (len > 0)
6387         {
6388         unsigned int repcode = *tempcode;
6389 
6390         /* There is a table for possessifying opcodes, all of which are less
6391         than OP_CALLOUT. A zero entry means there is no possessified version.
6392         */
6393 
6394         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6395           *tempcode = opcode_possessify[repcode];
6396 
6397         /* For opcode without a special possessified version, wrap the item in
6398         ONCE brackets. Because we are moving code along, we must ensure that any
6399         pending recursive references are updated. */
6400 
6401         else
6402           {
6403           *code = OP_END;
6404           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
6405           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6406           code += 1 + LINK_SIZE;
6407           len += 1 + LINK_SIZE;
6408           tempcode[0] = OP_ONCE;
6409           *code++ = OP_KET;
6410           PUTINC(code, 0, len);
6411           PUT(tempcode, 1, len);
6412           }
6413         }
6414 
6415 #ifdef NEVER
6416       if (len > 0) switch (*tempcode)
6417         {
6418         case OP_STAR:  *tempcode = OP_POSSTAR; break;
6419         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6420         case OP_QUERY: *tempcode = OP_POSQUERY; break;
6421         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6422 
6423         case OP_STARI:  *tempcode = OP_POSSTARI; break;
6424         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6425         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6426         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6427 
6428         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6429         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6430         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6431         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6432 
6433         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6434         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6435         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6436         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6437 
6438         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6439         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6440         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6441         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6442 
6443         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6444         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6445         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6446         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6447 
6448         /* Because we are moving code along, we must ensure that any
6449         pending recursive references are updated. */
6450 
6451         default:
6452         *code = OP_END;
6453         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm_offset);
6454         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6455         code += 1 + LINK_SIZE;
6456         len += 1 + LINK_SIZE;
6457         tempcode[0] = OP_ONCE;
6458         *code++ = OP_KET;
6459         PUTINC(code, 0, len);
6460         PUT(tempcode, 1, len);
6461         break;
6462         }
6463 #endif
6464       }
6465 
6466     /* In all case we no longer have a previous item. We also set the
6467     "follows varying string" flag for subsequently encountered reqchars if
6468     it isn't already set and we have just passed a varying length item. */
6469 
6470     END_REPEAT:
6471     previous = NULL;
6472     cd->req_varyopt |= reqvary;
6473     break;
6474 
6475 
6476     /* ===================================================================*/
6477     /* Start of nested parenthesized sub-expression, or comment or lookahead or
6478     lookbehind or option setting or condition or all the other extended
6479     parenthesis forms.  */
6480 
6481     case CHAR_LEFT_PARENTHESIS:
6482     ptr++;
6483 
6484     /* First deal with comments. Putting this code right at the start ensures
6485     that comments have no bad side effects. */
6486 
6487     if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
6488       {
6489       ptr += 2;
6490       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6491       if (*ptr == CHAR_NULL)
6492         {
6493         *errorcodeptr = ERR18;
6494         goto FAILED;
6495         }
6496       continue;
6497       }
6498 
6499     /* Now deal with various "verbs" that can be introduced by '*'. */
6500 
6501     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6502          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6503       {
6504       int i, namelen;
6505       int arglen = 0;
6506       const char *vn = verbnames;
6507       const pcre_uchar *name = ptr + 1;
6508       const pcre_uchar *arg = NULL;
6509       previous = NULL;
6510       ptr++;
6511       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6512       namelen = (int)(ptr - name);
6513 
6514       /* It appears that Perl allows any characters whatsoever, other than
6515       a closing parenthesis, to appear in arguments, so we no longer insist on
6516       letters, digits, and underscores. */
6517 
6518       if (*ptr == CHAR_COLON)
6519         {
6520         arg = ++ptr;
6521         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6522         arglen = (int)(ptr - arg);
6523         if ((unsigned int)arglen > MAX_MARK)
6524           {
6525           *errorcodeptr = ERR75;
6526           goto FAILED;
6527           }
6528         }
6529 
6530       if (*ptr != CHAR_RIGHT_PARENTHESIS)
6531         {
6532         *errorcodeptr = ERR60;
6533         goto FAILED;
6534         }
6535 
6536       /* Scan the table of verb names */
6537 
6538       for (i = 0; i < verbcount; i++)
6539         {
6540         if (namelen == verbs[i].len &&
6541             STRNCMP_UC_C8(name, vn, namelen) == 0)
6542           {
6543           int setverb;
6544 
6545           /* Check for open captures before ACCEPT and convert it to
6546           ASSERT_ACCEPT if in an assertion. */
6547 
6548           if (verbs[i].op == OP_ACCEPT)
6549             {
6550             open_capitem *oc;
6551             if (arglen != 0)
6552               {
6553               *errorcodeptr = ERR59;
6554               goto FAILED;
6555               }
6556             cd->had_accept = TRUE;
6557             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6558               {
6559               *code++ = OP_CLOSE;
6560               PUT2INC(code, 0, oc->number);
6561               }
6562             setverb = *code++ =
6563               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6564 
6565             /* Do not set firstchar after *ACCEPT */
6566             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6567             }
6568 
6569           /* Handle other cases with/without an argument */
6570 
6571           else if (arglen == 0)
6572             {
6573             if (verbs[i].op < 0)   /* Argument is mandatory */
6574               {
6575               *errorcodeptr = ERR66;
6576               goto FAILED;
6577               }
6578             setverb = *code++ = verbs[i].op;
6579             }
6580 
6581           else
6582             {
6583             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6584               {
6585               *errorcodeptr = ERR59;
6586               goto FAILED;
6587               }
6588             setverb = *code++ = verbs[i].op_arg;
6589             *code++ = arglen;
6590             memcpy(code, arg, IN_UCHARS(arglen));
6591             code += arglen;
6592             *code++ = 0;
6593             }
6594 
6595           switch (setverb)
6596             {
6597             case OP_THEN:
6598             case OP_THEN_ARG:
6599             cd->external_flags |= PCRE_HASTHEN;
6600             break;
6601 
6602             case OP_PRUNE:
6603             case OP_PRUNE_ARG:
6604             case OP_SKIP:
6605             case OP_SKIP_ARG:
6606             cd->had_pruneorskip = TRUE;
6607             break;
6608             }
6609 
6610           break;  /* Found verb, exit loop */
6611           }
6612 
6613         vn += verbs[i].len + 1;
6614         }
6615 
6616       if (i < verbcount) continue;    /* Successfully handled a verb */
6617       *errorcodeptr = ERR60;          /* Verb not recognized */
6618       goto FAILED;
6619       }
6620 
6621     /* Initialize for "real" parentheses */
6622 
6623     newoptions = options;
6624     skipbytes = 0;
6625     bravalue = OP_CBRA;
6626     save_hwm_offset = cd->hwm - cd->start_workspace;
6627     reset_bracount = FALSE;
6628 
6629     /* Deal with the extended parentheses; all are introduced by '?', and the
6630     appearance of any of them means that this is not a capturing group. */
6631 
6632     if (*ptr == CHAR_QUESTION_MARK)
6633       {
6634       int i, set, unset, namelen;
6635       int *optset;
6636       const pcre_uchar *name;
6637       pcre_uchar *slot;
6638 
6639       switch (*(++ptr))
6640         {
6641         /* ------------------------------------------------------------ */
6642         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6643         reset_bracount = TRUE;
6644         /* Fall through */
6645 
6646         /* ------------------------------------------------------------ */
6647         case CHAR_COLON:          /* Non-capturing bracket */
6648         bravalue = OP_BRA;
6649         ptr++;
6650         break;
6651 
6652 
6653         /* ------------------------------------------------------------ */
6654         case CHAR_LEFT_PARENTHESIS:
6655         bravalue = OP_COND;       /* Conditional group */
6656         tempptr = ptr;
6657 
6658         /* A condition can be an assertion, a number (referring to a numbered
6659         group's having been set), a name (referring to a named group), or 'R',
6660         referring to recursion. R<digits> and R&name are also permitted for
6661         recursion tests.
6662 
6663         There are ways of testing a named group: (?(name)) is used by Python;
6664         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6665 
6666         There is one unfortunate ambiguity, caused by history. 'R' can be the
6667         recursive thing or the name 'R' (and similarly for 'R' followed by
6668         digits). We look for a name first; if not found, we try the other case.
6669 
6670         For compatibility with auto-callouts, we allow a callout to be
6671         specified before a condition that is an assertion. First, check for the
6672         syntax of a callout; if found, adjust the temporary pointer that is
6673         used to check for an assertion condition. That's all that is needed! */
6674 
6675         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6676           {
6677           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6678           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6679             tempptr += i + 1;
6680           }
6681 
6682         /* For conditions that are assertions, check the syntax, and then exit
6683         the switch. This will take control down to where bracketed groups,
6684         including assertions, are processed. */
6685 
6686         if (tempptr[1] == CHAR_QUESTION_MARK &&
6687               (tempptr[2] == CHAR_EQUALS_SIGN ||
6688                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6689                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6690                    (tempptr[3] == CHAR_EQUALS_SIGN ||
6691                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
6692           {
6693           cd->iscondassert = TRUE;
6694           break;
6695           }
6696 
6697         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6698         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6699 
6700         code[1+LINK_SIZE] = OP_CREF;
6701         skipbytes = 1+IMM2_SIZE;
6702         refsign = -1;     /* => not a number */
6703         namelen = -1;     /* => not a name; must set to avoid warning */
6704         name = NULL;      /* Always set to avoid warning */
6705         recno = 0;        /* Always set to avoid warning */
6706 
6707         /* Check for a test for recursion in a named group. */
6708 
6709         ptr++;
6710         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6711           {
6712           terminator = -1;
6713           ptr += 2;
6714           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6715           }
6716 
6717         /* Check for a test for a named group's having been set, using the Perl
6718         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6719         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6720 
6721         else if (*ptr == CHAR_LESS_THAN_SIGN)
6722           {
6723           terminator = CHAR_GREATER_THAN_SIGN;
6724           ptr++;
6725           }
6726         else if (*ptr == CHAR_APOSTROPHE)
6727           {
6728           terminator = CHAR_APOSTROPHE;
6729           ptr++;
6730           }
6731         else
6732           {
6733           terminator = CHAR_NULL;
6734           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6735             else if (IS_DIGIT(*ptr)) refsign = 0;
6736           }
6737 
6738         /* Handle a number */
6739 
6740         if (refsign >= 0)
6741           {
6742           while (IS_DIGIT(*ptr))
6743             {
6744             recno = recno * 10 + (int)(*ptr - CHAR_0);
6745             ptr++;
6746             }
6747           }
6748 
6749         /* Otherwise we expect to read a name; anything else is an error. When
6750         a name is one of a number of duplicates, a different opcode is used and
6751         it needs more memory. Unfortunately we cannot tell whether a name is a
6752         duplicate in the first pass, so we have to allow for more memory. */
6753 
6754         else
6755           {
6756           if (IS_DIGIT(*ptr))
6757             {
6758             *errorcodeptr = ERR84;
6759             goto FAILED;
6760             }
6761           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6762             {
6763             *errorcodeptr = ERR28;   /* Assertion expected */
6764             goto FAILED;
6765             }
6766           name = ptr++;
6767           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6768             {
6769             ptr++;
6770             }
6771           namelen = (int)(ptr - name);
6772           if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
6773           }
6774 
6775         /* Check the terminator */
6776 
6777         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6778             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6779           {
6780           ptr--;                  /* Error offset */
6781           *errorcodeptr = ERR26;  /* Malformed number or name */
6782           goto FAILED;
6783           }
6784 
6785         /* Do no further checking in the pre-compile phase. */
6786 
6787         if (lengthptr != NULL) break;
6788 
6789         /* In the real compile we do the work of looking for the actual
6790         reference. If refsign is not negative, it means we have a number in
6791         recno. */
6792 
6793         if (refsign >= 0)
6794           {
6795           if (recno <= 0)
6796             {
6797             *errorcodeptr = ERR35;
6798             goto FAILED;
6799             }
6800           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6801             cd->bracount - recno + 1 : recno + cd->bracount;
6802           if (recno <= 0 || recno > cd->final_bracount)
6803             {
6804             *errorcodeptr = ERR15;
6805             goto FAILED;
6806             }
6807           PUT2(code, 2+LINK_SIZE, recno);
6808           if (recno > cd->top_backref) cd->top_backref = recno;
6809           break;
6810           }
6811 
6812         /* Otherwise look for the name. */
6813 
6814         slot = cd->name_table;
6815         for (i = 0; i < cd->names_found; i++)
6816           {
6817           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6818           slot += cd->name_entry_size;
6819           }
6820 
6821         /* Found the named subpattern. If the name is duplicated, add one to
6822         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6823         appropriate data values. Otherwise, just insert the unique subpattern
6824         number. */
6825 
6826         if (i < cd->names_found)
6827           {
6828           int offset = i++;
6829           int count = 1;
6830           recno = GET2(slot, 0);   /* Number from first found */
6831           if (recno > cd->top_backref) cd->top_backref = recno;
6832           for (; i < cd->names_found; i++)
6833             {
6834             slot += cd->name_entry_size;
6835             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6836               (slot+IMM2_SIZE)[namelen] != 0) break;
6837             count++;
6838             }
6839 
6840           if (count > 1)
6841             {
6842             PUT2(code, 2+LINK_SIZE, offset);
6843             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6844             skipbytes += IMM2_SIZE;
6845             code[1+LINK_SIZE]++;
6846             }
6847           else  /* Not a duplicated name */
6848             {
6849             PUT2(code, 2+LINK_SIZE, recno);
6850             }
6851           }
6852 
6853         /* If terminator == CHAR_NULL it means that the name followed directly
6854         after the opening parenthesis [e.g. (?(abc)...] and in this case there
6855         are some further alternatives to try. For the cases where terminator !=
6856         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6857         we have now checked all the possibilities, so give an error. */
6858 
6859         else if (terminator != CHAR_NULL)
6860           {
6861           *errorcodeptr = ERR15;
6862           goto FAILED;
6863           }
6864 
6865         /* Check for (?(R) for recursion. Allow digits after R to specify a
6866         specific group number. */
6867 
6868         else if (*name == CHAR_R)
6869           {
6870           recno = 0;
6871           for (i = 1; i < namelen; i++)
6872             {
6873             if (!IS_DIGIT(name[i]))
6874               {
6875               *errorcodeptr = ERR15;
6876               goto FAILED;
6877               }
6878             recno = recno * 10 + name[i] - CHAR_0;
6879             }
6880           if (recno == 0) recno = RREF_ANY;
6881           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6882           PUT2(code, 2+LINK_SIZE, recno);
6883           }
6884 
6885         /* Similarly, check for the (?(DEFINE) "condition", which is always
6886         false. */
6887 
6888         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6889           {
6890           code[1+LINK_SIZE] = OP_DEF;
6891           skipbytes = 1;
6892           }
6893 
6894         /* Reference to an unidentified subpattern. */
6895 
6896         else
6897           {
6898           *errorcodeptr = ERR15;
6899           goto FAILED;
6900           }
6901         break;
6902 
6903 
6904         /* ------------------------------------------------------------ */
6905         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
6906         bravalue = OP_ASSERT;
6907         cd->assert_depth += 1;
6908         ptr++;
6909         break;
6910 
6911         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6912         thing to do, but Perl allows all assertions to be quantified, and when
6913         they contain capturing parentheses there may be a potential use for
6914         this feature. Not that that applies to a quantified (?!) but we allow
6915         it for uniformity. */
6916 
6917         /* ------------------------------------------------------------ */
6918         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6919         ptr++;
6920         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6921              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6922             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6923           {
6924           *code++ = OP_FAIL;
6925           previous = NULL;
6926           continue;
6927           }
6928         bravalue = OP_ASSERT_NOT;
6929         cd->assert_depth += 1;
6930         break;
6931 
6932 
6933         /* ------------------------------------------------------------ */
6934         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
6935         switch (ptr[1])
6936           {
6937           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
6938           bravalue = OP_ASSERTBACK;
6939           cd->assert_depth += 1;
6940           ptr += 2;
6941           break;
6942 
6943           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
6944           bravalue = OP_ASSERTBACK_NOT;
6945           cd->assert_depth += 1;
6946           ptr += 2;
6947           break;
6948 
6949           default:                /* Could be name define, else bad */
6950           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
6951             goto DEFINE_NAME;
6952           ptr++;                  /* Correct offset for error */
6953           *errorcodeptr = ERR24;
6954           goto FAILED;
6955           }
6956         break;
6957 
6958 
6959         /* ------------------------------------------------------------ */
6960         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
6961         bravalue = OP_ONCE;
6962         ptr++;
6963         break;
6964 
6965 
6966         /* ------------------------------------------------------------ */
6967         case CHAR_C:                 /* Callout - may be followed by digits; */
6968         previous_callout = code;     /* Save for later completion */
6969         after_manual_callout = 1;    /* Skip one item before completing */
6970         *code++ = OP_CALLOUT;
6971           {
6972           int n = 0;
6973           ptr++;
6974           while(IS_DIGIT(*ptr))
6975             n = n * 10 + *ptr++ - CHAR_0;
6976           if (*ptr != CHAR_RIGHT_PARENTHESIS)
6977             {
6978             *errorcodeptr = ERR39;
6979             goto FAILED;
6980             }
6981           if (n > 255)
6982             {
6983             *errorcodeptr = ERR38;
6984             goto FAILED;
6985             }
6986           *code++ = n;
6987           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6988           PUT(code, LINK_SIZE, 0);                          /* Default length */
6989           code += 2 * LINK_SIZE;
6990           }
6991         previous = NULL;
6992         continue;
6993 
6994 
6995         /* ------------------------------------------------------------ */
6996         case CHAR_P:              /* Python-style named subpattern handling */
6997         if (*(++ptr) == CHAR_EQUALS_SIGN ||
6998             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
6999           {
7000           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7001           terminator = CHAR_RIGHT_PARENTHESIS;
7002           goto NAMED_REF_OR_RECURSE;
7003           }
7004         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7005           {
7006           *errorcodeptr = ERR41;
7007           goto FAILED;
7008           }
7009         /* Fall through to handle (?P< as (?< is handled */
7010 
7011 
7012         /* ------------------------------------------------------------ */
7013         DEFINE_NAME:    /* Come here from (?< handling */
7014         case CHAR_APOSTROPHE:
7015         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7016           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7017         name = ++ptr;
7018         if (IS_DIGIT(*ptr))
7019           {
7020           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7021           goto FAILED;
7022           }
7023         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7024         namelen = (int)(ptr - name);
7025 
7026         /* In the pre-compile phase, do a syntax check, remember the longest
7027         name, and then remember the group in a vector, expanding it if
7028         necessary. Duplicates for the same number are skipped; other duplicates
7029         are checked for validity. In the actual compile, there is nothing to
7030         do. */
7031 
7032         if (lengthptr != NULL)
7033           {
7034           named_group *ng;
7035           pcre_uint32 number = cd->bracount + 1;
7036 
7037           if (*ptr != (pcre_uchar)terminator)
7038             {
7039             *errorcodeptr = ERR42;
7040             goto FAILED;
7041             }
7042 
7043           if (cd->names_found >= MAX_NAME_COUNT)
7044             {
7045             *errorcodeptr = ERR49;
7046             goto FAILED;
7047             }
7048 
7049           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7050             {
7051             cd->name_entry_size = namelen + IMM2_SIZE + 1;
7052             if (namelen > MAX_NAME_SIZE)
7053               {
7054               *errorcodeptr = ERR48;
7055               goto FAILED;
7056               }
7057             }
7058 
7059           /* Scan the list to check for duplicates. For duplicate names, if the
7060           number is the same, break the loop, which causes the name to be
7061           discarded; otherwise, if DUPNAMES is not set, give an error.
7062           If it is set, allow the name with a different number, but continue
7063           scanning in case this is a duplicate with the same number. For
7064           non-duplicate names, give an error if the number is duplicated. */
7065 
7066           ng = cd->named_groups;
7067           for (i = 0; i < cd->names_found; i++, ng++)
7068             {
7069             if (namelen == ng->length &&
7070                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7071               {
7072               if (ng->number == number) break;
7073               if ((options & PCRE_DUPNAMES) == 0)
7074                 {
7075                 *errorcodeptr = ERR43;
7076                 goto FAILED;
7077                 }
7078               cd->dupnames = TRUE;  /* Duplicate names exist */
7079               }
7080             else if (ng->number == number)
7081               {
7082               *errorcodeptr = ERR65;
7083               goto FAILED;
7084               }
7085             }
7086 
7087           if (i >= cd->names_found)     /* Not a duplicate with same number */
7088             {
7089             /* Increase the list size if necessary */
7090 
7091             if (cd->names_found >= cd->named_group_list_size)
7092               {
7093               int newsize = cd->named_group_list_size * 2;
7094               named_group *newspace = (PUBL(malloc))
7095                 (newsize * sizeof(named_group));
7096 
7097               if (newspace == NULL)
7098                 {
7099                 *errorcodeptr = ERR21;
7100                 goto FAILED;
7101                 }
7102 
7103               memcpy(newspace, cd->named_groups,
7104                 cd->named_group_list_size * sizeof(named_group));
7105               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7106                 (PUBL(free))((void *)cd->named_groups);
7107               cd->named_groups = newspace;
7108               cd->named_group_list_size = newsize;
7109               }
7110 
7111             cd->named_groups[cd->names_found].name = name;
7112             cd->named_groups[cd->names_found].length = namelen;
7113             cd->named_groups[cd->names_found].number = number;
7114             cd->names_found++;
7115             }
7116           }
7117 
7118         ptr++;                    /* Move past > or ' in both passes. */
7119         goto NUMBERED_GROUP;
7120 
7121 
7122         /* ------------------------------------------------------------ */
7123         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7124         terminator = CHAR_RIGHT_PARENTHESIS;
7125         is_recurse = TRUE;
7126         /* Fall through */
7127 
7128         /* We come here from the Python syntax above that handles both
7129         references (?P=name) and recursion (?P>name), as well as falling
7130         through from the Perl recursion syntax (?&name). We also come here from
7131         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7132         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7133 
7134         NAMED_REF_OR_RECURSE:
7135         name = ++ptr;
7136         if (IS_DIGIT(*ptr))
7137           {
7138           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7139           goto FAILED;
7140           }
7141         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7142         namelen = (int)(ptr - name);
7143 
7144         /* In the pre-compile phase, do a syntax check. We used to just set
7145         a dummy reference number, because it was not used in the first pass.
7146         However, with the change of recursive back references to be atomic,
7147         we have to look for the number so that this state can be identified, as
7148         otherwise the incorrect length is computed. If it's not a backwards
7149         reference, the dummy number will do. */
7150 
7151         if (lengthptr != NULL)
7152           {
7153           named_group *ng;
7154 
7155           if (namelen == 0)
7156             {
7157             *errorcodeptr = ERR62;
7158             goto FAILED;
7159             }
7160           if (*ptr != (pcre_uchar)terminator)
7161             {
7162             *errorcodeptr = ERR42;
7163             goto FAILED;
7164             }
7165           if (namelen > MAX_NAME_SIZE)
7166             {
7167             *errorcodeptr = ERR48;
7168             goto FAILED;
7169             }
7170 
7171           /* The name table does not exist in the first pass; instead we must
7172           scan the list of names encountered so far in order to get the
7173           number. If the name is not found, set the value to 0 for a forward
7174           reference. */
7175 
7176           ng = cd->named_groups;
7177           for (i = 0; i < cd->names_found; i++, ng++)
7178             {
7179             if (namelen == ng->length &&
7180                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7181               break;
7182             }
7183           recno = (i < cd->names_found)? ng->number : 0;
7184 
7185           /* Count named back references. */
7186 
7187           if (!is_recurse) cd->namedrefcount++;
7188 
7189           /* We have to allow for a named reference to a duplicated name (this
7190           cannot be determined until the second pass). This needs an extra
7191           16-bit data item. */
7192 
7193           *lengthptr += IMM2_SIZE;
7194           }
7195 
7196         /* In the real compile, search the name table. We check the name
7197         first, and then check that we have reached the end of the name in the
7198         table. That way, if the name is longer than any in the table, the
7199         comparison will fail without reading beyond the table entry. */
7200 
7201         else
7202           {
7203           slot = cd->name_table;
7204           for (i = 0; i < cd->names_found; i++)
7205             {
7206             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7207                 slot[IMM2_SIZE+namelen] == 0)
7208               break;
7209             slot += cd->name_entry_size;
7210             }
7211 
7212           if (i < cd->names_found)
7213             {
7214             recno = GET2(slot, 0);
7215             }
7216           else
7217             {
7218             *errorcodeptr = ERR15;
7219             goto FAILED;
7220             }
7221           }
7222 
7223         /* In both phases, for recursions, we can now go to the code than
7224         handles numerical recursion. */
7225 
7226         if (is_recurse) goto HANDLE_RECURSION;
7227 
7228         /* In the second pass we must see if the name is duplicated. If so, we
7229         generate a different opcode. */
7230 
7231         if (lengthptr == NULL && cd->dupnames)
7232           {
7233           int count = 1;
7234           unsigned int index = i;
7235           pcre_uchar *cslot = slot + cd->name_entry_size;
7236 
7237           for (i++; i < cd->names_found; i++)
7238             {
7239             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7240 
7241 
7242             count++;
7243             cslot += cd->name_entry_size;
7244             }
7245 
7246           if (count > 1)
7247             {
7248             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7249             previous = code;
7250             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7251             PUT2INC(code, 0, index);
7252             PUT2INC(code, 0, count);
7253 
7254             /* Process each potentially referenced group. */
7255 
7256             for (; slot < cslot; slot += cd->name_entry_size)
7257               {
7258               open_capitem *oc;
7259               recno = GET2(slot, 0);
7260               cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7261               if (recno > cd->top_backref) cd->top_backref = recno;
7262 
7263               /* Check to see if this back reference is recursive, that it, it
7264               is inside the group that it references. A flag is set so that the
7265               group can be made atomic. */
7266 
7267               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7268                 {
7269                 if (oc->number == recno)
7270                   {
7271                   oc->flag = TRUE;
7272                   break;
7273                   }
7274                 }
7275               }
7276 
7277             continue;  /* End of back ref handling */
7278             }
7279           }
7280 
7281         /* First pass, or a non-duplicated name. */
7282 
7283         goto HANDLE_REFERENCE;
7284 
7285 
7286         /* ------------------------------------------------------------ */
7287         case CHAR_R:              /* Recursion */
7288         ptr++;                    /* Same as (?0)      */
7289         /* Fall through */
7290 
7291 
7292         /* ------------------------------------------------------------ */
7293         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7294         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7295         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7296           {
7297           const pcre_uchar *called;
7298           terminator = CHAR_RIGHT_PARENTHESIS;
7299 
7300           /* Come here from the \g<...> and \g'...' code (Oniguruma
7301           compatibility). However, the syntax has been checked to ensure that
7302           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7303           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7304           ever be taken. */
7305 
7306           HANDLE_NUMERICAL_RECURSION:
7307 
7308           if ((refsign = *ptr) == CHAR_PLUS)
7309             {
7310             ptr++;
7311             if (!IS_DIGIT(*ptr))
7312               {
7313               *errorcodeptr = ERR63;
7314               goto FAILED;
7315               }
7316             }
7317           else if (refsign == CHAR_MINUS)
7318             {
7319             if (!IS_DIGIT(ptr[1]))
7320               goto OTHER_CHAR_AFTER_QUERY;
7321             ptr++;
7322             }
7323 
7324           recno = 0;
7325           while(IS_DIGIT(*ptr))
7326             recno = recno * 10 + *ptr++ - CHAR_0;
7327 
7328           if (*ptr != (pcre_uchar)terminator)
7329             {
7330             *errorcodeptr = ERR29;
7331             goto FAILED;
7332             }
7333 
7334           if (refsign == CHAR_MINUS)
7335             {
7336             if (recno == 0)
7337               {
7338               *errorcodeptr = ERR58;
7339               goto FAILED;
7340               }
7341             recno = cd->bracount - recno + 1;
7342             if (recno <= 0)
7343               {
7344               *errorcodeptr = ERR15;
7345               goto FAILED;
7346               }
7347             }
7348           else if (refsign == CHAR_PLUS)
7349             {
7350             if (recno == 0)
7351               {
7352               *errorcodeptr = ERR58;
7353               goto FAILED;
7354               }
7355             recno += cd->bracount;
7356             }
7357 
7358           /* Come here from code above that handles a named recursion */
7359 
7360           HANDLE_RECURSION:
7361 
7362           previous = code;
7363           called = cd->start_code;
7364 
7365           /* When we are actually compiling, find the bracket that is being
7366           referenced. Temporarily end the regex in case it doesn't exist before
7367           this point. If we end up with a forward reference, first check that
7368           the bracket does occur later so we can give the error (and position)
7369           now. Then remember this forward reference in the workspace so it can
7370           be filled in at the end. */
7371 
7372           if (lengthptr == NULL)
7373             {
7374             *code = OP_END;
7375             if (recno != 0)
7376               called = PRIV(find_bracket)(cd->start_code, utf, recno);
7377 
7378             /* Forward reference */
7379 
7380             if (called == NULL)
7381               {
7382               if (recno > cd->final_bracount)
7383                 {
7384                 *errorcodeptr = ERR15;
7385                 goto FAILED;
7386                 }
7387 
7388               /* Fudge the value of "called" so that when it is inserted as an
7389               offset below, what it actually inserted is the reference number
7390               of the group. Then remember the forward reference. */
7391 
7392               called = cd->start_code + recno;
7393               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7394                   WORK_SIZE_SAFETY_MARGIN)
7395                 {
7396                 *errorcodeptr = expand_workspace(cd);
7397                 if (*errorcodeptr != 0) goto FAILED;
7398                 }
7399               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7400               }
7401 
7402             /* If not a forward reference, and the subpattern is still open,
7403             this is a recursive call. We check to see if this is a left
7404             recursion that could loop for ever, and diagnose that case. We
7405             must not, however, do this check if we are in a conditional
7406             subpattern because the condition might be testing for recursion in
7407             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7408             Forever loops are also detected at runtime, so those that occur in
7409             conditional subpatterns will be picked up then. */
7410 
7411             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7412                      could_be_empty(called, code, bcptr, utf, cd))
7413               {
7414               *errorcodeptr = ERR40;
7415               goto FAILED;
7416               }
7417             }
7418 
7419           /* Insert the recursion/subroutine item. It does not have a set first
7420           character (relevant if it is repeated, because it will then be
7421           wrapped with ONCE brackets). */
7422 
7423           *code = OP_RECURSE;
7424           PUT(code, 1, (int)(called - cd->start_code));
7425           code += 1 + LINK_SIZE;
7426           groupsetfirstchar = FALSE;
7427           }
7428 
7429         /* Can't determine a first byte now */
7430 
7431         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7432         continue;
7433 
7434 
7435         /* ------------------------------------------------------------ */
7436         default:              /* Other characters: check option setting */
7437         OTHER_CHAR_AFTER_QUERY:
7438         set = unset = 0;
7439         optset = &set;
7440 
7441         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7442           {
7443           switch (*ptr++)
7444             {
7445             case CHAR_MINUS: optset = &unset; break;
7446 
7447             case CHAR_J:    /* Record that it changed in the external options */
7448             *optset |= PCRE_DUPNAMES;
7449             cd->external_flags |= PCRE_JCHANGED;
7450             break;
7451 
7452             case CHAR_i: *optset |= PCRE_CASELESS; break;
7453             case CHAR_m: *optset |= PCRE_MULTILINE; break;
7454             case CHAR_s: *optset |= PCRE_DOTALL; break;
7455             case CHAR_x: *optset |= PCRE_EXTENDED; break;
7456             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7457             case CHAR_X: *optset |= PCRE_EXTRA; break;
7458 
7459             default:  *errorcodeptr = ERR12;
7460                       ptr--;    /* Correct the offset */
7461                       goto FAILED;
7462             }
7463           }
7464 
7465         /* Set up the changed option bits, but don't change anything yet. */
7466 
7467         newoptions = (options | set) & (~unset);
7468 
7469         /* If the options ended with ')' this is not the start of a nested
7470         group with option changes, so the options change at this level. If this
7471         item is right at the start of the pattern, the options can be
7472         abstracted and made external in the pre-compile phase, and ignored in
7473         the compile phase. This can be helpful when matching -- for instance in
7474         caseless checking of required bytes.
7475 
7476         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
7477         definitely *not* at the start of the pattern because something has been
7478         compiled. In the pre-compile phase, however, the code pointer can have
7479         that value after the start, because it gets reset as code is discarded
7480         during the pre-compile. However, this can happen only at top level - if
7481         we are within parentheses, the starting BRA will still be present. At
7482         any parenthesis level, the length value can be used to test if anything
7483         has been compiled at that level. Thus, a test for both these conditions
7484         is necessary to ensure we correctly detect the start of the pattern in
7485         both phases.
7486 
7487         If we are not at the pattern start, reset the greedy defaults and the
7488         case value for firstchar and reqchar. */
7489 
7490         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7491           {
7492           if (code == cd->start_code + 1 + LINK_SIZE &&
7493                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
7494             {
7495             cd->external_options = newoptions;
7496             }
7497           else
7498             {
7499             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7500             greedy_non_default = greedy_default ^ 1;
7501             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7502             }
7503 
7504           /* Change options at this level, and pass them back for use
7505           in subsequent branches. */
7506 
7507           *optionsptr = options = newoptions;
7508           previous = NULL;       /* This item can't be repeated */
7509           continue;              /* It is complete */
7510           }
7511 
7512         /* If the options ended with ':' we are heading into a nested group
7513         with possible change of options. Such groups are non-capturing and are
7514         not assertions of any kind. All we need to do is skip over the ':';
7515         the newoptions value is handled below. */
7516 
7517         bravalue = OP_BRA;
7518         ptr++;
7519         }     /* End of switch for character following (? */
7520       }       /* End of (? handling */
7521 
7522     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7523     is set, all unadorned brackets become non-capturing and behave like (?:...)
7524     brackets. */
7525 
7526     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7527       {
7528       bravalue = OP_BRA;
7529       }
7530 
7531     /* Else we have a capturing group. */
7532 
7533     else
7534       {
7535       NUMBERED_GROUP:
7536       cd->bracount += 1;
7537       PUT2(code, 1+LINK_SIZE, cd->bracount);
7538       skipbytes = IMM2_SIZE;
7539       }
7540 
7541     /* Process nested bracketed regex. First check for parentheses nested too
7542     deeply. */
7543 
7544     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7545       {
7546       *errorcodeptr = ERR82;
7547       goto FAILED;
7548       }
7549 
7550     /* All assertions used not to be repeatable, but this was changed for Perl
7551     compatibility. All kinds can now be repeated except for assertions that are
7552     conditions (Perl also forbids these to be repeated). We copy code into a
7553     non-register variable (tempcode) in order to be able to pass its address
7554     because some compilers complain otherwise. At the start of a conditional
7555     group whose condition is an assertion, cd->iscondassert is set. We unset it
7556     here so as to allow assertions later in the group to be quantified. */
7557 
7558     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7559         cd->iscondassert)
7560       {
7561       previous = NULL;
7562       cd->iscondassert = FALSE;
7563       }
7564     else previous = code;
7565 
7566     *code = bravalue;
7567     tempcode = code;
7568     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7569     tempbracount = cd->bracount;          /* Save value before bracket */
7570     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7571 
7572     if (!compile_regex(
7573          newoptions,                      /* The complete new option state */
7574          &tempcode,                       /* Where to put code (updated) */
7575          &ptr,                            /* Input pointer (updated) */
7576          errorcodeptr,                    /* Where to put an error message */
7577          (bravalue == OP_ASSERTBACK ||
7578           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7579          reset_bracount,                  /* True if (?| group */
7580          skipbytes,                       /* Skip over bracket number */
7581          cond_depth +
7582            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7583          &subfirstchar,                   /* For possible first char */
7584          &subfirstcharflags,
7585          &subreqchar,                     /* For possible last char */
7586          &subreqcharflags,
7587          bcptr,                           /* Current branch chain */
7588          cd,                              /* Tables block */
7589          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7590            &length_prevgroup              /* Pre-compile phase */
7591          ))
7592       goto FAILED;
7593 
7594     cd->parens_depth -= 1;
7595 
7596     /* If this was an atomic group and there are no capturing groups within it,
7597     generate OP_ONCE_NC instead of OP_ONCE. */
7598 
7599     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7600       *code = OP_ONCE_NC;
7601 
7602     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7603       cd->assert_depth -= 1;
7604 
7605     /* At the end of compiling, code is still pointing to the start of the
7606     group, while tempcode has been updated to point past the end of the group.
7607     The pattern pointer (ptr) is on the bracket.
7608 
7609     If this is a conditional bracket, check that there are no more than
7610     two branches in the group, or just one if it's a DEFINE group. We do this
7611     in the real compile phase, not in the pre-pass, where the whole group may
7612     not be available. */
7613 
7614     if (bravalue == OP_COND && lengthptr == NULL)
7615       {
7616       pcre_uchar *tc = code;
7617       int condcount = 0;
7618 
7619       do {
7620          condcount++;
7621          tc += GET(tc,1);
7622          }
7623       while (*tc != OP_KET);
7624 
7625       /* A DEFINE group is never obeyed inline (the "condition" is always
7626       false). It must have only one branch. */
7627 
7628       if (code[LINK_SIZE+1] == OP_DEF)
7629         {
7630         if (condcount > 1)
7631           {
7632           *errorcodeptr = ERR54;
7633           goto FAILED;
7634           }
7635         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7636         }
7637 
7638       /* A "normal" conditional group. If there is just one branch, we must not
7639       make use of its firstchar or reqchar, because this is equivalent to an
7640       empty second branch. */
7641 
7642       else
7643         {
7644         if (condcount > 2)
7645           {
7646           *errorcodeptr = ERR27;
7647           goto FAILED;
7648           }
7649         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7650         }
7651       }
7652 
7653     /* Error if hit end of pattern */
7654 
7655     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7656       {
7657       *errorcodeptr = ERR14;
7658       goto FAILED;
7659       }
7660 
7661     /* In the pre-compile phase, update the length by the length of the group,
7662     less the brackets at either end. Then reduce the compiled code to just a
7663     set of non-capturing brackets so that it doesn't use much memory if it is
7664     duplicated by a quantifier.*/
7665 
7666     if (lengthptr != NULL)
7667       {
7668       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7669         {
7670         *errorcodeptr = ERR20;
7671         goto FAILED;
7672         }
7673       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7674       code++;   /* This already contains bravalue */
7675       PUTINC(code, 0, 1 + LINK_SIZE);
7676       *code++ = OP_KET;
7677       PUTINC(code, 0, 1 + LINK_SIZE);
7678       break;    /* No need to waste time with special character handling */
7679       }
7680 
7681     /* Otherwise update the main code pointer to the end of the group. */
7682 
7683     code = tempcode;
7684 
7685     /* For a DEFINE group, required and first character settings are not
7686     relevant. */
7687 
7688     if (bravalue == OP_DEF) break;
7689 
7690     /* Handle updating of the required and first characters for other types of
7691     group. Update for normal brackets of all kinds, and conditions with two
7692     branches (see code above). If the bracket is followed by a quantifier with
7693     zero repeat, we have to back off. Hence the definition of zeroreqchar and
7694     zerofirstchar outside the main loop so that they can be accessed for the
7695     back off. */
7696 
7697     zeroreqchar = reqchar;
7698     zeroreqcharflags = reqcharflags;
7699     zerofirstchar = firstchar;
7700     zerofirstcharflags = firstcharflags;
7701     groupsetfirstchar = FALSE;
7702 
7703     if (bravalue >= OP_ONCE)
7704       {
7705       /* If we have not yet set a firstchar in this branch, take it from the
7706       subpattern, remembering that it was set here so that a repeat of more
7707       than one can replicate it as reqchar if necessary. If the subpattern has
7708       no firstchar, set "none" for the whole branch. In both cases, a zero
7709       repeat forces firstchar to "none". */
7710 
7711       if (firstcharflags == REQ_UNSET)
7712         {
7713         if (subfirstcharflags >= 0)
7714           {
7715           firstchar = subfirstchar;
7716           firstcharflags = subfirstcharflags;
7717           groupsetfirstchar = TRUE;
7718           }
7719         else firstcharflags = REQ_NONE;
7720         zerofirstcharflags = REQ_NONE;
7721         }
7722 
7723       /* If firstchar was previously set, convert the subpattern's firstchar
7724       into reqchar if there wasn't one, using the vary flag that was in
7725       existence beforehand. */
7726 
7727       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7728         {
7729         subreqchar = subfirstchar;
7730         subreqcharflags = subfirstcharflags | tempreqvary;
7731         }
7732 
7733       /* If the subpattern set a required byte (or set a first byte that isn't
7734       really the first byte - see above), set it. */
7735 
7736       if (subreqcharflags >= 0)
7737         {
7738         reqchar = subreqchar;
7739         reqcharflags = subreqcharflags;
7740         }
7741       }
7742 
7743     /* For a forward assertion, we take the reqchar, if set. This can be
7744     helpful if the pattern that follows the assertion doesn't set a different
7745     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
7746     for an assertion, however because it leads to incorrect effect for patterns
7747     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
7748     of a firstchar. This is overcome by a scan at the end if there's no
7749     firstchar, looking for an asserted first char. */
7750 
7751     else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
7752       {
7753       reqchar = subreqchar;
7754       reqcharflags = subreqcharflags;
7755       }
7756     break;     /* End of processing '(' */
7757 
7758 
7759     /* ===================================================================*/
7760     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7761     are arranged to be the negation of the corresponding OP_values in the
7762     default case when PCRE_UCP is not set. For the back references, the values
7763     are negative the reference number. Only back references and those types
7764     that consume a character may be repeated. We can test for values between
7765     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7766     ever created. */
7767 
7768     case CHAR_BACKSLASH:
7769     tempptr = ptr;
7770     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7771     if (*errorcodeptr != 0) goto FAILED;
7772 
7773     if (escape == 0)                  /* The escape coded a single character */
7774       c = ec;
7775     else
7776       {
7777       if (escape == ESC_Q)            /* Handle start of quoted string */
7778         {
7779         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
7780           ptr += 2;               /* avoid empty string */
7781             else inescq = TRUE;
7782         continue;
7783         }
7784 
7785       if (escape == ESC_E) continue;  /* Perl ignores an orphan \E */
7786 
7787       /* For metasequences that actually match a character, we disable the
7788       setting of a first character if it hasn't already been set. */
7789 
7790       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7791         firstcharflags = REQ_NONE;
7792 
7793       /* Set values to reset to if this is followed by a zero repeat. */
7794 
7795       zerofirstchar = firstchar;
7796       zerofirstcharflags = firstcharflags;
7797       zeroreqchar = reqchar;
7798       zeroreqcharflags = reqcharflags;
7799 
7800       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7801       is a subroutine call by number (Oniguruma syntax). In fact, the value
7802       ESC_g is returned only for these cases. So we don't need to check for <
7803       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7804       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7805       that is a synonym for a named back reference). */
7806 
7807       if (escape == ESC_g)
7808         {
7809         const pcre_uchar *p;
7810         pcre_uint32 cf;
7811 
7812         save_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
7813         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7814           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7815 
7816         /* These two statements stop the compiler for warning about possibly
7817         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7818         fact, because we do the check for a number below, the paths that
7819         would actually be in error are never taken. */
7820 
7821         skipbytes = 0;
7822         reset_bracount = FALSE;
7823 
7824         /* If it's not a signed or unsigned number, treat it as a name. */
7825 
7826         cf = ptr[1];
7827         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7828           {
7829           is_recurse = TRUE;
7830           goto NAMED_REF_OR_RECURSE;
7831           }
7832 
7833         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7834         or a digit. */
7835 
7836         p = ptr + 2;
7837         while (IS_DIGIT(*p)) p++;
7838         if (*p != (pcre_uchar)terminator)
7839           {
7840           *errorcodeptr = ERR57;
7841           break;
7842           }
7843         ptr++;
7844         goto HANDLE_NUMERICAL_RECURSION;
7845         }
7846 
7847       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7848       We also support \k{name} (.NET syntax).  */
7849 
7850       if (escape == ESC_k)
7851         {
7852         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7853           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7854           {
7855           *errorcodeptr = ERR69;
7856           break;
7857           }
7858         is_recurse = FALSE;
7859         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7860           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7861           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7862         goto NAMED_REF_OR_RECURSE;
7863         }
7864 
7865       /* Back references are handled specially; must disable firstchar if
7866       not set to cope with cases like (?=(\w+))\1: which would otherwise set
7867       ':' later. */
7868 
7869       if (escape < 0)
7870         {
7871         open_capitem *oc;
7872         recno = -escape;
7873 
7874         /* Come here from named backref handling when the reference is to a
7875         single group (i.e. not to a duplicated name. */
7876 
7877         HANDLE_REFERENCE:
7878         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7879         previous = code;
7880         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
7881         PUT2INC(code, 0, recno);
7882         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7883         if (recno > cd->top_backref) cd->top_backref = recno;
7884 
7885         /* Check to see if this back reference is recursive, that it, it
7886         is inside the group that it references. A flag is set so that the
7887         group can be made atomic. */
7888 
7889         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7890           {
7891           if (oc->number == recno)
7892             {
7893             oc->flag = TRUE;
7894             break;
7895             }
7896           }
7897         }
7898 
7899       /* So are Unicode property matches, if supported. */
7900 
7901 #ifdef SUPPORT_UCP
7902       else if (escape == ESC_P || escape == ESC_p)
7903         {
7904         BOOL negated;
7905         unsigned int ptype = 0, pdata = 0;
7906         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
7907           goto FAILED;
7908         previous = code;
7909         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
7910         *code++ = ptype;
7911         *code++ = pdata;
7912         }
7913 #else
7914 
7915       /* If Unicode properties are not supported, \X, \P, and \p are not
7916       allowed. */
7917 
7918       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
7919         {
7920         *errorcodeptr = ERR45;
7921         goto FAILED;
7922         }
7923 #endif
7924 
7925       /* For the rest (including \X when Unicode properties are supported), we
7926       can obtain the OP value by negating the escape value in the default
7927       situation when PCRE_UCP is not set. When it *is* set, we substitute
7928       Unicode property tests. Note that \b and \B do a one-character
7929       lookbehind, and \A also behaves as if it does. */
7930 
7931       else
7932         {
7933         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
7934              cd->max_lookbehind == 0)
7935           cd->max_lookbehind = 1;
7936 #ifdef SUPPORT_UCP
7937         if (escape >= ESC_DU && escape <= ESC_wu)
7938           {
7939           nestptr = ptr + 1;                   /* Where to resume */
7940           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
7941           }
7942         else
7943 #endif
7944         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
7945         so that it works in DFA mode and in lookbehinds. */
7946 
7947           {
7948           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
7949           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
7950           }
7951         }
7952       continue;
7953       }
7954 
7955     /* We have a data character whose value is in c. In UTF-8 mode it may have
7956     a value > 127. We set its representation in the length/buffer, and then
7957     handle it as a data character. */
7958 
7959 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
7960     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
7961       mclength = PRIV(ord2utf)(c, mcbuffer);
7962     else
7963 #endif
7964 
7965      {
7966      mcbuffer[0] = c;
7967      mclength = 1;
7968      }
7969     goto ONE_CHAR;
7970 
7971 
7972     /* ===================================================================*/
7973     /* Handle a literal character. It is guaranteed not to be whitespace or #
7974     when the extended flag is set. If we are in a UTF mode, it may be a
7975     multi-unit literal character. */
7976 
7977     default:
7978     NORMAL_CHAR:
7979     mclength = 1;
7980     mcbuffer[0] = c;
7981 
7982 #ifdef SUPPORT_UTF
7983     if (utf && HAS_EXTRALEN(c))
7984       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
7985 #endif
7986 
7987     /* At this point we have the character's bytes in mcbuffer, and the length
7988     in mclength. When not in UTF-8 mode, the length is always 1. */
7989 
7990     ONE_CHAR:
7991     previous = code;
7992 
7993     /* For caseless UTF-8 mode when UCP support is available, check whether
7994     this character has more than one other case. If so, generate a special
7995     OP_PROP item instead of OP_CHARI. */
7996 
7997 #ifdef SUPPORT_UCP
7998     if (utf && (options & PCRE_CASELESS) != 0)
7999       {
8000       GETCHAR(c, mcbuffer);
8001       if ((c = UCD_CASESET(c)) != 0)
8002         {
8003         *code++ = OP_PROP;
8004         *code++ = PT_CLIST;
8005         *code++ = c;
8006         if (firstcharflags == REQ_UNSET)
8007           firstcharflags = zerofirstcharflags = REQ_NONE;
8008         break;
8009         }
8010       }
8011 #endif
8012 
8013     /* Caseful matches, or not one of the multicase characters. */
8014 
8015     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8016     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8017 
8018     /* Remember if \r or \n were seen */
8019 
8020     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8021       cd->external_flags |= PCRE_HASCRORLF;
8022 
8023     /* Set the first and required bytes appropriately. If no previous first
8024     byte, set it from this character, but revert to none on a zero repeat.
8025     Otherwise, leave the firstchar value alone, and don't change it on a zero
8026     repeat. */
8027 
8028     if (firstcharflags == REQ_UNSET)
8029       {
8030       zerofirstcharflags = REQ_NONE;
8031       zeroreqchar = reqchar;
8032       zeroreqcharflags = reqcharflags;
8033 
8034       /* If the character is more than one byte long, we can set firstchar
8035       only if it is not to be matched caselessly. */
8036 
8037       if (mclength == 1 || req_caseopt == 0)
8038         {
8039         firstchar = mcbuffer[0] | req_caseopt;
8040         firstchar = mcbuffer[0];
8041         firstcharflags = req_caseopt;
8042 
8043         if (mclength != 1)
8044           {
8045           reqchar = code[-1];
8046           reqcharflags = cd->req_varyopt;
8047           }
8048         }
8049       else firstcharflags = reqcharflags = REQ_NONE;
8050       }
8051 
8052     /* firstchar was previously set; we can set reqchar only if the length is
8053     1 or the matching is caseful. */
8054 
8055     else
8056       {
8057       zerofirstchar = firstchar;
8058       zerofirstcharflags = firstcharflags;
8059       zeroreqchar = reqchar;
8060       zeroreqcharflags = reqcharflags;
8061       if (mclength == 1 || req_caseopt == 0)
8062         {
8063         reqchar = code[-1];
8064         reqcharflags = req_caseopt | cd->req_varyopt;
8065         }
8066       }
8067 
8068     break;            /* End of literal character handling */
8069     }
8070   }                   /* end of big loop */
8071 
8072 
8073 /* Control never reaches here by falling through, only by a goto for all the
8074 error states. Pass back the position in the pattern so that it can be displayed
8075 to the user for diagnosing the error. */
8076 
8077 FAILED:
8078 *ptrptr = ptr;
8079 return FALSE;
8080 }
8081 
8082 
8083 
8084 /*************************************************
8085 *     Compile sequence of alternatives           *
8086 *************************************************/
8087 
8088 /* On entry, ptr is pointing past the bracket character, but on return it
8089 points to the closing bracket, or vertical bar, or end of string. The code
8090 variable is pointing at the byte into which the BRA operator has been stored.
8091 This function is used during the pre-compile phase when we are trying to find
8092 out the amount of memory needed, as well as during the real compile phase. The
8093 value of lengthptr distinguishes the two phases.
8094 
8095 Arguments:
8096   options           option bits, including any changes for this subpattern
8097   codeptr           -> the address of the current code pointer
8098   ptrptr            -> the address of the current pattern pointer
8099   errorcodeptr      -> pointer to error code variable
8100   lookbehind        TRUE if this is a lookbehind assertion
8101   reset_bracount    TRUE to reset the count for each branch
8102   skipbytes         skip this many bytes at start (for brackets and OP_COND)
8103   cond_depth        depth of nesting for conditional subpatterns
8104   firstcharptr      place to put the first required character
8105   firstcharflagsptr place to put the first character flags, or a negative number
8106   reqcharptr        place to put the last required character
8107   reqcharflagsptr   place to put the last required character flags, or a negative number
8108   bcptr             pointer to the chain of currently open branches
8109   cd                points to the data block with tables pointers etc.
8110   lengthptr         NULL during the real compile phase
8111                     points to length accumulator during pre-compile phase
8112 
8113 Returns:            TRUE on success
8114 */
8115 
8116 static BOOL
compile_regex(int options,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,int skipbytes,int cond_depth,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,compile_data * cd,int * lengthptr)8117 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8118   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8119   int cond_depth,
8120   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8121   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8122   branch_chain *bcptr, compile_data *cd, int *lengthptr)
8123 {
8124 const pcre_uchar *ptr = *ptrptr;
8125 pcre_uchar *code = *codeptr;
8126 pcre_uchar *last_branch = code;
8127 pcre_uchar *start_bracket = code;
8128 pcre_uchar *reverse_count = NULL;
8129 open_capitem capitem;
8130 int capnumber = 0;
8131 pcre_uint32 firstchar, reqchar;
8132 pcre_int32 firstcharflags, reqcharflags;
8133 pcre_uint32 branchfirstchar, branchreqchar;
8134 pcre_int32 branchfirstcharflags, branchreqcharflags;
8135 int length;
8136 unsigned int orig_bracount;
8137 unsigned int max_bracount;
8138 branch_chain bc;
8139 size_t save_hwm_offset;
8140 
8141 /* If set, call the external function that checks for stack availability. */
8142 
8143 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8144   {
8145   *errorcodeptr= ERR85;
8146   return FALSE;
8147   }
8148 
8149 /* Miscellaneous initialization */
8150 
8151 bc.outer = bcptr;
8152 bc.current_branch = code;
8153 
8154 firstchar = reqchar = 0;
8155 firstcharflags = reqcharflags = REQ_UNSET;
8156 
8157 save_hwm_offset = cd->hwm - cd->start_workspace;
8158 
8159 /* Accumulate the length for use in the pre-compile phase. Start with the
8160 length of the BRA and KET and any extra bytes that are required at the
8161 beginning. We accumulate in a local variable to save frequent testing of
8162 lenthptr for NULL. We cannot do this by looking at the value of code at the
8163 start and end of each alternative, because compiled items are discarded during
8164 the pre-compile phase so that the work space is not exceeded. */
8165 
8166 length = 2 + 2*LINK_SIZE + skipbytes;
8167 
8168 /* WARNING: If the above line is changed for any reason, you must also change
8169 the code that abstracts option settings at the start of the pattern and makes
8170 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8171 pre-compile phase to find out whether anything has yet been compiled or not. */
8172 
8173 /* If this is a capturing subpattern, add to the chain of open capturing items
8174 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8175 detect groups that contain recursive back references to themselves. Note that
8176 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8177 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8178 
8179 if (*code == OP_CBRA)
8180   {
8181   capnumber = GET2(code, 1 + LINK_SIZE);
8182   capitem.number = capnumber;
8183   capitem.next = cd->open_caps;
8184   capitem.flag = FALSE;
8185   cd->open_caps = &capitem;
8186   }
8187 
8188 /* Offset is set zero to mark that this bracket is still open */
8189 
8190 PUT(code, 1, 0);
8191 code += 1 + LINK_SIZE + skipbytes;
8192 
8193 /* Loop for each alternative branch */
8194 
8195 orig_bracount = max_bracount = cd->bracount;
8196 for (;;)
8197   {
8198   /* For a (?| group, reset the capturing bracket count so that each branch
8199   uses the same numbers. */
8200 
8201   if (reset_bracount) cd->bracount = orig_bracount;
8202 
8203   /* Set up dummy OP_REVERSE if lookbehind assertion */
8204 
8205   if (lookbehind)
8206     {
8207     *code++ = OP_REVERSE;
8208     reverse_count = code;
8209     PUTINC(code, 0, 0);
8210     length += 1 + LINK_SIZE;
8211     }
8212 
8213   /* Now compile the branch; in the pre-compile phase its length gets added
8214   into the length. */
8215 
8216   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8217         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8218         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8219     {
8220     *ptrptr = ptr;
8221     return FALSE;
8222     }
8223 
8224   /* Keep the highest bracket count in case (?| was used and some branch
8225   has fewer than the rest. */
8226 
8227   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8228 
8229   /* In the real compile phase, there is some post-processing to be done. */
8230 
8231   if (lengthptr == NULL)
8232     {
8233     /* If this is the first branch, the firstchar and reqchar values for the
8234     branch become the values for the regex. */
8235 
8236     if (*last_branch != OP_ALT)
8237       {
8238       firstchar = branchfirstchar;
8239       firstcharflags = branchfirstcharflags;
8240       reqchar = branchreqchar;
8241       reqcharflags = branchreqcharflags;
8242       }
8243 
8244     /* If this is not the first branch, the first char and reqchar have to
8245     match the values from all the previous branches, except that if the
8246     previous value for reqchar didn't have REQ_VARY set, it can still match,
8247     and we set REQ_VARY for the regex. */
8248 
8249     else
8250       {
8251       /* If we previously had a firstchar, but it doesn't match the new branch,
8252       we have to abandon the firstchar for the regex, but if there was
8253       previously no reqchar, it takes on the value of the old firstchar. */
8254 
8255       if (firstcharflags >= 0 &&
8256           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8257         {
8258         if (reqcharflags < 0)
8259           {
8260           reqchar = firstchar;
8261           reqcharflags = firstcharflags;
8262           }
8263         firstcharflags = REQ_NONE;
8264         }
8265 
8266       /* If we (now or from before) have no firstchar, a firstchar from the
8267       branch becomes a reqchar if there isn't a branch reqchar. */
8268 
8269       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8270         {
8271         branchreqchar = branchfirstchar;
8272         branchreqcharflags = branchfirstcharflags;
8273         }
8274 
8275       /* Now ensure that the reqchars match */
8276 
8277       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8278           reqchar != branchreqchar)
8279         reqcharflags = REQ_NONE;
8280       else
8281         {
8282         reqchar = branchreqchar;
8283         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8284         }
8285       }
8286 
8287     /* If lookbehind, check that this branch matches a fixed-length string, and
8288     put the length into the OP_REVERSE item. Temporarily mark the end of the
8289     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8290     because there may be forward references that we can't check here. Set a
8291     flag to cause another lookbehind check at the end. Why not do it all at the
8292     end? Because common, erroneous checks are picked up here and the offset of
8293     the problem can be shown. */
8294 
8295     if (lookbehind)
8296       {
8297       int fixed_length;
8298       *code = OP_END;
8299       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8300         FALSE, cd, NULL);
8301       DPRINTF(("fixed length = %d\n", fixed_length));
8302       if (fixed_length == -3)
8303         {
8304         cd->check_lookbehind = TRUE;
8305         }
8306       else if (fixed_length < 0)
8307         {
8308         *errorcodeptr = (fixed_length == -2)? ERR36 :
8309                         (fixed_length == -4)? ERR70: ERR25;
8310         *ptrptr = ptr;
8311         return FALSE;
8312         }
8313       else
8314         {
8315         if (fixed_length > cd->max_lookbehind)
8316           cd->max_lookbehind = fixed_length;
8317         PUT(reverse_count, 0, fixed_length);
8318         }
8319       }
8320     }
8321 
8322   /* Reached end of expression, either ')' or end of pattern. In the real
8323   compile phase, go back through the alternative branches and reverse the chain
8324   of offsets, with the field in the BRA item now becoming an offset to the
8325   first alternative. If there are no alternatives, it points to the end of the
8326   group. The length in the terminating ket is always the length of the whole
8327   bracketed item. Return leaving the pointer at the terminating char. */
8328 
8329   if (*ptr != CHAR_VERTICAL_LINE)
8330     {
8331     if (lengthptr == NULL)
8332       {
8333       int branch_length = (int)(code - last_branch);
8334       do
8335         {
8336         int prev_length = GET(last_branch, 1);
8337         PUT(last_branch, 1, branch_length);
8338         branch_length = prev_length;
8339         last_branch -= branch_length;
8340         }
8341       while (branch_length > 0);
8342       }
8343 
8344     /* Fill in the ket */
8345 
8346     *code = OP_KET;
8347     PUT(code, 1, (int)(code - start_bracket));
8348     code += 1 + LINK_SIZE;
8349 
8350     /* If it was a capturing subpattern, check to see if it contained any
8351     recursive back references. If so, we must wrap it in atomic brackets.
8352     Because we are moving code along, we must ensure that any pending recursive
8353     references are updated. In any event, remove the block from the chain. */
8354 
8355     if (capnumber > 0)
8356       {
8357       if (cd->open_caps->flag)
8358         {
8359         *code = OP_END;
8360         adjust_recurse(start_bracket, 1 + LINK_SIZE,
8361           (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8362         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8363           IN_UCHARS(code - start_bracket));
8364         *start_bracket = OP_ONCE;
8365         code += 1 + LINK_SIZE;
8366         PUT(start_bracket, 1, (int)(code - start_bracket));
8367         *code = OP_KET;
8368         PUT(code, 1, (int)(code - start_bracket));
8369         code += 1 + LINK_SIZE;
8370         length += 2 + 2*LINK_SIZE;
8371         }
8372       cd->open_caps = cd->open_caps->next;
8373       }
8374 
8375     /* Retain the highest bracket number, in case resetting was used. */
8376 
8377     cd->bracount = max_bracount;
8378 
8379     /* Set values to pass back */
8380 
8381     *codeptr = code;
8382     *ptrptr = ptr;
8383     *firstcharptr = firstchar;
8384     *firstcharflagsptr = firstcharflags;
8385     *reqcharptr = reqchar;
8386     *reqcharflagsptr = reqcharflags;
8387     if (lengthptr != NULL)
8388       {
8389       if (OFLOW_MAX - *lengthptr < length)
8390         {
8391         *errorcodeptr = ERR20;
8392         return FALSE;
8393         }
8394       *lengthptr += length;
8395       }
8396     return TRUE;
8397     }
8398 
8399   /* Another branch follows. In the pre-compile phase, we can move the code
8400   pointer back to where it was for the start of the first branch. (That is,
8401   pretend that each branch is the only one.)
8402 
8403   In the real compile phase, insert an ALT node. Its length field points back
8404   to the previous branch while the bracket remains open. At the end the chain
8405   is reversed. It's done like this so that the start of the bracket has a
8406   zero offset until it is closed, making it possible to detect recursion. */
8407 
8408   if (lengthptr != NULL)
8409     {
8410     code = *codeptr + 1 + LINK_SIZE + skipbytes;
8411     length += 1 + LINK_SIZE;
8412     }
8413   else
8414     {
8415     *code = OP_ALT;
8416     PUT(code, 1, (int)(code - last_branch));
8417     bc.current_branch = last_branch = code;
8418     code += 1 + LINK_SIZE;
8419     }
8420 
8421   ptr++;
8422   }
8423 /* Control never reaches here */
8424 }
8425 
8426 
8427 
8428 
8429 /*************************************************
8430 *          Check for anchored expression         *
8431 *************************************************/
8432 
8433 /* Try to find out if this is an anchored regular expression. Consider each
8434 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8435 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8436 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8437 be found, because ^ generates OP_CIRCM in that mode.
8438 
8439 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8440 This is the code for \G, which means "match at start of match position, taking
8441 into account the match offset".
8442 
8443 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8444 because that will try the rest of the pattern at all possible matching points,
8445 so there is no point trying again.... er ....
8446 
8447 .... except when the .* appears inside capturing parentheses, and there is a
8448 subsequent back reference to those parentheses. We haven't enough information
8449 to catch that case precisely.
8450 
8451 At first, the best we could do was to detect when .* was in capturing brackets
8452 and the highest back reference was greater than or equal to that level.
8453 However, by keeping a bitmap of the first 31 back references, we can catch some
8454 of the more common cases more precisely.
8455 
8456 ... A second exception is when the .* appears inside an atomic group, because
8457 this prevents the number of characters it matches from being adjusted.
8458 
8459 Arguments:
8460   code           points to start of expression (the bracket)
8461   bracket_map    a bitmap of which brackets we are inside while testing; this
8462                   handles up to substring 31; after that we just have to take
8463                   the less precise approach
8464   cd             points to the compile data block
8465   atomcount      atomic group level
8466 
8467 Returns:     TRUE or FALSE
8468 */
8469 
8470 static BOOL
is_anchored(register const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8471 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8472   compile_data *cd, int atomcount)
8473 {
8474 do {
8475    const pcre_uchar *scode = first_significant_code(
8476      code + PRIV(OP_lengths)[*code], FALSE);
8477    register int op = *scode;
8478 
8479    /* Non-capturing brackets */
8480 
8481    if (op == OP_BRA  || op == OP_BRAPOS ||
8482        op == OP_SBRA || op == OP_SBRAPOS)
8483      {
8484      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8485      }
8486 
8487    /* Capturing brackets */
8488 
8489    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8490             op == OP_SCBRA || op == OP_SCBRAPOS)
8491      {
8492      int n = GET2(scode, 1+LINK_SIZE);
8493      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8494      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8495      }
8496 
8497    /* Positive forward assertions and conditions */
8498 
8499    else if (op == OP_ASSERT || op == OP_COND)
8500      {
8501      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8502      }
8503 
8504    /* Atomic groups */
8505 
8506    else if (op == OP_ONCE || op == OP_ONCE_NC)
8507      {
8508      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8509        return FALSE;
8510      }
8511 
8512    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8513    it isn't in brackets that are or may be referenced or inside an atomic
8514    group. */
8515 
8516    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8517              op == OP_TYPEPOSSTAR))
8518      {
8519      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8520          atomcount > 0 || cd->had_pruneorskip)
8521        return FALSE;
8522      }
8523 
8524    /* Check for explicit anchoring */
8525 
8526    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8527 
8528    code += GET(code, 1);
8529    }
8530 while (*code == OP_ALT);   /* Loop for each alternative */
8531 return TRUE;
8532 }
8533 
8534 
8535 
8536 /*************************************************
8537 *         Check for starting with ^ or .*        *
8538 *************************************************/
8539 
8540 /* This is called to find out if every branch starts with ^ or .* so that
8541 "first char" processing can be done to speed things up in multiline
8542 matching and for non-DOTALL patterns that start with .* (which must start at
8543 the beginning or after \n). As in the case of is_anchored() (see above), we
8544 have to take account of back references to capturing brackets that contain .*
8545 because in that case we can't make the assumption. Also, the appearance of .*
8546 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8547 count, because once again the assumption no longer holds.
8548 
8549 Arguments:
8550   code           points to start of expression (the bracket)
8551   bracket_map    a bitmap of which brackets we are inside while testing; this
8552                   handles up to substring 31; after that we just have to take
8553                   the less precise approach
8554   cd             points to the compile data
8555   atomcount      atomic group level
8556 
8557 Returns:         TRUE or FALSE
8558 */
8559 
8560 static BOOL
is_startline(const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8561 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8562   compile_data *cd, int atomcount)
8563 {
8564 do {
8565    const pcre_uchar *scode = first_significant_code(
8566      code + PRIV(OP_lengths)[*code], FALSE);
8567    register int op = *scode;
8568 
8569    /* If we are at the start of a conditional assertion group, *both* the
8570    conditional assertion *and* what follows the condition must satisfy the test
8571    for start of line. Other kinds of condition fail. Note that there may be an
8572    auto-callout at the start of a condition. */
8573 
8574    if (op == OP_COND)
8575      {
8576      scode += 1 + LINK_SIZE;
8577      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8578      switch (*scode)
8579        {
8580        case OP_CREF:
8581        case OP_DNCREF:
8582        case OP_RREF:
8583        case OP_DNRREF:
8584        case OP_DEF:
8585        case OP_FAIL:
8586        return FALSE;
8587 
8588        default:     /* Assertion */
8589        if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8590        do scode += GET(scode, 1); while (*scode == OP_ALT);
8591        scode += 1 + LINK_SIZE;
8592        break;
8593        }
8594      scode = first_significant_code(scode, FALSE);
8595      op = *scode;
8596      }
8597 
8598    /* Non-capturing brackets */
8599 
8600    if (op == OP_BRA  || op == OP_BRAPOS ||
8601        op == OP_SBRA || op == OP_SBRAPOS)
8602      {
8603      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8604      }
8605 
8606    /* Capturing brackets */
8607 
8608    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8609             op == OP_SCBRA || op == OP_SCBRAPOS)
8610      {
8611      int n = GET2(scode, 1+LINK_SIZE);
8612      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8613      if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
8614      }
8615 
8616    /* Positive forward assertions */
8617 
8618    else if (op == OP_ASSERT)
8619      {
8620      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8621      }
8622 
8623    /* Atomic brackets */
8624 
8625    else if (op == OP_ONCE || op == OP_ONCE_NC)
8626      {
8627      if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
8628      }
8629 
8630    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8631    brackets that may be referenced, as long as the pattern does not contain
8632    *PRUNE or *SKIP, because these break the feature. Consider, for example,
8633    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8634    start of a line. */
8635 
8636    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8637      {
8638      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8639          atomcount > 0 || cd->had_pruneorskip)
8640        return FALSE;
8641      }
8642 
8643    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8644    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8645    because the number of characters matched by .* cannot be adjusted inside
8646    them. */
8647 
8648    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8649 
8650    /* Move on to the next alternative */
8651 
8652    code += GET(code, 1);
8653    }
8654 while (*code == OP_ALT);  /* Loop for each alternative */
8655 return TRUE;
8656 }
8657 
8658 
8659 
8660 /*************************************************
8661 *       Check for asserted fixed first char      *
8662 *************************************************/
8663 
8664 /* During compilation, the "first char" settings from forward assertions are
8665 discarded, because they can cause conflicts with actual literals that follow.
8666 However, if we end up without a first char setting for an unanchored pattern,
8667 it is worth scanning the regex to see if there is an initial asserted first
8668 char. If all branches start with the same asserted char, or with a
8669 non-conditional bracket all of whose alternatives start with the same asserted
8670 char (recurse ad lib), then we return that char, with the flags set to zero or
8671 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8672 
8673 Arguments:
8674   code       points to start of expression (the bracket)
8675   flags      points to the first char flags, or to REQ_NONE
8676   inassert   TRUE if in an assertion
8677 
8678 Returns:     the fixed first char, or 0 with REQ_NONE in flags
8679 */
8680 
8681 static pcre_uint32
find_firstassertedchar(const pcre_uchar * code,pcre_int32 * flags,BOOL inassert)8682 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8683   BOOL inassert)
8684 {
8685 register pcre_uint32 c = 0;
8686 int cflags = REQ_NONE;
8687 
8688 *flags = REQ_NONE;
8689 do {
8690    pcre_uint32 d;
8691    int dflags;
8692    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8693              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8694    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8695      TRUE);
8696    register pcre_uchar op = *scode;
8697 
8698    switch(op)
8699      {
8700      default:
8701      return 0;
8702 
8703      case OP_BRA:
8704      case OP_BRAPOS:
8705      case OP_CBRA:
8706      case OP_SCBRA:
8707      case OP_CBRAPOS:
8708      case OP_SCBRAPOS:
8709      case OP_ASSERT:
8710      case OP_ONCE:
8711      case OP_ONCE_NC:
8712      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8713      if (dflags < 0)
8714        return 0;
8715      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8716      break;
8717 
8718      case OP_EXACT:
8719      scode += IMM2_SIZE;
8720      /* Fall through */
8721 
8722      case OP_CHAR:
8723      case OP_PLUS:
8724      case OP_MINPLUS:
8725      case OP_POSPLUS:
8726      if (!inassert) return 0;
8727      if (cflags < 0) { c = scode[1]; cflags = 0; }
8728        else if (c != scode[1]) return 0;
8729      break;
8730 
8731      case OP_EXACTI:
8732      scode += IMM2_SIZE;
8733      /* Fall through */
8734 
8735      case OP_CHARI:
8736      case OP_PLUSI:
8737      case OP_MINPLUSI:
8738      case OP_POSPLUSI:
8739      if (!inassert) return 0;
8740      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8741        else if (c != scode[1]) return 0;
8742      break;
8743      }
8744 
8745    code += GET(code, 1);
8746    }
8747 while (*code == OP_ALT);
8748 
8749 *flags = cflags;
8750 return c;
8751 }
8752 
8753 
8754 
8755 /*************************************************
8756 *     Add an entry to the name/number table      *
8757 *************************************************/
8758 
8759 /* This function is called between compiling passes to add an entry to the
8760 name/number table, maintaining alphabetical order. Checking for permitted
8761 and forbidden duplicates has already been done.
8762 
8763 Arguments:
8764   cd           the compile data block
8765   name         the name to add
8766   length       the length of the name
8767   groupno      the group number
8768 
8769 Returns:       nothing
8770 */
8771 
8772 static void
add_name(compile_data * cd,const pcre_uchar * name,int length,unsigned int groupno)8773 add_name(compile_data *cd, const pcre_uchar *name, int length,
8774   unsigned int groupno)
8775 {
8776 int i;
8777 pcre_uchar *slot = cd->name_table;
8778 
8779 for (i = 0; i < cd->names_found; i++)
8780   {
8781   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8782   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8783     crc = -1; /* Current name is a substring */
8784 
8785   /* Make space in the table and break the loop for an earlier name. For a
8786   duplicate or later name, carry on. We do this for duplicates so that in the
8787   simple case (when ?(| is not used) they are in order of their numbers. In all
8788   cases they are in the order in which they appear in the pattern. */
8789 
8790   if (crc < 0)
8791     {
8792     memmove(slot + cd->name_entry_size, slot,
8793       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8794     break;
8795     }
8796 
8797   /* Continue the loop for a later or duplicate name */
8798 
8799   slot += cd->name_entry_size;
8800   }
8801 
8802 PUT2(slot, 0, groupno);
8803 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8804 slot[IMM2_SIZE + length] = 0;
8805 cd->names_found++;
8806 }
8807 
8808 
8809 
8810 /*************************************************
8811 *        Compile a Regular Expression            *
8812 *************************************************/
8813 
8814 /* This function takes a string and returns a pointer to a block of store
8815 holding a compiled version of the expression. The original API for this
8816 function had no error code return variable; it is retained for backwards
8817 compatibility. The new function is given a new name.
8818 
8819 Arguments:
8820   pattern       the regular expression
8821   options       various option bits
8822   errorcodeptr  pointer to error code variable (pcre_compile2() only)
8823                   can be NULL if you don't want a code value
8824   errorptr      pointer to pointer to error text
8825   erroroffset   ptr offset in pattern where error was detected
8826   tables        pointer to character tables or NULL
8827 
8828 Returns:        pointer to compiled data block, or NULL on error,
8829                 with errorptr and erroroffset set
8830 */
8831 
8832 #if defined COMPILE_PCRE8
8833 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char * pattern,int options,const char ** errorptr,int * erroroffset,const unsigned char * tables)8834 pcre_compile(const char *pattern, int options, const char **errorptr,
8835   int *erroroffset, const unsigned char *tables)
8836 #elif defined COMPILE_PCRE16
8837 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8838 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
8839   int *erroroffset, const unsigned char *tables)
8840 #elif defined COMPILE_PCRE32
8841 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8842 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
8843   int *erroroffset, const unsigned char *tables)
8844 #endif
8845 {
8846 #if defined COMPILE_PCRE8
8847 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8848 #elif defined COMPILE_PCRE16
8849 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8850 #elif defined COMPILE_PCRE32
8851 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8852 #endif
8853 }
8854 
8855 
8856 #if defined COMPILE_PCRE8
8857 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char * pattern,int options,int * errorcodeptr,const char ** errorptr,int * erroroffset,const unsigned char * tables)8858 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
8859   const char **errorptr, int *erroroffset, const unsigned char *tables)
8860 #elif defined COMPILE_PCRE16
8861 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8862 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
8863   const char **errorptr, int *erroroffset, const unsigned char *tables)
8864 #elif defined COMPILE_PCRE32
8865 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8866 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
8867   const char **errorptr, int *erroroffset, const unsigned char *tables)
8868 #endif
8869 {
8870 REAL_PCRE *re;
8871 int length = 1;  /* For final END opcode */
8872 pcre_int32 firstcharflags, reqcharflags;
8873 pcre_uint32 firstchar, reqchar;
8874 pcre_uint32 limit_match = PCRE_UINT32_MAX;
8875 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
8876 int newline;
8877 int errorcode = 0;
8878 int skipatstart = 0;
8879 BOOL utf;
8880 BOOL never_utf = FALSE;
8881 size_t size;
8882 pcre_uchar *code;
8883 const pcre_uchar *codestart;
8884 const pcre_uchar *ptr;
8885 compile_data compile_block;
8886 compile_data *cd = &compile_block;
8887 
8888 /* This space is used for "compiling" into during the first phase, when we are
8889 computing the amount of memory that is needed. Compiled items are thrown away
8890 as soon as possible, so that a fairly large buffer should be sufficient for
8891 this purpose. The same space is used in the second phase for remembering where
8892 to fill in forward references to subpatterns. That may overflow, in which case
8893 new memory is obtained from malloc(). */
8894 
8895 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
8896 
8897 /* This vector is used for remembering name groups during the pre-compile. In a
8898 similar way to cworkspace, it can be expanded using malloc() if necessary. */
8899 
8900 named_group named_groups[NAMED_GROUP_LIST_SIZE];
8901 
8902 /* Set this early so that early errors get offset 0. */
8903 
8904 ptr = (const pcre_uchar *)pattern;
8905 
8906 /* We can't pass back an error message if errorptr is NULL; I guess the best we
8907 can do is just return NULL, but we can set a code value if there is a code
8908 pointer. */
8909 
8910 if (errorptr == NULL)
8911   {
8912   if (errorcodeptr != NULL) *errorcodeptr = 99;
8913   return NULL;
8914   }
8915 
8916 *errorptr = NULL;
8917 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
8918 
8919 /* However, we can give a message for this error */
8920 
8921 if (erroroffset == NULL)
8922   {
8923   errorcode = ERR16;
8924   goto PCRE_EARLY_ERROR_RETURN2;
8925   }
8926 
8927 *erroroffset = 0;
8928 
8929 /* Set up pointers to the individual character tables */
8930 
8931 if (tables == NULL) tables = PRIV(default_tables);
8932 cd->lcc = tables + lcc_offset;
8933 cd->fcc = tables + fcc_offset;
8934 cd->cbits = tables + cbits_offset;
8935 cd->ctypes = tables + ctypes_offset;
8936 
8937 /* Check that all undefined public option bits are zero */
8938 
8939 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
8940   {
8941   errorcode = ERR17;
8942   goto PCRE_EARLY_ERROR_RETURN;
8943   }
8944 
8945 /* If PCRE_NEVER_UTF is set, remember it. */
8946 
8947 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
8948 
8949 /* Check for global one-time settings at the start of the pattern, and remember
8950 the offset for later. */
8951 
8952 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
8953 
8954 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
8955        ptr[skipatstart+1] == CHAR_ASTERISK)
8956   {
8957   int newnl = 0;
8958   int newbsr = 0;
8959 
8960 /* For completeness and backward compatibility, (*UTFn) is supported in the
8961 relevant libraries, but (*UTF) is generic and always supported. Note that
8962 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
8963 
8964 #ifdef COMPILE_PCRE8
8965   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
8966     { skipatstart += 7; options |= PCRE_UTF8; continue; }
8967 #endif
8968 #ifdef COMPILE_PCRE16
8969   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
8970     { skipatstart += 8; options |= PCRE_UTF16; continue; }
8971 #endif
8972 #ifdef COMPILE_PCRE32
8973   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
8974     { skipatstart += 8; options |= PCRE_UTF32; continue; }
8975 #endif
8976 
8977   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
8978     { skipatstart += 6; options |= PCRE_UTF8; continue; }
8979   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
8980     { skipatstart += 6; options |= PCRE_UCP; continue; }
8981   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
8982     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
8983   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
8984     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
8985 
8986   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
8987     {
8988     pcre_uint32 c = 0;
8989     int p = skipatstart + 14;
8990     while (isdigit(ptr[p]))
8991       {
8992       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
8993       c = c*10 + ptr[p++] - CHAR_0;
8994       }
8995     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
8996     if (c < limit_match)
8997       {
8998       limit_match = c;
8999       cd->external_flags |= PCRE_MLSET;
9000       }
9001     skipatstart = p;
9002     continue;
9003     }
9004 
9005   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9006     {
9007     pcre_uint32 c = 0;
9008     int p = skipatstart + 18;
9009     while (isdigit(ptr[p]))
9010       {
9011       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9012       c = c*10 + ptr[p++] - CHAR_0;
9013       }
9014     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9015     if (c < limit_recursion)
9016       {
9017       limit_recursion = c;
9018       cd->external_flags |= PCRE_RLSET;
9019       }
9020     skipatstart = p;
9021     continue;
9022     }
9023 
9024   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9025     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9026   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9027     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9028   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9029     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9030   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9031     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9032   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9033     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9034 
9035   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9036     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9037   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9038     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9039 
9040   if (newnl != 0)
9041     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9042   else if (newbsr != 0)
9043     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9044   else break;
9045   }
9046 
9047 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9048 utf = (options & PCRE_UTF8) != 0;
9049 if (utf && never_utf)
9050   {
9051   errorcode = ERR78;
9052   goto PCRE_EARLY_ERROR_RETURN2;
9053   }
9054 
9055 /* Can't support UTF unless PCRE has been compiled to include the code. The
9056 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9057 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9058 not used here. */
9059 
9060 #ifdef SUPPORT_UTF
9061 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9062      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9063   {
9064 #if defined COMPILE_PCRE8
9065   errorcode = ERR44;
9066 #elif defined COMPILE_PCRE16
9067   errorcode = ERR74;
9068 #elif defined COMPILE_PCRE32
9069   errorcode = ERR77;
9070 #endif
9071   goto PCRE_EARLY_ERROR_RETURN2;
9072   }
9073 #else
9074 if (utf)
9075   {
9076   errorcode = ERR32;
9077   goto PCRE_EARLY_ERROR_RETURN;
9078   }
9079 #endif
9080 
9081 /* Can't support UCP unless PCRE has been compiled to include the code. */
9082 
9083 #ifndef SUPPORT_UCP
9084 if ((options & PCRE_UCP) != 0)
9085   {
9086   errorcode = ERR67;
9087   goto PCRE_EARLY_ERROR_RETURN;
9088   }
9089 #endif
9090 
9091 /* Check validity of \R options. */
9092 
9093 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9094      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9095   {
9096   errorcode = ERR56;
9097   goto PCRE_EARLY_ERROR_RETURN;
9098   }
9099 
9100 /* Handle different types of newline. The three bits give seven cases. The
9101 current code allows for fixed one- or two-byte sequences, plus "any" and
9102 "anycrlf". */
9103 
9104 switch (options & PCRE_NEWLINE_BITS)
9105   {
9106   case 0: newline = NEWLINE; break;   /* Build-time default */
9107   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9108   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9109   case PCRE_NEWLINE_CR+
9110        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9111   case PCRE_NEWLINE_ANY: newline = -1; break;
9112   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9113   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9114   }
9115 
9116 if (newline == -2)
9117   {
9118   cd->nltype = NLTYPE_ANYCRLF;
9119   }
9120 else if (newline < 0)
9121   {
9122   cd->nltype = NLTYPE_ANY;
9123   }
9124 else
9125   {
9126   cd->nltype = NLTYPE_FIXED;
9127   if (newline > 255)
9128     {
9129     cd->nllen = 2;
9130     cd->nl[0] = (newline >> 8) & 255;
9131     cd->nl[1] = newline & 255;
9132     }
9133   else
9134     {
9135     cd->nllen = 1;
9136     cd->nl[0] = newline;
9137     }
9138   }
9139 
9140 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9141 references to help in deciding whether (.*) can be treated as anchored or not.
9142 */
9143 
9144 cd->top_backref = 0;
9145 cd->backref_map = 0;
9146 
9147 /* Reflect pattern for debugging output */
9148 
9149 DPRINTF(("------------------------------------------------------------------\n"));
9150 #ifdef PCRE_DEBUG
9151 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9152 #endif
9153 DPRINTF(("\n"));
9154 
9155 /* Pretend to compile the pattern while actually just accumulating the length
9156 of memory required. This behaviour is triggered by passing a non-NULL final
9157 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9158 to compile parts of the pattern into; the compiled code is discarded when it is
9159 no longer needed, so hopefully this workspace will never overflow, though there
9160 is a test for its doing so. */
9161 
9162 cd->bracount = cd->final_bracount = 0;
9163 cd->names_found = 0;
9164 cd->name_entry_size = 0;
9165 cd->name_table = NULL;
9166 cd->dupnames = FALSE;
9167 cd->namedrefcount = 0;
9168 cd->start_code = cworkspace;
9169 cd->hwm = cworkspace;
9170 cd->iscondassert = FALSE;
9171 cd->start_workspace = cworkspace;
9172 cd->workspace_size = COMPILE_WORK_SIZE;
9173 cd->named_groups = named_groups;
9174 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9175 cd->start_pattern = (const pcre_uchar *)pattern;
9176 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9177 cd->req_varyopt = 0;
9178 cd->parens_depth = 0;
9179 cd->assert_depth = 0;
9180 cd->max_lookbehind = 0;
9181 cd->external_options = options;
9182 cd->open_caps = NULL;
9183 
9184 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9185 don't need to look at the result of the function here. The initial options have
9186 been put into the cd block so that they can be changed if an option setting is
9187 found within the regex right at the beginning. Bringing initial option settings
9188 outside can help speed up starting point checks. */
9189 
9190 ptr += skipatstart;
9191 code = cworkspace;
9192 *code = OP_BRA;
9193 
9194 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9195   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9196   cd, &length);
9197 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9198 
9199 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9200   (int)(cd->hwm - cworkspace)));
9201 
9202 if (length > MAX_PATTERN_SIZE)
9203   {
9204   errorcode = ERR20;
9205   goto PCRE_EARLY_ERROR_RETURN;
9206   }
9207 
9208 /* Compute the size of the data block for storing the compiled pattern. Integer
9209 overflow should no longer be possible because nowadays we limit the maximum
9210 value of cd->names_found and cd->name_entry_size. */
9211 
9212 size = sizeof(REAL_PCRE) +
9213   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9214 
9215 /* Get the memory. */
9216 
9217 re = (REAL_PCRE *)(PUBL(malloc))(size);
9218 if (re == NULL)
9219   {
9220   errorcode = ERR21;
9221   goto PCRE_EARLY_ERROR_RETURN;
9222   }
9223 
9224 /* Put in the magic number, and save the sizes, initial options, internal
9225 flags, and character table pointer. NULL is used for the default character
9226 tables. The nullpad field is at the end; it's there to help in the case when a
9227 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9228 pointers. */
9229 
9230 re->magic_number = MAGIC_NUMBER;
9231 re->size = (int)size;
9232 re->options = cd->external_options;
9233 re->flags = cd->external_flags;
9234 re->limit_match = limit_match;
9235 re->limit_recursion = limit_recursion;
9236 re->first_char = 0;
9237 re->req_char = 0;
9238 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9239 re->name_entry_size = cd->name_entry_size;
9240 re->name_count = cd->names_found;
9241 re->ref_count = 0;
9242 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9243 re->nullpad = NULL;
9244 #ifdef COMPILE_PCRE32
9245 re->dummy = 0;
9246 #else
9247 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9248 #endif
9249 
9250 /* The starting points of the name/number translation table and of the code are
9251 passed around in the compile data block. The start/end pattern and initial
9252 options are already set from the pre-compile phase, as is the name_entry_size
9253 field. Reset the bracket count and the names_found field. Also reset the hwm
9254 field; this time it's used for remembering forward references to subpatterns.
9255 */
9256 
9257 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9258 cd->parens_depth = 0;
9259 cd->assert_depth = 0;
9260 cd->bracount = 0;
9261 cd->max_lookbehind = 0;
9262 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9263 codestart = cd->name_table + re->name_entry_size * re->name_count;
9264 cd->start_code = codestart;
9265 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9266 cd->iscondassert = FALSE;
9267 cd->req_varyopt = 0;
9268 cd->had_accept = FALSE;
9269 cd->had_pruneorskip = FALSE;
9270 cd->check_lookbehind = FALSE;
9271 cd->open_caps = NULL;
9272 
9273 /* If any named groups were found, create the name/number table from the list
9274 created in the first pass. */
9275 
9276 if (cd->names_found > 0)
9277   {
9278   int i = cd->names_found;
9279   named_group *ng = cd->named_groups;
9280   cd->names_found = 0;
9281   for (; i > 0; i--, ng++)
9282     add_name(cd, ng->name, ng->length, ng->number);
9283   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9284     (PUBL(free))((void *)cd->named_groups);
9285   }
9286 
9287 /* Set up a starting, non-extracting bracket, then compile the expression. On
9288 error, errorcode will be set non-zero, so we don't need to look at the result
9289 of the function here. */
9290 
9291 ptr = (const pcre_uchar *)pattern + skipatstart;
9292 code = (pcre_uchar *)codestart;
9293 *code = OP_BRA;
9294 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9295   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9296 re->top_bracket = cd->bracount;
9297 re->top_backref = cd->top_backref;
9298 re->max_lookbehind = cd->max_lookbehind;
9299 re->flags = cd->external_flags | PCRE_MODE;
9300 
9301 if (cd->had_accept)
9302   {
9303   reqchar = 0;              /* Must disable after (*ACCEPT) */
9304   reqcharflags = REQ_NONE;
9305   }
9306 
9307 /* If not reached end of pattern on success, there's an excess bracket. */
9308 
9309 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9310 
9311 /* Fill in the terminating state and check for disastrous overflow, but
9312 if debugging, leave the test till after things are printed out. */
9313 
9314 *code++ = OP_END;
9315 
9316 #ifndef PCRE_DEBUG
9317 if (code - codestart > length) errorcode = ERR23;
9318 #endif
9319 
9320 #ifdef SUPPORT_VALGRIND
9321 /* If the estimated length exceeds the really used length, mark the extra
9322 allocated memory as unaddressable, so that any out-of-bound reads can be
9323 detected. */
9324 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9325 #endif
9326 
9327 /* Fill in any forward references that are required. There may be repeated
9328 references; optimize for them, as searching a large regex takes time. */
9329 
9330 if (cd->hwm > cd->start_workspace)
9331   {
9332   int prev_recno = -1;
9333   const pcre_uchar *groupptr = NULL;
9334   while (errorcode == 0 && cd->hwm > cd->start_workspace)
9335     {
9336     int offset, recno;
9337     cd->hwm -= LINK_SIZE;
9338     offset = GET(cd->hwm, 0);
9339     recno = GET(codestart, offset);
9340     if (recno != prev_recno)
9341       {
9342       groupptr = PRIV(find_bracket)(codestart, utf, recno);
9343       prev_recno = recno;
9344       }
9345     if (groupptr == NULL) errorcode = ERR53;
9346       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9347     }
9348   }
9349 
9350 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9351 NULL to indicate that forward references have been filled in. */
9352 
9353 if (cd->workspace_size > COMPILE_WORK_SIZE)
9354   (PUBL(free))((void *)cd->start_workspace);
9355 cd->start_workspace = NULL;
9356 
9357 /* Give an error if there's back reference to a non-existent capturing
9358 subpattern. */
9359 
9360 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9361 
9362 /* Unless disabled, check whether any single character iterators can be
9363 auto-possessified. The function overwrites the appropriate opcode values, so
9364 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9365 used in this code because at least one compiler gives a warning about loss of
9366 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9367 function call. */
9368 
9369 if ((options & PCRE_NO_AUTO_POSSESS) == 0)
9370   {
9371   pcre_uchar *temp = (pcre_uchar *)codestart;
9372   auto_possessify(temp, utf, cd);
9373   }
9374 
9375 /* If there were any lookbehind assertions that contained OP_RECURSE
9376 (recursions or subroutine calls), a flag is set for them to be checked here,
9377 because they may contain forward references. Actual recursions cannot be fixed
9378 length, but subroutine calls can. It is done like this so that those without
9379 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9380 exceptional ones forgo this. We scan the pattern to check that they are fixed
9381 length, and set their lengths. */
9382 
9383 if (cd->check_lookbehind)
9384   {
9385   pcre_uchar *cc = (pcre_uchar *)codestart;
9386 
9387   /* Loop, searching for OP_REVERSE items, and process those that do not have
9388   their length set. (Actually, it will also re-process any that have a length
9389   of zero, but that is a pathological case, and it does no harm.) When we find
9390   one, we temporarily terminate the branch it is in while we scan it. */
9391 
9392   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9393        cc != NULL;
9394        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9395     {
9396     if (GET(cc, 1) == 0)
9397       {
9398       int fixed_length;
9399       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9400       int end_op = *be;
9401       *be = OP_END;
9402       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9403         cd, NULL);
9404       *be = end_op;
9405       DPRINTF(("fixed length = %d\n", fixed_length));
9406       if (fixed_length < 0)
9407         {
9408         errorcode = (fixed_length == -2)? ERR36 :
9409                     (fixed_length == -4)? ERR70 : ERR25;
9410         break;
9411         }
9412       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9413       PUT(cc, 1, fixed_length);
9414       }
9415     cc += 1 + LINK_SIZE;
9416     }
9417   }
9418 
9419 /* Failed to compile, or error while post-processing */
9420 
9421 if (errorcode != 0)
9422   {
9423   (PUBL(free))(re);
9424   PCRE_EARLY_ERROR_RETURN:
9425   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9426   PCRE_EARLY_ERROR_RETURN2:
9427   *errorptr = find_error_text(errorcode);
9428   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9429   return NULL;
9430   }
9431 
9432 /* If the anchored option was not passed, set the flag if we can determine that
9433 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9434 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9435 of *PRUNE or *SKIP.
9436 
9437 Otherwise, if we know what the first byte has to be, save it, because that
9438 speeds up unanchored matches no end. If not, see if we can set the
9439 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9440 start with ^. and also when all branches start with non-atomic .* for
9441 non-DOTALL matches when *PRUNE and SKIP are not present. */
9442 
9443 if ((re->options & PCRE_ANCHORED) == 0)
9444   {
9445   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9446   else
9447     {
9448     if (firstcharflags < 0)
9449       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9450     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9451       {
9452 #if defined COMPILE_PCRE8
9453       re->first_char = firstchar & 0xff;
9454 #elif defined COMPILE_PCRE16
9455       re->first_char = firstchar & 0xffff;
9456 #elif defined COMPILE_PCRE32
9457       re->first_char = firstchar;
9458 #endif
9459       if ((firstcharflags & REQ_CASELESS) != 0)
9460         {
9461 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9462         /* We ignore non-ASCII first chars in 8 bit mode. */
9463         if (utf)
9464           {
9465           if (re->first_char < 128)
9466             {
9467             if (cd->fcc[re->first_char] != re->first_char)
9468               re->flags |= PCRE_FCH_CASELESS;
9469             }
9470           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9471             re->flags |= PCRE_FCH_CASELESS;
9472           }
9473         else
9474 #endif
9475         if (MAX_255(re->first_char)
9476             && cd->fcc[re->first_char] != re->first_char)
9477           re->flags |= PCRE_FCH_CASELESS;
9478         }
9479 
9480       re->flags |= PCRE_FIRSTSET;
9481       }
9482 
9483     else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
9484     }
9485   }
9486 
9487 /* For an anchored pattern, we use the "required byte" only if it follows a
9488 variable length item in the regex. Remove the caseless flag for non-caseable
9489 bytes. */
9490 
9491 if (reqcharflags >= 0 &&
9492      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9493   {
9494 #if defined COMPILE_PCRE8
9495   re->req_char = reqchar & 0xff;
9496 #elif defined COMPILE_PCRE16
9497   re->req_char = reqchar & 0xffff;
9498 #elif defined COMPILE_PCRE32
9499   re->req_char = reqchar;
9500 #endif
9501   if ((reqcharflags & REQ_CASELESS) != 0)
9502     {
9503 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9504     /* We ignore non-ASCII first chars in 8 bit mode. */
9505     if (utf)
9506       {
9507       if (re->req_char < 128)
9508         {
9509         if (cd->fcc[re->req_char] != re->req_char)
9510           re->flags |= PCRE_RCH_CASELESS;
9511         }
9512       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9513         re->flags |= PCRE_RCH_CASELESS;
9514       }
9515     else
9516 #endif
9517     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9518       re->flags |= PCRE_RCH_CASELESS;
9519     }
9520 
9521   re->flags |= PCRE_REQCHSET;
9522   }
9523 
9524 /* Print out the compiled data if debugging is enabled. This is never the
9525 case when building a production library. */
9526 
9527 #ifdef PCRE_DEBUG
9528 printf("Length = %d top_bracket = %d top_backref = %d\n",
9529   length, re->top_bracket, re->top_backref);
9530 
9531 printf("Options=%08x\n", re->options);
9532 
9533 if ((re->flags & PCRE_FIRSTSET) != 0)
9534   {
9535   pcre_uchar ch = re->first_char;
9536   const char *caseless =
9537     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9538   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9539     else printf("First char = \\x%02x%s\n", ch, caseless);
9540   }
9541 
9542 if ((re->flags & PCRE_REQCHSET) != 0)
9543   {
9544   pcre_uchar ch = re->req_char;
9545   const char *caseless =
9546     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9547   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9548     else printf("Req char = \\x%02x%s\n", ch, caseless);
9549   }
9550 
9551 #if defined COMPILE_PCRE8
9552 pcre_printint((pcre *)re, stdout, TRUE);
9553 #elif defined COMPILE_PCRE16
9554 pcre16_printint((pcre *)re, stdout, TRUE);
9555 #elif defined COMPILE_PCRE32
9556 pcre32_printint((pcre *)re, stdout, TRUE);
9557 #endif
9558 
9559 /* This check is done here in the debugging case so that the code that
9560 was compiled can be seen. */
9561 
9562 if (code - codestart > length)
9563   {
9564   (PUBL(free))(re);
9565   *errorptr = find_error_text(ERR23);
9566   *erroroffset = ptr - (pcre_uchar *)pattern;
9567   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9568   return NULL;
9569   }
9570 #endif   /* PCRE_DEBUG */
9571 
9572 /* Check for a pattern than can match an empty string, so that this information
9573 can be provided to applications. */
9574 
9575 do
9576   {
9577   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9578     {
9579     re->flags |= PCRE_MATCH_EMPTY;
9580     break;
9581     }
9582   codestart += GET(codestart, 1);
9583   }
9584 while (*codestart == OP_ALT);
9585 
9586 #if defined COMPILE_PCRE8
9587 return (pcre *)re;
9588 #elif defined COMPILE_PCRE16
9589 return (pcre16 *)re;
9590 #elif defined COMPILE_PCRE32
9591 return (pcre32 *)re;
9592 #endif
9593 }
9594 
9595 /* End of pcre_compile.c */
9596 
9597