xref: /PHP-8.2/ext/pcre/pcre2lib/pcre2_compile.c (revision 32cceb75)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2022 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #define NLBLOCK cb             /* Block containing newline information */
47 #define PSSTART start_pattern  /* Field containing processed string start */
48 #define PSEND   end_pattern    /* Field containing processed string end */
49 
50 #include "pcre2_internal.h"
51 
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53 
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63 
64 /* Other debugging code can be enabled by these defines. */
65 
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68 
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71 
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c)                xdigitab[c]
75 
76 #else  /* Either 16-bit or 32-bit */
77 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78 
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81 
82 #else  /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86 
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90 
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100   { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102   { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110 
111 /* Macros for manipulating elements of the parsed pattern vector. */
112 
113 #define META_CODE(x)   (x & 0xffff0000u)
114 #define META_DATA(x)   (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116 
117 /* Function definitions to allow mutual recursion */
118 
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121   add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122     compile_block *, const uint32_t *, unsigned int);
123 #endif
124 
125 static int
126   compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127     uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128     compile_block *, PCRE2_SIZE *);
129 
130 static int
131   get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132     compile_block *);
133 
134 static BOOL
135   set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136     compile_block *);
137 
138 static int
139   check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140     compile_block *, int *);
141 
142 
143 /*************************************************
144 *      Code parameters and static tables         *
145 *************************************************/
146 
147 #define MAX_GROUP_NUMBER   65535u
148 #define MAX_REPEAT_COUNT   65535u
149 #define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
150 
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156 
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163 
164 In the real compile phase, this workspace is not currently used. */
165 
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
167 
168 #define C16_WORK_SIZE \
169   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170 
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174 
175 #define GROUPINFO_DEFAULT_SIZE 256
176 
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179 
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181 
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186 
187 #define NAMED_GROUP_LIST_SIZE  20
188 
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192 
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194 
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199 
200 #define OFLOW_MAX (INT_MAX - 20)
201 
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207 
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210 
211 #define META_END              0x80000000u  /* End of pattern */
212 
213 #define META_ALT              0x80010000u  /* alternation */
214 #define META_ATOMIC           0x80020000u  /* atomic group */
215 #define META_BACKREF          0x80030000u  /* Back ref */
216 #define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
217 #define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
219 #define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
220 #define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
221 #define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
222 #define META_CLASS            0x800a0000u  /* start non-empty class */
223 #define META_CLASS_EMPTY      0x800b0000u  /* empty class */
224 #define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
225 #define META_CLASS_END        0x800d0000u  /* end of non-empty class */
226 #define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
227 #define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
228 #define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
229 #define META_COND_NAME        0x80110000u  /* (?(<name>)... */
230 #define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
231 #define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
232 #define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
233 #define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR           0x80160000u  /* $ metacharacter */
235 #define META_DOT              0x80170000u  /* . metacharacter */
236 #define META_ESCAPE           0x80180000u  /* \d and friends */
237 #define META_KET              0x80190000u  /* closing parenthesis */
238 #define META_NOCAPTURE        0x801a0000u  /* no capture parens */
239 #define META_OPTIONS          0x801b0000u  /* (?i) and friends */
240 #define META_POSIX            0x801c0000u  /* POSIX class item */
241 #define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
242 #define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
243 #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
244 #define META_RECURSE          0x80200000u  /* Recursion */
245 #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
246 #define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
247 
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250 
251 #define META_LOOKAHEAD        0x80230000u  /* (?= */
252 #define META_LOOKAHEADNOT     0x80240000u  /* (?! */
253 #define META_LOOKBEHIND       0x80250000u  /* (?<= */
254 #define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
255 
256 /* These cannot be conditions */
257 
258 #define META_LOOKAHEAD_NA     0x80270000u  /* (*napla: */
259 #define META_LOOKBEHIND_NA    0x80280000u  /* (*naplb: */
260 
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264 
265 #define META_MARK             0x80290000u  /* (*MARK) */
266 #define META_ACCEPT           0x802a0000u  /* (*ACCEPT) */
267 #define META_FAIL             0x802b0000u  /* (*FAIL) */
268 #define META_COMMIT           0x802c0000u  /* These               */
269 #define META_COMMIT_ARG       0x802d0000u  /*   pairs             */
270 #define META_PRUNE            0x802e0000u  /*     must            */
271 #define META_PRUNE_ARG        0x802f0000u  /*       be            */
272 #define META_SKIP             0x80300000u  /*         kept        */
273 #define META_SKIP_ARG         0x80310000u  /*           in        */
274 #define META_THEN             0x80320000u  /*             this    */
275 #define META_THEN_ARG         0x80330000u  /*               order */
276 
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278 
279 #define META_ASTERISK         0x80340000u  /* *  */
280 #define META_ASTERISK_PLUS    0x80350000u  /* *+ */
281 #define META_ASTERISK_QUERY   0x80360000u  /* *? */
282 #define META_PLUS             0x80370000u  /* +  */
283 #define META_PLUS_PLUS        0x80380000u  /* ++ */
284 #define META_PLUS_QUERY       0x80390000u  /* +? */
285 #define META_QUERY            0x803a0000u  /* ?  */
286 #define META_QUERY_PLUS       0x803b0000u  /* ?+ */
287 #define META_QUERY_QUERY      0x803c0000u  /* ?? */
288 #define META_MINMAX           0x803d0000u  /* {n,m}  repeat */
289 #define META_MINMAX_PLUS      0x803e0000u  /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY     0x803f0000u  /* {n,m}? repeat */
291 
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER  META_MINMAX_QUERY
294 
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299 
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301 
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305 
306 static unsigned char meta_extra_lengths[] = {
307   0,             /* META_END */
308   0,             /* META_ALT */
309   0,             /* META_ATOMIC */
310   0,             /* META_BACKREF - more if group is >= 10 */
311   1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
312   1,             /* META_BIGVALUE */
313   3,             /* META_CALLOUT_NUMBER */
314   3+SIZEOFFSET,  /* META_CALLOUT_STRING */
315   0,             /* META_CAPTURE */
316   0,             /* META_CIRCUMFLEX */
317   0,             /* META_CLASS */
318   0,             /* META_CLASS_EMPTY */
319   0,             /* META_CLASS_EMPTY_NOT */
320   0,             /* META_CLASS_END */
321   0,             /* META_CLASS_NOT */
322   0,             /* META_COND_ASSERT */
323   SIZEOFFSET,    /* META_COND_DEFINE */
324   1+SIZEOFFSET,  /* META_COND_NAME */
325   1+SIZEOFFSET,  /* META_COND_NUMBER */
326   1+SIZEOFFSET,  /* META_COND_RNAME */
327   1+SIZEOFFSET,  /* META_COND_RNUMBER */
328   3,             /* META_COND_VERSION */
329   0,             /* META_DOLLAR */
330   0,             /* META_DOT */
331   0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332   0,             /* META_KET */
333   0,             /* META_NOCAPTURE */
334   1,             /* META_OPTIONS */
335   1,             /* META_POSIX */
336   1,             /* META_POSIX_NEG */
337   0,             /* META_RANGE_ESCAPED */
338   0,             /* META_RANGE_LITERAL */
339   SIZEOFFSET,    /* META_RECURSE */
340   1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
341   0,             /* META_SCRIPT_RUN */
342   0,             /* META_LOOKAHEAD */
343   0,             /* META_LOOKAHEADNOT */
344   SIZEOFFSET,    /* META_LOOKBEHIND */
345   SIZEOFFSET,    /* META_LOOKBEHINDNOT */
346   0,             /* META_LOOKAHEAD_NA */
347   SIZEOFFSET,    /* META_LOOKBEHIND_NA */
348   1,             /* META_MARK - plus the string length */
349   0,             /* META_ACCEPT */
350   0,             /* META_FAIL */
351   0,             /* META_COMMIT */
352   1,             /* META_COMMIT_ARG - plus the string length */
353   0,             /* META_PRUNE */
354   1,             /* META_PRUNE_ARG - plus the string length */
355   0,             /* META_SKIP */
356   1,             /* META_SKIP_ARG - plus the string length */
357   0,             /* META_THEN */
358   1,             /* META_THEN_ARG - plus the string length */
359   0,             /* META_ASTERISK */
360   0,             /* META_ASTERISK_PLUS */
361   0,             /* META_ASTERISK_QUERY */
362   0,             /* META_PLUS */
363   0,             /* META_PLUS_PLUS */
364   0,             /* META_PLUS_QUERY */
365   0,             /* META_QUERY */
366   0,             /* META_QUERY_PLUS */
367   0,             /* META_QUERY_QUERY */
368   2,             /* META_MINMAX */
369   2,             /* META_MINMAX_PLUS */
370   2              /* META_MINMAX_QUERY */
371 };
372 
373 /* Types for skipping parts of a parsed pattern. */
374 
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376 
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380 
381   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382 
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385 
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387 
388 /* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389 variables, which are concerned with first and required code units. A value
390 greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391 matching xxcu variable is set, and the low valued bits are relevant. */
392 
393 #define REQ_UNSET     0xffffffffu  /* Not yet found anything */
394 #define REQ_NONE      0xfffffffeu  /* Found not fixed character */
395 #define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
396 #define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
397 
398 /* These flags are used in the groupinfo vector. */
399 
400 #define GI_SET_FIXED_LENGTH    0x80000000u
401 #define GI_NOT_FIXED_LENGTH    0x40000000u
402 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
403 
404 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405 and is fast (a good compiler can turn it into a subtraction and unsigned
406 comparison). */
407 
408 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409 
410 /* Table to identify hex digits. The tables in chartables are dependent on the
411 locale, and may mark arbitrary characters as digits. We want to recognize only
412 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413 costs 256 bytes, but it is a lot faster than doing character value tests (at
414 least in some simple cases I timed), and in some applications one wants PCRE2
415 to compile efficiently as well as match efficiently. The value in the table is
416 the binary hex digit value, or 0xff for non-hex digits. */
417 
418 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419 UTF-8 mode. */
420 
421 #ifndef EBCDIC
422 static const uint8_t xdigitab[] =
423   {
424   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
425   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
426   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
427   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
428   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
429   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
430   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
431   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
432   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
433   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
434   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
435   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
436   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
437   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
438   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
439   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
440   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456 
457 #else
458 
459 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460 
461 static const uint8_t xdigitab[] =
462   {
463   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
464   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
465   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
466   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
467   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
468   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
469   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
470   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
471   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
472   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
473   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
474   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
475   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
476   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
477   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
479   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
480   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
481   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
482   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
483   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
484   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
485   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
486   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
487   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
488   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
489   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
490   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
491   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
492   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
493   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
494   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
495 #endif  /* EBCDIC */
496 
497 
498 /* Table for handling alphanumeric escaped characters. Positive returns are
499 simple data values; negative values are for special things like \d and so on.
500 Zero means further processing is needed (for things like \x), or the escape is
501 invalid. */
502 
503 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
504 in UTF-8 mode. It runs from '0' to 'z'. */
505 
506 #ifndef EBCDIC
507 #define ESCAPES_FIRST       CHAR_0
508 #define ESCAPES_LAST        CHAR_z
509 #define UPPER_CASE(c)       (c-32)
510 
511 static const short int escapes[] = {
512      0,                       0,
513      0,                       0,
514      0,                       0,
515      0,                       0,
516      0,                       0,
517      CHAR_COLON,              CHAR_SEMICOLON,
518      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
519      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
520      CHAR_COMMERCIAL_AT,      -ESC_A,
521      -ESC_B,                  -ESC_C,
522      -ESC_D,                  -ESC_E,
523      0,                       -ESC_G,
524      -ESC_H,                  0,
525      0,                       -ESC_K,
526      0,                       0,
527      -ESC_N,                  0,
528      -ESC_P,                  -ESC_Q,
529      -ESC_R,                  -ESC_S,
530      0,                       0,
531      -ESC_V,                  -ESC_W,
532      -ESC_X,                  0,
533      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
534      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
535      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
536      CHAR_GRAVE_ACCENT,       CHAR_BEL,
537      -ESC_b,                  0,
538      -ESC_d,                  CHAR_ESC,
539      CHAR_FF,                 0,
540      -ESC_h,                  0,
541      0,                       -ESC_k,
542      0,                       0,
543      CHAR_LF,                 0,
544      -ESC_p,                  0,
545      CHAR_CR,                 -ESC_s,
546      CHAR_HT,                 0,
547      -ESC_v,                  -ESC_w,
548      0,                       0,
549      -ESC_z
550 };
551 
552 #else
553 
554 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557 because it is defined as 'a', which of course picks up the ASCII value. */
558 
559 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
560 #define ESCAPES_FIRST       CHAR_a
561 #define ESCAPES_LAST        CHAR_9
562 #define UPPER_CASE(c)       (c+64)
563 #else                              /* Testing in an ASCII environment */
564 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
565 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
566 #define UPPER_CASE(c)  (c-32)
567 #endif
568 
569 static const short int escapes[] = {
570 /*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
571 /*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
572 /*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
573 /*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
574 /*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
575 /*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
576 /*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
577 /*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
578 /*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
579 /*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
580 /*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
581 /*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
582 /*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
583 /*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
584 /*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
585 /*  F8 */      0,        0
586 };
587 
588 /* We also need a table of characters that may follow \c in an EBCDIC
589 environment for characters 0-31. */
590 
591 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592 
593 #endif   /* EBCDIC */
594 
595 
596 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597 searched linearly. Put all the names into a single string, in order to reduce
598 the number of relocations when a shared library is dynamically linked. The
599 string is built from string macros so that it works in UTF-8 mode on EBCDIC
600 platforms. */
601 
602 typedef struct verbitem {
603   unsigned int len;          /* Length of verb name */
604   uint32_t meta;             /* Base META_ code */
605   int has_arg;               /* Argument requirement */
606 } verbitem;
607 
608 static const char verbnames[] =
609   "\0"                       /* Empty name is a shorthand for MARK */
610   STRING_MARK0
611   STRING_ACCEPT0
612   STRING_F0
613   STRING_FAIL0
614   STRING_COMMIT0
615   STRING_PRUNE0
616   STRING_SKIP0
617   STRING_THEN;
618 
619 static const verbitem verbs[] = {
620   { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
621   { 4, META_MARK,   +1 },
622   { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
623   { 1, META_FAIL,   -1 },
624   { 4, META_FAIL,   -1 },
625   { 6, META_COMMIT,  0 },
626   { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
627   { 4, META_SKIP,    0 },
628   { 4, META_THEN,    0 }
629 };
630 
631 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632 
633 /* Verb opcodes, indexed by their META code offset from META_MARK. */
634 
635 static const uint32_t verbops[] = {
636   OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637   OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638 
639 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640 
641 typedef struct alasitem {
642   unsigned int len;          /* Length of name */
643   uint32_t meta;             /* Base META_ code */
644 } alasitem;
645 
646 static const char alasnames[] =
647   STRING_pla0
648   STRING_plb0
649   STRING_napla0
650   STRING_naplb0
651   STRING_nla0
652   STRING_nlb0
653   STRING_positive_lookahead0
654   STRING_positive_lookbehind0
655   STRING_non_atomic_positive_lookahead0
656   STRING_non_atomic_positive_lookbehind0
657   STRING_negative_lookahead0
658   STRING_negative_lookbehind0
659   STRING_atomic0
660   STRING_sr0
661   STRING_asr0
662   STRING_script_run0
663   STRING_atomic_script_run;
664 
665 static const alasitem alasmeta[] = {
666   {  3, META_LOOKAHEAD         },
667   {  3, META_LOOKBEHIND        },
668   {  5, META_LOOKAHEAD_NA      },
669   {  5, META_LOOKBEHIND_NA     },
670   {  3, META_LOOKAHEADNOT      },
671   {  3, META_LOOKBEHINDNOT     },
672   { 18, META_LOOKAHEAD         },
673   { 19, META_LOOKBEHIND        },
674   { 29, META_LOOKAHEAD_NA      },
675   { 30, META_LOOKBEHIND_NA     },
676   { 18, META_LOOKAHEADNOT      },
677   { 19, META_LOOKBEHINDNOT     },
678   {  6, META_ATOMIC            },
679   {  2, META_SCRIPT_RUN        }, /* sr = script run */
680   {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681   { 10, META_SCRIPT_RUN        }, /* script run */
682   { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
683 };
684 
685 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686 
687 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688 
689 static uint32_t chartypeoffset[] = {
690   OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
691   OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692 
693 /* Tables of names of POSIX character classes and their lengths. The names are
694 now all in a single string, to reduce the number of relocations when a shared
695 library is dynamically loaded. The list of lengths is terminated by a zero
696 length entry. The first three must be alpha, lower, upper, as this is assumed
697 for handling case independence. The indices for graph, print, and punct are
698 needed, so identify them. */
699 
700 static const char posix_names[] =
701   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704   STRING_word0  STRING_xdigit;
705 
706 static const uint8_t posix_name_lengths[] = {
707   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708 
709 #define PC_GRAPH  8
710 #define PC_PRINT  9
711 #define PC_PUNCT 10
712 
713 /* Table of class bit maps for each POSIX class. Each class is formed from a
714 base map, with an optional addition or removal of another map. Then, for some
715 classes, there is some additional tweaking: for [:blank:] the vertical space
716 characters are removed, and for [:alpha:] and [:alnum:] the underscore
717 character is removed. The triples in the table consist of the base map offset,
718 second map offset or -1 if no second map, and a non-negative value for map
719 addition or a negative value for map subtraction (if there are two maps). The
720 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
721 remove vertical space characters, 2 => remove underscore. */
722 
723 static const int posix_class_maps[] = {
724   cbit_word,  cbit_digit, -2,             /* alpha */
725   cbit_lower, -1,          0,             /* lower */
726   cbit_upper, -1,          0,             /* upper */
727   cbit_word,  -1,          2,             /* alnum - word without underscore */
728   cbit_print, cbit_cntrl,  0,             /* ascii */
729   cbit_space, -1,          1,             /* blank - a GNU extension */
730   cbit_cntrl, -1,          0,             /* cntrl */
731   cbit_digit, -1,          0,             /* digit */
732   cbit_graph, -1,          0,             /* graph */
733   cbit_print, -1,          0,             /* print */
734   cbit_punct, -1,          0,             /* punct */
735   cbit_space, -1,          0,             /* space */
736   cbit_word,  -1,          0,             /* word - a Perl extension */
737   cbit_xdigit,-1,          0              /* xdigit */
738 };
739 
740 #ifdef SUPPORT_UNICODE
741 
742 /* The POSIX class Unicode property substitutes that are used in UCP mode must
743 be in the order of the POSIX class names, defined above. */
744 
745 static int posix_substitutes[] = {
746   PT_GC, ucp_L,     /* alpha */
747   PT_PC, ucp_Ll,    /* lower */
748   PT_PC, ucp_Lu,    /* upper */
749   PT_ALNUM, 0,      /* alnum */
750   -1, 0,            /* ascii, treat as non-UCP */
751   -1, 1,            /* blank, treat as \h */
752   PT_PC, ucp_Cc,    /* cntrl */
753   PT_PC, ucp_Nd,    /* digit */
754   PT_PXGRAPH, 0,    /* graph */
755   PT_PXPRINT, 0,    /* print */
756   PT_PXPUNCT, 0,    /* punct */
757   PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
758   PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
759   -1, 0             /* xdigit, treat as non-UCP */
760 };
761 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
762 #endif  /* SUPPORT_UNICODE */
763 
764 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
765 are allowed. */
766 
767 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
768   (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
769    PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
770    PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
771 
772 #define PUBLIC_COMPILE_OPTIONS \
773   (PUBLIC_LITERAL_COMPILE_OPTIONS| \
774    PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
775    PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
776    PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
777    PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
778    PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
779    PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
780 
781 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
782    (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
783 
784 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
785    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
786     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
787     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
788     PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)
789 
790 /* Compile time error code numbers. They are given names so that they can more
791 easily be tracked. When a new number is added, the tables called eint1 and
792 eint2 in pcre2posix.c may need to be updated, and a new error text must be
793 added to compile_error_texts in pcre2_error.c. Also, the error codes in
794 pcre2.h.in must be updated - their values are exactly 100 greater than these
795 values. */
796 
797 enum { ERR0 = COMPILE_ERROR_BASE,
798        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
799        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
800        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
801        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
802        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
803        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
804        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
805        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
806        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
807        ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 };
808 
809 /* This is a table of start-of-pattern options such as (*UTF) and settings such
810 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
811 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
812 generic and always supported. */
813 
814 enum { PSO_OPT,     /* Value is an option bit */
815        PSO_FLG,     /* Value is a flag bit */
816        PSO_NL,      /* Value is a newline type */
817        PSO_BSR,     /* Value is a \R type */
818        PSO_LIMH,    /* Read integer value for heap limit */
819        PSO_LIMM,    /* Read integer value for match limit */
820        PSO_LIMD };  /* Read integer value for depth limit */
821 
822 typedef struct pso {
823   const uint8_t *name;
824   uint16_t length;
825   uint16_t type;
826   uint32_t value;
827 } pso;
828 
829 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
830 
831 static pso pso_list[] = {
832   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
833   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
834   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
835   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
836   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
837   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
838   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
839   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
840   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
841   { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
842   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
843   { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
844   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
845   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
846   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
847   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
848   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
849   { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
850   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
851   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
852   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
853 };
854 
855 /* This table is used when converting repeating opcodes into possessified
856 versions as a result of an explicit possessive quantifier such as ++. A zero
857 value means there is no possessified version - in those cases the item in
858 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
859 because all relevant opcodes are less than that. */
860 
861 static const uint8_t opcode_possessify[] = {
862   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
863   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
864 
865   0,                       /* NOTI */
866   OP_POSSTAR, 0,           /* STAR, MINSTAR */
867   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
868   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
869   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
870   0,                       /* EXACT */
871   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
872 
873   OP_POSSTARI, 0,          /* STARI, MINSTARI */
874   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
875   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
876   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
877   0,                       /* EXACTI */
878   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
879 
880   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
881   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
882   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
883   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
884   0,                       /* NOTEXACT */
885   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
886 
887   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
888   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
889   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
890   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
891   0,                       /* NOTEXACTI */
892   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
893 
894   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
895   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
896   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
897   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
898   0,                       /* TYPEEXACT */
899   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
900 
901   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
902   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
903   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
904   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
905   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
906 
907   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
908   0, 0,                    /* REF, REFI */
909   0, 0,                    /* DNREF, DNREFI */
910   0, 0                     /* RECURSE, CALLOUT */
911 };
912 
913 
914 #ifdef DEBUG_SHOW_PARSED
915 /*************************************************
916 *     Show the parsed pattern for debugging      *
917 *************************************************/
918 
919 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
920 can be enabled. */
921 
show_parsed(compile_block * cb)922 static void show_parsed(compile_block *cb)
923 {
924 uint32_t *pptr = cb->parsed_pattern;
925 
926 for (;;)
927   {
928   int max, min;
929   PCRE2_SIZE offset;
930   uint32_t i;
931   uint32_t length;
932   uint32_t meta_arg = META_DATA(*pptr);
933 
934   fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
935 
936   if (*pptr < META_END)
937     {
938     if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
939     pptr++;
940     }
941 
942   else switch (META_CODE(*pptr++))
943     {
944     default:
945     fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
946     return;
947 
948     case META_END:
949     fprintf(stderr, "META_END\n");
950     return;
951 
952     case META_CAPTURE:
953     fprintf(stderr, "META_CAPTURE %d", meta_arg);
954     break;
955 
956     case META_RECURSE:
957     GETOFFSET(offset, pptr);
958     fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
959     break;
960 
961     case META_BACKREF:
962     if (meta_arg < 10)
963       offset = cb->small_ref_offset[meta_arg];
964     else
965       GETOFFSET(offset, pptr);
966     fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
967     break;
968 
969     case META_ESCAPE:
970     if (meta_arg == ESC_P || meta_arg == ESC_p)
971       {
972       uint32_t ptype = *pptr >> 16;
973       uint32_t pvalue = *pptr++ & 0xffff;
974       fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
975         ptype, pvalue);
976       }
977     else
978       {
979       uint32_t cc;
980       /* There's just one escape we might have here that isn't negated in the
981       escapes table. */
982       if (meta_arg == ESC_g) cc = CHAR_g;
983       else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
984         {
985         if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
986         }
987       if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
988       fprintf(stderr, "META \\%c", cc);
989       }
990     break;
991 
992     case META_MINMAX:
993     min = *pptr++;
994     max = *pptr++;
995     if (max != REPEAT_UNLIMITED)
996       fprintf(stderr, "META {%d,%d}", min, max);
997     else
998       fprintf(stderr, "META {%d,}", min);
999     break;
1000 
1001     case META_MINMAX_QUERY:
1002     min = *pptr++;
1003     max = *pptr++;
1004     if (max != REPEAT_UNLIMITED)
1005       fprintf(stderr, "META {%d,%d}?", min, max);
1006     else
1007       fprintf(stderr, "META {%d,}?", min);
1008     break;
1009 
1010     case META_MINMAX_PLUS:
1011     min = *pptr++;
1012     max = *pptr++;
1013     if (max != REPEAT_UNLIMITED)
1014       fprintf(stderr, "META {%d,%d}+", min, max);
1015     else
1016       fprintf(stderr, "META {%d,}+", min);
1017     break;
1018 
1019     case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1020     case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1021     case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1022     case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1023     case META_DOT: fprintf(stderr, "META_DOT"); break;
1024     case META_ASTERISK: fprintf(stderr, "META *"); break;
1025     case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1026     case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1027     case META_PLUS: fprintf(stderr, "META +"); break;
1028     case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1029     case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1030     case META_QUERY: fprintf(stderr, "META ?"); break;
1031     case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1032     case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1033 
1034     case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1035     case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1036     case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1037     case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1038     case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1039     case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1040     case META_KET: fprintf(stderr, "META )"); break;
1041     case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1042 
1043     case META_CLASS: fprintf(stderr, "META ["); break;
1044     case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1045     case META_CLASS_END: fprintf(stderr, "META ]"); break;
1046     case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1047     case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1048 
1049     case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1050     case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1051 
1052     case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1053     case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1054 
1055     case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1056     case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1057     case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1058     case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1059     case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1060     case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1061 
1062     case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1063 
1064     case META_LOOKBEHIND:
1065     fprintf(stderr, "META (?<= %d offset=", meta_arg);
1066     GETOFFSET(offset, pptr);
1067     fprintf(stderr, "%zd", offset);
1068     break;
1069 
1070     case META_LOOKBEHIND_NA:
1071     fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
1072     GETOFFSET(offset, pptr);
1073     fprintf(stderr, "%zd", offset);
1074     break;
1075 
1076     case META_LOOKBEHINDNOT:
1077     fprintf(stderr, "META (?<! %d offset=", meta_arg);
1078     GETOFFSET(offset, pptr);
1079     fprintf(stderr, "%zd", offset);
1080     break;
1081 
1082     case META_CALLOUT_NUMBER:
1083     fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1084        pptr[1]);
1085     pptr += 3;
1086     break;
1087 
1088     case META_CALLOUT_STRING:
1089       {
1090       uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1091       uint32_t patlength = *pptr++;    /* Length of next pattern item */
1092       fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1093       GETOFFSET(offset, pptr);
1094       fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1095       }
1096     break;
1097 
1098     case META_RECURSE_BYNAME:
1099     fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1100     GETOFFSET(offset, pptr);
1101     fprintf(stderr, "%zd", offset);
1102     break;
1103 
1104     case META_BACKREF_BYNAME:
1105     fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1106     GETOFFSET(offset, pptr);
1107     fprintf(stderr, "%zd", offset);
1108     break;
1109 
1110     case META_COND_NUMBER:
1111     fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1112     GETOFFSET(offset, pptr);
1113     fprintf(stderr, "%zd", offset);
1114     pptr++;
1115     break;
1116 
1117     case META_COND_DEFINE:
1118     fprintf(stderr, "META (?(DEFINE) offset=");
1119     GETOFFSET(offset, pptr);
1120     fprintf(stderr, "%zd", offset);
1121     break;
1122 
1123     case META_COND_VERSION:
1124     fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1125     fprintf(stderr, "%d.", *pptr++);
1126     fprintf(stderr, "%d)", *pptr++);
1127     break;
1128 
1129     case META_COND_NAME:
1130     fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1131     GETOFFSET(offset, pptr);
1132     fprintf(stderr, "%zd", offset);
1133     break;
1134 
1135     case META_COND_RNAME:
1136     fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1137     GETOFFSET(offset, pptr);
1138     fprintf(stderr, "%zd", offset);
1139     break;
1140 
1141     /* This is kept as a name, because it might be. */
1142 
1143     case META_COND_RNUMBER:
1144     fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1145     GETOFFSET(offset, pptr);
1146     fprintf(stderr, "%zd", offset);
1147     break;
1148 
1149     case META_MARK:
1150     fprintf(stderr, "META (*MARK:");
1151     goto SHOWARG;
1152 
1153     case META_COMMIT_ARG:
1154     fprintf(stderr, "META (*COMMIT:");
1155     goto SHOWARG;
1156 
1157     case META_PRUNE_ARG:
1158     fprintf(stderr, "META (*PRUNE:");
1159     goto SHOWARG;
1160 
1161     case META_SKIP_ARG:
1162     fprintf(stderr, "META (*SKIP:");
1163     goto SHOWARG;
1164 
1165     case META_THEN_ARG:
1166     fprintf(stderr, "META (*THEN:");
1167     SHOWARG:
1168     length = *pptr++;
1169     for (i = 0; i < length; i++)
1170       {
1171       uint32_t cc = *pptr++;
1172       if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1173         else fprintf(stderr, "\\x{%x}", cc);
1174       }
1175     fprintf(stderr, ") length=%u", length);
1176     break;
1177     }
1178   fprintf(stderr, "\n");
1179   }
1180 return;
1181 }
1182 #endif  /* DEBUG_SHOW_PARSED */
1183 
1184 
1185 
1186 /*************************************************
1187 *               Copy compiled code               *
1188 *************************************************/
1189 
1190 /* Compiled JIT code cannot be copied, so the new compiled block has no
1191 associated JIT data. */
1192 
1193 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1194 pcre2_code_copy(const pcre2_code *code)
1195 {
1196 PCRE2_SIZE* ref_count;
1197 pcre2_code *newcode;
1198 
1199 if (code == NULL) return NULL;
1200 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1201 if (newcode == NULL) return NULL;
1202 memcpy(newcode, code, code->blocksize);
1203 newcode->executable_jit = NULL;
1204 
1205 /* If the code is one that has been deserialized, increment the reference count
1206 in the decoded tables. */
1207 
1208 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1209   {
1210   ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1211   (*ref_count)++;
1212   }
1213 
1214 return newcode;
1215 }
1216 
1217 
1218 
1219 /*************************************************
1220 *     Copy compiled code and character tables    *
1221 *************************************************/
1222 
1223 /* Compiled JIT code cannot be copied, so the new compiled block has no
1224 associated JIT data. This version of code_copy also makes a separate copy of
1225 the character tables. */
1226 
1227 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1228 pcre2_code_copy_with_tables(const pcre2_code *code)
1229 {
1230 PCRE2_SIZE* ref_count;
1231 pcre2_code *newcode;
1232 uint8_t *newtables;
1233 
1234 if (code == NULL) return NULL;
1235 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1236 if (newcode == NULL) return NULL;
1237 memcpy(newcode, code, code->blocksize);
1238 newcode->executable_jit = NULL;
1239 
1240 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1241   code->memctl.memory_data);
1242 if (newtables == NULL)
1243   {
1244   code->memctl.free((void *)newcode, code->memctl.memory_data);
1245   return NULL;
1246   }
1247 memcpy(newtables, code->tables, TABLES_LENGTH);
1248 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1249 *ref_count = 1;
1250 
1251 newcode->tables = newtables;
1252 newcode->flags |= PCRE2_DEREF_TABLES;
1253 return newcode;
1254 }
1255 
1256 
1257 
1258 /*************************************************
1259 *               Free compiled code               *
1260 *************************************************/
1261 
1262 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1263 pcre2_code_free(pcre2_code *code)
1264 {
1265 PCRE2_SIZE* ref_count;
1266 
1267 if (code != NULL)
1268   {
1269   if (code->executable_jit != NULL)
1270     PRIV(jit_free)(code->executable_jit, &code->memctl);
1271 
1272   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1273     {
1274     /* Decoded tables belong to the codes after deserialization, and they must
1275     be freed when there are no more references to them. The *ref_count should
1276     always be > 0. */
1277 
1278     ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1279     if (*ref_count > 0)
1280       {
1281       (*ref_count)--;
1282       if (*ref_count == 0)
1283         code->memctl.free((void *)code->tables, code->memctl.memory_data);
1284       }
1285     }
1286 
1287   code->memctl.free(code, code->memctl.memory_data);
1288   }
1289 }
1290 
1291 
1292 
1293 /*************************************************
1294 *         Read a number, possibly signed         *
1295 *************************************************/
1296 
1297 /* This function is used to read numbers in the pattern. The initial pointer
1298 must be the sign or first digit of the number. When relative values (introduced
1299 by + or -) are allowed, they are relative group numbers, and the result must be
1300 greater than zero.
1301 
1302 Arguments:
1303   ptrptr      points to the character pointer variable
1304   ptrend      points to the end of the input string
1305   allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1306   max_value   the largest number allowed
1307   max_error   the error to give for an over-large number
1308   intptr      where to put the result
1309   errcodeptr  where to put an error code
1310 
1311 Returns:      TRUE  - a number was read
1312               FALSE - errorcode == 0 => no number was found
1313                       errorcode != 0 => an error occurred
1314 */
1315 
1316 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1317 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1318   uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1319 {
1320 int sign = 0;
1321 uint32_t n = 0;
1322 PCRE2_SPTR ptr = *ptrptr;
1323 BOOL yield = FALSE;
1324 
1325 *errorcodeptr = 0;
1326 
1327 if (allow_sign >= 0 && ptr < ptrend)
1328   {
1329   if (*ptr == CHAR_PLUS)
1330     {
1331     sign = +1;
1332     max_value -= allow_sign;
1333     ptr++;
1334     }
1335   else if (*ptr == CHAR_MINUS)
1336     {
1337     sign = -1;
1338     ptr++;
1339     }
1340   }
1341 
1342 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1343 while (ptr < ptrend && IS_DIGIT(*ptr))
1344   {
1345   n = n * 10 + *ptr++ - CHAR_0;
1346   if (n > max_value)
1347     {
1348     *errorcodeptr = max_error;
1349     goto EXIT;
1350     }
1351   }
1352 
1353 if (allow_sign >= 0 && sign != 0)
1354   {
1355   if (n == 0)
1356     {
1357     *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1358     goto EXIT;
1359     }
1360 
1361   if (sign > 0) n += allow_sign;
1362   else if ((int)n > allow_sign)
1363     {
1364     *errorcodeptr = ERR15;  /* Non-existent subpattern */
1365     goto EXIT;
1366     }
1367   else n = allow_sign + 1 - n;
1368   }
1369 
1370 yield = TRUE;
1371 
1372 EXIT:
1373 *intptr = n;
1374 *ptrptr = ptr;
1375 return yield;
1376 }
1377 
1378 
1379 
1380 /*************************************************
1381 *         Read repeat counts                     *
1382 *************************************************/
1383 
1384 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1385 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1386 larger value is used for "unlimited". We have to use signed arguments for
1387 read_number() because it is capable of returning a signed value.
1388 
1389 Arguments:
1390   ptrptr         points to pointer to character after'{'
1391   ptrend         pointer to end of input
1392   minp           if not NULL, pointer to int for min
1393   maxp           if not NULL, pointer to int for max (-1 if no max)
1394                  returned as -1 if no max
1395   errorcodeptr   points to error code variable
1396 
1397 Returns:         FALSE if not a repeat quantifier, errorcode set zero
1398                  FALSE on error, with errorcode set non-zero
1399                  TRUE on success, with pointer updated to point after '}'
1400 */
1401 
1402 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1403 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1404   uint32_t *maxp, int *errorcodeptr)
1405 {
1406 PCRE2_SPTR p;
1407 BOOL yield = FALSE;
1408 BOOL had_comma = FALSE;
1409 int32_t min = 0;
1410 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1411 
1412 /* Check the syntax */
1413 
1414 *errorcodeptr = 0;
1415 for (p = *ptrptr;; p++)
1416   {
1417   uint32_t c;
1418   if (p >= ptrend) return FALSE;
1419   c = *p;
1420   if (IS_DIGIT(c)) continue;
1421   if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1422   if (c == CHAR_COMMA)
1423     {
1424     if (had_comma) return FALSE;
1425     had_comma = TRUE;
1426     }
1427   else return FALSE;
1428   }
1429 
1430 /* The only error from read_number() is for a number that is too big. */
1431 
1432 p = *ptrptr;
1433 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1434   goto EXIT;
1435 
1436 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1437   {
1438   p++;
1439   max = min;
1440   }
1441 else
1442   {
1443   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1444     {
1445     if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1446         errorcodeptr))
1447       goto EXIT;
1448     if (max < min)
1449       {
1450       *errorcodeptr = ERR4;
1451       goto EXIT;
1452       }
1453     }
1454   p++;
1455   }
1456 
1457 yield = TRUE;
1458 if (minp != NULL) *minp = (uint32_t)min;
1459 if (maxp != NULL) *maxp = (uint32_t)max;
1460 
1461 /* Update the pattern pointer */
1462 
1463 EXIT:
1464 *ptrptr = p;
1465 return yield;
1466 }
1467 
1468 
1469 
1470 /*************************************************
1471 *            Handle escapes                      *
1472 *************************************************/
1473 
1474 /* This function is called when a \ has been encountered. It either returns a
1475 positive value for a simple escape such as \d, or 0 for a data character, which
1476 is placed in chptr. A backreference to group n is returned as negative n. On
1477 entry, ptr is pointing at the character after \. On exit, it points after the
1478 final code unit of the escape sequence.
1479 
1480 This function is also called from pcre2_substitute() to handle escape sequences
1481 in replacement strings. In this case, the cb argument is NULL, and in the case
1482 of escapes that have further processing, only sequences that define a data
1483 character are recognised. The isclass argument is not relevant; the options
1484 argument is the final value of the compiled pattern's options.
1485 
1486 Arguments:
1487   ptrptr         points to the input position pointer
1488   ptrend         points to the end of the input
1489   chptr          points to a returned data character
1490   errorcodeptr   points to the errorcode variable (containing zero)
1491   options        the current options bits
1492   isclass        TRUE if inside a character class
1493   cb             compile data block or NULL when called from pcre2_substitute()
1494 
1495 Returns:         zero => a data character
1496                  positive => a special escape sequence
1497                  negative => a numerical back reference
1498                  on error, errorcodeptr is set non-zero
1499 */
1500 
1501 int
PRIV(check_escape)1502 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1503   int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1504   compile_block *cb)
1505 {
1506 BOOL utf = (options & PCRE2_UTF) != 0;
1507 PCRE2_SPTR ptr = *ptrptr;
1508 uint32_t c, cc;
1509 int escape = 0;
1510 int i;
1511 
1512 /* If backslash is at the end of the string, it's an error. */
1513 
1514 if (ptr >= ptrend)
1515   {
1516   *errorcodeptr = ERR1;
1517   return 0;
1518   }
1519 
1520 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1521 *errorcodeptr = 0;              /* Be optimistic */
1522 
1523 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1524 value test saves a memory lookup for code points outside the alphanumeric
1525 range. */
1526 
1527 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1528 
1529 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1530 positive value is a literal value for something like \n. A negative value is
1531 the negation of one of the ESC_ macros that is passed back for handling by the
1532 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1533 is supported. If the value is zero, further processing is handled below. */
1534 
1535 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1536   {
1537   if (i > 0)
1538     {
1539     c = (uint32_t)i;
1540     if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1541       c = CHAR_LF;
1542     }
1543   else  /* Negative table entry */
1544     {
1545     escape = -i;                    /* Else return a special escape */
1546     if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1547       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1548 
1549     /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1550     Unicode code points, as well as plain \N for "not newline". PCRE does not
1551     support \N{name}. However, it does support quantification such as \N{2,3},
1552     so if \N{ is not followed by U+dddd we check for a quantifier. */
1553 
1554     if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1555       {
1556       PCRE2_SPTR p = ptr + 1;
1557 
1558       /* \N{U+ can be handled by the \x{ code. However, this construction is
1559       not valid in EBCDIC environments because it specifies a Unicode
1560       character, not a codepoint in the local code. For example \N{U+0041}
1561       must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1562       casing semantics for the entire pattern, so allow it only in UTF (i.e.
1563       Unicode) mode. */
1564 
1565       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1566         {
1567 #ifdef EBCDIC
1568         *errorcodeptr = ERR93;
1569 #else
1570         if (utf)
1571           {
1572           ptr = p + 1;
1573           escape = 0;   /* Not a fancy escape after all */
1574           goto COME_FROM_NU;
1575           }
1576         else *errorcodeptr = ERR93;
1577 #endif
1578         }
1579 
1580       /* Give an error if what follows is not a quantifier, but don't override
1581       an error set by the quantifier reader (e.g. number overflow). */
1582 
1583       else
1584         {
1585         if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1586              *errorcodeptr == 0)
1587           *errorcodeptr = ERR37;
1588         }
1589       }
1590     }
1591   }
1592 
1593 /* Escapes that need further processing, including those that are unknown, have
1594 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1595 \o, and \x are recognized (\u and \U can never appear as they are used for case
1596 forcing). */
1597 
1598 else
1599   {
1600   int s;
1601   PCRE2_SPTR oldptr;
1602   BOOL overflow;
1603   BOOL alt_bsux =
1604     ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1605 
1606   /* Filter calls from pcre2_substitute(). */
1607 
1608   if (cb == NULL)
1609     {
1610     if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1611       {
1612       *errorcodeptr = ERR3;
1613       return 0;
1614       }
1615     alt_bsux = FALSE;   /* Do not modify \x handling */
1616     }
1617 
1618   switch (c)
1619     {
1620     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1621     error. */
1622 
1623     case CHAR_F:
1624     case CHAR_l:
1625     case CHAR_L:
1626     *errorcodeptr = ERR37;
1627     break;
1628 
1629     /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1630     is set. Otherwise, \u must be followed by exactly four hex digits or, if
1631     PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1632     Otherwise it is a lowercase u letter. This gives some compatibility with
1633     ECMAScript (aka JavaScript). */
1634 
1635     case CHAR_u:
1636     if (!alt_bsux) *errorcodeptr = ERR37; else
1637       {
1638       uint32_t xc;
1639 
1640       if (ptr >= ptrend) break;
1641       if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1642           (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1643         {
1644         PCRE2_SPTR hptr = ptr + 1;
1645         cc = 0;
1646 
1647         while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1648           {
1649           if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1650             {
1651             *errorcodeptr = ERR77;
1652             ptr = hptr;   /* Show where */
1653             break;        /* *hptr != } will cause another break below */
1654             }
1655           cc = (cc << 4) | xc;
1656           hptr++;
1657           }
1658 
1659         if (hptr == ptr + 1 ||   /* No hex digits */
1660             hptr >= ptrend ||    /* Hit end of input */
1661             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1662           break;         /* Hex escape not recognized */
1663 
1664         c = cc;          /* Accept the code point */
1665         ptr = hptr + 1;
1666         }
1667 
1668       else  /* Must be exactly 4 hex digits */
1669         {
1670         if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1671         if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1672         if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1673         cc = (cc << 4) | xc;
1674         if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1675         cc = (cc << 4) | xc;
1676         if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1677         c = (cc << 4) | xc;
1678         ptr += 4;
1679         }
1680 
1681       if (utf)
1682         {
1683         if (c > 0x10ffffU) *errorcodeptr = ERR77;
1684         else
1685           if (c >= 0xd800 && c <= 0xdfff &&
1686               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1687                 *errorcodeptr = ERR73;
1688         }
1689       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1690       }
1691     break;
1692 
1693     /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1694     in which case it is an upper case letter. */
1695 
1696     case CHAR_U:
1697     if (!alt_bsux) *errorcodeptr = ERR37;
1698     break;
1699 
1700     /* In a character class, \g is just a literal "g". Outside a character
1701     class, \g must be followed by one of a number of specific things:
1702 
1703     (1) A number, either plain or braced. If positive, it is an absolute
1704     backreference. If negative, it is a relative backreference. This is a Perl
1705     5.10 feature.
1706 
1707     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1708     is part of Perl's movement towards a unified syntax for back references. As
1709     this is synonymous with \k{name}, we fudge it up by pretending it really
1710     was \k{name}.
1711 
1712     (3) For Oniguruma compatibility we also support \g followed by a name or a
1713     number either in angle brackets or in single quotes. However, these are
1714     (possibly recursive) subroutine calls, _not_ backreferences. We return
1715     the ESC_g code.
1716 
1717     Summary: Return a negative number for a numerical back reference, ESC_k for
1718     a named back reference, and ESC_g for a named or numbered subroutine call.
1719     */
1720 
1721     case CHAR_g:
1722     if (isclass) break;
1723 
1724     if (ptr >= ptrend)
1725       {
1726       *errorcodeptr = ERR57;
1727       break;
1728       }
1729 
1730     if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1731       {
1732       escape = ESC_g;
1733       break;
1734       }
1735 
1736     /* If there is a brace delimiter, try to read a numerical reference. If
1737     there isn't one, assume we have a name and treat it as \k. */
1738 
1739     if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1740       {
1741       PCRE2_SPTR p = ptr + 1;
1742       if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1743           errorcodeptr))
1744         {
1745         if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1746         break;
1747         }
1748       if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1749         {
1750         *errorcodeptr = ERR57;
1751         break;
1752         }
1753       ptr = p + 1;
1754       }
1755 
1756     /* Read an undelimited number */
1757 
1758     else
1759       {
1760       if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1761           errorcodeptr))
1762         {
1763         if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1764         break;
1765         }
1766       }
1767 
1768     if (s <= 0)
1769       {
1770       *errorcodeptr = ERR15;
1771       break;
1772       }
1773 
1774     escape = -s;
1775     break;
1776 
1777     /* The handling of escape sequences consisting of a string of digits
1778     starting with one that is not zero is not straightforward. Perl has changed
1779     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1780     recommended to avoid the ambiguities in the old syntax.
1781 
1782     Outside a character class, the digits are read as a decimal number. If the
1783     number is less than 10, or if there are that many previous extracting left
1784     brackets, it is a back reference. Otherwise, up to three octal digits are
1785     read to form an escaped character code. Thus \123 is likely to be octal 123
1786     (cf \0123, which is octal 012 followed by the literal 3).
1787 
1788     Inside a character class, \ followed by a digit is always either a literal
1789     8 or 9 or an octal number. */
1790 
1791     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1792     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1793 
1794     if (!isclass)
1795       {
1796       oldptr = ptr;
1797       ptr--;   /* Back to the digit */
1798 
1799       /* As we know we are at a digit, the only possible error from
1800       read_number() is a number that is too large to be a group number. In this
1801       case we fall through handle this as not a group reference. If we have
1802       read a small enough number, check for a back reference.
1803 
1804       \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1805       are octal escapes if there are not that many previous captures. */
1806 
1807       if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1808           (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1809         {
1810         if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1811           else escape = -s;     /* Indicates a back reference */
1812         break;
1813         }
1814 
1815       ptr = oldptr;      /* Put the pointer back and fall through */
1816       }
1817 
1818     /* Handle a digit following \ when the number is not a back reference, or
1819     we are within a character class. If the first digit is 8 or 9, Perl used to
1820     generate a binary zero and then treat the digit as a following literal. At
1821     least by Perl 5.18 this changed so as not to insert the binary zero. */
1822 
1823     if (c >= CHAR_8) break;
1824 
1825     /* Fall through */
1826 
1827     /* \0 always starts an octal number, but we may drop through to here with a
1828     larger first octal digit. The original code used just to take the least
1829     significant 8 bits of octal numbers (I think this is what early Perls used
1830     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1831     but no more than 3 octal digits. */
1832 
1833     case CHAR_0:
1834     c -= CHAR_0;
1835     while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1836         c = c * 8 + *ptr++ - CHAR_0;
1837 #if PCRE2_CODE_UNIT_WIDTH == 8
1838     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1839 #endif
1840     break;
1841 
1842     /* \o is a relatively new Perl feature, supporting a more general way of
1843     specifying character codes in octal. The only supported form is \o{ddd}. */
1844 
1845     case CHAR_o:
1846     if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1847       {
1848       ptr--;
1849       *errorcodeptr = ERR55;
1850       }
1851     else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1852       *errorcodeptr = ERR78;
1853     else
1854       {
1855       c = 0;
1856       overflow = FALSE;
1857       while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1858         {
1859         cc = *ptr++;
1860         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1861 #if PCRE2_CODE_UNIT_WIDTH == 32
1862         if (c >= 0x20000000l) { overflow = TRUE; break; }
1863 #endif
1864         c = (c << 3) + (cc - CHAR_0);
1865 #if PCRE2_CODE_UNIT_WIDTH == 8
1866         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1867 #elif PCRE2_CODE_UNIT_WIDTH == 16
1868         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1869 #elif PCRE2_CODE_UNIT_WIDTH == 32
1870         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1871 #endif
1872         }
1873       if (overflow)
1874         {
1875         while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1876         *errorcodeptr = ERR34;
1877         }
1878       else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1879         {
1880         if (utf && c >= 0xd800 && c <= 0xdfff &&
1881             (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1882           {
1883           ptr--;
1884           *errorcodeptr = ERR73;
1885           }
1886         }
1887       else
1888         {
1889         ptr--;
1890         *errorcodeptr = ERR64;
1891         }
1892       }
1893     break;
1894 
1895     /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1896     by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1897 
1898     case CHAR_x:
1899     if (alt_bsux)
1900       {
1901       uint32_t xc;
1902       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1903       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1904       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1905       c = (cc << 4) | xc;
1906       ptr += 2;
1907       }
1908 
1909     /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1910     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1911     digits. If not, { used to be treated as a data character. However, Perl
1912     seems to read hex digits up to the first non-such, and ignore the rest, so
1913     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1914     now gives an error. */
1915 
1916     else
1917       {
1918       if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1919         {
1920 #ifndef EBCDIC
1921         COME_FROM_NU:
1922 #endif
1923         if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1924           {
1925           *errorcodeptr = ERR78;
1926           break;
1927           }
1928         c = 0;
1929         overflow = FALSE;
1930 
1931         while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1932           {
1933           ptr++;
1934           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
1935 #if PCRE2_CODE_UNIT_WIDTH == 32
1936           if (c >= 0x10000000l) { overflow = TRUE; break; }
1937 #endif
1938           c = (c << 4) | cc;
1939           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1940             {
1941             overflow = TRUE;
1942             break;
1943             }
1944           }
1945 
1946         if (overflow)
1947           {
1948           while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1949           *errorcodeptr = ERR34;
1950           }
1951         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1952           {
1953           if (utf && c >= 0xd800 && c <= 0xdfff &&
1954               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1955             {
1956             ptr--;
1957             *errorcodeptr = ERR73;
1958             }
1959           }
1960 
1961         /* If the sequence of hex digits does not end with '}', give an error.
1962         We used just to recognize this construct and fall through to the normal
1963         \x handling, but nowadays Perl gives an error, which seems much more
1964         sensible, so we do too. */
1965 
1966         else
1967           {
1968           ptr--;
1969           *errorcodeptr = ERR67;
1970           }
1971         }   /* End of \x{} processing */
1972 
1973       /* Read a up to two hex digits after \x */
1974 
1975       else
1976         {
1977         c = 0;
1978         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1979         ptr++;
1980         c = cc;
1981         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1982         ptr++;
1983         c = (c << 4) | cc;
1984         }     /* End of \xdd handling */
1985       }       /* End of Perl-style \x handling */
1986     break;
1987 
1988     /* The handling of \c is different in ASCII and EBCDIC environments. In an
1989     ASCII (or Unicode) environment, an error is given if the character
1990     following \c is not a printable ASCII character. Otherwise, the following
1991     character is upper-cased if it is a letter, and after that the 0x40 bit is
1992     flipped. The result is the value of the escape.
1993 
1994     In an EBCDIC environment the handling of \c is compatible with the
1995     specification in the perlebcdic document. The following character must be
1996     a letter or one of small number of special characters. These provide a
1997     means of defining the character values 0-31.
1998 
1999     For testing the EBCDIC handling of \c in an ASCII environment, recognize
2000     the EBCDIC value of 'c' explicitly. */
2001 
2002 #if defined EBCDIC && 'a' != 0x81
2003     case 0x83:
2004 #else
2005     case CHAR_c:
2006 #endif
2007     if (ptr >= ptrend)
2008       {
2009       *errorcodeptr = ERR2;
2010       break;
2011       }
2012     c = *ptr;
2013     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2014 
2015     /* Handle \c in an ASCII/Unicode environment. */
2016 
2017 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
2018     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2019       {
2020       *errorcodeptr = ERR68;
2021       break;
2022       }
2023     c ^= 0x40;
2024 
2025     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2026     255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2027     POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2028     The other valid sequences correspond to a list of specific characters. */
2029 
2030 #else
2031     if (c == CHAR_QUESTION_MARK)
2032       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2033     else
2034       {
2035       for (i = 0; i < 32; i++)
2036         {
2037         if (c == ebcdic_escape_c[i]) break;
2038         }
2039       if (i < 32) c = i; else *errorcodeptr = ERR68;
2040       }
2041 #endif  /* EBCDIC */
2042 
2043     ptr++;
2044     break;
2045 
2046     /* Any other alphanumeric following \ is an error. Perl gives an error only
2047     if in warning mode, but PCRE doesn't have a warning mode. */
2048 
2049     default:
2050     *errorcodeptr = ERR3;
2051     *ptrptr = ptr - 1;     /* Point to the character at fault */
2052     return 0;
2053     }
2054   }
2055 
2056 /* Set the pointer to the next character before returning. */
2057 
2058 *ptrptr = ptr;
2059 *chptr = c;
2060 return escape;
2061 }
2062 
2063 
2064 
2065 #ifdef SUPPORT_UNICODE
2066 /*************************************************
2067 *               Handle \P and \p                 *
2068 *************************************************/
2069 
2070 /* This function is called after \P or \p has been encountered, provided that
2071 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2072 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2073 after the final code unit of the escape sequence.
2074 
2075 Arguments:
2076   ptrptr         the pattern position pointer
2077   negptr         a boolean that is set TRUE for negation else FALSE
2078   ptypeptr       an unsigned int that is set to the type value
2079   pdataptr       an unsigned int that is set to the detailed property value
2080   errorcodeptr   the error code variable
2081   cb             the compile data
2082 
2083 Returns:         TRUE if the type value was found, or FALSE for an invalid type
2084 */
2085 
2086 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2087 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2088   uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2089 {
2090 PCRE2_UCHAR c;
2091 PCRE2_SIZE i, bot, top;
2092 PCRE2_SPTR ptr = *ptrptr;
2093 PCRE2_UCHAR name[50];
2094 PCRE2_UCHAR *vptr = NULL;
2095 uint16_t ptscript = PT_NOTSCRIPT;
2096 
2097 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2098 c = *ptr++;
2099 *negptr = FALSE;
2100 
2101 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2102 negation. */
2103 
2104 if (c == CHAR_LEFT_CURLY_BRACKET)
2105   {
2106   if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2107 
2108   if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2109     {
2110     *negptr = TRUE;
2111     ptr++;
2112     }
2113 
2114   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2115     {
2116     if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2117     c = *ptr++;
2118     while (c == '_' || c == '-' || isspace(c))
2119       {
2120       if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2121       c = *ptr++;
2122       }
2123     if (c == CHAR_NUL) goto ERROR_RETURN;
2124     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2125     name[i] = tolower(c);
2126     if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2127     }
2128 
2129   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2130   name[i] = 0;
2131   }
2132 
2133 /* If { doesn't follow \p or \P there is just one following character, which
2134 must be an ASCII letter. */
2135 
2136 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2137   {
2138   name[0] = tolower(c);
2139   name[1] = 0;
2140   }
2141 else goto ERROR_RETURN;
2142 
2143 *ptrptr = ptr;
2144 
2145 /* If the property contains ':' or '=' we have class name and value separately
2146 specified. The following are supported:
2147 
2148   . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2149   . Script (synonym sc) for which the property name is the script name
2150   . Script_Extensions (synonym scx), ditto
2151 
2152 As this is a small number, we currently just check the names directly. If this
2153 grows, a sorted table and a switch will be neater.
2154 
2155 For both the script properties, set a PT_xxx value so that (1) they can be
2156 distinguished and (2) invalid script names that happen to be the name of
2157 another property can be diagnosed. */
2158 
2159 if (vptr != NULL)
2160   {
2161   int offset = 0;
2162   PCRE2_UCHAR sname[8];
2163 
2164   *vptr = 0;   /* Terminate property name */
2165   if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2166       PRIV(strcmp_c8)(name, STRING_bc) == 0)
2167     {
2168     offset = 4;
2169     sname[0] = CHAR_b;
2170     sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2171     sname[2] = CHAR_d;
2172     sname[3] = CHAR_i;
2173     }
2174 
2175   else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2176            PRIV(strcmp_c8)(name, STRING_sc) == 0)
2177     ptscript = PT_SC;
2178 
2179   else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2180            PRIV(strcmp_c8)(name, STRING_scx) == 0)
2181     ptscript = PT_SCX;
2182 
2183   else
2184     {
2185     *errorcodeptr = ERR47;
2186     return FALSE;
2187     }
2188 
2189   /* Adjust the string in name[] as needed */
2190 
2191   memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2192   if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2193   }
2194 
2195 /* Search for a recognized property using binary chop. */
2196 
2197 bot = 0;
2198 top = PRIV(utt_size);
2199 
2200 while (bot < top)
2201   {
2202   int r;
2203   i = (bot + top) >> 1;
2204   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2205 
2206   /* When a matching property is found, some extra checking is needed when the
2207   \p{xx:yy} syntax is used and xx is either sc or scx. */
2208 
2209   if (r == 0)
2210     {
2211     *pdataptr = PRIV(utt)[i].value;
2212     if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2213       {
2214       *ptypeptr = PRIV(utt)[i].type;
2215       return TRUE;
2216       }
2217 
2218     switch (PRIV(utt)[i].type)
2219       {
2220       case PT_SC:
2221       *ptypeptr = PT_SC;
2222       return TRUE;
2223 
2224       case PT_SCX:
2225       *ptypeptr = ptscript;
2226       return TRUE;
2227       }
2228 
2229     break;  /* Non-script found */
2230     }
2231 
2232   if (r > 0) bot = i + 1; else top = i;
2233   }
2234 
2235 *errorcodeptr = ERR47;   /* Unrecognized property */
2236 return FALSE;
2237 
2238 ERROR_RETURN:            /* Malformed \P or \p */
2239 *errorcodeptr = ERR46;
2240 *ptrptr = ptr;
2241 return FALSE;
2242 }
2243 #endif
2244 
2245 
2246 
2247 /*************************************************
2248 *           Check for POSIX class syntax         *
2249 *************************************************/
2250 
2251 /* This function is called when the sequence "[:" or "[." or "[=" is
2252 encountered in a character class. It checks whether this is followed by a
2253 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2254 reach an unescaped ']' without the special preceding character, return FALSE.
2255 
2256 Originally, this function only recognized a sequence of letters between the
2257 terminators, but it seems that Perl recognizes any sequence of characters,
2258 though of course unknown POSIX names are subsequently rejected. Perl gives an
2259 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2260 didn't consider this to be a POSIX class. Likewise for [:1234:].
2261 
2262 The problem in trying to be exactly like Perl is in the handling of escapes. We
2263 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2264 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2265 below handles the special cases \\ and \], but does not try to do any other
2266 escape processing. This makes it different from Perl for cases such as
2267 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2268 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2269 when Perl does, I think.
2270 
2271 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2272 It seems that the appearance of a nested POSIX class supersedes an apparent
2273 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2274 a digit. This is handled by returning FALSE if the start of a new group with
2275 the same terminator is encountered, since the next closing sequence must close
2276 the nested group, not the outer one.
2277 
2278 In Perl, unescaped square brackets may also appear as part of class names. For
2279 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2280 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2281 seem right at all. PCRE does not allow closing square brackets in POSIX class
2282 names.
2283 
2284 Arguments:
2285   ptr      pointer to the character after the initial [ (colon, dot, equals)
2286   ptrend   pointer to the end of the pattern
2287   endptr   where to return a pointer to the terminating ':', '.', or '='
2288 
2289 Returns:   TRUE or FALSE
2290 */
2291 
2292 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2293 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2294 {
2295 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2296 terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2297 
2298 for (; ptrend - ptr >= 2; ptr++)
2299   {
2300   if (*ptr == CHAR_BACKSLASH &&
2301       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2302     ptr++;
2303 
2304   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2305             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2306 
2307   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2308     {
2309     *endptr = ptr;
2310     return TRUE;
2311     }
2312   }
2313 
2314 return FALSE;
2315 }
2316 
2317 
2318 
2319 /*************************************************
2320 *          Check POSIX class name                *
2321 *************************************************/
2322 
2323 /* This function is called to check the name given in a POSIX-style class entry
2324 such as [:alnum:].
2325 
2326 Arguments:
2327   ptr        points to the first letter
2328   len        the length of the name
2329 
2330 Returns:     a value representing the name, or -1 if unknown
2331 */
2332 
2333 static int
check_posix_name(PCRE2_SPTR ptr,int len)2334 check_posix_name(PCRE2_SPTR ptr, int len)
2335 {
2336 const char *pn = posix_names;
2337 int yield = 0;
2338 while (posix_name_lengths[yield] != 0)
2339   {
2340   if (len == posix_name_lengths[yield] &&
2341     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2342   pn += posix_name_lengths[yield] + 1;
2343   yield++;
2344   }
2345 return -1;
2346 }
2347 
2348 
2349 
2350 /*************************************************
2351 *       Read a subpattern or VERB name           *
2352 *************************************************/
2353 
2354 /* This function is called from parse_regex() below whenever it needs to read
2355 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2356 pointer must be to the character before the name. If that character is '*' we
2357 are reading a verb or alpha assertion name. The pointer is updated to point
2358 after the name, for a VERB or alpha assertion name, or after tha name's
2359 terminator for a subpattern name. Returning both the offset and the name
2360 pointer is redundant information, but some callers use one and some the other,
2361 so it is simplest just to return both.
2362 
2363 Arguments:
2364   ptrptr      points to the character pointer variable
2365   ptrend      points to the end of the input string
2366   utf         true if the input is UTF-encoded
2367   terminator  the terminator of a subpattern name must be this
2368   offsetptr   where to put the offset from the start of the pattern
2369   nameptr     where to put a pointer to the name in the input
2370   namelenptr  where to put the length of the name
2371   errcodeptr  where to put an error code
2372   cb          pointer to the compile data block
2373 
2374 Returns:    TRUE if a name was read
2375             FALSE otherwise, with error code set
2376 */
2377 
2378 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2379 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2380   PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2381   int *errorcodeptr, compile_block *cb)
2382 {
2383 PCRE2_SPTR ptr = *ptrptr;
2384 BOOL is_group = (*ptr != CHAR_ASTERISK);
2385 
2386 if (++ptr >= ptrend)               /* No characters in name */
2387   {
2388   *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2389                             ERR60; /* Verb not recognized or malformed */
2390   goto FAILED;
2391   }
2392 
2393 *nameptr = ptr;
2394 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2395 
2396 /* In UTF mode, a group name may contain letters and decimal digits as defined
2397 by Unicode properties, and underscores, but must not start with a digit. */
2398 
2399 #ifdef SUPPORT_UNICODE
2400 if (utf && is_group)
2401   {
2402   uint32_t c, type;
2403 
2404   GETCHAR(c, ptr);
2405   type = UCD_CHARTYPE(c);
2406 
2407   if (type == ucp_Nd)
2408     {
2409     *errorcodeptr = ERR44;
2410     goto FAILED;
2411     }
2412 
2413   for(;;)
2414     {
2415     if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2416         c != CHAR_UNDERSCORE) break;
2417     ptr++;
2418     FORWARDCHARTEST(ptr, ptrend);
2419     if (ptr >= ptrend) break;
2420     GETCHAR(c, ptr);
2421     type = UCD_CHARTYPE(c);
2422     }
2423   }
2424 else
2425 #else
2426 (void)utf;  /* Avoid compiler warning */
2427 #endif      /* SUPPORT_UNICODE */
2428 
2429 /* Handle non-group names and group names in non-UTF modes. A group name must
2430 not start with a digit. If either of the others start with a digit it just
2431 won't be recognized. */
2432 
2433   {
2434   if (is_group && IS_DIGIT(*ptr))
2435     {
2436     *errorcodeptr = ERR44;
2437     goto FAILED;
2438     }
2439 
2440   while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2441     {
2442     ptr++;
2443     }
2444   }
2445 
2446 /* Check name length */
2447 
2448 if (ptr > *nameptr + MAX_NAME_SIZE)
2449   {
2450   *errorcodeptr = ERR48;
2451   goto FAILED;
2452   }
2453 *namelenptr = (uint32_t)(ptr - *nameptr);
2454 
2455 /* Subpattern names must not be empty, and their terminator is checked here.
2456 (What follows a verb or alpha assertion name is checked separately.) */
2457 
2458 if (is_group)
2459   {
2460   if (ptr == *nameptr)
2461     {
2462     *errorcodeptr = ERR62;   /* Subpattern name expected */
2463     goto FAILED;
2464     }
2465   if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2466     {
2467     *errorcodeptr = ERR42;
2468     goto FAILED;
2469     }
2470   ptr++;
2471   }
2472 
2473 *ptrptr = ptr;
2474 return TRUE;
2475 
2476 FAILED:
2477 *ptrptr = ptr;
2478 return FALSE;
2479 }
2480 
2481 
2482 
2483 /*************************************************
2484 *          Manage callouts at start of cycle     *
2485 *************************************************/
2486 
2487 /* At the start of a new item in parse_regex() we are able to record the
2488 details of the previous item in a prior callout, and also to set up an
2489 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2490 which would otherwise happen for items such as \Q that contribute nothing to
2491 the parsed pattern.
2492 
2493 Arguments:
2494   ptr              current pattern pointer
2495   pcalloutptr      points to a pointer to previous callout, or NULL
2496   auto_callout     TRUE if auto_callouts are enabled
2497   parsed_pattern   the parsed pattern pointer
2498   cb               compile block
2499 
2500 Returns: possibly updated parsed_pattern pointer.
2501 */
2502 
2503 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2504 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2505   uint32_t *parsed_pattern, compile_block *cb)
2506 {
2507 uint32_t *previous_callout = *pcalloutptr;
2508 
2509 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2510   cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2511 
2512 if (!auto_callout) previous_callout = NULL; else
2513   {
2514   if (previous_callout == NULL ||
2515       previous_callout != parsed_pattern - 4 ||
2516       previous_callout[3] != 255)
2517     {
2518     previous_callout = parsed_pattern;  /* Set up new automatic callout */
2519     parsed_pattern += 4;
2520     previous_callout[0] = META_CALLOUT_NUMBER;
2521     previous_callout[2] = 0;
2522     previous_callout[3] = 255;
2523     }
2524   previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2525   }
2526 
2527 *pcalloutptr = previous_callout;
2528 return parsed_pattern;
2529 }
2530 
2531 
2532 
2533 /*************************************************
2534 *      Parse regex and identify named groups     *
2535 *************************************************/
2536 
2537 /* This function is called first of all. It scans the pattern and does two
2538 things: (1) It identifies capturing groups and makes a table of named capturing
2539 groups so that information about them is fully available to both the compiling
2540 scans. (2) It writes a parsed version of the pattern with comments omitted and
2541 escapes processed into the parsed_pattern vector.
2542 
2543 Arguments:
2544   ptr             points to the start of the pattern
2545   options         compiling dynamic options (may change during the scan)
2546   has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2547   cb              pointer to the compile data block
2548 
2549 Returns:   zero on success or a non-zero error code, with the
2550              error offset placed in the cb field
2551 */
2552 
2553 /* A structure and some flags for dealing with nested groups. */
2554 
2555 typedef struct nest_save {
2556   uint16_t  nest_depth;
2557   uint16_t  reset_group;
2558   uint16_t  max_group;
2559   uint16_t  flags;
2560   uint32_t  options;
2561 } nest_save;
2562 
2563 #define NSF_RESET          0x0001u
2564 #define NSF_CONDASSERT     0x0002u
2565 #define NSF_ATOMICSR       0x0004u
2566 
2567 /* Options that are changeable within the pattern must be tracked during
2568 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2569 but all must be tracked so that META_OPTIONS items set the correct values for
2570 the main compiling phase. */
2571 
2572 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2573   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2574   PCRE2_UNGREEDY)
2575 
2576 /* States used for analyzing ranges in character classes. The two OK values
2577 must be last. */
2578 
2579 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2580 
2581 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2582 the storing of literal values in the main parsed pattern, where they can always
2583 be quantified. */
2584 
2585 #if PCRE2_CODE_UNIT_WIDTH == 32
2586 #define PARSED_LITERAL(c, p) \
2587   { \
2588   if (c >= META_END) *p++ = META_BIGVALUE; \
2589   *p++ = c; \
2590   okquantifier = TRUE; \
2591   }
2592 #else
2593 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2594 #endif
2595 
2596 /* Here's the actual function. */
2597 
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2598 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2599   compile_block *cb)
2600 {
2601 uint32_t c;
2602 uint32_t delimiter;
2603 uint32_t namelen;
2604 uint32_t class_range_state;
2605 uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2606 uint32_t *verbstartptr = NULL;
2607 uint32_t *previous_callout = NULL;
2608 uint32_t *parsed_pattern = cb->parsed_pattern;
2609 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2610 uint32_t meta_quantifier = 0;
2611 uint32_t add_after_mark = 0;
2612 uint32_t extra_options = cb->cx->extra_options;
2613 uint16_t nest_depth = 0;
2614 int after_manual_callout = 0;
2615 int expect_cond_assert = 0;
2616 int errorcode = 0;
2617 int escape;
2618 int i;
2619 BOOL inescq = FALSE;
2620 BOOL inverbname = FALSE;
2621 BOOL utf = (options & PCRE2_UTF) != 0;
2622 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2623 BOOL isdupname;
2624 BOOL negate_class;
2625 BOOL okquantifier = FALSE;
2626 PCRE2_SPTR thisptr;
2627 PCRE2_SPTR name;
2628 PCRE2_SPTR ptrend = cb->end_pattern;
2629 PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2630 named_group *ng;
2631 nest_save *top_nest, *end_nests;
2632 
2633 /* Insert leading items for word and line matching (features provided for the
2634 benefit of pcre2grep). */
2635 
2636 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2637   {
2638   *parsed_pattern++ = META_CIRCUMFLEX;
2639   *parsed_pattern++ = META_NOCAPTURE;
2640   }
2641 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2642   {
2643   *parsed_pattern++ = META_ESCAPE + ESC_b;
2644   *parsed_pattern++ = META_NOCAPTURE;
2645   }
2646 
2647 /* If the pattern is actually a literal string, process it separately to avoid
2648 cluttering up the main loop. */
2649 
2650 if ((options & PCRE2_LITERAL) != 0)
2651   {
2652   while (ptr < ptrend)
2653     {
2654     if (parsed_pattern >= parsed_pattern_end)
2655       {
2656       errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2657       goto FAILED;
2658       }
2659     thisptr = ptr;
2660     GETCHARINCTEST(c, ptr);
2661     if (auto_callout)
2662       parsed_pattern = manage_callouts(thisptr, &previous_callout,
2663         auto_callout, parsed_pattern, cb);
2664     PARSED_LITERAL(c, parsed_pattern);
2665     }
2666   goto PARSED_END;
2667   }
2668 
2669 /* Process a real regex which may contain meta-characters. */
2670 
2671 top_nest = NULL;
2672 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2673 
2674 /* The size of the nest_save structure might not be a factor of the size of the
2675 workspace. Therefore we must round down end_nests so as to correctly avoid
2676 creating a nest_save that spans the end of the workspace. */
2677 
2678 end_nests = (nest_save *)((char *)end_nests -
2679   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2680 
2681 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2682 
2683 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2684 
2685 /* Now scan the pattern */
2686 
2687 while (ptr < ptrend)
2688   {
2689   int prev_expect_cond_assert;
2690   uint32_t min_repeat, max_repeat;
2691   uint32_t set, unset, *optset;
2692   uint32_t terminator;
2693   uint32_t prev_meta_quantifier;
2694   BOOL prev_okquantifier;
2695   PCRE2_SPTR tempptr;
2696   PCRE2_SIZE offset;
2697 
2698   if (parsed_pattern >= parsed_pattern_end)
2699     {
2700     errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2701     goto FAILED;
2702     }
2703 
2704   if (nest_depth > cb->cx->parens_nest_limit)
2705     {
2706     errorcode = ERR19;
2707     goto FAILED;        /* Parentheses too deeply nested */
2708     }
2709 
2710   /* Get next input character, save its position for callout handling. */
2711 
2712   thisptr = ptr;
2713   GETCHARINCTEST(c, ptr);
2714 
2715   /* Copy quoted literals until \E, allowing for the possibility of automatic
2716   callouts, except when processing a (*VERB) "name".  */
2717 
2718   if (inescq)
2719     {
2720     if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2721       {
2722       inescq = FALSE;
2723       ptr++;   /* Skip E */
2724       }
2725     else
2726       {
2727       if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2728         {                           /* expecting a conditional assertion, */
2729         ptr--;                      /* but an empty \Q\E sequence is OK.  */
2730         errorcode = ERR28;
2731         goto FAILED;
2732         }
2733       if (inverbname)
2734         {                          /* Don't use PARSED_LITERAL() because it */
2735 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2736         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2737 #endif
2738         *parsed_pattern++ = c;
2739         }
2740       else
2741         {
2742         if (after_manual_callout-- <= 0)
2743           parsed_pattern = manage_callouts(thisptr, &previous_callout,
2744             auto_callout, parsed_pattern, cb);
2745         PARSED_LITERAL(c, parsed_pattern);
2746         }
2747       meta_quantifier = 0;
2748       }
2749     continue;  /* Next character */
2750     }
2751 
2752   /* If we are processing the "name" part of a (*VERB:NAME) item, all
2753   characters up to the closing parenthesis are literals except when
2754   PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2755   and \E and escaped characters are allowed (no character types such as \d). If
2756   PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2757   this by not entering the special (*VERB:NAME) processing - they are then
2758   picked up below. Note that c is a character, not a code unit, so we must not
2759   use MAX_255 to test its size because MAX_255 tests code units and is assumed
2760   TRUE in 8-bit mode. */
2761 
2762   if (inverbname &&
2763        (
2764         /* EITHER: not both options set */
2765         ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2766                     (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2767 #ifdef SUPPORT_UNICODE
2768         /* OR: character > 255 AND not Unicode Pattern White Space */
2769         (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2770 #endif
2771         /* OR: not a # comment or isspace() white space */
2772         (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2773 #ifdef SUPPORT_UNICODE
2774         /* and not CHAR_NEL when Unicode is supported */
2775           && c != CHAR_NEL
2776 #endif
2777        )))
2778     {
2779     PCRE2_SIZE verbnamelength;
2780 
2781     switch(c)
2782       {
2783       default:                     /* Don't use PARSED_LITERAL() because it */
2784 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2785       if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2786 #endif
2787       *parsed_pattern++ = c;
2788       break;
2789 
2790       case CHAR_RIGHT_PARENTHESIS:
2791       inverbname = FALSE;
2792       /* This is the length in characters */
2793       verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2794       /* But the limit on the length is in code units */
2795       if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2796         {
2797         ptr--;
2798         errorcode = ERR76;
2799         goto FAILED;
2800         }
2801       *verblengthptr = (uint32_t)verbnamelength;
2802 
2803       /* If this name was on a verb such as (*ACCEPT) which does not continue,
2804       a (*MARK) was generated for the name. We now add the original verb as the
2805       next item. */
2806 
2807       if (add_after_mark != 0)
2808         {
2809         *parsed_pattern++ = add_after_mark;
2810         add_after_mark = 0;
2811         }
2812       break;
2813 
2814       case CHAR_BACKSLASH:
2815       if ((options & PCRE2_ALT_VERBNAMES) != 0)
2816         {
2817         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2818           cb->cx->extra_options, FALSE, cb);
2819         if (errorcode != 0) goto FAILED;
2820         }
2821       else escape = 0;   /* Treat all as literal */
2822 
2823       switch(escape)
2824         {
2825         case 0:                    /* Don't use PARSED_LITERAL() because it */
2826 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2827         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2828 #endif
2829         *parsed_pattern++ = c;
2830         break;
2831 
2832         case ESC_Q:
2833         inescq = TRUE;
2834         break;
2835 
2836         case ESC_E:           /* Ignore */
2837         break;
2838 
2839         default:
2840         errorcode = ERR40;    /* Invalid in verb name */
2841         goto FAILED;
2842         }
2843       }
2844     continue;   /* Next character in pattern */
2845     }
2846 
2847   /* Not a verb name character. At this point we must process everything that
2848   must not change the quantification state. This is mainly comments, but we
2849   handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2850   A+, as in Perl. An isolated \E is ignored. */
2851 
2852   if (c == CHAR_BACKSLASH && ptr < ptrend)
2853     {
2854     if (*ptr == CHAR_Q || *ptr == CHAR_E)
2855       {
2856       inescq = *ptr == CHAR_Q;
2857       ptr++;
2858       continue;
2859       }
2860     }
2861 
2862   /* Skip over whitespace and # comments in extended mode. Note that c is a
2863   character, not a code unit, so we must not use MAX_255 to test its size
2864   because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2865   whitespace characters are those designated as "Pattern White Space" by
2866   Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2867   U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2868   subset of space characters that match \h and \v. */
2869 
2870   if ((options & PCRE2_EXTENDED) != 0)
2871     {
2872     if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2873 #ifdef SUPPORT_UNICODE
2874     if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2875 #endif
2876     if (c == CHAR_NUMBER_SIGN)
2877       {
2878       while (ptr < ptrend)
2879         {
2880         if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
2881           {                       /* IS_NEWLINE sets cb->nllen. */
2882           ptr += cb->nllen;
2883           break;
2884           }
2885         ptr++;
2886 #ifdef SUPPORT_UNICODE
2887         if (utf) FORWARDCHARTEST(ptr, ptrend);
2888 #endif
2889         }
2890       continue;  /* Next character in pattern */
2891       }
2892     }
2893 
2894   /* Skip over bracketed comments */
2895 
2896   if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2897       ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2898     {
2899     while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2900     if (ptr >= ptrend)
2901       {
2902       errorcode = ERR18;  /* A special error for missing ) in a comment */
2903       goto FAILED;        /* to make it easier to debug. */
2904       }
2905     ptr++;
2906     continue;  /* Next character in pattern */
2907     }
2908 
2909   /* If the next item is not a quantifier, fill in length of any previous
2910   callout and create an auto callout if required. */
2911 
2912   if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2913        (c != CHAR_LEFT_CURLY_BRACKET ||
2914          (tempptr = ptr,
2915          !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2916     {
2917     if (after_manual_callout-- <= 0)
2918       parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2919         parsed_pattern, cb);
2920     }
2921 
2922   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2923   assertion, possibly preceded by a callout. If the value is 1, we have just
2924   had the callout and expect an assertion. There must be at least 3 more
2925   characters in all cases. When expect_cond_assert is 2, we know that the
2926   current character is an opening parenthesis, as otherwise we wouldn't be
2927   here. However, when it is 1, we need to check, and it's easiest just to check
2928   always. Note that expect_cond_assert may be negative, since all callouts just
2929   decrement it. */
2930 
2931   if (expect_cond_assert > 0)
2932     {
2933     BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2934               (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2935     if (ok)
2936       {
2937       if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
2938         {
2939         ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2940         }
2941       else switch(ptr[1])  /* Traditional symbolic format */
2942         {
2943         case CHAR_C:
2944         ok = expect_cond_assert == 2;
2945         break;
2946 
2947         case CHAR_EQUALS_SIGN:
2948         case CHAR_EXCLAMATION_MARK:
2949         break;
2950 
2951         case CHAR_LESS_THAN_SIGN:
2952         ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2953         break;
2954 
2955         default:
2956         ok = FALSE;
2957         }
2958       }
2959 
2960     if (!ok)
2961       {
2962       ptr--;   /* Adjust error offset */
2963       errorcode = ERR28;
2964       goto FAILED;
2965       }
2966     }
2967 
2968   /* Remember whether we are expecting a conditional assertion, and set the
2969   default for this item. */
2970 
2971   prev_expect_cond_assert = expect_cond_assert;
2972   expect_cond_assert = 0;
2973 
2974   /* Remember quantification status for the previous significant item, then set
2975   default for this item. */
2976 
2977   prev_okquantifier = okquantifier;
2978   prev_meta_quantifier = meta_quantifier;
2979   okquantifier = FALSE;
2980   meta_quantifier = 0;
2981 
2982   /* If the previous significant item was a quantifier, adjust the parsed code
2983   if there is a following modifier. The base meta value is always followed by
2984   the PLUS and QUERY values, in that order. We do this here rather than after
2985   reading a quantifier so that intervening comments and /x whitespace can be
2986   ignored without having to replicate code. */
2987 
2988   if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2989     {
2990     parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2991       prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2992         0x00020000u : 0x00010000u);
2993     continue;  /* Next character in pattern */
2994     }
2995 
2996 
2997   /* Process the next item in the main part of a pattern. */
2998 
2999   switch(c)
3000     {
3001     default:              /* Non-special character */
3002     PARSED_LITERAL(c, parsed_pattern);
3003     break;
3004 
3005 
3006     /* ---- Escape sequence ---- */
3007 
3008     case CHAR_BACKSLASH:
3009     tempptr = ptr;
3010     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3011       cb->cx->extra_options, FALSE, cb);
3012     if (errorcode != 0)
3013       {
3014       ESCAPE_FAILED:
3015       if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3016         goto FAILED;
3017       ptr = tempptr;
3018       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3019         {
3020         GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3021         }
3022       escape = 0;                 /* Treat as literal character */
3023       }
3024 
3025     /* The escape was a data escape or literal character. */
3026 
3027     if (escape == 0)
3028       {
3029       PARSED_LITERAL(c, parsed_pattern);
3030       }
3031 
3032     /* The escape was a back (or forward) reference. We keep the offset in
3033     order to give a more useful diagnostic for a bad forward reference. For
3034     references to groups numbered less than 10 we can't use more than two items
3035     in parsed_pattern because they may be just two characters in the input (and
3036     in a 64-bit world an offset may need two elements). So for them, the offset
3037     of the first occurrent is held in a special vector. */
3038 
3039     else if (escape < 0)
3040       {
3041       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3042       escape = -escape;
3043       *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3044       if (escape < 10)
3045         {
3046         if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3047           cb->small_ref_offset[escape] = offset;
3048         }
3049       else
3050         {
3051         PUTOFFSET(offset, parsed_pattern);
3052         }
3053       okquantifier = TRUE;
3054       }
3055 
3056     /* The escape was a character class such as \d etc. or other special
3057     escape indicator such as \A or \X. Most of them generate just a single
3058     parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3059     value. They are supported only when Unicode is available. The type and
3060     value are packed into a single 32-bit value so that the whole sequences
3061     uses only two elements in the parsed_vector. This is because the same
3062     coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3063     set.
3064 
3065     There are also some cases where the escape sequence is followed by a name:
3066     \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3067     and \g'name' are subroutine calls by name; \g{name} is a synonym for
3068     \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3069     and returned as a negative value (handled above). A name is coded as an
3070     offset into the pattern and a length. */
3071 
3072     else switch (escape)
3073       {
3074       case ESC_C:
3075 #ifdef NEVER_BACKSLASH_C
3076       errorcode = ERR85;
3077       goto ESCAPE_FAILED;
3078 #else
3079       if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3080         {
3081         errorcode = ERR83;
3082         goto ESCAPE_FAILED;
3083         }
3084 #endif
3085       okquantifier = TRUE;
3086       *parsed_pattern++ = META_ESCAPE + escape;
3087       break;
3088 
3089       case ESC_X:
3090 #ifndef SUPPORT_UNICODE
3091       errorcode = ERR45;   /* Supported only with Unicode support */
3092       goto ESCAPE_FAILED;
3093 #endif
3094       case ESC_H:
3095       case ESC_h:
3096       case ESC_N:
3097       case ESC_R:
3098       case ESC_V:
3099       case ESC_v:
3100       okquantifier = TRUE;
3101       *parsed_pattern++ = META_ESCAPE + escape;
3102       break;
3103 
3104       default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3105       *parsed_pattern++ = META_ESCAPE + escape;
3106       break;
3107 
3108       /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
3109       without Unicode support because it is checked when pcre2_compile() is
3110       called. */
3111 
3112       case ESC_d:
3113       case ESC_D:
3114       case ESC_s:
3115       case ESC_S:
3116       case ESC_w:
3117       case ESC_W:
3118       okquantifier = TRUE;
3119       if ((options & PCRE2_UCP) == 0)
3120         {
3121         *parsed_pattern++ = META_ESCAPE + escape;
3122         }
3123       else
3124         {
3125         *parsed_pattern++ = META_ESCAPE +
3126           ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3127             ESC_p : ESC_P);
3128         switch(escape)
3129           {
3130           case ESC_d:
3131           case ESC_D:
3132           *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3133           break;
3134 
3135           case ESC_s:
3136           case ESC_S:
3137           *parsed_pattern++ = PT_SPACE << 16;
3138           break;
3139 
3140           case ESC_w:
3141           case ESC_W:
3142           *parsed_pattern++ = PT_WORD << 16;
3143           break;
3144           }
3145         }
3146       break;
3147 
3148       /* Unicode property matching */
3149 
3150       case ESC_P:
3151       case ESC_p:
3152 #ifdef SUPPORT_UNICODE
3153         {
3154         BOOL negated;
3155         uint16_t ptype = 0, pdata = 0;
3156         if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3157           goto ESCAPE_FAILED;
3158         if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3159         *parsed_pattern++ = META_ESCAPE + escape;
3160         *parsed_pattern++ = (ptype << 16) | pdata;
3161         okquantifier = TRUE;
3162         }
3163 #else
3164       errorcode = ERR45;
3165       goto ESCAPE_FAILED;
3166 #endif
3167       break;  /* End \P and \p */
3168 
3169       /* When \g is used with quotes or angle brackets as delimiters, it is a
3170       numerical or named subroutine call, and control comes here. When used
3171       with brace delimiters it is a numberical back reference and does not come
3172       here because check_escape() returns it directly as a reference. \k is
3173       always a named back reference. */
3174 
3175       case ESC_g:
3176       case ESC_k:
3177       if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3178           *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3179         {
3180         errorcode = (escape == ESC_g)? ERR57 : ERR69;
3181         goto ESCAPE_FAILED;
3182         }
3183       terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3184         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3185         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3186 
3187       /* For a non-braced \g, check for a numerical recursion. */
3188 
3189       if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3190         {
3191         PCRE2_SPTR p = ptr + 1;
3192 
3193         if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3194             &errorcode))
3195           {
3196           if (p >= ptrend || *p != terminator)
3197             {
3198             errorcode = ERR57;
3199             goto ESCAPE_FAILED;
3200             }
3201           ptr = p;
3202           goto SET_RECURSION;
3203           }
3204         if (errorcode != 0) goto ESCAPE_FAILED;
3205         }
3206 
3207       /* Not a numerical recursion */
3208 
3209       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3210           &errorcode, cb)) goto ESCAPE_FAILED;
3211 
3212       /* \k and \g when used with braces are back references, whereas \g used
3213       with quotes or angle brackets is a recursion */
3214 
3215       *parsed_pattern++ =
3216         (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3217           META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3218       *parsed_pattern++ = namelen;
3219 
3220       PUTOFFSET(offset, parsed_pattern);
3221       okquantifier = TRUE;
3222       break;  /* End special escape processing */
3223       }
3224     break;    /* End escape sequence processing */
3225 
3226 
3227     /* ---- Single-character special items ---- */
3228 
3229     case CHAR_CIRCUMFLEX_ACCENT:
3230     *parsed_pattern++ = META_CIRCUMFLEX;
3231     break;
3232 
3233     case CHAR_DOLLAR_SIGN:
3234     *parsed_pattern++ = META_DOLLAR;
3235     break;
3236 
3237     case CHAR_DOT:
3238     *parsed_pattern++ = META_DOT;
3239     okquantifier = TRUE;
3240     break;
3241 
3242 
3243     /* ---- Single-character quantifiers ---- */
3244 
3245     case CHAR_ASTERISK:
3246     meta_quantifier = META_ASTERISK;
3247     goto CHECK_QUANTIFIER;
3248 
3249     case CHAR_PLUS:
3250     meta_quantifier = META_PLUS;
3251     goto CHECK_QUANTIFIER;
3252 
3253     case CHAR_QUESTION_MARK:
3254     meta_quantifier = META_QUERY;
3255     goto CHECK_QUANTIFIER;
3256 
3257 
3258     /* ---- Potential {n,m} quantifier ---- */
3259 
3260     case CHAR_LEFT_CURLY_BRACKET:
3261     if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3262         &errorcode))
3263       {
3264       if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3265       PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3266       break;                               /* No more quantifier processing */
3267       }
3268     meta_quantifier = META_MINMAX;
3269     /* Fall through */
3270 
3271 
3272     /* ---- Quantifier post-processing ---- */
3273 
3274     /* Check that a quantifier is allowed after the previous item. */
3275 
3276     CHECK_QUANTIFIER:
3277     if (!prev_okquantifier)
3278       {
3279       errorcode = ERR9;
3280       goto FAILED_BACK;
3281       }
3282 
3283     /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3284     quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3285     sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3286     wrapping it in non-capturing brackets, but we have to allow for a preceding
3287     (*MARK) for when (*ACCEPT) has an argument. */
3288 
3289     if (parsed_pattern[-1] == META_ACCEPT)
3290       {
3291       uint32_t *p;
3292       for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3293       *verbstartptr = META_NOCAPTURE;
3294       parsed_pattern[1] = META_KET;
3295       parsed_pattern += 2;
3296       }
3297 
3298     /* Now we can put the quantifier into the parsed pattern vector. At this
3299     stage, we have only the basic quantifier. The check for a following + or ?
3300     modifier happens at the top of the loop, after any intervening comments
3301     have been removed. */
3302 
3303     *parsed_pattern++ = meta_quantifier;
3304     if (c == CHAR_LEFT_CURLY_BRACKET)
3305       {
3306       *parsed_pattern++ = min_repeat;
3307       *parsed_pattern++ = max_repeat;
3308       }
3309     break;
3310 
3311 
3312     /* ---- Character class ---- */
3313 
3314     case CHAR_LEFT_SQUARE_BRACKET:
3315     okquantifier = TRUE;
3316 
3317     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3318     used for "start of word" and "end of word". As these are otherwise illegal
3319     sequences, we don't break anything by recognizing them. They are replaced
3320     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3321     erroneous and are handled by the normal code below. */
3322 
3323     if (ptrend - ptr >= 6 &&
3324          (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3325           PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3326       {
3327       *parsed_pattern++ = META_ESCAPE + ESC_b;
3328 
3329       if (ptr[2] == CHAR_LESS_THAN_SIGN)
3330         {
3331         *parsed_pattern++ = META_LOOKAHEAD;
3332         }
3333       else
3334         {
3335         *parsed_pattern++ = META_LOOKBEHIND;
3336         *has_lookbehind = TRUE;
3337 
3338         /* The offset is used only for the "non-fixed length" error; this won't
3339         occur here, so just store zero. */
3340 
3341         PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3342         }
3343 
3344       if ((options & PCRE2_UCP) == 0)
3345         *parsed_pattern++ = META_ESCAPE + ESC_w;
3346       else
3347         {
3348         *parsed_pattern++ = META_ESCAPE + ESC_p;
3349         *parsed_pattern++ = PT_WORD << 16;
3350         }
3351       *parsed_pattern++ = META_KET;
3352       ptr += 6;
3353       break;
3354       }
3355 
3356     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3357     they are encountered at the top level, so we'll do that too. */
3358 
3359     if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3360          *ptr == CHAR_EQUALS_SIGN) &&
3361         check_posix_syntax(ptr, ptrend, &tempptr))
3362       {
3363       errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3364       goto FAILED;
3365       }
3366 
3367     /* Process a regular character class. If the first character is '^', set
3368     the negation flag. If the first few characters (either before or after ^)
3369     are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3370     This makes for compatibility with Perl. */
3371 
3372     negate_class = FALSE;
3373     while (ptr < ptrend)
3374       {
3375       GETCHARINCTEST(c, ptr);
3376       if (c == CHAR_BACKSLASH)
3377         {
3378         if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3379         else if (ptrend - ptr >= 3 &&
3380              PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3381           ptr += 3;
3382         else
3383           break;
3384         }
3385       else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3386                (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3387         continue;
3388       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3389         negate_class = TRUE;
3390       else break;
3391       }
3392 
3393     /* Now the real contents of the class; c has the first "real" character.
3394     Empty classes are permitted only if the option is set. */
3395 
3396     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3397         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3398       {
3399       *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3400       break;  /* End of class processing */
3401       }
3402 
3403     /* Process a non-empty class. */
3404 
3405     *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3406     class_range_state = RANGE_NO;
3407 
3408     /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3409     because there are holes in the encoding, and simply using the range A-Z
3410     (for example) would include the characters in the holes. This applies only
3411     to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3412     in this respect. In order to accommodate this, we keep track of whether
3413     character values are literal or not, and a state variable for handling
3414     ranges. */
3415 
3416     /* Loop for the contents of the class */
3417 
3418     for (;;)
3419       {
3420       BOOL char_is_literal = TRUE;
3421 
3422       /* Inside \Q...\E everything is literal except \E */
3423 
3424       if (inescq)
3425         {
3426         if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3427           {
3428           inescq = FALSE;                   /* Reset literal state */
3429           ptr++;                            /* Skip the 'E' */
3430           goto CLASS_CONTINUE;
3431           }
3432         goto CLASS_LITERAL;
3433         }
3434 
3435       /* Skip over space and tab (only) in extended-more mode. */
3436 
3437       if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3438           (c == CHAR_SPACE || c == CHAR_HT))
3439         goto CLASS_CONTINUE;
3440 
3441       /* Handle POSIX class names. Perl allows a negation extension of the
3442       form [:^name:]. A square bracket that doesn't match the syntax is
3443       treated as a literal. We also recognize the POSIX constructions
3444       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3445       5.6 and 5.8 do. */
3446 
3447       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3448           ptrend - ptr >= 3 &&
3449           (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3450            *ptr == CHAR_EQUALS_SIGN) &&
3451           check_posix_syntax(ptr, ptrend, &tempptr))
3452         {
3453         BOOL posix_negate = FALSE;
3454         int posix_class;
3455 
3456         /* Perl treats a hyphen before a POSIX class as a literal, not the
3457         start of a range. However, it gives a warning in its warning mode. PCRE
3458         does not have a warning mode, so we give an error, because this is
3459         likely an error on the user's part. */
3460 
3461         if (class_range_state == RANGE_STARTED)
3462           {
3463           errorcode = ERR50;
3464           goto FAILED;
3465           }
3466 
3467         if (*ptr != CHAR_COLON)
3468           {
3469           errorcode = ERR13;
3470           goto FAILED_BACK;
3471           }
3472 
3473         if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3474           {
3475           posix_negate = TRUE;
3476           ptr++;
3477           }
3478 
3479         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3480         if (posix_class < 0)
3481           {
3482           errorcode = ERR30;
3483           goto FAILED;
3484           }
3485         ptr = tempptr + 2;
3486 
3487         /* Perl treats a hyphen after a POSIX class as a literal, not the
3488         start of a range. However, it gives a warning in its warning mode
3489         unless the hyphen is the last character in the class. PCRE does not
3490         have a warning mode, so we give an error, because this is likely an
3491         error on the user's part. */
3492 
3493         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3494             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3495           {
3496           errorcode = ERR50;
3497           goto FAILED;
3498           }
3499 
3500         /* Set "a hyphen is not the start of a range" for the -] case, and also
3501         in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3502         fuzzers do that kind of thing) and *then* a hyphen. This causes that
3503         hyphen to be treated as a literal. I don't think it's worth setting up
3504         special apparatus to do otherwise. */
3505 
3506         class_range_state = RANGE_NO;
3507 
3508         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3509         use Unicode properties \p or \P or, in one case, \h or \H. The
3510         substitutes table has two values per class, containing the type and
3511         value of a \p or \P item. The special cases are specified with a
3512         negative type: a non-zero value causes \h or \H to be used, and a zero
3513         value falls through to behave like a non-UCP POSIX class. */
3514 
3515 #ifdef SUPPORT_UNICODE
3516         if ((options & PCRE2_UCP) != 0)
3517           {
3518           int ptype = posix_substitutes[2*posix_class];
3519           int pvalue = posix_substitutes[2*posix_class + 1];
3520           if (ptype >= 0)
3521             {
3522             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3523             *parsed_pattern++ = (ptype << 16) | pvalue;
3524             goto CLASS_CONTINUE;
3525             }
3526 
3527           if (pvalue != 0)
3528             {
3529             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3530             goto CLASS_CONTINUE;
3531             }
3532 
3533           /* Fall through */
3534           }
3535 #endif  /* SUPPORT_UNICODE */
3536 
3537         /* Non-UCP POSIX class */
3538 
3539         *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3540         *parsed_pattern++ = posix_class;
3541         }
3542 
3543       /* Handle potential start of range */
3544 
3545       else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3546         {
3547         *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3548           META_RANGE_LITERAL : META_RANGE_ESCAPED;
3549         class_range_state = RANGE_STARTED;
3550         }
3551 
3552       /* Handle a literal character */
3553 
3554       else if (c != CHAR_BACKSLASH)
3555         {
3556         CLASS_LITERAL:
3557         if (class_range_state == RANGE_STARTED)
3558           {
3559           if (c == parsed_pattern[-2])       /* Optimize one-char range */
3560             parsed_pattern--;
3561           else if (parsed_pattern[-2] > c)   /* Check range is in order */
3562             {
3563             errorcode = ERR8;
3564             goto FAILED_BACK;
3565             }
3566           else
3567             {
3568             if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3569               parsed_pattern[-1] = META_RANGE_ESCAPED;
3570             PARSED_LITERAL(c, parsed_pattern);
3571             }
3572           class_range_state = RANGE_NO;
3573           }
3574         else  /* Potential start of range */
3575           {
3576           class_range_state = char_is_literal?
3577             RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3578           PARSED_LITERAL(c, parsed_pattern);
3579           }
3580         }
3581 
3582       /* Handle escapes in a class */
3583 
3584       else
3585         {
3586         tempptr = ptr;
3587         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3588           cb->cx->extra_options, TRUE, cb);
3589 
3590         if (errorcode != 0)
3591           {
3592           if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3593             goto FAILED;
3594           ptr = tempptr;
3595           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3596             {
3597             GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3598             }
3599           escape = 0;                 /* Treat as literal character */
3600           }
3601 
3602         switch(escape)
3603           {
3604           case 0:  /* Escaped character code point is in c */
3605           char_is_literal = FALSE;
3606           goto CLASS_LITERAL;
3607 
3608           case ESC_b:
3609           c = CHAR_BS;    /* \b is backspace in a class */
3610           char_is_literal = FALSE;
3611           goto CLASS_LITERAL;
3612 
3613           case ESC_Q:
3614           inescq = TRUE;  /* Enter literal mode */
3615           goto CLASS_CONTINUE;
3616 
3617           case ESC_E:     /* Ignore orphan \E */
3618           goto CLASS_CONTINUE;
3619 
3620           case ESC_B:     /* Always an error in a class */
3621           case ESC_R:
3622           case ESC_X:
3623           errorcode = ERR7;
3624           ptr--;
3625           goto FAILED;
3626           }
3627 
3628         /* The second part of a range can be a single-character escape
3629         sequence (detected above), but not any of the other escapes. Perl
3630         treats a hyphen as a literal in such circumstances. However, in Perl's
3631         warning mode, a warning is given, so PCRE now faults it, as it is
3632         almost certainly a mistake on the user's part. */
3633 
3634         if (class_range_state == RANGE_STARTED)
3635           {
3636           errorcode = ERR50;
3637           goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3638           }
3639 
3640         /* Of the remaining escapes, only those that define characters are
3641         allowed in a class. None may start a range. */
3642 
3643         class_range_state = RANGE_NO;
3644         switch(escape)
3645           {
3646           case ESC_N:
3647           errorcode = ERR71;
3648           goto FAILED;
3649 
3650           case ESC_H:
3651           case ESC_h:
3652           case ESC_V:
3653           case ESC_v:
3654           *parsed_pattern++ = META_ESCAPE + escape;
3655           break;
3656 
3657           /* These escapes are converted to Unicode property tests when
3658           PCRE2_UCP is set. */
3659 
3660           case ESC_d:
3661           case ESC_D:
3662           case ESC_s:
3663           case ESC_S:
3664           case ESC_w:
3665           case ESC_W:
3666           if ((options & PCRE2_UCP) == 0)
3667             {
3668             *parsed_pattern++ = META_ESCAPE + escape;
3669             }
3670           else
3671             {
3672             *parsed_pattern++ = META_ESCAPE +
3673               ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3674                 ESC_p : ESC_P);
3675             switch(escape)
3676               {
3677               case ESC_d:
3678               case ESC_D:
3679               *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3680               break;
3681 
3682               case ESC_s:
3683               case ESC_S:
3684               *parsed_pattern++ = PT_SPACE << 16;
3685               break;
3686 
3687               case ESC_w:
3688               case ESC_W:
3689               *parsed_pattern++ = PT_WORD << 16;
3690               break;
3691               }
3692             }
3693           break;
3694 
3695           /* Explicit Unicode property matching */
3696 
3697           case ESC_P:
3698           case ESC_p:
3699 #ifdef SUPPORT_UNICODE
3700             {
3701             BOOL negated;
3702             uint16_t ptype = 0, pdata = 0;
3703             if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3704               goto FAILED;
3705             if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3706             *parsed_pattern++ = META_ESCAPE + escape;
3707             *parsed_pattern++ = (ptype << 16) | pdata;
3708             }
3709 #else
3710           errorcode = ERR45;
3711           goto FAILED;
3712 #endif
3713           break;  /* End \P and \p */
3714 
3715           default:    /* All others are not allowed in a class */
3716           errorcode = ERR7;
3717           ptr--;
3718           goto FAILED;
3719           }
3720 
3721         /* Perl gives a warning unless a following hyphen is the last character
3722         in the class. PCRE throws an error. */
3723 
3724         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3725             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3726           {
3727           errorcode = ERR50;
3728           goto FAILED;
3729           }
3730         }
3731 
3732       /* Proceed to next thing in the class. */
3733 
3734       CLASS_CONTINUE:
3735       if (ptr >= ptrend)
3736         {
3737         errorcode = ERR6;  /* Missing terminating ']' */
3738         goto FAILED;
3739         }
3740       GETCHARINCTEST(c, ptr);
3741       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3742       }     /* End of class-processing loop */
3743 
3744     /* -] at the end of a class is a literal '-' */
3745 
3746     if (class_range_state == RANGE_STARTED)
3747       {
3748       parsed_pattern[-1] = CHAR_MINUS;
3749       class_range_state = RANGE_NO;
3750       }
3751 
3752     *parsed_pattern++ = META_CLASS_END;
3753     break;  /* End of character class */
3754 
3755 
3756     /* ---- Opening parenthesis ---- */
3757 
3758     case CHAR_LEFT_PARENTHESIS:
3759     if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3760 
3761     /* If ( is not followed by ? it is either a capture or a special verb or an
3762     alpha assertion or a positive non-atomic lookahead. */
3763 
3764     if (*ptr != CHAR_QUESTION_MARK)
3765       {
3766       const char *vn;
3767 
3768       /* Handle capturing brackets (or non-capturing if auto-capture is turned
3769       off). */
3770 
3771       if (*ptr != CHAR_ASTERISK)
3772         {
3773         nest_depth++;
3774         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3775           {
3776           if (cb->bracount >= MAX_GROUP_NUMBER)
3777             {
3778             errorcode = ERR97;
3779             goto FAILED;
3780             }
3781           cb->bracount++;
3782           *parsed_pattern++ = META_CAPTURE | cb->bracount;
3783           }
3784         else *parsed_pattern++ = META_NOCAPTURE;
3785         }
3786 
3787       /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3788       quantifier" error rather than "(*MARK) must have an argument". */
3789 
3790       else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3791         break;
3792 
3793       /* Handle "alpha assertions" such as (*pla:...). Most of these are
3794       synonyms for the historical symbolic assertions, but the script run and
3795       non-atomic lookaround ones are new. They are distinguished by starting
3796       with a lower case letter. Checking both ends of the alphabet makes this
3797       work in all character codes. */
3798 
3799       else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3800         {
3801         uint32_t meta;
3802 
3803         vn = alasnames;
3804         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3805           &errorcode, cb)) goto FAILED;
3806         if (ptr >= ptrend || *ptr != CHAR_COLON)
3807           {
3808           errorcode = ERR95;  /* Malformed */
3809           goto FAILED;
3810           }
3811 
3812         /* Scan the table of alpha assertion names */
3813 
3814         for (i = 0; i < alascount; i++)
3815           {
3816           if (namelen == alasmeta[i].len &&
3817               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3818             break;
3819           vn += alasmeta[i].len + 1;
3820           }
3821 
3822         if (i >= alascount)
3823           {
3824           errorcode = ERR95;  /* Alpha assertion not recognized */
3825           goto FAILED;
3826           }
3827 
3828         /* Check for expecting an assertion condition. If so, only atomic
3829         lookaround assertions are valid. */
3830 
3831         meta = alasmeta[i].meta;
3832         if (prev_expect_cond_assert > 0 &&
3833             (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3834           {
3835           errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3836             ERR98 : ERR28;  /* (Atomic) assertion expected */
3837           goto FAILED;
3838           }
3839 
3840         /* The lookaround alphabetic synonyms can mostly be handled by jumping
3841         to the code that handles the traditional symbolic forms. */
3842 
3843         switch(meta)
3844           {
3845           default:
3846           errorcode = ERR89;  /* Unknown code; should never occur because */
3847           goto FAILED;        /* the meta values come from a table above. */
3848 
3849           case META_ATOMIC:
3850           goto ATOMIC_GROUP;
3851 
3852           case META_LOOKAHEAD:
3853           goto POSITIVE_LOOK_AHEAD;
3854 
3855           case META_LOOKAHEAD_NA:
3856           goto POSITIVE_NONATOMIC_LOOK_AHEAD;
3857 
3858           case META_LOOKAHEADNOT:
3859           goto NEGATIVE_LOOK_AHEAD;
3860 
3861           case META_LOOKBEHIND:
3862           case META_LOOKBEHINDNOT:
3863           case META_LOOKBEHIND_NA:
3864           *parsed_pattern++ = meta;
3865           ptr--;
3866           goto POST_LOOKBEHIND;
3867 
3868           /* The script run facilities are handled here. Unicode support is
3869           required (give an error if not, as this is a security issue). Always
3870           record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3871           META_ATOMIC and remember that we need two META_KETs at the end. */
3872 
3873           case META_SCRIPT_RUN:
3874           case META_ATOMIC_SCRIPT_RUN:
3875 #ifdef SUPPORT_UNICODE
3876           *parsed_pattern++ = META_SCRIPT_RUN;
3877           nest_depth++;
3878           ptr++;
3879           if (meta == META_ATOMIC_SCRIPT_RUN)
3880             {
3881             *parsed_pattern++ = META_ATOMIC;
3882             if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3883             else if (++top_nest >= end_nests)
3884               {
3885               errorcode = ERR84;
3886               goto FAILED;
3887               }
3888             top_nest->nest_depth = nest_depth;
3889             top_nest->flags = NSF_ATOMICSR;
3890             top_nest->options = options & PARSE_TRACKED_OPTIONS;
3891             }
3892           break;
3893 #else  /* SUPPORT_UNICODE */
3894           errorcode = ERR96;
3895           goto FAILED;
3896 #endif
3897           }
3898         }
3899 
3900 
3901       /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3902 
3903       else
3904         {
3905         vn = verbnames;
3906         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3907           &errorcode, cb)) goto FAILED;
3908         if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3909                               *ptr != CHAR_RIGHT_PARENTHESIS))
3910           {
3911           errorcode = ERR60;  /* Malformed */
3912           goto FAILED;
3913           }
3914 
3915         /* Scan the table of verb names */
3916 
3917         for (i = 0; i < verbcount; i++)
3918           {
3919           if (namelen == verbs[i].len &&
3920               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3921             break;
3922           vn += verbs[i].len + 1;
3923           }
3924 
3925         if (i >= verbcount)
3926           {
3927           errorcode = ERR60;  /* Verb not recognized */
3928           goto FAILED;
3929           }
3930 
3931         /* An empty argument is treated as no argument. */
3932 
3933         if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3934              ptr[1] == CHAR_RIGHT_PARENTHESIS)
3935           ptr++;    /* Advance to the closing parens */
3936 
3937         /* Check for mandatory non-empty argument; this is (*MARK) */
3938 
3939         if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3940           {
3941           errorcode = ERR66;
3942           goto FAILED;
3943           }
3944 
3945         /* Remember where this verb, possibly with a preceding (*MARK), starts,
3946         for handling quantified (*ACCEPT). */
3947 
3948         verbstartptr = parsed_pattern;
3949         okquantifier = (verbs[i].meta == META_ACCEPT);
3950 
3951         /* It appears that Perl allows any characters whatsoever, other than a
3952         closing parenthesis, to appear in arguments ("names"), so we no longer
3953         insist on letters, digits, and underscores. Perl does not, however, do
3954         any interpretation within arguments, and has no means of including a
3955         closing parenthesis. PCRE supports escape processing but only when it
3956         is requested by an option. We set inverbname TRUE here, and let the
3957         main loop take care of this so that escape and \x processing is done by
3958         the main code above. */
3959 
3960         if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
3961           {
3962           /* Some optional arguments can be treated as a preceding (*MARK) */
3963 
3964           if (verbs[i].has_arg < 0)
3965             {
3966             add_after_mark = verbs[i].meta;
3967             *parsed_pattern++ = META_MARK;
3968             }
3969 
3970           /* The remaining verbs with arguments (except *MARK) need a different
3971           opcode. */
3972 
3973           else
3974             {
3975             *parsed_pattern++ = verbs[i].meta +
3976               ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3977             }
3978 
3979           /* Set up for reading the name in the main loop. */
3980 
3981           verblengthptr = parsed_pattern++;
3982           verbnamestart = ptr;
3983           inverbname = TRUE;
3984           }
3985         else  /* No verb "name" argument */
3986           {
3987           *parsed_pattern++ = verbs[i].meta;
3988           }
3989         }     /* End of (*VERB) handling */
3990       break;  /* Done with this parenthesis */
3991       }       /* End of groups that don't start with (? */
3992 
3993 
3994     /* ---- Items starting (? ---- */
3995 
3996     /* The type of item is determined by what follows (?. Handle (?| and option
3997     changes under "default" because both need a new block on the nest stack.
3998     Comments starting with (?# are handled above. Note that there is some
3999     ambiguity about the sequence (?- because if a digit follows it's a relative
4000     recursion or subroutine call whereas otherwise it's an option unsetting. */
4001 
4002     if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4003 
4004     switch(*ptr)
4005       {
4006       default:
4007       if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4008         goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4009 
4010       /* We now have either (?| or a (possibly empty) option setting,
4011       optionally followed by a non-capturing group. */
4012 
4013       nest_depth++;
4014       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4015       else if (++top_nest >= end_nests)
4016         {
4017         errorcode = ERR84;
4018         goto FAILED;
4019         }
4020       top_nest->nest_depth = nest_depth;
4021       top_nest->flags = 0;
4022       top_nest->options = options & PARSE_TRACKED_OPTIONS;
4023 
4024       /* Start of non-capturing group that resets the capture count for each
4025       branch. */
4026 
4027       if (*ptr == CHAR_VERTICAL_LINE)
4028         {
4029         top_nest->reset_group = (uint16_t)cb->bracount;
4030         top_nest->max_group = (uint16_t)cb->bracount;
4031         top_nest->flags |= NSF_RESET;
4032         cb->external_flags |= PCRE2_DUPCAPUSED;
4033         *parsed_pattern++ = META_NOCAPTURE;
4034         ptr++;
4035         }
4036 
4037       /* Scan for options imnsxJU to be set or unset. */
4038 
4039       else
4040         {
4041         BOOL hyphenok = TRUE;
4042         uint32_t oldoptions = options;
4043 
4044         top_nest->reset_group = 0;
4045         top_nest->max_group = 0;
4046         set = unset = 0;
4047         optset = &set;
4048 
4049         /* ^ at the start unsets imnsx and disables the subsequent use of - */
4050 
4051         if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4052           {
4053           options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4054                        PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4055           hyphenok = FALSE;
4056           ptr++;
4057           }
4058 
4059         while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4060                                *ptr != CHAR_COLON)
4061           {
4062           switch (*ptr++)
4063             {
4064             case CHAR_MINUS:
4065             if (!hyphenok)
4066               {
4067               errorcode = ERR94;
4068               ptr--;  /* Correct the offset */
4069               goto FAILED;
4070               }
4071             optset = &unset;
4072             hyphenok = FALSE;
4073             break;
4074 
4075             case CHAR_J:  /* Record that it changed in the external options */
4076             *optset |= PCRE2_DUPNAMES;
4077             cb->external_flags |= PCRE2_JCHANGED;
4078             break;
4079 
4080             case CHAR_i: *optset |= PCRE2_CASELESS; break;
4081             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4082             case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4083             case CHAR_s: *optset |= PCRE2_DOTALL; break;
4084             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4085 
4086             /* If x appears twice it sets the extended extended option. */
4087 
4088             case CHAR_x:
4089             *optset |= PCRE2_EXTENDED;
4090             if (ptr < ptrend && *ptr == CHAR_x)
4091               {
4092               *optset |= PCRE2_EXTENDED_MORE;
4093               ptr++;
4094               }
4095             break;
4096 
4097             default:
4098             errorcode = ERR11;
4099             ptr--;    /* Correct the offset */
4100             goto FAILED;
4101             }
4102           }
4103 
4104         /* If we are setting extended without extended-more, ensure that any
4105         existing extended-more gets unset. Also, unsetting extended must also
4106         unset extended-more. */
4107 
4108         if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4109             (unset & PCRE2_EXTENDED) != 0)
4110           unset |= PCRE2_EXTENDED_MORE;
4111 
4112         options = (options | set) & (~unset);
4113 
4114         /* If the options ended with ')' this is not the start of a nested
4115         group with option changes, so the options change at this level.
4116         In this case, if the previous level set up a nest block, discard the
4117         one we have just created. Otherwise adjust it for the previous level.
4118         If the options ended with ':' we are starting a non-capturing group,
4119         possibly with an options setting. */
4120 
4121         if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4122         if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4123           {
4124           nest_depth--;  /* This is not a nested group after all. */
4125           if (top_nest > (nest_save *)(cb->start_workspace) &&
4126               (top_nest-1)->nest_depth == nest_depth) top_nest--;
4127           else top_nest->nest_depth = nest_depth;
4128           }
4129         else *parsed_pattern++ = META_NOCAPTURE;
4130 
4131         /* If nothing changed, no need to record. */
4132 
4133         if (options != oldoptions)
4134           {
4135           *parsed_pattern++ = META_OPTIONS;
4136           *parsed_pattern++ = options;
4137           }
4138         }     /* End options processing */
4139       break;  /* End default case after (? */
4140 
4141 
4142       /* ---- Python syntax support ---- */
4143 
4144       case CHAR_P:
4145       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4146 
4147       /* (?P<name> is the same as (?<name>, which defines a named group. */
4148 
4149       if (*ptr == CHAR_LESS_THAN_SIGN)
4150         {
4151         terminator = CHAR_GREATER_THAN_SIGN;
4152         goto DEFINE_NAME;
4153         }
4154 
4155       /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4156       call. */
4157 
4158       if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4159 
4160       /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4161       else after (?P is an error. */
4162 
4163       if (*ptr != CHAR_EQUALS_SIGN)
4164         {
4165         errorcode = ERR41;
4166         goto FAILED;
4167         }
4168       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4169           &namelen, &errorcode, cb)) goto FAILED;
4170       *parsed_pattern++ = META_BACKREF_BYNAME;
4171       *parsed_pattern++ = namelen;
4172       PUTOFFSET(offset, parsed_pattern);
4173       okquantifier = TRUE;
4174       break;   /* End of (?P processing */
4175 
4176 
4177       /* ---- Recursion/subroutine calls by number ---- */
4178 
4179       case CHAR_R:
4180       i = 0;         /* (?R) == (?R0) */
4181       ptr++;
4182       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4183         {
4184         errorcode = ERR58;
4185         goto FAILED;
4186         }
4187       goto SET_RECURSION;
4188 
4189       /* An item starting (?- followed by a digit comes here via the "default"
4190       case because (?- followed by a non-digit is an options setting. */
4191 
4192       case CHAR_PLUS:
4193       if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4194         {
4195         errorcode = ERR29;   /* Missing number */
4196         goto FAILED;
4197         }
4198       /* Fall through */
4199 
4200       case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4201       case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4202       RECURSION_BYNUMBER:
4203       if (!read_number(&ptr, ptrend,
4204           (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4205           MAX_GROUP_NUMBER, ERR61,
4206           &i, &errorcode)) goto FAILED;
4207       if (i < 0)  /* NB (?0) is permitted */
4208         {
4209         errorcode = ERR15;   /* Unknown group */
4210         goto FAILED_BACK;
4211         }
4212       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4213         goto UNCLOSED_PARENTHESIS;
4214 
4215       SET_RECURSION:
4216       *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4217       offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4218       ptr++;
4219       PUTOFFSET(offset, parsed_pattern);
4220       okquantifier = TRUE;
4221       break;  /* End of recursive call by number handling */
4222 
4223 
4224       /* ---- Recursion/subroutine calls by name ---- */
4225 
4226       case CHAR_AMPERSAND:
4227       RECURSE_BY_NAME:
4228       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4229           &namelen, &errorcode, cb)) goto FAILED;
4230       *parsed_pattern++ = META_RECURSE_BYNAME;
4231       *parsed_pattern++ = namelen;
4232       PUTOFFSET(offset, parsed_pattern);
4233       okquantifier = TRUE;
4234       break;
4235 
4236       /* ---- Callout with numerical or string argument ---- */
4237 
4238       case CHAR_C:
4239       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4240 
4241       /* If the previous item was a condition starting (?(? an assertion,
4242       optionally preceded by a callout, is expected. This is checked later on,
4243       during actual compilation. However we need to identify this kind of
4244       assertion in this pass because it must not be qualified. The value of
4245       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4246       for a callout - still leaving a positive value that identifies the
4247       assertion. Multiple callouts or any other items will make it zero or
4248       less, which doesn't matter because they will cause an error later. */
4249 
4250       expect_cond_assert = prev_expect_cond_assert - 1;
4251 
4252       /* If previous_callout is not NULL, it means this follows a previous
4253       callout. If it was a manual callout, do nothing; this means its "length
4254       of next pattern item" field will remain zero. If it was an automatic
4255       callout, abolish it. */
4256 
4257       if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4258           previous_callout == parsed_pattern - 4 &&
4259           parsed_pattern[-1] == 255)
4260         parsed_pattern = previous_callout;
4261 
4262       /* Save for updating next pattern item length, and skip one item before
4263       completing. */
4264 
4265       previous_callout = parsed_pattern;
4266       after_manual_callout = 1;
4267 
4268       /* Handle a string argument; specific delimiter is required. */
4269 
4270       if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4271         {
4272         PCRE2_SIZE calloutlength;
4273         PCRE2_SPTR startptr = ptr;
4274 
4275         delimiter = 0;
4276         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4277           {
4278           if (*ptr == PRIV(callout_start_delims)[i])
4279             {
4280             delimiter = PRIV(callout_end_delims)[i];
4281             break;
4282             }
4283           }
4284         if (delimiter == 0)
4285           {
4286           errorcode = ERR82;
4287           goto FAILED;
4288           }
4289 
4290         *parsed_pattern = META_CALLOUT_STRING;
4291         parsed_pattern += 3;   /* Skip pattern info */
4292 
4293         for (;;)
4294           {
4295           if (++ptr >= ptrend)
4296             {
4297             errorcode = ERR81;
4298             ptr = startptr;   /* To give a more useful message */
4299             goto FAILED;
4300             }
4301           if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4302             break;
4303           }
4304 
4305         calloutlength = (PCRE2_SIZE)(ptr - startptr);
4306         if (calloutlength > UINT32_MAX)
4307           {
4308           errorcode = ERR72;
4309           goto FAILED;
4310           }
4311         *parsed_pattern++ = (uint32_t)calloutlength;
4312         offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4313         PUTOFFSET(offset, parsed_pattern);
4314         }
4315 
4316       /* Handle a callout with an optional numerical argument, which must be
4317       less than or equal to 255. A missing argument gives 0. */
4318 
4319       else
4320         {
4321         int n = 0;
4322         *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4323         parsed_pattern += 3;                       /* Skip pattern info */
4324         while (ptr < ptrend && IS_DIGIT(*ptr))
4325           {
4326           n = n * 10 + *ptr++ - CHAR_0;
4327           if (n > 255)
4328             {
4329             errorcode = ERR38;
4330             goto FAILED;
4331             }
4332           }
4333         *parsed_pattern++ = n;
4334         }
4335 
4336       /* Both formats must have a closing parenthesis */
4337 
4338       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4339         {
4340         errorcode = ERR39;
4341         goto FAILED;
4342         }
4343       ptr++;
4344 
4345       /* Remember the offset to the next item in the pattern, and set a default
4346       length. This should get updated after the next item is read. */
4347 
4348       previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4349       previous_callout[2] = 0;
4350       break;                  /* End callout */
4351 
4352 
4353       /* ---- Conditional group ---- */
4354 
4355       /* A condition can be an assertion, a number (referring to a numbered
4356       group's having been set), a name (referring to a named group), or 'R',
4357       referring to overall recursion. R<digits> and R&name are also permitted
4358       for recursion state tests. Numbers may be preceded by + or - to specify a
4359       relative group number.
4360 
4361       There are several syntaxes for testing a named group: (?(name)) is used
4362       by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4363 
4364       There are two unfortunate ambiguities. 'R' can be the recursive thing or
4365       the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4366       the Perl DEFINE feature or the Python named test. We look for a name
4367       first; if not found, we try the other case.
4368 
4369       For compatibility with auto-callouts, we allow a callout to be specified
4370       before a condition that is an assertion. */
4371 
4372       case CHAR_LEFT_PARENTHESIS:
4373       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4374       nest_depth++;
4375 
4376       /* If the next character is ? or * there must be an assertion next
4377       (optionally preceded by a callout). We do not check this here, but
4378       instead we set expect_cond_assert to 2. If this is still greater than
4379       zero (callouts decrement it) when the next assertion is read, it will be
4380       marked as a condition that must not be repeated. A value greater than
4381       zero also causes checking that an assertion (possibly with callout)
4382       follows. */
4383 
4384       if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4385         {
4386         *parsed_pattern++ = META_COND_ASSERT;
4387         ptr--;   /* Pull pointer back to the opening parenthesis. */
4388         expect_cond_assert = 2;
4389         break;  /* End of conditional */
4390         }
4391 
4392       /* Handle (?([+-]number)... */
4393 
4394       if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4395           &errorcode))
4396         {
4397         if (i <= 0)
4398           {
4399           errorcode = ERR15;
4400           goto FAILED;
4401           }
4402         *parsed_pattern++ = META_COND_NUMBER;
4403         offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4404         PUTOFFSET(offset, parsed_pattern);
4405         *parsed_pattern++ = i;
4406         }
4407       else if (errorcode != 0) goto FAILED;   /* Number too big */
4408 
4409       /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4410 
4411       else if (ptrend - ptr >= 10 &&
4412                PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4413                ptr[7] != CHAR_RIGHT_PARENTHESIS)
4414         {
4415         uint32_t ge = 0;
4416         int major = 0;
4417         int minor = 0;
4418 
4419         ptr += 7;
4420         if (*ptr == CHAR_GREATER_THAN_SIGN)
4421           {
4422           ge = 1;
4423           ptr++;
4424           }
4425 
4426         /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4427         references its argument twice. */
4428 
4429         if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4430           goto BAD_VERSION_CONDITION;
4431 
4432         if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4433           goto FAILED;
4434 
4435         if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4436         if (*ptr == CHAR_DOT)
4437           {
4438           if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4439           minor = (*ptr++ - CHAR_0) * 10;
4440           if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4441           if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4442           if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4443             goto BAD_VERSION_CONDITION;
4444           }
4445 
4446         *parsed_pattern++ = META_COND_VERSION;
4447         *parsed_pattern++ = ge;
4448         *parsed_pattern++ = major;
4449         *parsed_pattern++ = minor;
4450         }
4451 
4452       /* All the remaining cases now require us to read a name. We cannot at
4453       this stage distinguish ambiguous cases such as (?(R12) which might be a
4454       recursion test by number or a name, because the named groups have not yet
4455       all been identified. Those cases are treated as names, but given a
4456       different META code. */
4457 
4458       else
4459         {
4460         BOOL was_r_ampersand = FALSE;
4461 
4462         if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4463           {
4464           terminator = CHAR_RIGHT_PARENTHESIS;
4465           was_r_ampersand = TRUE;
4466           ptr++;
4467           }
4468         else if (*ptr == CHAR_LESS_THAN_SIGN)
4469           terminator = CHAR_GREATER_THAN_SIGN;
4470         else if (*ptr == CHAR_APOSTROPHE)
4471           terminator = CHAR_APOSTROPHE;
4472         else
4473           {
4474           terminator = CHAR_RIGHT_PARENTHESIS;
4475           ptr--;   /* Point to char before name */
4476           }
4477         if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4478             &errorcode, cb)) goto FAILED;
4479 
4480         /* Handle (?(R&name) */
4481 
4482         if (was_r_ampersand)
4483           {
4484           *parsed_pattern = META_COND_RNAME;
4485           ptr--;   /* Back to closing parens */
4486           }
4487 
4488         /* Handle (?(name). If the name is "DEFINE" we identify it with a
4489         special code. Likewise if the name consists of R followed only by
4490         digits. Otherwise, handle it like a quoted name. */
4491 
4492         else if (terminator == CHAR_RIGHT_PARENTHESIS)
4493           {
4494           if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4495             *parsed_pattern = META_COND_DEFINE;
4496           else
4497             {
4498             for (i = 1; i < (int)namelen; i++)
4499               if (!IS_DIGIT(name[i])) break;
4500             *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4501               META_COND_RNUMBER : META_COND_NAME;
4502             }
4503           ptr--;   /* Back to closing parens */
4504           }
4505 
4506         /* Handle (?('name') or (?(<name>) */
4507 
4508         else *parsed_pattern = META_COND_NAME;
4509 
4510         /* All these cases except DEFINE end with the name length and offset;
4511         DEFINE just has an offset (for the "too many branches" error). */
4512 
4513         if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4514         PUTOFFSET(offset, parsed_pattern);
4515         }  /* End cases that read a name */
4516 
4517       /* Check the closing parenthesis of the condition */
4518 
4519       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4520         {
4521         errorcode = ERR24;
4522         goto FAILED;
4523         }
4524       ptr++;
4525       break;  /* End of condition processing */
4526 
4527 
4528       /* ---- Atomic group ---- */
4529 
4530       case CHAR_GREATER_THAN_SIGN:
4531       ATOMIC_GROUP:                          /* Come from (*atomic: */
4532       *parsed_pattern++ = META_ATOMIC;
4533       nest_depth++;
4534       ptr++;
4535       break;
4536 
4537 
4538       /* ---- Lookahead assertions ---- */
4539 
4540       case CHAR_EQUALS_SIGN:
4541       POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4542       *parsed_pattern++ = META_LOOKAHEAD;
4543       ptr++;
4544       goto POST_ASSERTION;
4545 
4546       case CHAR_ASTERISK:
4547       POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */
4548       *parsed_pattern++ = META_LOOKAHEAD_NA;
4549       ptr++;
4550       goto POST_ASSERTION;
4551 
4552       case CHAR_EXCLAMATION_MARK:
4553       NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4554       *parsed_pattern++ = META_LOOKAHEADNOT;
4555       ptr++;
4556       goto POST_ASSERTION;
4557 
4558 
4559       /* ---- Lookbehind assertions ---- */
4560 
4561       /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4562       is the start of the name of a capturing group. */
4563 
4564       case CHAR_LESS_THAN_SIGN:
4565       if (ptrend - ptr <= 1 ||
4566          (ptr[1] != CHAR_EQUALS_SIGN &&
4567           ptr[1] != CHAR_EXCLAMATION_MARK &&
4568           ptr[1] != CHAR_ASTERISK))
4569         {
4570         terminator = CHAR_GREATER_THAN_SIGN;
4571         goto DEFINE_NAME;
4572         }
4573       *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4574         META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4575         META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4576 
4577       POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
4578       *has_lookbehind = TRUE;
4579       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4580       PUTOFFSET(offset, parsed_pattern);
4581       ptr += 2;
4582       /* Fall through */
4583 
4584       /* If the previous item was a condition starting (?(? an assertion,
4585       optionally preceded by a callout, is expected. This is checked later on,
4586       during actual compilation. However we need to identify this kind of
4587       assertion in this pass because it must not be qualified. The value of
4588       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4589       for a callout - still leaving a positive value that identifies the
4590       assertion. Multiple callouts or any other items will make it zero or
4591       less, which doesn't matter because they will cause an error later. */
4592 
4593       POST_ASSERTION:
4594       nest_depth++;
4595       if (prev_expect_cond_assert > 0)
4596         {
4597         if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4598         else if (++top_nest >= end_nests)
4599           {
4600           errorcode = ERR84;
4601           goto FAILED;
4602           }
4603         top_nest->nest_depth = nest_depth;
4604         top_nest->flags = NSF_CONDASSERT;
4605         top_nest->options = options & PARSE_TRACKED_OPTIONS;
4606         }
4607       break;
4608 
4609 
4610       /* ---- Define a named group ---- */
4611 
4612       /* A named group may be defined as (?'name') or (?<name>). In the latter
4613       case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4614       terminator set to '>'. */
4615 
4616       case CHAR_APOSTROPHE:
4617       terminator = CHAR_APOSTROPHE;    /* Terminator */
4618 
4619       DEFINE_NAME:
4620       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4621           &errorcode, cb)) goto FAILED;
4622 
4623       /* We have a name for this capturing group. It is also assigned a number,
4624       which is its primary means of identification. */
4625 
4626       if (cb->bracount >= MAX_GROUP_NUMBER)
4627         {
4628         errorcode = ERR97;
4629         goto FAILED;
4630         }
4631       cb->bracount++;
4632       *parsed_pattern++ = META_CAPTURE | cb->bracount;
4633       nest_depth++;
4634 
4635       /* Check not too many names */
4636 
4637       if (cb->names_found >= MAX_NAME_COUNT)
4638         {
4639         errorcode = ERR49;
4640         goto FAILED;
4641         }
4642 
4643       /* Adjust the entry size to accommodate the longest name found. */
4644 
4645       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4646         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4647 
4648       /* Scan the list to check for duplicates. For duplicate names, if the
4649       number is the same, break the loop, which causes the name to be
4650       discarded; otherwise, if DUPNAMES is not set, give an error.
4651       If it is set, allow the name with a different number, but continue
4652       scanning in case this is a duplicate with the same number. For
4653       non-duplicate names, give an error if the number is duplicated. */
4654 
4655       isdupname = FALSE;
4656       ng = cb->named_groups;
4657       for (i = 0; i < cb->names_found; i++, ng++)
4658         {
4659         if (namelen == ng->length &&
4660             PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4661           {
4662           if (ng->number == cb->bracount) break;
4663           if ((options & PCRE2_DUPNAMES) == 0)
4664             {
4665             errorcode = ERR43;
4666             goto FAILED;
4667             }
4668           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4669           cb->dupnames = TRUE;              /* Duplicate names exist */
4670           }
4671         else if (ng->number == cb->bracount)
4672           {
4673           errorcode = ERR65;
4674           goto FAILED;
4675           }
4676         }
4677 
4678       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4679 
4680       /* Increase the list size if necessary */
4681 
4682       if (cb->names_found >= cb->named_group_list_size)
4683         {
4684         uint32_t newsize = cb->named_group_list_size * 2;
4685         named_group *newspace =
4686           cb->cx->memctl.malloc(newsize * sizeof(named_group),
4687           cb->cx->memctl.memory_data);
4688         if (newspace == NULL)
4689           {
4690           errorcode = ERR21;
4691           goto FAILED;
4692           }
4693 
4694         memcpy(newspace, cb->named_groups,
4695           cb->named_group_list_size * sizeof(named_group));
4696         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4697           cb->cx->memctl.free((void *)cb->named_groups,
4698           cb->cx->memctl.memory_data);
4699         cb->named_groups = newspace;
4700         cb->named_group_list_size = newsize;
4701         }
4702 
4703       /* Add this name to the list */
4704 
4705       cb->named_groups[cb->names_found].name = name;
4706       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4707       cb->named_groups[cb->names_found].number = cb->bracount;
4708       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4709       cb->names_found++;
4710       break;
4711       }        /* End of (? switch */
4712     break;     /* End of ( handling */
4713 
4714 
4715     /* ---- Branch terminators ---- */
4716 
4717     /* Alternation: reset the capture count if we are in a (?| group. */
4718 
4719     case CHAR_VERTICAL_LINE:
4720     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4721         (top_nest->flags & NSF_RESET) != 0)
4722       {
4723       if (cb->bracount > top_nest->max_group)
4724         top_nest->max_group = (uint16_t)cb->bracount;
4725       cb->bracount = top_nest->reset_group;
4726       }
4727     *parsed_pattern++ = META_ALT;
4728     break;
4729 
4730     /* End of group; reset the capture count to the maximum if we are in a (?|
4731     group and/or reset the options that are tracked during parsing. Disallow
4732     quantifier for a condition that is an assertion. */
4733 
4734     case CHAR_RIGHT_PARENTHESIS:
4735     okquantifier = TRUE;
4736     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4737       {
4738       options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4739       if ((top_nest->flags & NSF_RESET) != 0 &&
4740           top_nest->max_group > cb->bracount)
4741         cb->bracount = top_nest->max_group;
4742       if ((top_nest->flags & NSF_CONDASSERT) != 0)
4743         okquantifier = FALSE;
4744 
4745       if ((top_nest->flags & NSF_ATOMICSR) != 0)
4746         {
4747         *parsed_pattern++ = META_KET;
4748         }
4749 
4750       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4751         else top_nest--;
4752       }
4753     if (nest_depth == 0)    /* Unmatched closing parenthesis */
4754       {
4755       errorcode = ERR22;
4756       goto FAILED_BACK;
4757       }
4758     nest_depth--;
4759     *parsed_pattern++ = META_KET;
4760     break;
4761     }  /* End of switch on pattern character */
4762   }    /* End of main character scan loop */
4763 
4764 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4765 
4766 if (inverbname && ptr >= ptrend)
4767   {
4768   errorcode = ERR60;
4769   goto FAILED;
4770   }
4771 
4772 /* Manage callout for the final item */
4773 
4774 PARSED_END:
4775 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4776   parsed_pattern, cb);
4777 
4778 /* Insert trailing items for word and line matching (features provided for the
4779 benefit of pcre2grep). */
4780 
4781 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4782   {
4783   *parsed_pattern++ = META_KET;
4784   *parsed_pattern++ = META_DOLLAR;
4785   }
4786 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4787   {
4788   *parsed_pattern++ = META_KET;
4789   *parsed_pattern++ = META_ESCAPE + ESC_b;
4790   }
4791 
4792 /* Terminate the parsed pattern, then return success if all groups are closed.
4793 Otherwise we have unclosed parentheses. */
4794 
4795 if (parsed_pattern >= parsed_pattern_end)
4796   {
4797   errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
4798   goto FAILED;
4799   }
4800 
4801 *parsed_pattern = META_END;
4802 if (nest_depth == 0) return 0;
4803 
4804 UNCLOSED_PARENTHESIS:
4805 errorcode = ERR14;
4806 
4807 /* Come here for all failures. */
4808 
4809 FAILED:
4810 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4811 return errorcode;
4812 
4813 /* Some errors need to indicate the previous character. */
4814 
4815 FAILED_BACK:
4816 ptr--;
4817 goto FAILED;
4818 
4819 /* This failure happens several times. */
4820 
4821 BAD_VERSION_CONDITION:
4822 errorcode = ERR79;
4823 goto FAILED;
4824 }
4825 
4826 
4827 
4828 /*************************************************
4829 *       Find first significant opcode            *
4830 *************************************************/
4831 
4832 /* This is called by several functions that scan a compiled expression looking
4833 for a fixed first character, or an anchoring opcode etc. It skips over things
4834 that do not influence this. For some calls, it makes sense to skip negative
4835 forward and all backward assertions, and also the \b assertion; for others it
4836 does not.
4837 
4838 Arguments:
4839   code         pointer to the start of the group
4840   skipassert   TRUE if certain assertions are to be skipped
4841 
4842 Returns:       pointer to the first significant opcode
4843 */
4844 
4845 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4846 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4847 {
4848 for (;;)
4849   {
4850   switch ((int)*code)
4851     {
4852     case OP_ASSERT_NOT:
4853     case OP_ASSERTBACK:
4854     case OP_ASSERTBACK_NOT:
4855     case OP_ASSERTBACK_NA:
4856     if (!skipassert) return code;
4857     do code += GET(code, 1); while (*code == OP_ALT);
4858     code += PRIV(OP_lengths)[*code];
4859     break;
4860 
4861     case OP_WORD_BOUNDARY:
4862     case OP_NOT_WORD_BOUNDARY:
4863     if (!skipassert) return code;
4864     /* Fall through */
4865 
4866     case OP_CALLOUT:
4867     case OP_CREF:
4868     case OP_DNCREF:
4869     case OP_RREF:
4870     case OP_DNRREF:
4871     case OP_FALSE:
4872     case OP_TRUE:
4873     code += PRIV(OP_lengths)[*code];
4874     break;
4875 
4876     case OP_CALLOUT_STR:
4877     code += GET(code, 1 + 2*LINK_SIZE);
4878     break;
4879 
4880     case OP_SKIPZERO:
4881     code += 2 + GET(code, 2) + LINK_SIZE;
4882     break;
4883 
4884     case OP_COND:
4885     case OP_SCOND:
4886     if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
4887         code[GET(code, 1)] != OP_KET)      /* More than one branch */
4888       return code;
4889     code += GET(code, 1) + 1 + LINK_SIZE;
4890     break;
4891 
4892     case OP_MARK:
4893     case OP_COMMIT_ARG:
4894     case OP_PRUNE_ARG:
4895     case OP_SKIP_ARG:
4896     case OP_THEN_ARG:
4897     code += code[1] + PRIV(OP_lengths)[*code];
4898     break;
4899 
4900     default:
4901     return code;
4902     }
4903   }
4904 /* Control never reaches here */
4905 }
4906 
4907 
4908 
4909 #ifdef SUPPORT_UNICODE
4910 /*************************************************
4911 *           Get othercase range                  *
4912 *************************************************/
4913 
4914 /* This function is passed the start and end of a class range in UCP mode. It
4915 searches up the characters, looking for ranges of characters in the "other"
4916 case. Each call returns the next one, updating the start address. A character
4917 with multiple other cases is returned on its own with a special return value.
4918 
4919 Arguments:
4920   cptr        points to starting character value; updated
4921   d           end value
4922   ocptr       where to put start of othercase range
4923   odptr       where to put end of othercase range
4924 
4925 Yield:        -1 when no more
4926                0 when a range is returned
4927               >0 the CASESET offset for char with multiple other cases
4928                 in this case, ocptr contains the original
4929 */
4930 
4931 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4932 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4933   uint32_t *odptr)
4934 {
4935 uint32_t c, othercase, next;
4936 unsigned int co;
4937 
4938 /* Find the first character that has an other case. If it has multiple other
4939 cases, return its case offset value. */
4940 
4941 for (c = *cptr; c <= d; c++)
4942   {
4943   if ((co = UCD_CASESET(c)) != 0)
4944     {
4945     *ocptr = c++;   /* Character that has the set */
4946     *cptr = c;      /* Rest of input range */
4947     return (int)co;
4948     }
4949   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4950   }
4951 
4952 if (c > d) return -1;  /* Reached end of range */
4953 
4954 /* Found a character that has a single other case. Search for the end of the
4955 range, which is either the end of the input range, or a character that has zero
4956 or more than one other cases. */
4957 
4958 *ocptr = othercase;
4959 next = othercase + 1;
4960 
4961 for (++c; c <= d; c++)
4962   {
4963   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4964   next++;
4965   }
4966 
4967 *odptr = next - 1;     /* End of othercase range */
4968 *cptr = c;             /* Rest of input range */
4969 return 0;
4970 }
4971 #endif  /* SUPPORT_UNICODE */
4972 
4973 
4974 
4975 /*************************************************
4976 * Add a character or range to a class (internal) *
4977 *************************************************/
4978 
4979 /* This function packages up the logic of adding a character or range of
4980 characters to a class. The character values in the arguments will be within the
4981 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4982 called only from within the "add to class" group of functions, some of which
4983 are recursive and mutually recursive. The external entry point is
4984 add_to_class().
4985 
4986 Arguments:
4987   classbits     the bit map for characters < 256
4988   uchardptr     points to the pointer for extra data
4989   options       the options word
4990   cb            compile data
4991   start         start of range character
4992   end           end of range character
4993 
4994 Returns:        the number of < 256 characters added
4995                 the pointer to extra data is updated
4996 */
4997 
4998 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4999 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5000   uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
5001 {
5002 uint32_t c;
5003 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5004 unsigned int n8 = 0;
5005 
5006 /* If caseless matching is required, scan the range and process alternate
5007 cases. In Unicode, there are 8-bit characters that have alternate cases that
5008 are greater than 255 and vice-versa. Sometimes we can just extend the original
5009 range. */
5010 
5011 if ((options & PCRE2_CASELESS) != 0)
5012   {
5013 #ifdef SUPPORT_UNICODE
5014   if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5015     {
5016     int rc;
5017     uint32_t oc, od;
5018 
5019     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
5020     c = start;
5021 
5022     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
5023       {
5024       /* Handle a single character that has more than one other case. */
5025 
5026       if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
5027         PRIV(ucd_caseless_sets) + rc, oc);
5028 
5029       /* Do nothing if the other case range is within the original range. */
5030 
5031       else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
5032 
5033       /* Extend the original range if there is overlap, noting that if oc < c, we
5034       can't have od > end because a subrange is always shorter than the basic
5035       range. Otherwise, use a recursive call to add the additional range. */
5036 
5037       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5038       else if (od > end && oc <= end + 1)
5039         {
5040         end = od;       /* Extend upwards */
5041         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5042         }
5043       else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
5044       }
5045     }
5046   else
5047 #endif  /* SUPPORT_UNICODE */
5048 
5049   /* Not UTF mode */
5050 
5051   for (c = start; c <= classbits_end; c++)
5052     {
5053     SETBIT(classbits, cb->fcc[c]);
5054     n8++;
5055     }
5056   }
5057 
5058 /* Now handle the originally supplied range. Adjust the final value according
5059 to the bit length - this means that the same lists of (e.g.) horizontal spaces
5060 can be used in all cases. */
5061 
5062 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5063   end = MAX_NON_UTF_CHAR;
5064 
5065 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5066 
5067 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
5068 
5069 for (c = start; c <= classbits_end; c++)
5070   {
5071   /* Regardless of start, c will always be <= 255. */
5072   SETBIT(classbits, c);
5073   n8++;
5074   }
5075 
5076 #ifdef SUPPORT_WIDE_CHARS
5077 if (start <= 0xff) start = 0xff + 1;
5078 
5079 if (end >= start)
5080   {
5081   PCRE2_UCHAR *uchardata = *uchardptr;
5082 
5083 #ifdef SUPPORT_UNICODE
5084   if ((options & PCRE2_UTF) != 0)
5085     {
5086     if (start < end)
5087       {
5088       *uchardata++ = XCL_RANGE;
5089       uchardata += PRIV(ord2utf)(start, uchardata);
5090       uchardata += PRIV(ord2utf)(end, uchardata);
5091       }
5092     else if (start == end)
5093       {
5094       *uchardata++ = XCL_SINGLE;
5095       uchardata += PRIV(ord2utf)(start, uchardata);
5096       }
5097     }
5098   else
5099 #endif  /* SUPPORT_UNICODE */
5100 
5101   /* Without UTF support, character values are constrained by the bit length,
5102   and can only be > 256 for 16-bit and 32-bit libraries. */
5103 
5104 #if PCRE2_CODE_UNIT_WIDTH == 8
5105     {}
5106 #else
5107   if (start < end)
5108     {
5109     *uchardata++ = XCL_RANGE;
5110     *uchardata++ = start;
5111     *uchardata++ = end;
5112     }
5113   else if (start == end)
5114     {
5115     *uchardata++ = XCL_SINGLE;
5116     *uchardata++ = start;
5117     }
5118 #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
5119   *uchardptr = uchardata;   /* Updata extra data pointer */
5120   }
5121 #else  /* SUPPORT_WIDE_CHARS */
5122   (void)uchardptr;          /* Avoid compiler warning */
5123 #endif /* SUPPORT_WIDE_CHARS */
5124 
5125 return n8;    /* Number of 8-bit characters */
5126 }
5127 
5128 
5129 
5130 #ifdef SUPPORT_UNICODE
5131 /*************************************************
5132 * Add a list of characters to a class (internal) *
5133 *************************************************/
5134 
5135 /* This function is used for adding a list of case-equivalent characters to a
5136 class when in UTF mode. This function is called only from within
5137 add_to_class_internal(), with which it is mutually recursive.
5138 
5139 Arguments:
5140   classbits     the bit map for characters < 256
5141   uchardptr     points to the pointer for extra data
5142   options       the options word
5143   cb            contains pointers to tables etc.
5144   p             points to row of 32-bit values, terminated by NOTACHAR
5145   except        character to omit; this is used when adding lists of
5146                   case-equivalent characters to avoid including the one we
5147                   already know about
5148 
5149 Returns:        the number of < 256 characters added
5150                 the pointer to extra data is updated
5151 */
5152 
5153 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5154 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5155   uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
5156 {
5157 unsigned int n8 = 0;
5158 while (p[0] < NOTACHAR)
5159   {
5160   unsigned int n = 0;
5161   if (p[0] != except)
5162     {
5163     while(p[n+1] == p[0] + n + 1) n++;
5164     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5165     }
5166   p += n + 1;
5167   }
5168 return n8;
5169 }
5170 #endif
5171 
5172 
5173 
5174 /*************************************************
5175 *   External entry point for add range to class  *
5176 *************************************************/
5177 
5178 /* This function sets the overall range so that the internal functions can try
5179 to avoid duplication when handling case-independence.
5180 
5181 Arguments:
5182   classbits     the bit map for characters < 256
5183   uchardptr     points to the pointer for extra data
5184   options       the options word
5185   cb            compile data
5186   start         start of range character
5187   end           end of range character
5188 
5189 Returns:        the number of < 256 characters added
5190                 the pointer to extra data is updated
5191 */
5192 
5193 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5194 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5195   compile_block *cb, uint32_t start, uint32_t end)
5196 {
5197 cb->class_range_start = start;
5198 cb->class_range_end = end;
5199 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5200 }
5201 
5202 
5203 /*************************************************
5204 *   External entry point for add list to class   *
5205 *************************************************/
5206 
5207 /* This function is used for adding a list of horizontal or vertical whitespace
5208 characters to a class. The list must be in order so that ranges of characters
5209 can be detected and handled appropriately. This function sets the overall range
5210 so that the internal functions can try to avoid duplication when handling
5211 case-independence.
5212 
5213 Arguments:
5214   classbits     the bit map for characters < 256
5215   uchardptr     points to the pointer for extra data
5216   options       the options word
5217   cb            contains pointers to tables etc.
5218   p             points to row of 32-bit values, terminated by NOTACHAR
5219   except        character to omit; this is used when adding lists of
5220                   case-equivalent characters to avoid including the one we
5221                   already know about
5222 
5223 Returns:        the number of < 256 characters added
5224                 the pointer to extra data is updated
5225 */
5226 
5227 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5228 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5229   compile_block *cb, const uint32_t *p, unsigned int except)
5230 {
5231 unsigned int n8 = 0;
5232 while (p[0] < NOTACHAR)
5233   {
5234   unsigned int n = 0;
5235   if (p[0] != except)
5236     {
5237     while(p[n+1] == p[0] + n + 1) n++;
5238     cb->class_range_start = p[0];
5239     cb->class_range_end = p[n];
5240     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5241     }
5242   p += n + 1;
5243   }
5244 return n8;
5245 }
5246 
5247 
5248 
5249 /*************************************************
5250 *    Add characters not in a list to a class     *
5251 *************************************************/
5252 
5253 /* This function is used for adding the complement of a list of horizontal or
5254 vertical whitespace to a class. The list must be in order.
5255 
5256 Arguments:
5257   classbits     the bit map for characters < 256
5258   uchardptr     points to the pointer for extra data
5259   options       the options word
5260   cb            contains pointers to tables etc.
5261   p             points to row of 32-bit values, terminated by NOTACHAR
5262 
5263 Returns:        the number of < 256 characters added
5264                 the pointer to extra data is updated
5265 */
5266 
5267 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)5268 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5269   uint32_t options, compile_block *cb, const uint32_t *p)
5270 {
5271 BOOL utf = (options & PCRE2_UTF) != 0;
5272 unsigned int n8 = 0;
5273 if (p[0] > 0)
5274   n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5275 while (p[0] < NOTACHAR)
5276   {
5277   while (p[1] == p[0] + 1) p++;
5278   n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5279     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5280   p++;
5281   }
5282 return n8;
5283 }
5284 
5285 
5286 
5287 /*************************************************
5288 *    Find details of duplicate group names       *
5289 *************************************************/
5290 
5291 /* This is called from compile_branch() when it needs to know the index and
5292 count of duplicates in the names table when processing named backreferences,
5293 either directly, or as conditions.
5294 
5295 Arguments:
5296   name          points to the name
5297   length        the length of the name
5298   indexptr      where to put the index
5299   countptr      where to put the count of duplicates
5300   errorcodeptr  where to put an error code
5301   cb            the compile block
5302 
5303 Returns:        TRUE if OK, FALSE if not, error code set
5304 */
5305 
5306 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5307 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5308   int *countptr, int *errorcodeptr, compile_block *cb)
5309 {
5310 uint32_t i, groupnumber;
5311 int count;
5312 PCRE2_UCHAR *slot = cb->name_table;
5313 
5314 /* Find the first entry in the table */
5315 
5316 for (i = 0; i < cb->names_found; i++)
5317   {
5318   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5319       slot[IMM2_SIZE+length] == 0) break;
5320   slot += cb->name_entry_size;
5321   }
5322 
5323 /* This should not occur, because this function is called only when we know we
5324 have duplicate names. Give an internal error. */
5325 
5326 if (i >= cb->names_found)
5327   {
5328   *errorcodeptr = ERR53;
5329   cb->erroroffset = name - cb->start_pattern;
5330   return FALSE;
5331   }
5332 
5333 /* Record the index and then see how many duplicates there are, updating the
5334 backref map and maximum back reference as we do. */
5335 
5336 *indexptr = i;
5337 count = 0;
5338 
5339 for (;;)
5340   {
5341   count++;
5342   groupnumber = GET2(slot,0);
5343   cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5344   if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5345   if (++i >= cb->names_found) break;
5346   slot += cb->name_entry_size;
5347   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5348     (slot+IMM2_SIZE)[length] != 0) break;
5349   }
5350 
5351 *countptr = count;
5352 return TRUE;
5353 }
5354 
5355 
5356 
5357 /*************************************************
5358 *           Compile one branch                   *
5359 *************************************************/
5360 
5361 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5362 the options are changed during the branch, the pointer is used to change the
5363 external options bits. This function is used during the pre-compile phase when
5364 we are trying to find out the amount of memory needed, as well as during the
5365 real compile phase. The value of lengthptr distinguishes the two phases.
5366 
5367 Arguments:
5368   optionsptr        pointer to the option bits
5369   codeptr           points to the pointer to the current code point
5370   pptrptr           points to the current parsed pattern pointer
5371   errorcodeptr      points to error code variable
5372   firstcuptr        place to put the first required code unit
5373   firstcuflagsptr   place to put the first code unit flags
5374   reqcuptr          place to put the last required code unit
5375   reqcuflagsptr     place to put the last required code unit flags
5376   bcptr             points to current branch chain
5377   cb                contains pointers to tables etc.
5378   lengthptr         NULL during the real compile phase
5379                     points to length accumulator during pre-compile phase
5380 
5381 Returns:            0 There's been an error, *errorcodeptr is non-zero
5382                    +1 Success, this branch must match at least one character
5383                    -1 Success, this branch may match an empty string
5384 */
5385 
5386 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)5387 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
5388   int *errorcodeptr, uint32_t *firstcuptr, uint32_t *firstcuflagsptr,
5389   uint32_t *reqcuptr, uint32_t *reqcuflagsptr, branch_chain *bcptr,
5390   compile_block *cb, PCRE2_SIZE *lengthptr)
5391 {
5392 int bravalue = 0;
5393 int okreturn = -1;
5394 int group_return = 0;
5395 uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5396 uint32_t greedy_default, greedy_non_default;
5397 uint32_t repeat_type, op_type;
5398 uint32_t options = *optionsptr;               /* May change dynamically */
5399 uint32_t firstcu, reqcu;
5400 uint32_t zeroreqcu, zerofirstcu;
5401 uint32_t escape;
5402 uint32_t *pptr = *pptrptr;
5403 uint32_t meta, meta_arg;
5404 uint32_t firstcuflags, reqcuflags;
5405 uint32_t zeroreqcuflags, zerofirstcuflags;
5406 uint32_t req_caseopt, reqvary, tempreqvary;
5407 PCRE2_SIZE offset = 0;
5408 PCRE2_SIZE length_prevgroup = 0;
5409 PCRE2_UCHAR *code = *codeptr;
5410 PCRE2_UCHAR *last_code = code;
5411 PCRE2_UCHAR *orig_code = code;
5412 PCRE2_UCHAR *tempcode;
5413 PCRE2_UCHAR *previous = NULL;
5414 PCRE2_UCHAR op_previous;
5415 BOOL groupsetfirstcu = FALSE;
5416 BOOL had_accept = FALSE;
5417 BOOL matched_char = FALSE;
5418 BOOL previous_matched_char = FALSE;
5419 BOOL reset_caseful = FALSE;
5420 const uint8_t *cbits = cb->cbits;
5421 uint8_t classbits[32];
5422 
5423 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5424 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5425 dynamically as we process the pattern. */
5426 
5427 #ifdef SUPPORT_UNICODE
5428 BOOL utf = (options & PCRE2_UTF) != 0;
5429 BOOL ucp = (options & PCRE2_UCP) != 0;
5430 #else  /* No Unicode support */
5431 BOOL utf = FALSE;
5432 #endif
5433 
5434 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5435 class_uchardata always so that it can be passed to add_to_class() always,
5436 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5437 alternative calls for the different cases. */
5438 
5439 PCRE2_UCHAR *class_uchardata;
5440 #ifdef SUPPORT_WIDE_CHARS
5441 BOOL xclass;
5442 PCRE2_UCHAR *class_uchardata_base;
5443 #endif
5444 
5445 /* Set up the default and non-default settings for greediness */
5446 
5447 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5448 greedy_non_default = greedy_default ^ 1;
5449 
5450 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5451 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5452 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5453 
5454 When we hit a repeat whose minimum is zero, we may have to adjust these values
5455 to take the zero repeat into account. This is implemented by setting them to
5456 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5457 item types that can be repeated set these backoff variables appropriately. */
5458 
5459 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5460 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5461 
5462 /* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5463 according to the current setting of the caseless flag. The REQ_CASELESS value
5464 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5465 to record the case status of the value. This is used only for ASCII characters.
5466 */
5467 
5468 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5469 
5470 /* Switch on next META item until the end of the branch */
5471 
5472 for (;; pptr++)
5473   {
5474 #ifdef SUPPORT_WIDE_CHARS
5475   BOOL xclass_has_prop;
5476 #endif
5477   BOOL negate_class;
5478   BOOL should_flip_negation;
5479   BOOL match_all_or_no_wide_chars;
5480   BOOL possessive_quantifier;
5481   BOOL note_group_empty;
5482   int class_has_8bitchar;
5483   uint32_t mclength;
5484   uint32_t skipunits;
5485   uint32_t subreqcu, subfirstcu;
5486   uint32_t groupnumber;
5487   uint32_t verbarglen, verbculen;
5488   uint32_t subreqcuflags, subfirstcuflags;
5489   open_capitem *oc;
5490   PCRE2_UCHAR mcbuffer[8];
5491 
5492   /* Get next META item in the pattern and its potential argument. */
5493 
5494   meta = META_CODE(*pptr);
5495   meta_arg = META_DATA(*pptr);
5496 
5497   /* If we are in the pre-compile phase, accumulate the length used for the
5498   previous cycle of this loop, unless the next item is a quantifier. */
5499 
5500   if (lengthptr != NULL)
5501     {
5502     if (code > cb->start_workspace + cb->workspace_size -
5503         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5504       {
5505       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5506         ERR52 : ERR86;
5507       return 0;
5508       }
5509 
5510     /* There is at least one situation where code goes backwards: this is the
5511     case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5512     is processed, the whole class is eliminated. However, it is created first,
5513     so we have to allow memory for it. Therefore, don't ever reduce the length
5514     at this point. */
5515 
5516     if (code < last_code) code = last_code;
5517 
5518     /* If the next thing is not a quantifier, we add the length of the previous
5519     item into the total, and reset the code pointer to the start of the
5520     workspace. Otherwise leave the previous item available to be quantified. */
5521 
5522     if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5523       {
5524       if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5525         {
5526         *errorcodeptr = ERR20;   /* Integer overflow */
5527         return 0;
5528         }
5529       *lengthptr += (PCRE2_SIZE)(code - orig_code);
5530       if (*lengthptr > MAX_PATTERN_SIZE)
5531         {
5532         *errorcodeptr = ERR20;   /* Pattern is too large */
5533         return 0;
5534         }
5535       code = orig_code;
5536       }
5537 
5538     /* Remember where this code item starts so we can catch the "backwards"
5539     case above next time round. */
5540 
5541     last_code = code;
5542     }
5543 
5544   /* Process the next parsed pattern item. If it is not a quantifier, remember
5545   where it starts so that it can be quantified when a quantifier follows.
5546   Checking for the legality of quantifiers happens in parse_regex(), except for
5547   a quantifier after an assertion that is a condition. */
5548 
5549   if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5550     {
5551     previous = code;
5552     if (matched_char && !had_accept) okreturn = 1;
5553     }
5554 
5555   previous_matched_char = matched_char;
5556   matched_char = FALSE;
5557   note_group_empty = FALSE;
5558   skipunits = 0;         /* Default value for most subgroups */
5559 
5560   switch(meta)
5561     {
5562     /* ===================================================================*/
5563     /* The branch terminates at pattern end or | or ) */
5564 
5565     case META_END:
5566     case META_ALT:
5567     case META_KET:
5568     *firstcuptr = firstcu;
5569     *firstcuflagsptr = firstcuflags;
5570     *reqcuptr = reqcu;
5571     *reqcuflagsptr = reqcuflags;
5572     *codeptr = code;
5573     *pptrptr = pptr;
5574     return okreturn;
5575 
5576 
5577     /* ===================================================================*/
5578     /* Handle single-character metacharacters. In multiline mode, ^ disables
5579     the setting of any following char as a first character. */
5580 
5581     case META_CIRCUMFLEX:
5582     if ((options & PCRE2_MULTILINE) != 0)
5583       {
5584       if (firstcuflags == REQ_UNSET)
5585         zerofirstcuflags = firstcuflags = REQ_NONE;
5586       *code++ = OP_CIRCM;
5587       }
5588     else *code++ = OP_CIRC;
5589     break;
5590 
5591     case META_DOLLAR:
5592     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5593     break;
5594 
5595     /* There can never be a first char if '.' is first, whatever happens about
5596     repeats. The value of reqcu doesn't change either. */
5597 
5598     case META_DOT:
5599     matched_char = TRUE;
5600     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5601     zerofirstcu = firstcu;
5602     zerofirstcuflags = firstcuflags;
5603     zeroreqcu = reqcu;
5604     zeroreqcuflags = reqcuflags;
5605     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5606     break;
5607 
5608 
5609     /* ===================================================================*/
5610     /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5611     Otherwise, an initial ']' is taken as a data character. When empty classes
5612     are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5613     match any character, so generate OP_ALLANY. */
5614 
5615     case META_CLASS_EMPTY:
5616     case META_CLASS_EMPTY_NOT:
5617     matched_char = TRUE;
5618     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5619     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5620     zerofirstcu = firstcu;
5621     zerofirstcuflags = firstcuflags;
5622     break;
5623 
5624 
5625     /* ===================================================================*/
5626     /* Non-empty character class. If the included characters are all < 256, we
5627     build a 32-byte bitmap of the permitted characters, except in the special
5628     case where there is only one such character. For negated classes, we build
5629     the map as usual, then invert it at the end. However, we use a different
5630     opcode so that data characters > 255 can be handled correctly.
5631 
5632     If the class contains characters outside the 0-255 range, a different
5633     opcode is compiled. It may optionally have a bit map for characters < 256,
5634     but those above are are explicitly listed afterwards. A flag code unit
5635     tells whether the bitmap is present, and whether this is a negated class or
5636     not. */
5637 
5638     case META_CLASS_NOT:
5639     case META_CLASS:
5640     matched_char = TRUE;
5641     negate_class = meta == META_CLASS_NOT;
5642 
5643     /* We can optimize the case of a single character in a class by generating
5644     OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5645     negative. In the negative case there can be no first char if this item is
5646     first, whatever repeat count may follow. In the case of reqcu, save the
5647     previous value for reinstating. */
5648 
5649     /* NOTE: at present this optimization is not effective if the only
5650     character in a class in 32-bit, non-UCP mode has its top bit set. */
5651 
5652     if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5653       {
5654 #ifdef SUPPORT_UNICODE
5655       uint32_t d;
5656 #endif
5657       uint32_t c = pptr[1];
5658 
5659       pptr += 2;                 /* Move on to class end */
5660       if (meta == META_CLASS)    /* A positive one-char class can be */
5661         {                        /* handled as a normal literal character. */
5662         meta = c;                /* Set up the character */
5663         goto NORMAL_CHAR_SET;
5664         }
5665 
5666       /* Handle a negative one-character class */
5667 
5668       zeroreqcu = reqcu;
5669       zeroreqcuflags = reqcuflags;
5670       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5671       zerofirstcu = firstcu;
5672       zerofirstcuflags = firstcuflags;
5673 
5674       /* For caseless UTF or UCP mode, check whether this character has more
5675       than one other case. If so, generate a special OP_NOTPROP item instead of
5676       OP_NOTI. */
5677 
5678 #ifdef SUPPORT_UNICODE
5679       if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5680           (d = UCD_CASESET(c)) != 0)
5681         {
5682         *code++ = OP_NOTPROP;
5683         *code++ = PT_CLIST;
5684         *code++ = d;
5685         break;   /* We are finished with this class */
5686         }
5687 #endif
5688       /* Char has only one other case, or UCP not available */
5689 
5690       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5691       code += PUTCHAR(c, code);
5692       break;   /* We are finished with this class */
5693       }        /* End of 1-char optimization */
5694 
5695     /* Handle character classes that contain more than just one literal
5696     character. If there are exactly two characters in a positive class, see if
5697     they are case partners. This can be optimized to generate a caseless single
5698     character match (which also sets first/required code units if relevant). */
5699 
5700     if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5701         pptr[3] == META_CLASS_END)
5702       {
5703       uint32_t c = pptr[1];
5704 
5705 #ifdef SUPPORT_UNICODE
5706       if (UCD_CASESET(c) == 0)
5707 #endif
5708         {
5709         uint32_t d;
5710 
5711 #ifdef SUPPORT_UNICODE
5712         if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5713 #endif
5714           {
5715 #if PCRE2_CODE_UNIT_WIDTH != 8
5716           if (c > 255) d = c; else
5717 #endif
5718           d = TABLE_GET(c, cb->fcc, c);
5719           }
5720 
5721         if (c != d && pptr[2] == d)
5722           {
5723           pptr += 3;                 /* Move on to class end */
5724           meta = c;
5725           if ((options & PCRE2_CASELESS) == 0)
5726             {
5727             reset_caseful = TRUE;
5728             options |= PCRE2_CASELESS;
5729             req_caseopt = REQ_CASELESS;
5730             }
5731           goto CLASS_CASELESS_CHAR;
5732           }
5733         }
5734       }
5735 
5736     /* If a non-extended class contains a negative special such as \S, we need
5737     to flip the negation flag at the end, so that support for characters > 255
5738     works correctly (they are all included in the class). An extended class may
5739     need to insert specific matching or non-matching code for wide characters.
5740     */
5741 
5742     should_flip_negation = match_all_or_no_wide_chars = FALSE;
5743 
5744     /* Extended class (xclass) will be used when characters > 255
5745     might match. */
5746 
5747 #ifdef SUPPORT_WIDE_CHARS
5748     xclass = FALSE;
5749     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
5750     class_uchardata_base = class_uchardata;   /* Save the start */
5751 #endif
5752 
5753     /* For optimization purposes, we track some properties of the class:
5754     class_has_8bitchar will be non-zero if the class contains at least one
5755     character with a code point less than 256; xclass_has_prop will be TRUE if
5756     Unicode property checks are present in the class. */
5757 
5758     class_has_8bitchar = 0;
5759 #ifdef SUPPORT_WIDE_CHARS
5760     xclass_has_prop = FALSE;
5761 #endif
5762 
5763     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5764     in a temporary bit of memory, in case the class contains fewer than two
5765     8-bit characters because in that case the compiled code doesn't use the bit
5766     map. */
5767 
5768     memset(classbits, 0, 32 * sizeof(uint8_t));
5769 
5770     /* Process items until META_CLASS_END is reached. */
5771 
5772     while ((meta = *(++pptr)) != META_CLASS_END)
5773       {
5774       /* Handle POSIX classes such as [:alpha:] etc. */
5775 
5776       if (meta == META_POSIX || meta == META_POSIX_NEG)
5777         {
5778         BOOL local_negate = (meta == META_POSIX_NEG);
5779         int posix_class = *(++pptr);
5780         int taboffset, tabopt;
5781         uint8_t pbits[32];
5782 
5783         should_flip_negation = local_negate;  /* Note negative special */
5784 
5785         /* If matching is caseless, upper and lower are converted to alpha.
5786         This relies on the fact that the class table starts with alpha,
5787         lower, upper as the first 3 entries. */
5788 
5789         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5790           posix_class = 0;
5791 
5792         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5793         different escape sequences that use Unicode properties \p or \P.
5794         Others that are not available via \p or \P have to generate
5795         XCL_PROP/XCL_NOTPROP directly, which is done here. */
5796 
5797 #ifdef SUPPORT_UNICODE
5798         if ((options & PCRE2_UCP) != 0) switch(posix_class)
5799           {
5800           case PC_GRAPH:
5801           case PC_PRINT:
5802           case PC_PUNCT:
5803           *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5804           *class_uchardata++ = (PCRE2_UCHAR)
5805             ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5806              (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5807           *class_uchardata++ = 0;
5808           xclass_has_prop = TRUE;
5809           goto CONTINUE_CLASS;
5810 
5811           /* For the other POSIX classes (ascii, xdigit) we are going to
5812           fall through to the non-UCP case and build a bit map for
5813           characters with code points less than 256. However, if we are in
5814           a negated POSIX class, characters with code points greater than
5815           255 must either all match or all not match, depending on whether
5816           the whole class is not or is negated. For example, for
5817           [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5818           they must not.
5819 
5820           In the special case where there are no xclass items, this is
5821           automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5822           explicit range is needed for OP_XCLASS. Setting a flag here
5823           causes the range to be generated later when it is known that
5824           OP_XCLASS is required. In the 8-bit library this is relevant only in
5825           utf mode, since no wide characters can exist otherwise. */
5826 
5827           default:
5828 #if PCRE2_CODE_UNIT_WIDTH == 8
5829           if (utf)
5830 #endif
5831           match_all_or_no_wide_chars |= local_negate;
5832           break;
5833           }
5834 #endif  /* SUPPORT_UNICODE */
5835 
5836         /* In the non-UCP case, or when UCP makes no difference, we build the
5837         bit map for the POSIX class in a chunk of local store because we may
5838         be adding and subtracting from it, and we don't want to subtract bits
5839         that may be in the main map already. At the end we or the result into
5840         the bit map that is being built. */
5841 
5842         posix_class *= 3;
5843 
5844         /* Copy in the first table (always present) */
5845 
5846         memcpy(pbits, cbits + posix_class_maps[posix_class],
5847           32 * sizeof(uint8_t));
5848 
5849         /* If there is a second table, add or remove it as required. */
5850 
5851         taboffset = posix_class_maps[posix_class + 1];
5852         tabopt = posix_class_maps[posix_class + 2];
5853 
5854         if (taboffset >= 0)
5855           {
5856           if (tabopt >= 0)
5857             for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5858           else
5859             for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5860           }
5861 
5862         /* Now see if we need to remove any special characters. An option
5863         value of 1 removes vertical space and 2 removes underscore. */
5864 
5865         if (tabopt < 0) tabopt = -tabopt;
5866         if (tabopt == 1) pbits[1] &= ~0x3c;
5867           else if (tabopt == 2) pbits[11] &= 0x7f;
5868 
5869         /* Add the POSIX table or its complement into the main table that is
5870         being built and we are done. */
5871 
5872         if (local_negate)
5873           for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
5874         else
5875           for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
5876 
5877         /* Every class contains at least one < 256 character. */
5878 
5879         class_has_8bitchar = 1;
5880         goto CONTINUE_CLASS;    /* End of POSIX handling */
5881         }
5882 
5883       /* Other than POSIX classes, the only items we should encounter are
5884       \d-type escapes and literal characters (possibly as ranges). */
5885 
5886       if (meta == META_BIGVALUE)
5887         {
5888         meta = *(++pptr);
5889         goto CLASS_LITERAL;
5890         }
5891 
5892       /* Any other non-literal must be an escape */
5893 
5894       if (meta >= META_END)
5895         {
5896         if (META_CODE(meta) != META_ESCAPE)
5897           {
5898 #ifdef DEBUG_SHOW_PARSED
5899           fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5900                           "in character class\n", meta);
5901 #endif
5902           *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
5903           return 0;
5904           }
5905         escape = META_DATA(meta);
5906 
5907         /* Every class contains at least one < 256 character. */
5908 
5909         class_has_8bitchar++;
5910 
5911         switch(escape)
5912           {
5913           case ESC_d:
5914           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5915           break;
5916 
5917           case ESC_D:
5918           should_flip_negation = TRUE;
5919           for (int i = 0; i < 32; i++)
5920             classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
5921           break;
5922 
5923           case ESC_w:
5924           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5925           break;
5926 
5927           case ESC_W:
5928           should_flip_negation = TRUE;
5929           for (int i = 0; i < 32; i++)
5930             classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
5931           break;
5932 
5933           /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5934           5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5935           previously set by something earlier in the character class.
5936           Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5937           we could just adjust the appropriate bit. From PCRE 8.34 we no
5938           longer treat \s and \S specially. */
5939 
5940           case ESC_s:
5941           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5942           break;
5943 
5944           case ESC_S:
5945           should_flip_negation = TRUE;
5946           for (int i = 0; i < 32; i++)
5947             classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
5948           break;
5949 
5950           /* When adding the horizontal or vertical space lists to a class, or
5951           their complements, disable PCRE2_CASELESS, because it justs wastes
5952           time, and in the "not-x" UTF cases can create unwanted duplicates in
5953           the XCLASS list (provoked by characters that have more than one other
5954           case and by both cases being in the same "not-x" sublist). */
5955 
5956           case ESC_h:
5957           (void)add_list_to_class(classbits, &class_uchardata,
5958             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5959           break;
5960 
5961           case ESC_H:
5962           (void)add_not_list_to_class(classbits, &class_uchardata,
5963             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5964           break;
5965 
5966           case ESC_v:
5967           (void)add_list_to_class(classbits, &class_uchardata,
5968             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5969           break;
5970 
5971           case ESC_V:
5972           (void)add_not_list_to_class(classbits, &class_uchardata,
5973             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5974           break;
5975 
5976           /* If Unicode is not supported, \P and \p are not allowed and are
5977           faulted at parse time, so will never appear here. */
5978 
5979 #ifdef SUPPORT_UNICODE
5980           case ESC_p:
5981           case ESC_P:
5982             {
5983             uint32_t ptype = *(++pptr) >> 16;
5984             uint32_t pdata = *pptr & 0xffff;
5985             *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5986             *class_uchardata++ = ptype;
5987             *class_uchardata++ = pdata;
5988             xclass_has_prop = TRUE;
5989             class_has_8bitchar--;                /* Undo! */
5990             }
5991           break;
5992 #endif
5993           }
5994 
5995         goto CONTINUE_CLASS;
5996         }  /* End handling \d-type escapes */
5997 
5998       /* A literal character may be followed by a range meta. At parse time
5999       there are checks for out-of-order characters, for ranges where the two
6000       characters are equal, and for hyphens that cannot indicate a range. At
6001       this point, therefore, no checking is needed. */
6002 
6003       else
6004         {
6005         uint32_t c, d;
6006 
6007         CLASS_LITERAL:
6008         c = d = meta;
6009 
6010         /* Remember if \r or \n were explicitly used */
6011 
6012         if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6013 
6014         /* Process a character range */
6015 
6016         if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6017           {
6018 #ifdef EBCDIC
6019           BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6020 #endif
6021           pptr += 2;
6022           d = *pptr;
6023           if (d == META_BIGVALUE) d = *(++pptr);
6024 
6025           /* Remember an explicit \r or \n, and add the range to the class. */
6026 
6027           if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6028 
6029           /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6030           because there are holes in the encoding, and simply using the range
6031           A-Z (for example) would include the characters in the holes. This
6032           applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6033 
6034 #ifdef EBCDIC
6035           if (range_is_literal &&
6036                (cb->ctypes[c] & ctype_letter) != 0 &&
6037                (cb->ctypes[d] & ctype_letter) != 0 &&
6038                (c <= CHAR_z) == (d <= CHAR_z))
6039             {
6040             uint32_t uc = (d <= CHAR_z)? 0 : 64;
6041             uint32_t C = c - uc;
6042             uint32_t D = d - uc;
6043 
6044             if (C <= CHAR_i)
6045               {
6046               class_has_8bitchar +=
6047                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6048                   ((D < CHAR_i)? D : CHAR_i) + uc);
6049               C = CHAR_j;
6050               }
6051 
6052             if (C <= D && C <= CHAR_r)
6053               {
6054               class_has_8bitchar +=
6055                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6056                   ((D < CHAR_r)? D : CHAR_r) + uc);
6057               C = CHAR_s;
6058               }
6059 
6060             if (C <= D)
6061               {
6062               class_has_8bitchar +=
6063                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6064                   D + uc);
6065               }
6066             }
6067           else
6068 #endif
6069           /* Not an EBCDIC special range */
6070 
6071           class_has_8bitchar +=
6072             add_to_class(classbits, &class_uchardata, options, cb, c, d);
6073           goto CONTINUE_CLASS;   /* Go get the next char in the class */
6074           }  /* End of range handling */
6075 
6076 
6077         /* Handle a single character. */
6078 
6079         class_has_8bitchar +=
6080           add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
6081         }
6082 
6083       /* Continue to the next item in the class. */
6084 
6085       CONTINUE_CLASS:
6086 
6087 #ifdef SUPPORT_WIDE_CHARS
6088       /* If any wide characters or Unicode properties have been encountered,
6089       set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6090       of the extra data and reset the pointer. This is so that very large
6091       classes that contain a zillion wide characters or Unicode property tests
6092       do not overwrite the workspace (which is on the stack). */
6093 
6094       if (class_uchardata > class_uchardata_base)
6095         {
6096         xclass = TRUE;
6097         if (lengthptr != NULL)
6098           {
6099           *lengthptr += class_uchardata - class_uchardata_base;
6100           class_uchardata = class_uchardata_base;
6101           }
6102         }
6103 #endif
6104 
6105       continue;  /* Needed to avoid error when not supporting wide chars */
6106       }   /* End of main class-processing loop */
6107 
6108     /* If this class is the first thing in the branch, there can be no first
6109     char setting, whatever the repeat count. Any reqcu setting must remain
6110     unchanged after any kind of repeat. */
6111 
6112     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6113     zerofirstcu = firstcu;
6114     zerofirstcuflags = firstcuflags;
6115     zeroreqcu = reqcu;
6116     zeroreqcuflags = reqcuflags;
6117 
6118     /* If there are characters with values > 255, or Unicode property settings
6119     (\p or \P), we have to compile an extended class, with its own opcode,
6120     unless there were no property settings and there was a negated special such
6121     as \S in the class, and PCRE2_UCP is not set, because in that case all
6122     characters > 255 are in or not in the class, so any that were explicitly
6123     given as well can be ignored.
6124 
6125     In the UCP case, if certain negated POSIX classes ([:^ascii:] or
6126     [^:xdigit:]) were present in a class, we either have to match or not match
6127     all wide characters (depending on whether the whole class is or is not
6128     negated). This requirement is indicated by match_all_or_no_wide_chars being
6129     true. We do this by including an explicit range, which works in both cases.
6130     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6131     cannot be any wide characters in 8-bit non-UTF mode.
6132 
6133     When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6134     class where \S etc is present without PCRE2_UCP, causing an extended class
6135     to be compiled, we make sure that all characters > 255 are included by
6136     forcing match_all_or_no_wide_chars to be true.
6137 
6138     If, when generating an xclass, there are no characters < 256, we can omit
6139     the bitmap in the actual compiled code. */
6140 
6141 #ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
6142     if (xclass && (
6143 #ifdef SUPPORT_UNICODE
6144         (options & PCRE2_UCP) != 0 ||
6145 #endif
6146         xclass_has_prop || !should_flip_negation))
6147       {
6148       if (match_all_or_no_wide_chars || (
6149 #if PCRE2_CODE_UNIT_WIDTH == 8
6150            utf &&
6151 #endif
6152            should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6153         {
6154         *class_uchardata++ = XCL_RANGE;
6155         if (utf)   /* Will always be utf in the 8-bit library */
6156           {
6157           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6158           class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6159           }
6160         else       /* Can only happen for the 16-bit & 32-bit libraries */
6161           {
6162 #if PCRE2_CODE_UNIT_WIDTH == 16
6163           *class_uchardata++ = 0x100;
6164           *class_uchardata++ = 0xffffu;
6165 #elif PCRE2_CODE_UNIT_WIDTH == 32
6166           *class_uchardata++ = 0x100;
6167           *class_uchardata++ = 0xffffffffu;
6168 #endif
6169           }
6170         }
6171       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
6172       *code++ = OP_XCLASS;
6173       code += LINK_SIZE;
6174       *code = negate_class? XCL_NOT:0;
6175       if (xclass_has_prop) *code |= XCL_HASPROP;
6176 
6177       /* If the map is required, move up the extra data to make room for it;
6178       otherwise just move the code pointer to the end of the extra data. */
6179 
6180       if (class_has_8bitchar > 0)
6181         {
6182         *code++ |= XCL_MAP;
6183         (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6184           CU2BYTES(class_uchardata - code));
6185         if (negate_class && !xclass_has_prop)
6186           {
6187           /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6188           for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6189           }
6190         memcpy(code, classbits, 32);
6191         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6192         }
6193       else code = class_uchardata;
6194 
6195       /* Now fill in the complete length of the item */
6196 
6197       PUT(previous, 1, (int)(code - previous));
6198       break;   /* End of class handling */
6199       }
6200 #endif  /* SUPPORT_WIDE_CHARS */
6201 
6202     /* If there are no characters > 255, or they are all to be included or
6203     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6204     whole class was negated and whether there were negative specials such as \S
6205     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6206     negating it if necessary. */
6207 
6208     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6209     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
6210       {
6211       if (negate_class)
6212         {
6213        /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6214        for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6215        }
6216       memcpy(code, classbits, 32);
6217       }
6218     code += 32 / sizeof(PCRE2_UCHAR);
6219     break;  /* End of class processing */
6220 
6221 
6222     /* ===================================================================*/
6223     /* Deal with (*VERB)s. */
6224 
6225     /* Check for open captures before ACCEPT and close those that are within
6226     the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6227     assertion. In the first pass, just accumulate the length required;
6228     otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6229     workspace overflow. Do not set firstcu after *ACCEPT. */
6230 
6231     case META_ACCEPT:
6232     cb->had_accept = had_accept = TRUE;
6233     for (oc = cb->open_caps;
6234          oc != NULL && oc->assert_depth >= cb->assert_depth;
6235          oc = oc->next)
6236       {
6237       if (lengthptr != NULL)
6238         {
6239         *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6240         }
6241       else
6242         {
6243         *code++ = OP_CLOSE;
6244         PUT2INC(code, 0, oc->number);
6245         }
6246       }
6247     *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6248     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6249     break;
6250 
6251     case META_PRUNE:
6252     case META_SKIP:
6253     cb->had_pruneorskip = TRUE;
6254     /* Fall through */
6255     case META_COMMIT:
6256     case META_FAIL:
6257     *code++ = verbops[(meta - META_MARK) >> 16];
6258     break;
6259 
6260     case META_THEN:
6261     cb->external_flags |= PCRE2_HASTHEN;
6262     *code++ = OP_THEN;
6263     break;
6264 
6265     /* Handle verbs with arguments. Arguments can be very long, especially in
6266     16- and 32-bit modes, and can overflow the workspace in the first pass.
6267     However, the argument length is constrained to be small enough to fit in
6268     one code unit. This check happens in parse_regex(). In the first pass,
6269     instead of putting the argument into memory, we just update the length
6270     counter and set up an empty argument. */
6271 
6272     case META_THEN_ARG:
6273     cb->external_flags |= PCRE2_HASTHEN;
6274     goto VERB_ARG;
6275 
6276     case META_PRUNE_ARG:
6277     case META_SKIP_ARG:
6278     cb->had_pruneorskip = TRUE;
6279     /* Fall through */
6280     case META_MARK:
6281     case META_COMMIT_ARG:
6282     VERB_ARG:
6283     *code++ = verbops[(meta - META_MARK) >> 16];
6284     /* The length is in characters. */
6285     verbarglen = *(++pptr);
6286     verbculen = 0;
6287     tempcode = code++;
6288     for (int i = 0; i < (int)verbarglen; i++)
6289       {
6290       meta = *(++pptr);
6291 #ifdef SUPPORT_UNICODE
6292       if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6293 #endif
6294         {
6295         mclength = 1;
6296         mcbuffer[0] = meta;
6297         }
6298       if (lengthptr != NULL) *lengthptr += mclength; else
6299         {
6300         memcpy(code, mcbuffer, CU2BYTES(mclength));
6301         code += mclength;
6302         verbculen += mclength;
6303         }
6304       }
6305 
6306     *tempcode = verbculen;   /* Fill in the code unit length */
6307     *code++ = 0;             /* Terminating zero */
6308     break;
6309 
6310 
6311     /* ===================================================================*/
6312     /* Handle options change. The new setting must be passed back for use in
6313     subsequent branches. Reset the greedy defaults and the case value for
6314     firstcu and reqcu. */
6315 
6316     case META_OPTIONS:
6317     *optionsptr = options = *(++pptr);
6318     greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6319     greedy_non_default = greedy_default ^ 1;
6320     req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6321     break;
6322 
6323 
6324     /* ===================================================================*/
6325     /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6326     because it could be a numerical check on recursion, or a name check on a
6327     group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6328     we can handle it either way. We first try for a name; if not found, process
6329     the number. */
6330 
6331     case META_COND_RNUMBER:   /* (?(Rdigits) */
6332     case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6333     case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6334     bravalue = OP_COND;
6335       {
6336       int count, index;
6337       unsigned int i;
6338       PCRE2_SPTR name;
6339       named_group *ng = cb->named_groups;
6340       uint32_t length = *(++pptr);
6341 
6342       GETPLUSOFFSET(offset, pptr);
6343       name = cb->start_pattern + offset;
6344 
6345       /* In the first pass, the names generated in the pre-pass are available,
6346       but the main name table has not yet been created. Scan the list of names
6347       generated in the pre-pass in order to get a number and whether or not
6348       this name is duplicated. If it is not duplicated, we can handle it as a
6349       numerical group. */
6350 
6351       for (i = 0; i < cb->names_found; i++, ng++)
6352         {
6353         if (length == ng->length &&
6354             PRIV(strncmp)(name, ng->name, length) == 0)
6355           {
6356           if (!ng->isdup)
6357             {
6358             code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6359             PUT2(code, 2+LINK_SIZE, ng->number);
6360             if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6361             skipunits = 1+IMM2_SIZE;
6362             goto GROUP_PROCESS_NOTE_EMPTY;
6363             }
6364           break;  /* Found a duplicated name */
6365           }
6366         }
6367 
6368       /* If the name was not found we have a bad reference, unless we are
6369       dealing with R<digits>, which is treated as a recursion test by number.
6370       */
6371 
6372       if (i >= cb->names_found)
6373         {
6374         groupnumber = 0;
6375         if (meta == META_COND_RNUMBER)
6376           {
6377           for (i = 1; i < length; i++)
6378             {
6379             groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6380             if (groupnumber > MAX_GROUP_NUMBER)
6381               {
6382               *errorcodeptr = ERR61;
6383               cb->erroroffset = offset + i;
6384               return 0;
6385               }
6386             }
6387           }
6388 
6389         if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6390           {
6391           *errorcodeptr = ERR15;
6392           cb->erroroffset = offset;
6393           return 0;
6394           }
6395 
6396         /* (?Rdigits) treated as a recursion reference by number. A value of
6397         zero (which is the result of both (?R) and (?R0)) means "any", and is
6398         translated into RREF_ANY (which is 0xffff). */
6399 
6400         if (groupnumber == 0) groupnumber = RREF_ANY;
6401         code[1+LINK_SIZE] = OP_RREF;
6402         PUT2(code, 2+LINK_SIZE, groupnumber);
6403         skipunits = 1+IMM2_SIZE;
6404         goto GROUP_PROCESS_NOTE_EMPTY;
6405         }
6406 
6407       /* A duplicated name was found. Note that if an R<digits> name is found
6408       (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6409 
6410       code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6411 
6412       /* We have a duplicated name. In the compile pass we have to search the
6413       main table in order to get the index and count values. */
6414 
6415       count = 0;  /* Values for first pass (avoids compiler warning) */
6416       index = 0;
6417       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6418             &count, errorcodeptr, cb)) return 0;
6419 
6420       /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6421       insert appropriate data values. */
6422 
6423       code[1+LINK_SIZE]++;
6424       skipunits = 1+2*IMM2_SIZE;
6425       PUT2(code, 2+LINK_SIZE, index);
6426       PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6427       }
6428     goto GROUP_PROCESS_NOTE_EMPTY;
6429 
6430     /* The DEFINE condition is always false. Its internal groups may never
6431     be called, so matched_char must remain false, hence the jump to
6432     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6433 
6434     case META_COND_DEFINE:
6435     bravalue = OP_COND;
6436     GETPLUSOFFSET(offset, pptr);
6437     code[1+LINK_SIZE] = OP_DEFINE;
6438     skipunits = 1;
6439     goto GROUP_PROCESS;
6440 
6441     /* Conditional test of a group's being set. */
6442 
6443     case META_COND_NUMBER:
6444     bravalue = OP_COND;
6445     GETPLUSOFFSET(offset, pptr);
6446     groupnumber = *(++pptr);
6447     if (groupnumber > cb->bracount)
6448       {
6449       *errorcodeptr = ERR15;
6450       cb->erroroffset = offset;
6451       return 0;
6452       }
6453     if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6454     offset -= 2;   /* Point at initial ( for too many branches error */
6455     code[1+LINK_SIZE] = OP_CREF;
6456     skipunits = 1+IMM2_SIZE;
6457     PUT2(code, 2+LINK_SIZE, groupnumber);
6458     goto GROUP_PROCESS_NOTE_EMPTY;
6459 
6460     /* Test for the PCRE2 version. */
6461 
6462     case META_COND_VERSION:
6463     bravalue = OP_COND;
6464     if (pptr[1] > 0)
6465       code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6466         (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6467           OP_TRUE : OP_FALSE;
6468     else
6469       code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6470         OP_TRUE : OP_FALSE;
6471     skipunits = 1;
6472     pptr += 3;
6473     goto GROUP_PROCESS_NOTE_EMPTY;
6474 
6475     /* The condition is an assertion, possibly preceded by a callout. */
6476 
6477     case META_COND_ASSERT:
6478     bravalue = OP_COND;
6479     goto GROUP_PROCESS_NOTE_EMPTY;
6480 
6481 
6482     /* ===================================================================*/
6483     /* Handle all kinds of nested bracketed groups. The non-capturing,
6484     non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6485 
6486     case META_LOOKAHEAD:
6487     bravalue = OP_ASSERT;
6488     cb->assert_depth += 1;
6489     goto GROUP_PROCESS;
6490 
6491     case META_LOOKAHEAD_NA:
6492     bravalue = OP_ASSERT_NA;
6493     cb->assert_depth += 1;
6494     goto GROUP_PROCESS;
6495 
6496     /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6497     thing to do, but Perl allows all assertions to be quantified, and when
6498     they contain capturing parentheses there may be a potential use for
6499     this feature. Not that that applies to a quantified (?!) but we allow
6500     it for uniformity. */
6501 
6502     case META_LOOKAHEADNOT:
6503     if (pptr[1] == META_KET &&
6504          (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6505       {
6506       *code++ = OP_FAIL;
6507       pptr++;
6508       }
6509     else
6510       {
6511       bravalue = OP_ASSERT_NOT;
6512       cb->assert_depth += 1;
6513       goto GROUP_PROCESS;
6514       }
6515     break;
6516 
6517     case META_LOOKBEHIND:
6518     bravalue = OP_ASSERTBACK;
6519     cb->assert_depth += 1;
6520     goto GROUP_PROCESS;
6521 
6522     case META_LOOKBEHINDNOT:
6523     bravalue = OP_ASSERTBACK_NOT;
6524     cb->assert_depth += 1;
6525     goto GROUP_PROCESS;
6526 
6527     case META_LOOKBEHIND_NA:
6528     bravalue = OP_ASSERTBACK_NA;
6529     cb->assert_depth += 1;
6530     goto GROUP_PROCESS;
6531 
6532     case META_ATOMIC:
6533     bravalue = OP_ONCE;
6534     goto GROUP_PROCESS_NOTE_EMPTY;
6535 
6536     case META_SCRIPT_RUN:
6537     bravalue = OP_SCRIPT_RUN;
6538     goto GROUP_PROCESS_NOTE_EMPTY;
6539 
6540     case META_NOCAPTURE:
6541     bravalue = OP_BRA;
6542     /* Fall through */
6543 
6544     /* Process nested bracketed regex. The nesting depth is maintained for the
6545     benefit of the stackguard function. The test for too deep nesting is now
6546     done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6547     others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6548     note of whether or not they may match an empty string. */
6549 
6550     GROUP_PROCESS_NOTE_EMPTY:
6551     note_group_empty = TRUE;
6552 
6553     GROUP_PROCESS:
6554     cb->parens_depth += 1;
6555     *code = bravalue;
6556     pptr++;
6557     tempcode = code;
6558     tempreqvary = cb->req_varyopt;        /* Save value before group */
6559     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6560 
6561     if ((group_return =
6562          compile_regex(
6563          options,                         /* The option state */
6564          &tempcode,                       /* Where to put code (updated) */
6565          &pptr,                           /* Input pointer (updated) */
6566          errorcodeptr,                    /* Where to put an error message */
6567          skipunits,                       /* Skip over bracket number */
6568          &subfirstcu,                     /* For possible first char */
6569          &subfirstcuflags,
6570          &subreqcu,                       /* For possible last char */
6571          &subreqcuflags,
6572          bcptr,                           /* Current branch chain */
6573          cb,                              /* Compile data block */
6574          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6575            &length_prevgroup              /* Pre-compile phase */
6576          )) == 0)
6577       return 0;  /* Error */
6578 
6579     cb->parens_depth -= 1;
6580 
6581     /* If that was a non-conditional significant group (not an assertion, not a
6582     DEFINE) that matches at least one character, then the current item matches
6583     a character. Conditionals are handled below. */
6584 
6585     if (note_group_empty && bravalue != OP_COND && group_return > 0)
6586       matched_char = TRUE;
6587 
6588     /* If we've just compiled an assertion, pop the assert depth. */
6589 
6590     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6591       cb->assert_depth -= 1;
6592 
6593     /* At the end of compiling, code is still pointing to the start of the
6594     group, while tempcode has been updated to point past the end of the group.
6595     The parsed pattern pointer (pptr) is on the closing META_KET.
6596 
6597     If this is a conditional bracket, check that there are no more than
6598     two branches in the group, or just one if it's a DEFINE group. We do this
6599     in the real compile phase, not in the pre-pass, where the whole group may
6600     not be available. */
6601 
6602     if (bravalue == OP_COND && lengthptr == NULL)
6603       {
6604       PCRE2_UCHAR *tc = code;
6605       int condcount = 0;
6606 
6607       do {
6608          condcount++;
6609          tc += GET(tc,1);
6610          }
6611       while (*tc != OP_KET);
6612 
6613       /* A DEFINE group is never obeyed inline (the "condition" is always
6614       false). It must have only one branch. Having checked this, change the
6615       opcode to OP_FALSE. */
6616 
6617       if (code[LINK_SIZE+1] == OP_DEFINE)
6618         {
6619         if (condcount > 1)
6620           {
6621           cb->erroroffset = offset;
6622           *errorcodeptr = ERR54;
6623           return 0;
6624           }
6625         code[LINK_SIZE+1] = OP_FALSE;
6626         bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6627         }
6628 
6629       /* A "normal" conditional group. If there is just one branch, we must not
6630       make use of its firstcu or reqcu, because this is equivalent to an
6631       empty second branch. Also, it may match an empty string. If there are two
6632       branches, this item must match a character if the group must. */
6633 
6634       else
6635         {
6636         if (condcount > 2)
6637           {
6638           cb->erroroffset = offset;
6639           *errorcodeptr = ERR27;
6640           return 0;
6641           }
6642         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6643           else if (group_return > 0) matched_char = TRUE;
6644         }
6645       }
6646 
6647     /* In the pre-compile phase, update the length by the length of the group,
6648     less the brackets at either end. Then reduce the compiled code to just a
6649     set of non-capturing brackets so that it doesn't use much memory if it is
6650     duplicated by a quantifier.*/
6651 
6652     if (lengthptr != NULL)
6653       {
6654       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6655         {
6656         *errorcodeptr = ERR20;
6657         return 0;
6658         }
6659       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6660       code++;   /* This already contains bravalue */
6661       PUTINC(code, 0, 1 + LINK_SIZE);
6662       *code++ = OP_KET;
6663       PUTINC(code, 0, 1 + LINK_SIZE);
6664       break;    /* No need to waste time with special character handling */
6665       }
6666 
6667     /* Otherwise update the main code pointer to the end of the group. */
6668 
6669     code = tempcode;
6670 
6671     /* For a DEFINE group, required and first character settings are not
6672     relevant. */
6673 
6674     if (bravalue == OP_DEFINE) break;
6675 
6676     /* Handle updating of the required and first code units for other types of
6677     group. Update for normal brackets of all kinds, and conditions with two
6678     branches (see code above). If the bracket is followed by a quantifier with
6679     zero repeat, we have to back off. Hence the definition of zeroreqcu and
6680     zerofirstcu outside the main loop so that they can be accessed for the back
6681     off. */
6682 
6683     zeroreqcu = reqcu;
6684     zeroreqcuflags = reqcuflags;
6685     zerofirstcu = firstcu;
6686     zerofirstcuflags = firstcuflags;
6687     groupsetfirstcu = FALSE;
6688 
6689     if (bravalue >= OP_ONCE)  /* Not an assertion */
6690       {
6691       /* If we have not yet set a firstcu in this branch, take it from the
6692       subpattern, remembering that it was set here so that a repeat of more
6693       than one can replicate it as reqcu if necessary. If the subpattern has
6694       no firstcu, set "none" for the whole branch. In both cases, a zero
6695       repeat forces firstcu to "none". */
6696 
6697       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6698         {
6699         if (subfirstcuflags < REQ_NONE)
6700           {
6701           firstcu = subfirstcu;
6702           firstcuflags = subfirstcuflags;
6703           groupsetfirstcu = TRUE;
6704           }
6705         else firstcuflags = REQ_NONE;
6706         zerofirstcuflags = REQ_NONE;
6707         }
6708 
6709       /* If firstcu was previously set, convert the subpattern's firstcu
6710       into reqcu if there wasn't one, using the vary flag that was in
6711       existence beforehand. */
6712 
6713       else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6714         {
6715         subreqcu = subfirstcu;
6716         subreqcuflags = subfirstcuflags | tempreqvary;
6717         }
6718 
6719       /* If the subpattern set a required code unit (or set a first code unit
6720       that isn't really the first code unit - see above), set it. */
6721 
6722       if (subreqcuflags < REQ_NONE)
6723         {
6724         reqcu = subreqcu;
6725         reqcuflags = subreqcuflags;
6726         }
6727       }
6728 
6729     /* For a forward assertion, we take the reqcu, if set, provided that the
6730     group has also set a firstcu. This can be helpful if the pattern that
6731     follows the assertion doesn't set a different char. For example, it's
6732     useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6733     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6734     the "real" "a" would then become a reqcu instead of a firstcu. This is
6735     overcome by a scan at the end if there's no firstcu, looking for an
6736     asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6737     we must only take the reqcu when the group also set a firstcu. Otherwise,
6738     in that example, 'X' ends up set for both. */
6739 
6740     else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6741              subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
6742       {
6743       reqcu = subreqcu;
6744       reqcuflags = subreqcuflags;
6745       }
6746 
6747     break;  /* End of nested group handling */
6748 
6749 
6750     /* ===================================================================*/
6751     /* Handle named backreferences and recursions. */
6752 
6753     case META_BACKREF_BYNAME:
6754     case META_RECURSE_BYNAME:
6755       {
6756       int count, index;
6757       PCRE2_SPTR name;
6758       BOOL is_dupname = FALSE;
6759       named_group *ng = cb->named_groups;
6760       uint32_t length = *(++pptr);
6761 
6762       GETPLUSOFFSET(offset, pptr);
6763       name = cb->start_pattern + offset;
6764 
6765       /* In the first pass, the names generated in the pre-pass are available,
6766       but the main name table has not yet been created. Scan the list of names
6767       generated in the pre-pass in order to get a number and whether or not
6768       this name is duplicated. */
6769 
6770       groupnumber = 0;
6771       for (unsigned int i = 0; i < cb->names_found; i++, ng++)
6772         {
6773         if (length == ng->length &&
6774             PRIV(strncmp)(name, ng->name, length) == 0)
6775           {
6776           is_dupname = ng->isdup;
6777           groupnumber = ng->number;
6778 
6779           /* For a recursion, that's all that is needed. We can now go to
6780           the code that handles numerical recursion, applying it to the first
6781           group with the given name. */
6782 
6783           if (meta == META_RECURSE_BYNAME)
6784             {
6785             meta_arg = groupnumber;
6786             goto HANDLE_NUMERICAL_RECURSION;
6787             }
6788 
6789           /* For a back reference, update the back reference map and the
6790           maximum back reference. */
6791 
6792           cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6793           if (groupnumber > cb->top_backref)
6794             cb->top_backref = groupnumber;
6795           }
6796         }
6797 
6798       /* If the name was not found we have a bad reference. */
6799 
6800       if (groupnumber == 0)
6801         {
6802         *errorcodeptr = ERR15;
6803         cb->erroroffset = offset;
6804         return 0;
6805         }
6806 
6807       /* If a back reference name is not duplicated, we can handle it as
6808       a numerical reference. */
6809 
6810       if (!is_dupname)
6811         {
6812         meta_arg = groupnumber;
6813         goto HANDLE_SINGLE_REFERENCE;
6814         }
6815 
6816       /* If a back reference name is duplicated, we generate a different
6817       opcode to a numerical back reference. In the second pass we must
6818       search for the index and count in the final name table. */
6819 
6820       count = 0;  /* Values for first pass (avoids compiler warning) */
6821       index = 0;
6822       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6823             &count, errorcodeptr, cb)) return 0;
6824 
6825       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6826       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6827       PUT2INC(code, 0, index);
6828       PUT2INC(code, 0, count);
6829       }
6830     break;
6831 
6832 
6833     /* ===================================================================*/
6834     /* Handle a numerical callout. */
6835 
6836     case META_CALLOUT_NUMBER:
6837     code[0] = OP_CALLOUT;
6838     PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6839     PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6840     code[1 + 2*LINK_SIZE] = pptr[3];
6841     pptr += 3;
6842     code += PRIV(OP_lengths)[OP_CALLOUT];
6843     break;
6844 
6845 
6846     /* ===================================================================*/
6847     /* Handle a callout with a string argument. In the pre-pass we just compute
6848     the length without generating anything. The length in pptr[3] includes both
6849     delimiters; in the actual compile only the first one is copied, but a
6850     terminating zero is added. Any doubled delimiters within the string make
6851     this an overestimate, but it is not worth bothering about. */
6852 
6853     case META_CALLOUT_STRING:
6854     if (lengthptr != NULL)
6855       {
6856       *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6857       pptr += 3;
6858       SKIPOFFSET(pptr);
6859       }
6860 
6861     /* In the real compile we can copy the string. The starting delimiter is
6862      included so that the client can discover it if they want. We also pass the
6863      start offset to help a script language give better error messages. */
6864 
6865     else
6866       {
6867       PCRE2_SPTR pp;
6868       uint32_t delimiter;
6869       uint32_t length = pptr[3];
6870       PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6871 
6872       code[0] = OP_CALLOUT_STR;
6873       PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6874       PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6875 
6876       pptr += 3;
6877       GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
6878       pp = cb->start_pattern + offset;
6879       delimiter = *callout_string++ = *pp++;
6880       if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6881         delimiter = CHAR_RIGHT_CURLY_BRACKET;
6882       PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
6883 
6884       /* The syntax of the pattern was checked in the parsing scan. The length
6885       includes both delimiters, but we have passed the opening one just above,
6886       so we reduce length before testing it. The test is for > 1 because we do
6887       not want to copy the final delimiter. This also ensures that pp[1] is
6888       accessible. */
6889 
6890       while (--length > 1)
6891         {
6892         if (*pp == delimiter && pp[1] == delimiter)
6893           {
6894           *callout_string++ = delimiter;
6895           pp += 2;
6896           length--;
6897           }
6898         else *callout_string++ = *pp++;
6899         }
6900       *callout_string++ = CHAR_NUL;
6901 
6902       /* Set the length of the entire item, the advance to its end. */
6903 
6904       PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6905       code = callout_string;
6906       }
6907     break;
6908 
6909 
6910     /* ===================================================================*/
6911     /* Handle repetition. The different types are all sorted out in the parsing
6912     pass. */
6913 
6914     case META_MINMAX_PLUS:
6915     case META_MINMAX_QUERY:
6916     case META_MINMAX:
6917     repeat_min = *(++pptr);
6918     repeat_max = *(++pptr);
6919     goto REPEAT;
6920 
6921     case META_ASTERISK:
6922     case META_ASTERISK_PLUS:
6923     case META_ASTERISK_QUERY:
6924     repeat_min = 0;
6925     repeat_max = REPEAT_UNLIMITED;
6926     goto REPEAT;
6927 
6928     case META_PLUS:
6929     case META_PLUS_PLUS:
6930     case META_PLUS_QUERY:
6931     repeat_min = 1;
6932     repeat_max = REPEAT_UNLIMITED;
6933     goto REPEAT;
6934 
6935     case META_QUERY:
6936     case META_QUERY_PLUS:
6937     case META_QUERY_QUERY:
6938     repeat_min = 0;
6939     repeat_max = 1;
6940 
6941     REPEAT:
6942     if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6943 
6944     /* Remember whether this is a variable length repeat, and default to
6945     single-char opcodes. */
6946 
6947     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6948     op_type = 0;
6949 
6950     /* Adjust first and required code units for a zero repeat. */
6951 
6952     if (repeat_min == 0)
6953       {
6954       firstcu = zerofirstcu;
6955       firstcuflags = zerofirstcuflags;
6956       reqcu = zeroreqcu;
6957       reqcuflags = zeroreqcuflags;
6958       }
6959 
6960     /* Note the greediness and possessiveness. */
6961 
6962     switch (meta)
6963       {
6964       case META_MINMAX_PLUS:
6965       case META_ASTERISK_PLUS:
6966       case META_PLUS_PLUS:
6967       case META_QUERY_PLUS:
6968       repeat_type = 0;                  /* Force greedy */
6969       possessive_quantifier = TRUE;
6970       break;
6971 
6972       case META_MINMAX_QUERY:
6973       case META_ASTERISK_QUERY:
6974       case META_PLUS_QUERY:
6975       case META_QUERY_QUERY:
6976       repeat_type = greedy_non_default;
6977       possessive_quantifier = FALSE;
6978       break;
6979 
6980       default:
6981       repeat_type = greedy_default;
6982       possessive_quantifier = FALSE;
6983       break;
6984       }
6985 
6986     /* Save start of previous item, in case we have to move it up in order to
6987     insert something before it, and remember what it was. */
6988 
6989     tempcode = previous;
6990     op_previous = *previous;
6991 
6992     /* Now handle repetition for the different types of item. If the repeat
6993     minimum and the repeat maximum are both 1, we can ignore the quantifier for
6994     non-parenthesized items, as they have only one alternative. For anything in
6995     parentheses, we must not ignore if {1} is possessive. */
6996 
6997     switch (op_previous)
6998       {
6999       /* If previous was a character or negated character match, abolish the
7000       item and generate a repeat item instead. If a char item has a minimum of
7001       more than one, ensure that it is set in reqcu - it might not be if a
7002       sequence such as x{3} is the first thing in a branch because the x will
7003       have gone into firstcu instead.  */
7004 
7005       case OP_CHAR:
7006       case OP_CHARI:
7007       case OP_NOT:
7008       case OP_NOTI:
7009       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7010       op_type = chartypeoffset[op_previous - OP_CHAR];
7011 
7012       /* Deal with UTF characters that take up more than one code unit. */
7013 
7014 #ifdef MAYBE_UTF_MULTI
7015       if (utf && NOT_FIRSTCU(code[-1]))
7016         {
7017         PCRE2_UCHAR *lastchar = code - 1;
7018         BACKCHAR(lastchar);
7019         mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7020         memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7021         }
7022       else
7023 #endif  /* MAYBE_UTF_MULTI */
7024 
7025       /* Handle the case of a single code unit - either with no UTF support, or
7026       with UTF disabled, or for a single-code-unit UTF character. In the latter
7027       case, for a repeated positive match, get the caseless flag for the
7028       required code unit from the previous character, because a class like [Aa]
7029       sets a caseless A but by now the req_caseopt flag has been reset. */
7030 
7031         {
7032         mcbuffer[0] = code[-1];
7033         mclength = 1;
7034         if (op_previous <= OP_CHARI && repeat_min > 1)
7035           {
7036           reqcu = mcbuffer[0];
7037           reqcuflags = cb->req_varyopt;
7038           if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7039           }
7040         }
7041       goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7042 
7043       /* If previous was a character class or a back reference, we put the
7044       repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7045 
7046 #ifdef SUPPORT_WIDE_CHARS
7047       case OP_XCLASS:
7048 #endif
7049       case OP_CLASS:
7050       case OP_NCLASS:
7051       case OP_REF:
7052       case OP_REFI:
7053       case OP_DNREF:
7054       case OP_DNREFI:
7055 
7056       if (repeat_max == 0)
7057         {
7058         code = previous;
7059         goto END_REPEAT;
7060         }
7061       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7062 
7063       if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7064         *code++ = OP_CRSTAR + repeat_type;
7065       else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7066         *code++ = OP_CRPLUS + repeat_type;
7067       else if (repeat_min == 0 && repeat_max == 1)
7068         *code++ = OP_CRQUERY + repeat_type;
7069       else
7070         {
7071         *code++ = OP_CRRANGE + repeat_type;
7072         PUT2INC(code, 0, repeat_min);
7073         if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7074         PUT2INC(code, 0, repeat_max);
7075         }
7076       break;
7077 
7078       /* If previous is OP_FAIL, it was generated by an empty class []
7079       (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7080       generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7081       time. We can just ignore this repeat. */
7082 
7083       case OP_FAIL:
7084       goto END_REPEAT;
7085 
7086       /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7087       because pcre2_match() could not handle backtracking into recursively
7088       called groups. Now that this backtracking is available, we no longer need
7089       to do this. However, we still need to replicate recursions as we do for
7090       groups so as to have independent backtracking points. We can replicate
7091       for the minimum number of repeats directly. For optional repeats we now
7092       wrap the recursion in OP_BRA brackets and make use of the bracket
7093       repetition. */
7094 
7095       case OP_RECURSE:
7096       if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7097         goto END_REPEAT;
7098 
7099       /* Generate unwrapped repeats for a non-zero minimum, except when the
7100       minimum is 1 and the maximum unlimited, because that can be handled with
7101       OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7102       minimum, we just need to generate the appropriate additional copies.
7103       Otherwise we need to generate one more, to simulate the situation when
7104       the minimum is zero. */
7105 
7106       if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7107         {
7108         int replicate = repeat_min;
7109         if (repeat_min == repeat_max) replicate--;
7110 
7111         /* In the pre-compile phase, we don't actually do the replication. We
7112         just adjust the length as if we had. Do some paranoid checks for
7113         potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7114         integer type when available, otherwise double. */
7115 
7116         if (lengthptr != NULL)
7117           {
7118           PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
7119           if ((INT64_OR_DOUBLE)replicate*
7120                 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
7121                   (INT64_OR_DOUBLE)INT_MAX ||
7122               OFLOW_MAX - *lengthptr < delta)
7123             {
7124             *errorcodeptr = ERR20;
7125             return 0;
7126             }
7127           *lengthptr += delta;
7128           }
7129 
7130         else for (int i = 0; i < replicate; i++)
7131           {
7132           memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7133           previous = code;
7134           code += 1 + LINK_SIZE;
7135           }
7136 
7137         /* If the number of repeats is fixed, we are done. Otherwise, adjust
7138         the counts and fall through. */
7139 
7140         if (repeat_min == repeat_max) break;
7141         if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7142         repeat_min = 0;
7143         }
7144 
7145       /* Wrap the recursion call in OP_BRA brackets. */
7146 
7147       (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7148       op_previous = *previous = OP_BRA;
7149       PUT(previous, 1, 2 + 2*LINK_SIZE);
7150       previous[2 + 2*LINK_SIZE] = OP_KET;
7151       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7152       code += 2 + 2 * LINK_SIZE;
7153       length_prevgroup = 3 + 3*LINK_SIZE;
7154       group_return = -1;  /* Set "may match empty string" */
7155 
7156       /* Now treat as a repeated OP_BRA. */
7157       /* Fall through */
7158 
7159       /* If previous was a bracket group, we may have to replicate it in
7160       certain cases. Note that at this point we can encounter only the "basic"
7161       bracket opcodes such as BRA and CBRA, as this is the place where they get
7162       converted into the more special varieties such as BRAPOS and SBRA.
7163       Originally, PCRE did not allow repetition of assertions, but now it does,
7164       for Perl compatibility. */
7165 
7166       case OP_ASSERT:
7167       case OP_ASSERT_NOT:
7168       case OP_ASSERT_NA:
7169       case OP_ASSERTBACK:
7170       case OP_ASSERTBACK_NOT:
7171       case OP_ASSERTBACK_NA:
7172       case OP_ONCE:
7173       case OP_SCRIPT_RUN:
7174       case OP_BRA:
7175       case OP_CBRA:
7176       case OP_COND:
7177         {
7178         int len = (int)(code - previous);
7179         PCRE2_UCHAR *bralink = NULL;
7180         PCRE2_UCHAR *brazeroptr = NULL;
7181 
7182         if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7183           goto END_REPEAT;
7184 
7185         /* Repeating a DEFINE group (or any group where the condition is always
7186         FALSE and there is only one branch) is pointless, but Perl allows the
7187         syntax, so we just ignore the repeat. */
7188 
7189         if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7190             previous[GET(previous, 1)] != OP_ALT)
7191           goto END_REPEAT;
7192 
7193         /* Perl allows all assertions to be quantified, and when they contain
7194         capturing parentheses and/or are optional there are potential uses for
7195         this feature. PCRE2 used to force the maximum quantifier to 1 on the
7196         invalid grounds that further repetition was never useful. This was
7197         always a bit pointless, since an assertion could be wrapped with a
7198         repeated group to achieve the effect. General repetition is now
7199         permitted, but if the maximum is unlimited it is set to one more than
7200         the minimum. */
7201 
7202         if (op_previous < OP_ONCE)    /* Assertion */
7203           {
7204           if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7205           }
7206 
7207         /* The case of a zero minimum is special because of the need to stick
7208         OP_BRAZERO in front of it, and because the group appears once in the
7209         data, whereas in other cases it appears the minimum number of times. For
7210         this reason, it is simplest to treat this case separately, as otherwise
7211         the code gets far too messy. There are several special subcases when the
7212         minimum is zero. */
7213 
7214         if (repeat_min == 0)
7215           {
7216           /* If the maximum is also zero, we used to just omit the group from
7217           the output altogether, like this:
7218 
7219           ** if (repeat_max == 0)
7220           **   {
7221           **   code = previous;
7222           **   goto END_REPEAT;
7223           **   }
7224 
7225           However, that fails when a group or a subgroup within it is
7226           referenced as a subroutine from elsewhere in the pattern, so now we
7227           stick in OP_SKIPZERO in front of it so that it is skipped on
7228           execution. As we don't have a list of which groups are referenced, we
7229           cannot do this selectively.
7230 
7231           If the maximum is 1 or unlimited, we just have to stick in the
7232           BRAZERO and do no more at this point. */
7233 
7234           if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7235             {
7236             (void)memmove(previous + 1, previous, CU2BYTES(len));
7237             code++;
7238             if (repeat_max == 0)
7239               {
7240               *previous++ = OP_SKIPZERO;
7241               goto END_REPEAT;
7242               }
7243             brazeroptr = previous;    /* Save for possessive optimizing */
7244             *previous++ = OP_BRAZERO + repeat_type;
7245             }
7246 
7247           /* If the maximum is greater than 1 and limited, we have to replicate
7248           in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7249           The first one has to be handled carefully because it's the original
7250           copy, which has to be moved up. The remainder can be handled by code
7251           that is common with the non-zero minimum case below. We have to
7252           adjust the value or repeat_max, since one less copy is required. */
7253 
7254           else
7255             {
7256             int linkoffset;
7257             (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7258             code += 2 + LINK_SIZE;
7259             *previous++ = OP_BRAZERO + repeat_type;
7260             *previous++ = OP_BRA;
7261 
7262             /* We chain together the bracket link offset fields that have to be
7263             filled in later when the ends of the brackets are reached. */
7264 
7265             linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7266             bralink = previous;
7267             PUTINC(previous, 0, linkoffset);
7268             }
7269 
7270           if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7271           }
7272 
7273         /* If the minimum is greater than zero, replicate the group as many
7274         times as necessary, and adjust the maximum to the number of subsequent
7275         copies that we need. */
7276 
7277         else
7278           {
7279           if (repeat_min > 1)
7280             {
7281             /* In the pre-compile phase, we don't actually do the replication.
7282             We just adjust the length as if we had. Do some paranoid checks for
7283             potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7284             integer type when available, otherwise double. */
7285 
7286             if (lengthptr != NULL)
7287               {
7288               PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7289               if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7290                     (INT64_OR_DOUBLE)length_prevgroup >
7291                       (INT64_OR_DOUBLE)INT_MAX ||
7292                   OFLOW_MAX - *lengthptr < delta)
7293                 {
7294                 *errorcodeptr = ERR20;
7295                 return 0;
7296                 }
7297               *lengthptr += delta;
7298               }
7299 
7300             /* This is compiling for real. If there is a set first code unit
7301             for the group, and we have not yet set a "required code unit", set
7302             it. */
7303 
7304             else
7305               {
7306               if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7307                 {
7308                 reqcu = firstcu;
7309                 reqcuflags = firstcuflags;
7310                 }
7311               for (uint32_t i = 1; i < repeat_min; i++)
7312                 {
7313                 memcpy(code, previous, CU2BYTES(len));
7314                 code += len;
7315                 }
7316               }
7317             }
7318 
7319           if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7320           }
7321 
7322         /* This code is common to both the zero and non-zero minimum cases. If
7323         the maximum is limited, it replicates the group in a nested fashion,
7324         remembering the bracket starts on a stack. In the case of a zero
7325         minimum, the first one was set up above. In all cases the repeat_max
7326         now specifies the number of additional copies needed. Again, we must
7327         remember to replicate entries on the forward reference list. */
7328 
7329         if (repeat_max != REPEAT_UNLIMITED)
7330           {
7331           /* In the pre-compile phase, we don't actually do the replication. We
7332           just adjust the length as if we had. For each repetition we must add
7333           1 to the length for BRAZERO and for all but the last repetition we
7334           must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7335           paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7336           is a 64-bit integer type when available, otherwise double. */
7337 
7338           if (lengthptr != NULL && repeat_max > 0)
7339             {
7340             PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7341                         2 - 2*LINK_SIZE;   /* Last one doesn't nest */
7342             if ((INT64_OR_DOUBLE)repeat_max *
7343                   (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7344                     > (INT64_OR_DOUBLE)INT_MAX ||
7345                 OFLOW_MAX - *lengthptr < delta)
7346               {
7347               *errorcodeptr = ERR20;
7348               return 0;
7349               }
7350             *lengthptr += delta;
7351             }
7352 
7353           /* This is compiling for real */
7354 
7355           else for (uint32_t i = repeat_max; i >= 1; i--)
7356             {
7357             *code++ = OP_BRAZERO + repeat_type;
7358 
7359             /* All but the final copy start a new nesting, maintaining the
7360             chain of brackets outstanding. */
7361 
7362             if (i != 1)
7363               {
7364               int linkoffset;
7365               *code++ = OP_BRA;
7366               linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7367               bralink = code;
7368               PUTINC(code, 0, linkoffset);
7369               }
7370 
7371             memcpy(code, previous, CU2BYTES(len));
7372             code += len;
7373             }
7374 
7375           /* Now chain through the pending brackets, and fill in their length
7376           fields (which are holding the chain links pro tem). */
7377 
7378           while (bralink != NULL)
7379             {
7380             int oldlinkoffset;
7381             int linkoffset = (int)(code - bralink + 1);
7382             PCRE2_UCHAR *bra = code - linkoffset;
7383             oldlinkoffset = GET(bra, 1);
7384             bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7385             *code++ = OP_KET;
7386             PUTINC(code, 0, linkoffset);
7387             PUT(bra, 1, linkoffset);
7388             }
7389           }
7390 
7391         /* If the maximum is unlimited, set a repeater in the final copy. For
7392         SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7393         possessively repeated ONCE brackets can be converted into non-capturing
7394         brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7395         saves having to deal with possessive ONCEs specially.
7396 
7397         Otherwise, when we are doing the actual compile phase, check to see
7398         whether this group is one that could match an empty string. If so,
7399         convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7400         that runtime checking can be done. [This check is also applied to ONCE
7401         and SCRIPT_RUN groups at runtime, but in a different way.]
7402 
7403         Then, if the quantifier was possessive and the bracket is not a
7404         conditional, we convert the BRA code to the POS form, and the KET code
7405         to KETRPOS. (It turns out to be convenient at runtime to detect this
7406         kind of subpattern at both the start and at the end.) The use of
7407         special opcodes makes it possible to reduce greatly the stack usage in
7408         pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7409         OP_BRAPOSZERO.
7410 
7411         Then, if the minimum number of matches is 1 or 0, cancel the possessive
7412         flag so that the default action below, of wrapping everything inside
7413         atomic brackets, does not happen. When the minimum is greater than 1,
7414         there will be earlier copies of the group, and so we still have to wrap
7415         the whole thing. */
7416 
7417         else
7418           {
7419           PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7420           PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7421 
7422           /* Convert possessive ONCE brackets to non-capturing */
7423 
7424           if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7425 
7426           /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7427           to do is to set the KET. */
7428 
7429           if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7430             *ketcode = OP_KETRMAX + repeat_type;
7431 
7432           /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7433           (which have been converted to non-capturing above). */
7434 
7435           else
7436             {
7437             /* In the compile phase, adjust the opcode if the group can match
7438             an empty string. For a conditional group with only one branch, the
7439             value of group_return will not show "could be empty", so we must
7440             check that separately. */
7441 
7442             if (lengthptr == NULL)
7443               {
7444               if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7445               if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7446                 *bracode = OP_SCOND;
7447               }
7448 
7449             /* Handle possessive quantifiers. */
7450 
7451             if (possessive_quantifier)
7452               {
7453               /* For COND brackets, we wrap the whole thing in a possessively
7454               repeated non-capturing bracket, because we have not invented POS
7455               versions of the COND opcodes. */
7456 
7457               if (*bracode == OP_COND || *bracode == OP_SCOND)
7458                 {
7459                 int nlen = (int)(code - bracode);
7460                 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7461                 code += 1 + LINK_SIZE;
7462                 nlen += 1 + LINK_SIZE;
7463                 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7464                 *code++ = OP_KETRPOS;
7465                 PUTINC(code, 0, nlen);
7466                 PUT(bracode, 1, nlen);
7467                 }
7468 
7469               /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7470 
7471               else
7472                 {
7473                 *bracode += 1;              /* Switch to xxxPOS opcodes */
7474                 *ketcode = OP_KETRPOS;
7475                 }
7476 
7477               /* If the minimum is zero, mark it as possessive, then unset the
7478               possessive flag when the minimum is 0 or 1. */
7479 
7480               if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7481               if (repeat_min < 2) possessive_quantifier = FALSE;
7482               }
7483 
7484             /* Non-possessive quantifier */
7485 
7486             else *ketcode = OP_KETRMAX + repeat_type;
7487             }
7488           }
7489         }
7490       break;
7491 
7492       /* If previous was a character type match (\d or similar), abolish it and
7493       create a suitable repeat item. The code is shared with single-character
7494       repeats by setting op_type to add a suitable offset into repeat_type.
7495       Note the the Unicode property types will be present only when
7496       SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7497       here because it just makes it horribly messy. */
7498 
7499       default:
7500       if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7501         {
7502         *errorcodeptr = ERR10;
7503         return 0;
7504         }
7505       else
7506         {
7507         int prop_type, prop_value;
7508         PCRE2_UCHAR *oldcode;
7509 
7510         if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7511 
7512         op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7513         mclength = 0;                         /* Not a character */
7514 
7515         if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7516           {
7517           prop_type = previous[1];
7518           prop_value = previous[2];
7519           }
7520         else
7521           {
7522           /* Come here from just above with a character in mcbuffer/mclength. */
7523           OUTPUT_SINGLE_REPEAT:
7524           prop_type = prop_value = -1;
7525           }
7526 
7527         /* At this point, if prop_type == prop_value == -1 we either have a
7528         character in mcbuffer when mclength is greater than zero, or we have
7529         mclength zero, in which case there is a non-property character type in
7530         op_previous. If prop_type/value are not negative, we have a property
7531         character type in op_previous. */
7532 
7533         oldcode = code;                   /* Save where we were */
7534         code = previous;                  /* Usually overwrite previous item */
7535 
7536         /* If the maximum is zero then the minimum must also be zero; Perl allows
7537         this case, so we do too - by simply omitting the item altogether. */
7538 
7539         if (repeat_max == 0) goto END_REPEAT;
7540 
7541         /* Combine the op_type with the repeat_type */
7542 
7543         repeat_type += op_type;
7544 
7545         /* A minimum of zero is handled either as the special case * or ?, or as
7546         an UPTO, with the maximum given. */
7547 
7548         if (repeat_min == 0)
7549           {
7550           if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7551             else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7552           else
7553             {
7554             *code++ = OP_UPTO + repeat_type;
7555             PUT2INC(code, 0, repeat_max);
7556             }
7557           }
7558 
7559         /* A repeat minimum of 1 is optimized into some special cases. If the
7560         maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7561         left in place and, if the maximum is greater than 1, we use OP_UPTO with
7562         one less than the maximum. */
7563 
7564         else if (repeat_min == 1)
7565           {
7566           if (repeat_max == REPEAT_UNLIMITED)
7567             *code++ = OP_PLUS + repeat_type;
7568           else
7569             {
7570             code = oldcode;  /* Leave previous item in place */
7571             if (repeat_max == 1) goto END_REPEAT;
7572             *code++ = OP_UPTO + repeat_type;
7573             PUT2INC(code, 0, repeat_max - 1);
7574             }
7575           }
7576 
7577         /* The case {n,n} is just an EXACT, while the general case {n,m} is
7578         handled as an EXACT followed by an UPTO or STAR or QUERY. */
7579 
7580         else
7581           {
7582           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7583           PUT2INC(code, 0, repeat_min);
7584 
7585           /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7586           and then generate the second opcode. For a repeated Unicode property
7587           match, there are two extra values that define the required property,
7588           and mclength is set zero to indicate this. */
7589 
7590           if (repeat_max != repeat_min)
7591             {
7592             if (mclength > 0)
7593               {
7594               memcpy(code, mcbuffer, CU2BYTES(mclength));
7595               code += mclength;
7596               }
7597             else
7598               {
7599               *code++ = op_previous;
7600               if (prop_type >= 0)
7601                 {
7602                 *code++ = prop_type;
7603                 *code++ = prop_value;
7604                 }
7605               }
7606 
7607             /* Now set up the following opcode */
7608 
7609             if (repeat_max == REPEAT_UNLIMITED)
7610               *code++ = OP_STAR + repeat_type;
7611             else
7612               {
7613               repeat_max -= repeat_min;
7614               if (repeat_max == 1)
7615                 {
7616                 *code++ = OP_QUERY + repeat_type;
7617                 }
7618               else
7619                 {
7620                 *code++ = OP_UPTO + repeat_type;
7621                 PUT2INC(code, 0, repeat_max);
7622                 }
7623               }
7624             }
7625           }
7626 
7627         /* Fill in the character or character type for the final opcode. */
7628 
7629         if (mclength > 0)
7630           {
7631           memcpy(code, mcbuffer, CU2BYTES(mclength));
7632           code += mclength;
7633           }
7634         else
7635           {
7636           *code++ = op_previous;
7637           if (prop_type >= 0)
7638             {
7639             *code++ = prop_type;
7640             *code++ = prop_value;
7641             }
7642           }
7643         }
7644       break;
7645       }  /* End of switch on different op_previous values */
7646 
7647 
7648     /* If the character following a repeat is '+', possessive_quantifier is
7649     TRUE. For some opcodes, there are special alternative opcodes for this
7650     case. For anything else, we wrap the entire repeated item inside OP_ONCE
7651     brackets. Logically, the '+' notation is just syntactic sugar, taken from
7652     Sun's Java package, but the special opcodes can optimize it.
7653 
7654     Some (but not all) possessively repeated subpatterns have already been
7655     completely handled in the code just above. For them, possessive_quantifier
7656     is always FALSE at this stage. Note that the repeated item starts at
7657     tempcode, not at previous, which might be the first part of a string whose
7658     (former) last char we repeated. */
7659 
7660     if (possessive_quantifier)
7661       {
7662       int len;
7663 
7664       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7665       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7666       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7667       remains is greater than zero, there's a further opcode that can be
7668       handled. If not, do nothing, leaving the EXACT alone. */
7669 
7670       switch(*tempcode)
7671         {
7672         case OP_TYPEEXACT:
7673         tempcode += PRIV(OP_lengths)[*tempcode] +
7674           ((tempcode[1 + IMM2_SIZE] == OP_PROP
7675           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7676         break;
7677 
7678         /* CHAR opcodes are used for exacts whose count is 1. */
7679 
7680         case OP_CHAR:
7681         case OP_CHARI:
7682         case OP_NOT:
7683         case OP_NOTI:
7684         case OP_EXACT:
7685         case OP_EXACTI:
7686         case OP_NOTEXACT:
7687         case OP_NOTEXACTI:
7688         tempcode += PRIV(OP_lengths)[*tempcode];
7689 #ifdef SUPPORT_UNICODE
7690         if (utf && HAS_EXTRALEN(tempcode[-1]))
7691           tempcode += GET_EXTRALEN(tempcode[-1]);
7692 #endif
7693         break;
7694 
7695         /* For the class opcodes, the repeat operator appears at the end;
7696         adjust tempcode to point to it. */
7697 
7698         case OP_CLASS:
7699         case OP_NCLASS:
7700         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7701         break;
7702 
7703 #ifdef SUPPORT_WIDE_CHARS
7704         case OP_XCLASS:
7705         tempcode += GET(tempcode, 1);
7706         break;
7707 #endif
7708         }
7709 
7710       /* If tempcode is equal to code (which points to the end of the repeated
7711       item), it means we have skipped an EXACT item but there is no following
7712       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7713       all other cases, tempcode will be pointing to the repeat opcode, and will
7714       be less than code, so the value of len will be greater than 0. */
7715 
7716       len = (int)(code - tempcode);
7717       if (len > 0)
7718         {
7719         unsigned int repcode = *tempcode;
7720 
7721         /* There is a table for possessifying opcodes, all of which are less
7722         than OP_CALLOUT. A zero entry means there is no possessified version.
7723         */
7724 
7725         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7726           *tempcode = opcode_possessify[repcode];
7727 
7728         /* For opcode without a special possessified version, wrap the item in
7729         ONCE brackets. */
7730 
7731         else
7732           {
7733           (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7734           code += 1 + LINK_SIZE;
7735           len += 1 + LINK_SIZE;
7736           tempcode[0] = OP_ONCE;
7737           *code++ = OP_KET;
7738           PUTINC(code, 0, len);
7739           PUT(tempcode, 1, len);
7740           }
7741         }
7742       }
7743 
7744     /* We set the "follows varying string" flag for subsequently encountered
7745     reqcus if it isn't already set and we have just passed a varying length
7746     item. */
7747 
7748     END_REPEAT:
7749     cb->req_varyopt |= reqvary;
7750     break;
7751 
7752 
7753     /* ===================================================================*/
7754     /* Handle a 32-bit data character with a value greater than META_END. */
7755 
7756     case META_BIGVALUE:
7757     pptr++;
7758     goto NORMAL_CHAR;
7759 
7760 
7761     /* ===============================================================*/
7762     /* Handle a back reference by number, which is the meta argument. The
7763     pattern offsets for back references to group numbers less than 10 are held
7764     in a special vector, to avoid using more than two parsed pattern elements
7765     in 64-bit environments. We only need the offset to the first occurrence,
7766     because if that doesn't fail, subsequent ones will also be OK. */
7767 
7768     case META_BACKREF:
7769     if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7770       else GETPLUSOFFSET(offset, pptr);
7771 
7772     if (meta_arg > cb->bracount)
7773       {
7774       cb->erroroffset = offset;
7775       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7776       return 0;
7777       }
7778 
7779     /* Come here from named backref handling when the reference is to a
7780     single group (that is, not to a duplicated name). The back reference
7781     data will have already been updated. We must disable firstcu if not
7782     set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7783     later. */
7784 
7785     HANDLE_SINGLE_REFERENCE:
7786     if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7787     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7788     PUT2INC(code, 0, meta_arg);
7789 
7790     /* Update the map of back references, and keep the highest one. We
7791     could do this in parse_regex() for numerical back references, but not
7792     for named back references, because we don't know the numbers to which
7793     named back references refer. So we do it all in this function. */
7794 
7795     cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7796     if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7797     break;
7798 
7799 
7800     /* ===============================================================*/
7801     /* Handle recursion by inserting the number of the called group (which is
7802     the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7803     scanned and these numbers are replaced by offsets within the pattern. It is
7804     done like this to avoid problems with forward references and adjusting
7805     offsets when groups are duplicated and moved (as discovered in previous
7806     implementations). Note that a recursion does not have a set first
7807     character. */
7808 
7809     case META_RECURSE:
7810     GETPLUSOFFSET(offset, pptr);
7811     if (meta_arg > cb->bracount)
7812       {
7813       cb->erroroffset = offset;
7814       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7815       return 0;
7816       }
7817     HANDLE_NUMERICAL_RECURSION:
7818     *code = OP_RECURSE;
7819     PUT(code, 1, meta_arg);
7820     code += 1 + LINK_SIZE;
7821     groupsetfirstcu = FALSE;
7822     cb->had_recurse = TRUE;
7823     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7824     zerofirstcu = firstcu;
7825     zerofirstcuflags = firstcuflags;
7826     break;
7827 
7828 
7829     /* ===============================================================*/
7830     /* Handle capturing parentheses; the number is the meta argument. */
7831 
7832     case META_CAPTURE:
7833     bravalue = OP_CBRA;
7834     skipunits = IMM2_SIZE;
7835     PUT2(code, 1+LINK_SIZE, meta_arg);
7836     cb->lastcapture = meta_arg;
7837     goto GROUP_PROCESS_NOTE_EMPTY;
7838 
7839 
7840     /* ===============================================================*/
7841     /* Handle escape sequence items. For ones like \d, the ESC_values are
7842     arranged to be the same as the corresponding OP_values in the default case
7843     when PCRE2_UCP is not set (which is the only case in which they will appear
7844     here).
7845 
7846     Note: \Q and \E are never seen here, as they were dealt with in
7847     parse_pattern(). Neither are numerical back references or recursions, which
7848     were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7849     \g, when followed by names, are turned into META_BACKREF_BYNAME or
7850     META_RECURSE_BYNAME. */
7851 
7852     case META_ESCAPE:
7853 
7854     /* We can test for escape sequences that consume a character because their
7855     values lie between ESC_b and ESC_Z; this may have to change if any new ones
7856     are ever created. For these sequences, we disable the setting of a first
7857     character if it hasn't already been set. */
7858 
7859     if (meta_arg > ESC_b && meta_arg < ESC_Z)
7860       {
7861       matched_char = TRUE;
7862       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7863       }
7864 
7865     /* Set values to reset to if this is followed by a zero repeat. */
7866 
7867     zerofirstcu = firstcu;
7868     zerofirstcuflags = firstcuflags;
7869     zeroreqcu = reqcu;
7870     zeroreqcuflags = reqcuflags;
7871 
7872     /* If Unicode is not supported, \P and \p are not allowed and are
7873     faulted at parse time, so will never appear here. */
7874 
7875 #ifdef SUPPORT_UNICODE
7876     if (meta_arg == ESC_P || meta_arg == ESC_p)
7877       {
7878       uint32_t ptype = *(++pptr) >> 16;
7879       uint32_t pdata = *pptr & 0xffff;
7880 
7881       /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7882       from the auto-anchoring code. */
7883 
7884       if (meta_arg == ESC_p && ptype == PT_ANY)
7885         {
7886         *code++ = OP_ALLANY;
7887         }
7888       else
7889         {
7890         *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7891         *code++ = ptype;
7892         *code++ = pdata;
7893         }
7894       break;  /* End META_ESCAPE */
7895       }
7896 #endif
7897 
7898     /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
7899     done. However, there's an option, in case anyone was relying on it. */
7900 
7901     if (cb->assert_depth > 0 && meta_arg == ESC_K &&
7902         (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
7903       {
7904       *errorcodeptr = ERR99;
7905       return 0;
7906       }
7907 
7908     /* For the rest (including \X when Unicode is supported - if not it's
7909     faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7910     not set; if it is set, these escapes do not show up here because they are
7911     converted into Unicode property tests in parse_regex(). Note that \b and \B
7912     do a one-character lookbehind, and \A also behaves as if it does. */
7913 
7914     if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7915     if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7916          cb->max_lookbehind == 0)
7917       cb->max_lookbehind = 1;
7918 
7919     /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7920     instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7921 
7922 #if PCRE2_CODE_UNIT_WIDTH == 32
7923     *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7924 #else
7925     *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7926 #endif
7927     break;  /* End META_ESCAPE */
7928 
7929 
7930     /* ===================================================================*/
7931     /* Handle an unrecognized meta value. A parsed pattern value less than
7932     META_END is a literal. Otherwise we have a problem. */
7933 
7934     default:
7935     if (meta >= META_END)
7936       {
7937 #ifdef DEBUG_SHOW_PARSED
7938       fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7939 #endif
7940       *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
7941       return 0;
7942       }
7943 
7944     /* Handle a literal character. We come here by goto in the case of a
7945     32-bit, non-UTF character whose value is greater than META_END. */
7946 
7947     NORMAL_CHAR:
7948     meta = *pptr;     /* Get the full 32 bits */
7949     NORMAL_CHAR_SET:  /* Character is already in meta */
7950     matched_char = TRUE;
7951 
7952     /* For caseless UTF or UCP mode, check whether this character has more than
7953     one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
7954     */
7955 
7956 #ifdef SUPPORT_UNICODE
7957     if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
7958       {
7959       uint32_t caseset = UCD_CASESET(meta);
7960       if (caseset != 0)
7961         {
7962         *code++ = OP_PROP;
7963         *code++ = PT_CLIST;
7964         *code++ = caseset;
7965         if (firstcuflags == REQ_UNSET)
7966           firstcuflags = zerofirstcuflags = REQ_NONE;
7967         break;  /* End handling this meta item */
7968         }
7969       }
7970 #endif
7971 
7972     /* Caseful matches, or caseless and not one of the multicase characters. We
7973     come here by goto in the case of a positive class that contains only
7974     case-partners of a character with just two cases; matched_char has already
7975     been set TRUE and options fudged if necessary. */
7976 
7977     CLASS_CASELESS_CHAR:
7978 
7979     /* Get the character's code units into mcbuffer, with the length in
7980     mclength. When not in UTF mode, the length is always 1. */
7981 
7982 #ifdef SUPPORT_UNICODE
7983     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7984 #endif
7985       {
7986       mclength = 1;
7987       mcbuffer[0] = meta;
7988       }
7989 
7990     /* Generate the appropriate code */
7991 
7992     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7993     memcpy(code, mcbuffer, CU2BYTES(mclength));
7994     code += mclength;
7995 
7996     /* Remember if \r or \n were seen */
7997 
7998     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7999       cb->external_flags |= PCRE2_HASCRORLF;
8000 
8001     /* Set the first and required code units appropriately. If no previous
8002     first code unit, set it from this character, but revert to none on a zero
8003     repeat. Otherwise, leave the firstcu value alone, and don't change it on
8004     a zero repeat. */
8005 
8006     if (firstcuflags == REQ_UNSET)
8007       {
8008       zerofirstcuflags = REQ_NONE;
8009       zeroreqcu = reqcu;
8010       zeroreqcuflags = reqcuflags;
8011 
8012       /* If the character is more than one code unit long, we can set a single
8013       firstcu only if it is not to be matched caselessly. Multiple possible
8014       starting code units may be picked up later in the studying code. */
8015 
8016       if (mclength == 1 || req_caseopt == 0)
8017         {
8018         firstcu = mcbuffer[0];
8019         firstcuflags = req_caseopt;
8020         if (mclength != 1)
8021           {
8022           reqcu = code[-1];
8023           reqcuflags = cb->req_varyopt;
8024           }
8025         }
8026       else firstcuflags = reqcuflags = REQ_NONE;
8027       }
8028 
8029     /* firstcu was previously set; we can set reqcu only if the length is
8030     1 or the matching is caseful. */
8031 
8032     else
8033       {
8034       zerofirstcu = firstcu;
8035       zerofirstcuflags = firstcuflags;
8036       zeroreqcu = reqcu;
8037       zeroreqcuflags = reqcuflags;
8038       if (mclength == 1 || req_caseopt == 0)
8039         {
8040         reqcu = code[-1];
8041         reqcuflags = req_caseopt | cb->req_varyopt;
8042         }
8043       }
8044 
8045     /* If caselessness was temporarily instated, reset it. */
8046 
8047     if (reset_caseful)
8048       {
8049       options &= ~PCRE2_CASELESS;
8050       req_caseopt = 0;
8051       reset_caseful = FALSE;
8052       }
8053 
8054     break;    /* End literal character handling */
8055     }         /* End of big switch */
8056   }           /* End of big loop */
8057 
8058 /* Control never reaches here. */
8059 }
8060 
8061 
8062 
8063 /*************************************************
8064 *   Compile regex: a sequence of alternatives    *
8065 *************************************************/
8066 
8067 /* On entry, pptr is pointing past the bracket meta, but on return it points to
8068 the closing bracket or META_END. The code variable is pointing at the code unit
8069 into which the BRA operator has been stored. This function is used during the
8070 pre-compile phase when we are trying to find out the amount of memory needed,
8071 as well as during the real compile phase. The value of lengthptr distinguishes
8072 the two phases.
8073 
8074 Arguments:
8075   options           option bits, including any changes for this subpattern
8076   codeptr           -> the address of the current code pointer
8077   pptrptr           -> the address of the current parsed pattern pointer
8078   errorcodeptr      -> pointer to error code variable
8079   skipunits         skip this many code units at start (for brackets and OP_COND)
8080   firstcuptr        place to put the first required code unit
8081   firstcuflagsptr   place to put the first code unit flags
8082   reqcuptr          place to put the last required code unit
8083   reqcuflagsptr     place to put the last required code unit flags
8084   bcptr             pointer to the chain of currently open branches
8085   cb                points to the data block with tables pointers etc.
8086   lengthptr         NULL during the real compile phase
8087                     points to length accumulator during pre-compile phase
8088 
8089 Returns:            0 There has been an error
8090                    +1 Success, this group must match at least one character
8091                    -1 Success, this group may match an empty string
8092 */
8093 
8094 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)8095 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
8096   int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
8097   uint32_t *firstcuflagsptr, uint32_t *reqcuptr, uint32_t *reqcuflagsptr,
8098   branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
8099 {
8100 PCRE2_UCHAR *code = *codeptr;
8101 PCRE2_UCHAR *last_branch = code;
8102 PCRE2_UCHAR *start_bracket = code;
8103 BOOL lookbehind;
8104 open_capitem capitem;
8105 int capnumber = 0;
8106 int okreturn = 1;
8107 uint32_t *pptr = *pptrptr;
8108 uint32_t firstcu, reqcu;
8109 uint32_t lookbehindlength;
8110 uint32_t firstcuflags, reqcuflags;
8111 uint32_t branchfirstcu, branchreqcu;
8112 uint32_t branchfirstcuflags, branchreqcuflags;
8113 PCRE2_SIZE length;
8114 branch_chain bc;
8115 
8116 /* If set, call the external function that checks for stack availability. */
8117 
8118 if (cb->cx->stack_guard != NULL &&
8119     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8120   {
8121   *errorcodeptr= ERR33;
8122   return 0;
8123   }
8124 
8125 /* Miscellaneous initialization */
8126 
8127 bc.outer = bcptr;
8128 bc.current_branch = code;
8129 
8130 firstcu = reqcu = 0;
8131 firstcuflags = reqcuflags = REQ_UNSET;
8132 
8133 /* Accumulate the length for use in the pre-compile phase. Start with the
8134 length of the BRA and KET and any extra code units that are required at the
8135 beginning. We accumulate in a local variable to save frequent testing of
8136 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8137 start and end of each alternative, because compiled items are discarded during
8138 the pre-compile phase so that the workspace is not exceeded. */
8139 
8140 length = 2 + 2*LINK_SIZE + skipunits;
8141 
8142 /* Remember if this is a lookbehind assertion, and if it is, save its length
8143 and skip over the pattern offset. */
8144 
8145 lookbehind = *code == OP_ASSERTBACK ||
8146              *code == OP_ASSERTBACK_NOT ||
8147              *code == OP_ASSERTBACK_NA;
8148 
8149 if (lookbehind)
8150   {
8151   lookbehindlength = META_DATA(pptr[-1]);
8152   pptr += SIZEOFFSET;
8153   }
8154 else lookbehindlength = 0;
8155 
8156 /* If this is a capturing subpattern, add to the chain of open capturing items
8157 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8158 need be tested here; changing this opcode to one of its variants, e.g.
8159 OP_SCBRAPOS, happens later, after the group has been compiled. */
8160 
8161 if (*code == OP_CBRA)
8162   {
8163   capnumber = GET2(code, 1 + LINK_SIZE);
8164   capitem.number = capnumber;
8165   capitem.next = cb->open_caps;
8166   capitem.assert_depth = cb->assert_depth;
8167   cb->open_caps = &capitem;
8168   }
8169 
8170 /* Offset is set zero to mark that this bracket is still open */
8171 
8172 PUT(code, 1, 0);
8173 code += 1 + LINK_SIZE + skipunits;
8174 
8175 /* Loop for each alternative branch */
8176 
8177 for (;;)
8178   {
8179   int branch_return;
8180 
8181   /* Insert OP_REVERSE if this is as lookbehind assertion. */
8182 
8183   if (lookbehind && lookbehindlength > 0)
8184     {
8185     *code++ = OP_REVERSE;
8186     PUTINC(code, 0, lookbehindlength);
8187     length += 1 + LINK_SIZE;
8188     }
8189 
8190   /* Now compile the branch; in the pre-compile phase its length gets added
8191   into the length. */
8192 
8193   if ((branch_return =
8194         compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
8195           &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
8196           cb, (lengthptr == NULL)? NULL : &length)) == 0)
8197     return 0;
8198 
8199   /* If a branch can match an empty string, so can the whole group. */
8200 
8201   if (branch_return < 0) okreturn = -1;
8202 
8203   /* In the real compile phase, there is some post-processing to be done. */
8204 
8205   if (lengthptr == NULL)
8206     {
8207     /* If this is the first branch, the firstcu and reqcu values for the
8208     branch become the values for the regex. */
8209 
8210     if (*last_branch != OP_ALT)
8211       {
8212       firstcu = branchfirstcu;
8213       firstcuflags = branchfirstcuflags;
8214       reqcu = branchreqcu;
8215       reqcuflags = branchreqcuflags;
8216       }
8217 
8218     /* If this is not the first branch, the first char and reqcu have to
8219     match the values from all the previous branches, except that if the
8220     previous value for reqcu didn't have REQ_VARY set, it can still match,
8221     and we set REQ_VARY for the group from this branch's value. */
8222 
8223     else
8224       {
8225       /* If we previously had a firstcu, but it doesn't match the new branch,
8226       we have to abandon the firstcu for the regex, but if there was
8227       previously no reqcu, it takes on the value of the old firstcu. */
8228 
8229       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8230         {
8231         if (firstcuflags < REQ_NONE)
8232           {
8233           if (reqcuflags >= REQ_NONE)
8234             {
8235             reqcu = firstcu;
8236             reqcuflags = firstcuflags;
8237             }
8238           }
8239         firstcuflags = REQ_NONE;
8240         }
8241 
8242       /* If we (now or from before) have no firstcu, a firstcu from the
8243       branch becomes a reqcu if there isn't a branch reqcu. */
8244 
8245       if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8246           branchreqcuflags >= REQ_NONE)
8247         {
8248         branchreqcu = branchfirstcu;
8249         branchreqcuflags = branchfirstcuflags;
8250         }
8251 
8252       /* Now ensure that the reqcus match */
8253 
8254       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8255           reqcu != branchreqcu)
8256         reqcuflags = REQ_NONE;
8257       else
8258         {
8259         reqcu = branchreqcu;
8260         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8261         }
8262       }
8263     }
8264 
8265   /* Handle reaching the end of the expression, either ')' or end of pattern.
8266   In the real compile phase, go back through the alternative branches and
8267   reverse the chain of offsets, with the field in the BRA item now becoming an
8268   offset to the first alternative. If there are no alternatives, it points to
8269   the end of the group. The length in the terminating ket is always the length
8270   of the whole bracketed item. Return leaving the pointer at the terminating
8271   char. */
8272 
8273   if (META_CODE(*pptr) != META_ALT)
8274     {
8275     if (lengthptr == NULL)
8276       {
8277       PCRE2_SIZE branch_length = code - last_branch;
8278       do
8279         {
8280         PCRE2_SIZE prev_length = GET(last_branch, 1);
8281         PUT(last_branch, 1, branch_length);
8282         branch_length = prev_length;
8283         last_branch -= branch_length;
8284         }
8285       while (branch_length > 0);
8286       }
8287 
8288     /* Fill in the ket */
8289 
8290     *code = OP_KET;
8291     PUT(code, 1, (int)(code - start_bracket));
8292     code += 1 + LINK_SIZE;
8293 
8294     /* If it was a capturing subpattern, remove the block from the chain. */
8295 
8296     if (capnumber > 0) cb->open_caps = cb->open_caps->next;
8297 
8298     /* Set values to pass back */
8299 
8300     *codeptr = code;
8301     *pptrptr = pptr;
8302     *firstcuptr = firstcu;
8303     *firstcuflagsptr = firstcuflags;
8304     *reqcuptr = reqcu;
8305     *reqcuflagsptr = reqcuflags;
8306     if (lengthptr != NULL)
8307       {
8308       if (OFLOW_MAX - *lengthptr < length)
8309         {
8310         *errorcodeptr = ERR20;
8311         return 0;
8312         }
8313       *lengthptr += length;
8314       }
8315     return okreturn;
8316     }
8317 
8318   /* Another branch follows. In the pre-compile phase, we can move the code
8319   pointer back to where it was for the start of the first branch. (That is,
8320   pretend that each branch is the only one.)
8321 
8322   In the real compile phase, insert an ALT node. Its length field points back
8323   to the previous branch while the bracket remains open. At the end the chain
8324   is reversed. It's done like this so that the start of the bracket has a
8325   zero offset until it is closed, making it possible to detect recursion. */
8326 
8327   if (lengthptr != NULL)
8328     {
8329     code = *codeptr + 1 + LINK_SIZE + skipunits;
8330     length += 1 + LINK_SIZE;
8331     }
8332   else
8333     {
8334     *code = OP_ALT;
8335     PUT(code, 1, (int)(code - last_branch));
8336     bc.current_branch = last_branch = code;
8337     code += 1 + LINK_SIZE;
8338     }
8339 
8340   /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8341   and then advance past the vertical bar. */
8342 
8343   lookbehindlength = META_DATA(*pptr);
8344   pptr++;
8345   }
8346 /* Control never reaches here */
8347 }
8348 
8349 
8350 
8351 /*************************************************
8352 *          Check for anchored pattern            *
8353 *************************************************/
8354 
8355 /* Try to find out if this is an anchored regular expression. Consider each
8356 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8357 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8358 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8359 be found, because ^ generates OP_CIRCM in that mode.
8360 
8361 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8362 This is the code for \G, which means "match at start of match position, taking
8363 into account the match offset".
8364 
8365 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8366 because that will try the rest of the pattern at all possible matching points,
8367 so there is no point trying again.... er ....
8368 
8369 .... except when the .* appears inside capturing parentheses, and there is a
8370 subsequent back reference to those parentheses. We haven't enough information
8371 to catch that case precisely.
8372 
8373 At first, the best we could do was to detect when .* was in capturing brackets
8374 and the highest back reference was greater than or equal to that level.
8375 However, by keeping a bitmap of the first 31 back references, we can catch some
8376 of the more common cases more precisely.
8377 
8378 ... A second exception is when the .* appears inside an atomic group, because
8379 this prevents the number of characters it matches from being adjusted.
8380 
8381 Arguments:
8382   code           points to start of the compiled pattern
8383   bracket_map    a bitmap of which brackets we are inside while testing; this
8384                    handles up to substring 31; after that we just have to take
8385                    the less precise approach
8386   cb             points to the compile data block
8387   atomcount      atomic group level
8388   inassert       TRUE if in an assertion
8389 
8390 Returns:     TRUE or FALSE
8391 */
8392 
8393 static BOOL
is_anchored(PCRE2_SPTR code,uint32_t bracket_map,compile_block * cb,int atomcount,BOOL inassert)8394 is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8395   int atomcount, BOOL inassert)
8396 {
8397 do {
8398    PCRE2_SPTR scode = first_significant_code(
8399      code + PRIV(OP_lengths)[*code], FALSE);
8400    int op = *scode;
8401 
8402    /* Non-capturing brackets */
8403 
8404    if (op == OP_BRA  || op == OP_BRAPOS ||
8405        op == OP_SBRA || op == OP_SBRAPOS)
8406      {
8407      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8408        return FALSE;
8409      }
8410 
8411    /* Capturing brackets */
8412 
8413    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8414             op == OP_SCBRA || op == OP_SCBRAPOS)
8415      {
8416      int n = GET2(scode, 1+LINK_SIZE);
8417      uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8418      if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8419      }
8420 
8421    /* Positive forward assertion */
8422 
8423    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8424      {
8425      if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8426      }
8427 
8428    /* Condition. If there is no second branch, it can't be anchored. */
8429 
8430    else if (op == OP_COND || op == OP_SCOND)
8431      {
8432      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8433      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8434        return FALSE;
8435      }
8436 
8437    /* Atomic groups */
8438 
8439    else if (op == OP_ONCE)
8440      {
8441      if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8442        return FALSE;
8443      }
8444 
8445    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8446    it isn't in brackets that are or may be referenced or inside an atomic
8447    group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8448    because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8449    with the subject "aab", which matches "b", i.e. not at the start of a line.
8450    There is also an option that disables auto-anchoring. */
8451 
8452    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8453              op == OP_TYPEPOSSTAR))
8454      {
8455      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8456          atomcount > 0 || cb->had_pruneorskip || inassert ||
8457          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8458        return FALSE;
8459      }
8460 
8461    /* Check for explicit anchoring */
8462 
8463    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8464 
8465    code += GET(code, 1);
8466    }
8467 while (*code == OP_ALT);   /* Loop for each alternative */
8468 return TRUE;
8469 }
8470 
8471 
8472 
8473 /*************************************************
8474 *         Check for starting with ^ or .*        *
8475 *************************************************/
8476 
8477 /* This is called to find out if every branch starts with ^ or .* so that
8478 "first char" processing can be done to speed things up in multiline
8479 matching and for non-DOTALL patterns that start with .* (which must start at
8480 the beginning or after \n). As in the case of is_anchored() (see above), we
8481 have to take account of back references to capturing brackets that contain .*
8482 because in that case we can't make the assumption. Also, the appearance of .*
8483 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8484 or *SKIP does not count, because once again the assumption no longer holds.
8485 
8486 Arguments:
8487   code           points to start of the compiled pattern or a group
8488   bracket_map    a bitmap of which brackets we are inside while testing; this
8489                    handles up to substring 31; after that we just have to take
8490                    the less precise approach
8491   cb             points to the compile data
8492   atomcount      atomic group level
8493   inassert       TRUE if in an assertion
8494 
8495 Returns:         TRUE or FALSE
8496 */
8497 
8498 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8499 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8500   int atomcount, BOOL inassert)
8501 {
8502 do {
8503    PCRE2_SPTR scode = first_significant_code(
8504      code + PRIV(OP_lengths)[*code], FALSE);
8505    int op = *scode;
8506 
8507    /* If we are at the start of a conditional assertion group, *both* the
8508    conditional assertion *and* what follows the condition must satisfy the test
8509    for start of line. Other kinds of condition fail. Note that there may be an
8510    auto-callout at the start of a condition. */
8511 
8512    if (op == OP_COND)
8513      {
8514      scode += 1 + LINK_SIZE;
8515 
8516      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8517        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8518 
8519      switch (*scode)
8520        {
8521        case OP_CREF:
8522        case OP_DNCREF:
8523        case OP_RREF:
8524        case OP_DNRREF:
8525        case OP_FAIL:
8526        case OP_FALSE:
8527        case OP_TRUE:
8528        return FALSE;
8529 
8530        default:     /* Assertion */
8531        if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8532        do scode += GET(scode, 1); while (*scode == OP_ALT);
8533        scode += 1 + LINK_SIZE;
8534        break;
8535        }
8536      scode = first_significant_code(scode, FALSE);
8537      op = *scode;
8538      }
8539 
8540    /* Non-capturing brackets */
8541 
8542    if (op == OP_BRA  || op == OP_BRAPOS ||
8543        op == OP_SBRA || op == OP_SBRAPOS)
8544      {
8545      if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8546        return FALSE;
8547      }
8548 
8549    /* Capturing brackets */
8550 
8551    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8552             op == OP_SCBRA || op == OP_SCBRAPOS)
8553      {
8554      int n = GET2(scode, 1+LINK_SIZE);
8555      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8556      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8557      }
8558 
8559    /* Positive forward assertions */
8560 
8561    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8562      {
8563      if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8564        return FALSE;
8565      }
8566 
8567    /* Atomic brackets */
8568 
8569    else if (op == OP_ONCE)
8570      {
8571      if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8572        return FALSE;
8573      }
8574 
8575    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8576    brackets that may be referenced or an assertion, and as long as the pattern
8577    does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8578    for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8579    i.e. not at the start of a line. There is also an option that disables this
8580    optimization. */
8581 
8582    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8583      {
8584      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8585          atomcount > 0 || cb->had_pruneorskip || inassert ||
8586          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8587        return FALSE;
8588      }
8589 
8590    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8591    in particular that this includes atomic brackets OP_ONCE because the number
8592    of characters matched by .* cannot be adjusted inside them. */
8593 
8594    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8595 
8596    /* Move on to the next alternative */
8597 
8598    code += GET(code, 1);
8599    }
8600 while (*code == OP_ALT);  /* Loop for each alternative */
8601 return TRUE;
8602 }
8603 
8604 
8605 
8606 /*************************************************
8607 *   Scan compiled regex for recursion reference  *
8608 *************************************************/
8609 
8610 /* This function scans through a compiled pattern until it finds an instance of
8611 OP_RECURSE.
8612 
8613 Arguments:
8614   code        points to start of expression
8615   utf         TRUE in UTF mode
8616 
8617 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8618 */
8619 
8620 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8621 find_recurse(PCRE2_SPTR code, BOOL utf)
8622 {
8623 for (;;)
8624   {
8625   PCRE2_UCHAR c = *code;
8626   if (c == OP_END) return NULL;
8627   if (c == OP_RECURSE) return code;
8628 
8629   /* XCLASS is used for classes that cannot be represented just by a bit map.
8630   This includes negated single high-valued characters. CALLOUT_STR is used for
8631   callouts with string arguments. In both cases the length in the table is
8632   zero; the actual length is stored in the compiled code. */
8633 
8634   if (c == OP_XCLASS) code += GET(code, 1);
8635     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8636 
8637   /* Otherwise, we can get the item's length from the table, except that for
8638   repeated character types, we have to test for \p and \P, which have an extra
8639   two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8640   we must add in its length. */
8641 
8642   else
8643     {
8644     switch(c)
8645       {
8646       case OP_TYPESTAR:
8647       case OP_TYPEMINSTAR:
8648       case OP_TYPEPLUS:
8649       case OP_TYPEMINPLUS:
8650       case OP_TYPEQUERY:
8651       case OP_TYPEMINQUERY:
8652       case OP_TYPEPOSSTAR:
8653       case OP_TYPEPOSPLUS:
8654       case OP_TYPEPOSQUERY:
8655       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8656       break;
8657 
8658       case OP_TYPEPOSUPTO:
8659       case OP_TYPEUPTO:
8660       case OP_TYPEMINUPTO:
8661       case OP_TYPEEXACT:
8662       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8663         code += 2;
8664       break;
8665 
8666       case OP_MARK:
8667       case OP_COMMIT_ARG:
8668       case OP_PRUNE_ARG:
8669       case OP_SKIP_ARG:
8670       case OP_THEN_ARG:
8671       code += code[1];
8672       break;
8673       }
8674 
8675     /* Add in the fixed length from the table */
8676 
8677     code += PRIV(OP_lengths)[c];
8678 
8679     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8680     be followed by a multi-unit character. The length in the table is a
8681     minimum, so we have to arrange to skip the extra units. */
8682 
8683 #ifdef MAYBE_UTF_MULTI
8684     if (utf) switch(c)
8685       {
8686       case OP_CHAR:
8687       case OP_CHARI:
8688       case OP_NOT:
8689       case OP_NOTI:
8690       case OP_EXACT:
8691       case OP_EXACTI:
8692       case OP_NOTEXACT:
8693       case OP_NOTEXACTI:
8694       case OP_UPTO:
8695       case OP_UPTOI:
8696       case OP_NOTUPTO:
8697       case OP_NOTUPTOI:
8698       case OP_MINUPTO:
8699       case OP_MINUPTOI:
8700       case OP_NOTMINUPTO:
8701       case OP_NOTMINUPTOI:
8702       case OP_POSUPTO:
8703       case OP_POSUPTOI:
8704       case OP_NOTPOSUPTO:
8705       case OP_NOTPOSUPTOI:
8706       case OP_STAR:
8707       case OP_STARI:
8708       case OP_NOTSTAR:
8709       case OP_NOTSTARI:
8710       case OP_MINSTAR:
8711       case OP_MINSTARI:
8712       case OP_NOTMINSTAR:
8713       case OP_NOTMINSTARI:
8714       case OP_POSSTAR:
8715       case OP_POSSTARI:
8716       case OP_NOTPOSSTAR:
8717       case OP_NOTPOSSTARI:
8718       case OP_PLUS:
8719       case OP_PLUSI:
8720       case OP_NOTPLUS:
8721       case OP_NOTPLUSI:
8722       case OP_MINPLUS:
8723       case OP_MINPLUSI:
8724       case OP_NOTMINPLUS:
8725       case OP_NOTMINPLUSI:
8726       case OP_POSPLUS:
8727       case OP_POSPLUSI:
8728       case OP_NOTPOSPLUS:
8729       case OP_NOTPOSPLUSI:
8730       case OP_QUERY:
8731       case OP_QUERYI:
8732       case OP_NOTQUERY:
8733       case OP_NOTQUERYI:
8734       case OP_MINQUERY:
8735       case OP_MINQUERYI:
8736       case OP_NOTMINQUERY:
8737       case OP_NOTMINQUERYI:
8738       case OP_POSQUERY:
8739       case OP_POSQUERYI:
8740       case OP_NOTPOSQUERY:
8741       case OP_NOTPOSQUERYI:
8742       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8743       break;
8744       }
8745 #else
8746     (void)(utf);  /* Keep compiler happy by referencing function argument */
8747 #endif  /* MAYBE_UTF_MULTI */
8748     }
8749   }
8750 }
8751 
8752 
8753 
8754 /*************************************************
8755 *    Check for asserted fixed first code unit    *
8756 *************************************************/
8757 
8758 /* During compilation, the "first code unit" settings from forward assertions
8759 are discarded, because they can cause conflicts with actual literals that
8760 follow. However, if we end up without a first code unit setting for an
8761 unanchored pattern, it is worth scanning the regex to see if there is an
8762 initial asserted first code unit. If all branches start with the same asserted
8763 code unit, or with a non-conditional bracket all of whose alternatives start
8764 with the same asserted code unit (recurse ad lib), then we return that code
8765 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8766 REQ_NONE in the flags.
8767 
8768 Arguments:
8769   code       points to start of compiled pattern
8770   flags      points to the first code unit flags
8771   inassert   non-zero if in an assertion
8772 
8773 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
8774 */
8775 
8776 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,uint32_t * flags,uint32_t inassert)8777 find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
8778 {
8779 uint32_t c = 0;
8780 uint32_t cflags = REQ_NONE;
8781 
8782 *flags = REQ_NONE;
8783 do {
8784    uint32_t d;
8785    uint32_t dflags;
8786    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8787              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8788    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8789    PCRE2_UCHAR op = *scode;
8790 
8791    switch(op)
8792      {
8793      default:
8794      return 0;
8795 
8796      case OP_BRA:
8797      case OP_BRAPOS:
8798      case OP_CBRA:
8799      case OP_SCBRA:
8800      case OP_CBRAPOS:
8801      case OP_SCBRAPOS:
8802      case OP_ASSERT:
8803      case OP_ASSERT_NA:
8804      case OP_ONCE:
8805      case OP_SCRIPT_RUN:
8806      d = find_firstassertedcu(scode, &dflags, inassert +
8807        ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
8808      if (dflags >= REQ_NONE) return 0;
8809      if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
8810        else if (c != d || cflags != dflags) return 0;
8811      break;
8812 
8813      case OP_EXACT:
8814      scode += IMM2_SIZE;
8815      /* Fall through */
8816 
8817      case OP_CHAR:
8818      case OP_PLUS:
8819      case OP_MINPLUS:
8820      case OP_POSPLUS:
8821      if (inassert == 0) return 0;
8822      if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
8823        else if (c != scode[1]) return 0;
8824      break;
8825 
8826      case OP_EXACTI:
8827      scode += IMM2_SIZE;
8828      /* Fall through */
8829 
8830      case OP_CHARI:
8831      case OP_PLUSI:
8832      case OP_MINPLUSI:
8833      case OP_POSPLUSI:
8834      if (inassert == 0) return 0;
8835 
8836      /* If the character is more than one code unit long, we cannot set its
8837      first code unit when matching caselessly. Later scanning may pick up
8838      multiple code units. */
8839 
8840 #ifdef SUPPORT_UNICODE
8841 #if PCRE2_CODE_UNIT_WIDTH == 8
8842      if (scode[1] >= 0x80) return 0;
8843 #elif PCRE2_CODE_UNIT_WIDTH == 16
8844      if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
8845 #endif
8846 #endif
8847 
8848      if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
8849        else if (c != scode[1]) return 0;
8850      break;
8851      }
8852 
8853    code += GET(code, 1);
8854    }
8855 while (*code == OP_ALT);
8856 
8857 *flags = cflags;
8858 return c;
8859 }
8860 
8861 
8862 
8863 /*************************************************
8864 *     Add an entry to the name/number table      *
8865 *************************************************/
8866 
8867 /* This function is called between compiling passes to add an entry to the
8868 name/number table, maintaining alphabetical order. Checking for permitted
8869 and forbidden duplicates has already been done.
8870 
8871 Arguments:
8872   cb           the compile data block
8873   name         the name to add
8874   length       the length of the name
8875   groupno      the group number
8876   tablecount   the count of names in the table so far
8877 
8878 Returns:       nothing
8879 */
8880 
8881 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8882 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8883   unsigned int groupno, uint32_t tablecount)
8884 {
8885 uint32_t i;
8886 PCRE2_UCHAR *slot = cb->name_table;
8887 
8888 for (i = 0; i < tablecount; i++)
8889   {
8890   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8891   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8892     crc = -1; /* Current name is a substring */
8893 
8894   /* Make space in the table and break the loop for an earlier name. For a
8895   duplicate or later name, carry on. We do this for duplicates so that in the
8896   simple case (when ?(| is not used) they are in order of their numbers. In all
8897   cases they are in the order in which they appear in the pattern. */
8898 
8899   if (crc < 0)
8900     {
8901     (void)memmove(slot + cb->name_entry_size, slot,
8902       CU2BYTES((tablecount - i) * cb->name_entry_size));
8903     break;
8904     }
8905 
8906   /* Continue the loop for a later or duplicate name */
8907 
8908   slot += cb->name_entry_size;
8909   }
8910 
8911 PUT2(slot, 0, groupno);
8912 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8913 
8914 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8915 the memory is all initialized. Otherwise valgrind moans about uninitialized
8916 memory when saving serialized compiled patterns. */
8917 
8918 memset(slot + IMM2_SIZE + length, 0,
8919   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8920 }
8921 
8922 
8923 
8924 /*************************************************
8925 *             Skip in parsed pattern             *
8926 *************************************************/
8927 
8928 /* This function is called to skip parts of the parsed pattern when finding the
8929 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8930 the end of the branch, it is called to skip over an internal lookaround or
8931 (DEFINE) group, and it is also called to skip to the end of a class, during
8932 which it will never encounter nested groups (but there's no need to have
8933 special code for that).
8934 
8935 When called to find the end of a branch or group, pptr must point to the first
8936 meta code inside the branch, not the branch-starting code. In other cases it
8937 can point to the item that causes the function to be called.
8938 
8939 Arguments:
8940   pptr       current pointer to skip from
8941   skiptype   PSKIP_CLASS when skipping to end of class
8942              PSKIP_ALT when META_ALT ends the skip
8943              PSKIP_KET when only META_KET ends the skip
8944 
8945 Returns:     new value of pptr
8946              NULL if META_END is reached - should never occur
8947                or for an unknown meta value - likewise
8948 */
8949 
8950 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8951 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8952 {
8953 uint32_t nestlevel = 0;
8954 
8955 for (;; pptr++)
8956   {
8957   uint32_t meta = META_CODE(*pptr);
8958 
8959   switch(meta)
8960     {
8961     default:  /* Just skip over most items */
8962     if (meta < META_END) continue;  /* Literal */
8963     break;
8964 
8965     /* This should never occur. */
8966 
8967     case META_END:
8968     return NULL;
8969 
8970     /* The data for these items is variable in length. */
8971 
8972     case META_BACKREF:  /* Offset is present only if group >= 10 */
8973     if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8974     break;
8975 
8976     case META_ESCAPE:   /* A few escapes are followed by data items. */
8977     switch (META_DATA(*pptr))
8978       {
8979       case ESC_P:
8980       case ESC_p:
8981       pptr += 1;
8982       break;
8983 
8984       case ESC_g:
8985       case ESC_k:
8986       pptr += 1 + SIZEOFFSET;
8987       break;
8988       }
8989     break;
8990 
8991     case META_MARK:     /* Add the length of the name. */
8992     case META_COMMIT_ARG:
8993     case META_PRUNE_ARG:
8994     case META_SKIP_ARG:
8995     case META_THEN_ARG:
8996     pptr += pptr[1];
8997     break;
8998 
8999     /* These are the "active" items in this loop. */
9000 
9001     case META_CLASS_END:
9002     if (skiptype == PSKIP_CLASS) return pptr;
9003     break;
9004 
9005     case META_ATOMIC:
9006     case META_CAPTURE:
9007     case META_COND_ASSERT:
9008     case META_COND_DEFINE:
9009     case META_COND_NAME:
9010     case META_COND_NUMBER:
9011     case META_COND_RNAME:
9012     case META_COND_RNUMBER:
9013     case META_COND_VERSION:
9014     case META_LOOKAHEAD:
9015     case META_LOOKAHEADNOT:
9016     case META_LOOKAHEAD_NA:
9017     case META_LOOKBEHIND:
9018     case META_LOOKBEHINDNOT:
9019     case META_LOOKBEHIND_NA:
9020     case META_NOCAPTURE:
9021     case META_SCRIPT_RUN:
9022     nestlevel++;
9023     break;
9024 
9025     case META_ALT:
9026     if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9027     break;
9028 
9029     case META_KET:
9030     if (nestlevel == 0) return pptr;
9031     nestlevel--;
9032     break;
9033     }
9034 
9035   /* The extra data item length for each meta is in a table. */
9036 
9037   meta = (meta >> 16) & 0x7fff;
9038   if (meta >= sizeof(meta_extra_lengths)) return NULL;
9039   pptr += meta_extra_lengths[meta];
9040   }
9041 /* Control never reaches here */
9042 return pptr;
9043 }
9044 
9045 
9046 
9047 /*************************************************
9048 *       Find length of a parsed group            *
9049 *************************************************/
9050 
9051 /* This is called for nested groups within a branch of a lookbehind whose
9052 length is being computed. If all the branches in the nested group have the same
9053 length, that is OK. On entry, the pointer must be at the first element after
9054 the group initializing code. On exit it points to OP_KET. Caching is used to
9055 improve processing speed when the same capturing group occurs many times.
9056 
9057 Arguments:
9058   pptrptr     pointer to pointer in the parsed pattern
9059   isinline    FALSE if a reference or recursion; TRUE for inline group
9060   errcodeptr  pointer to the errorcode
9061   lcptr       pointer to the loop counter
9062   group       number of captured group or -1 for a non-capturing group
9063   recurses    chain of recurse_check to catch mutual recursion
9064   cb          pointer to the compile data
9065 
9066 Returns:      the group length or a negative number
9067 */
9068 
9069 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)9070 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
9071    int group, parsed_recurse_check *recurses, compile_block *cb)
9072 {
9073 int branchlength;
9074 int grouplength = -1;
9075 
9076 /* The cache can be used only if there is no possibility of there being two
9077 groups with the same number. We do not need to set the end pointer for a group
9078 that is being processed as a back reference or recursion, but we must do so for
9079 an inline group. */
9080 
9081 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9082   {
9083   uint32_t groupinfo = cb->groupinfo[group];
9084   if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9085   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9086     {
9087     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9088     return groupinfo & GI_FIXED_LENGTH_MASK;
9089     }
9090   }
9091 
9092 /* Scan the group. In this case we find the end pointer of necessity. */
9093 
9094 for(;;)
9095   {
9096   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9097   if (branchlength < 0) goto ISNOTFIXED;
9098   if (grouplength == -1) grouplength = branchlength;
9099     else if (grouplength != branchlength) goto ISNOTFIXED;
9100   if (**pptrptr == META_KET) break;
9101   *pptrptr += 1;   /* Skip META_ALT */
9102   }
9103 
9104 if (group > 0)
9105   cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9106 return grouplength;
9107 
9108 ISNOTFIXED:
9109 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
9110 return -1;
9111 }
9112 
9113 
9114 
9115 /*************************************************
9116 *        Find length of a parsed branch          *
9117 *************************************************/
9118 
9119 /* Return a fixed length for a branch in a lookbehind, giving an error if the
9120 length is not fixed. On entry, *pptrptr points to the first element inside the
9121 branch. On exit it is set to point to the ALT or KET.
9122 
9123 Arguments:
9124   pptrptr     pointer to pointer in the parsed pattern
9125   errcodeptr  pointer to error code
9126   lcptr       pointer to loop counter
9127   recurses    chain of recurse_check to catch mutual recursion
9128   cb          pointer to compile block
9129 
9130 Returns:      the length, or a negative value on error
9131 */
9132 
9133 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9134 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9135   parsed_recurse_check *recurses, compile_block *cb)
9136 {
9137 int branchlength = 0;
9138 int grouplength;
9139 uint32_t lastitemlength = 0;
9140 uint32_t *pptr = *pptrptr;
9141 PCRE2_SIZE offset;
9142 parsed_recurse_check this_recurse;
9143 
9144 /* A large and/or complex regex can take too long to process. This can happen
9145 more often when (?| groups are present in the pattern because their length
9146 cannot be cached. */
9147 
9148 if ((*lcptr)++ > 2000)
9149   {
9150   *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9151   return -1;
9152   }
9153 
9154 /* Scan the branch, accumulating the length. */
9155 
9156 for (;; pptr++)
9157   {
9158   parsed_recurse_check *r;
9159   uint32_t *gptr, *gptrend;
9160   uint32_t escape;
9161   uint32_t group = 0;
9162   uint32_t itemlength = 0;
9163 
9164   if (*pptr < META_END)
9165     {
9166     itemlength = 1;
9167     }
9168 
9169   else switch (META_CODE(*pptr))
9170     {
9171     case META_KET:
9172     case META_ALT:
9173     goto EXIT;
9174 
9175     /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9176     actual termination. */
9177 
9178     case META_ACCEPT:
9179     case META_FAIL:
9180     pptr = parsed_skip(pptr, PSKIP_ALT);
9181     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9182     goto EXIT;
9183 
9184     case META_MARK:
9185     case META_COMMIT_ARG:
9186     case META_PRUNE_ARG:
9187     case META_SKIP_ARG:
9188     case META_THEN_ARG:
9189     pptr += pptr[1] + 1;
9190     break;
9191 
9192     case META_CIRCUMFLEX:
9193     case META_COMMIT:
9194     case META_DOLLAR:
9195     case META_PRUNE:
9196     case META_SKIP:
9197     case META_THEN:
9198     break;
9199 
9200     case META_OPTIONS:
9201     pptr += 1;
9202     break;
9203 
9204     case META_BIGVALUE:
9205     itemlength = 1;
9206     pptr += 1;
9207     break;
9208 
9209     case META_CLASS:
9210     case META_CLASS_NOT:
9211     itemlength = 1;
9212     pptr = parsed_skip(pptr, PSKIP_CLASS);
9213     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9214     break;
9215 
9216     case META_CLASS_EMPTY_NOT:
9217     case META_DOT:
9218     itemlength = 1;
9219     break;
9220 
9221     case META_CALLOUT_NUMBER:
9222     pptr += 3;
9223     break;
9224 
9225     case META_CALLOUT_STRING:
9226     pptr += 3 + SIZEOFFSET;
9227     break;
9228 
9229     /* Only some escapes consume a character. Of those, \R and \X are never
9230     allowed because they might match more than character. \C is allowed only in
9231     32-bit and non-UTF 8/16-bit modes. */
9232 
9233     case META_ESCAPE:
9234     escape = META_DATA(*pptr);
9235     if (escape == ESC_R || escape == ESC_X) return -1;
9236     if (escape > ESC_b && escape < ESC_Z)
9237       {
9238 #if PCRE2_CODE_UNIT_WIDTH != 32
9239       if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9240         {
9241         *errcodeptr = ERR36;
9242         return -1;
9243         }
9244 #endif
9245       itemlength = 1;
9246       if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9247       }
9248     break;
9249 
9250     /* Lookaheads do not contribute to the length of this branch, but they may
9251     contain lookbehinds within them whose lengths need to be set. */
9252 
9253     case META_LOOKAHEAD:
9254     case META_LOOKAHEADNOT:
9255     case META_LOOKAHEAD_NA:
9256     *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9257     if (*errcodeptr != 0) return -1;
9258 
9259     /* Ignore any qualifiers that follow a lookahead assertion. */
9260 
9261     switch (pptr[1])
9262       {
9263       case META_ASTERISK:
9264       case META_ASTERISK_PLUS:
9265       case META_ASTERISK_QUERY:
9266       case META_PLUS:
9267       case META_PLUS_PLUS:
9268       case META_PLUS_QUERY:
9269       case META_QUERY:
9270       case META_QUERY_PLUS:
9271       case META_QUERY_QUERY:
9272       pptr++;
9273       break;
9274 
9275       case META_MINMAX:
9276       case META_MINMAX_PLUS:
9277       case META_MINMAX_QUERY:
9278       pptr += 3;
9279       break;
9280 
9281       default:
9282       break;
9283       }
9284     break;
9285 
9286     /* A nested lookbehind does not contribute any length to this lookbehind,
9287     but must itself be checked and have its lengths set. */
9288 
9289     case META_LOOKBEHIND:
9290     case META_LOOKBEHINDNOT:
9291     case META_LOOKBEHIND_NA:
9292     if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9293       return -1;
9294     break;
9295 
9296     /* Back references and recursions are handled by very similar code. At this
9297     stage, the names generated in the parsing pass are available, but the main
9298     name table has not yet been created. So for the named varieties, scan the
9299     list of names in order to get the number of the first one in the pattern,
9300     and whether or not this name is duplicated. */
9301 
9302     case META_BACKREF_BYNAME:
9303     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9304       goto ISNOTFIXED;
9305     /* Fall through */
9306 
9307     case META_RECURSE_BYNAME:
9308       {
9309       int i;
9310       PCRE2_SPTR name;
9311       BOOL is_dupname = FALSE;
9312       named_group *ng = cb->named_groups;
9313       uint32_t meta_code = META_CODE(*pptr);
9314       uint32_t length = *(++pptr);
9315 
9316       GETPLUSOFFSET(offset, pptr);
9317       name = cb->start_pattern + offset;
9318       for (i = 0; i < cb->names_found; i++, ng++)
9319         {
9320         if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9321           {
9322           group = ng->number;
9323           is_dupname = ng->isdup;
9324           break;
9325           }
9326         }
9327 
9328       if (group == 0)
9329         {
9330         *errcodeptr = ERR15;  /* Non-existent subpattern */
9331         cb->erroroffset = offset;
9332         return -1;
9333         }
9334 
9335       /* A numerical back reference can be fixed length if duplicate capturing
9336       groups are not being used. A non-duplicate named back reference can also
9337       be handled. */
9338 
9339       if (meta_code == META_RECURSE_BYNAME ||
9340           (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9341         goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9342       }
9343     goto ISNOTFIXED;                     /* Duplicate name or number */
9344 
9345     /* The offset values for back references < 10 are in a separate vector
9346     because otherwise they would use more than two parsed pattern elements on
9347     64-bit systems. */
9348 
9349     case META_BACKREF:
9350     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9351         (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9352       goto ISNOTFIXED;
9353     group = META_DATA(*pptr);
9354     if (group < 10)
9355       {
9356       offset = cb->small_ref_offset[group];
9357       goto RECURSE_OR_BACKREF_LENGTH;
9358       }
9359 
9360     /* Fall through */
9361     /* For groups >= 10 - picking up group twice does no harm. */
9362 
9363     /* A true recursion implies not fixed length, but a subroutine call may
9364     be OK. Back reference "recursions" are also failed. */
9365 
9366     case META_RECURSE:
9367     group = META_DATA(*pptr);
9368     GETPLUSOFFSET(offset, pptr);
9369 
9370     RECURSE_OR_BACKREF_LENGTH:
9371     if (group > cb->bracount)
9372       {
9373       cb->erroroffset = offset;
9374       *errcodeptr = ERR15;  /* Non-existent subpattern */
9375       return -1;
9376       }
9377     if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9378     for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9379       {
9380       if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9381         else if (*gptr == (META_CAPTURE | group)) break;
9382       }
9383 
9384     /* We must start the search for the end of the group at the first meta code
9385     inside the group. Otherwise it will be treated as an enclosed group. */
9386 
9387     gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9388     if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9389     if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9390     for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9391     if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9392     this_recurse.prev = recurses;
9393     this_recurse.groupptr = gptr;
9394 
9395     /* We do not need to know the position of the end of the group, that is,
9396     gptr is not used after the call to get_grouplength(). Setting the second
9397     argument FALSE stops it scanning for the end when the length can be found
9398     in the cache. */
9399 
9400     gptr++;
9401     grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9402       &this_recurse, cb);
9403     if (grouplength < 0)
9404       {
9405       if (*errcodeptr == 0) goto ISNOTFIXED;
9406       return -1;  /* Error already set */
9407       }
9408     itemlength = grouplength;
9409     break;
9410 
9411     /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9412     the length of this branch. Skip from the following item to the next
9413     unpaired ket. */
9414 
9415     case META_COND_DEFINE:
9416     pptr = parsed_skip(pptr + 1, PSKIP_KET);
9417     break;
9418 
9419     /* Check other nested groups - advance past the initial data for each type
9420     and then seek a fixed length with get_grouplength(). */
9421 
9422     case META_COND_NAME:
9423     case META_COND_NUMBER:
9424     case META_COND_RNAME:
9425     case META_COND_RNUMBER:
9426     pptr += 2 + SIZEOFFSET;
9427     goto CHECK_GROUP;
9428 
9429     case META_COND_ASSERT:
9430     pptr += 1;
9431     goto CHECK_GROUP;
9432 
9433     case META_COND_VERSION:
9434     pptr += 4;
9435     goto CHECK_GROUP;
9436 
9437     case META_CAPTURE:
9438     group = META_DATA(*pptr);
9439     /* Fall through */
9440 
9441     case META_ATOMIC:
9442     case META_NOCAPTURE:
9443     case META_SCRIPT_RUN:
9444     pptr++;
9445     CHECK_GROUP:
9446     grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9447       recurses, cb);
9448     if (grouplength < 0) return -1;
9449     itemlength = grouplength;
9450     break;
9451 
9452     /* Exact repetition is OK; variable repetition is not. A repetition of zero
9453     must subtract the length that has already been added. */
9454 
9455     case META_MINMAX:
9456     case META_MINMAX_PLUS:
9457     case META_MINMAX_QUERY:
9458     if (pptr[1] == pptr[2])
9459       {
9460       switch(pptr[1])
9461         {
9462         case 0:
9463         branchlength -= lastitemlength;
9464         break;
9465 
9466         case 1:
9467         itemlength = 0;
9468         break;
9469 
9470         default:  /* Check for integer overflow */
9471         if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9472             INT_MAX/lastitemlength < pptr[1] - 1)
9473           {
9474           *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9475           return -1;
9476           }
9477         itemlength = (pptr[1] - 1) * lastitemlength;
9478         break;
9479         }
9480       pptr += 2;
9481       break;
9482       }
9483     /* Fall through */
9484 
9485     /* Any other item means this branch does not have a fixed length. */
9486 
9487     default:
9488     ISNOTFIXED:
9489     *errcodeptr = ERR25;   /* Not fixed length */
9490     return -1;
9491     }
9492 
9493   /* Add the item length to the branchlength, checking for integer overflow and
9494   for the branch length exceeding the limit. */
9495 
9496   if (INT_MAX - branchlength < (int)itemlength ||
9497       (branchlength += itemlength) > LOOKBEHIND_MAX)
9498     {
9499     *errcodeptr = ERR87;
9500     return -1;
9501     }
9502 
9503   /* Save this item length for use if the next item is a quantifier. */
9504 
9505   lastitemlength = itemlength;
9506   }
9507 
9508 EXIT:
9509 *pptrptr = pptr;
9510 return branchlength;
9511 
9512 PARSED_SKIP_FAILED:
9513 *errcodeptr = ERR90;
9514 return -1;
9515 }
9516 
9517 
9518 
9519 /*************************************************
9520 *        Set lengths in a lookbehind             *
9521 *************************************************/
9522 
9523 /* This function is called for each lookbehind, to set the lengths in its
9524 branches. An error occurs if any branch does not have a fixed length that is
9525 less than the maximum (65535). On exit, the pointer must be left on the final
9526 ket.
9527 
9528 The function also maintains the max_lookbehind value. Any lookbehind branch
9529 that contains a nested lookbehind may actually look further back than the
9530 length of the branch. The additional amount is passed back from
9531 get_branchlength() as an "extra" value.
9532 
9533 Arguments:
9534   pptrptr     pointer to pointer in the parsed pattern
9535   errcodeptr  pointer to error code
9536   lcptr       pointer to loop counter
9537   recurses    chain of recurse_check to catch mutual recursion
9538   cb          pointer to compile block
9539 
9540 Returns:      TRUE if all is well
9541               FALSE otherwise, with error code and offset set
9542 */
9543 
9544 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9545 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9546   parsed_recurse_check *recurses, compile_block *cb)
9547 {
9548 PCRE2_SIZE offset;
9549 int branchlength;
9550 uint32_t *bptr = *pptrptr;
9551 
9552 READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9553 *pptrptr += SIZEOFFSET;
9554 
9555 do
9556   {
9557   *pptrptr += 1;
9558   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9559   if (branchlength < 0)
9560     {
9561     /* The errorcode and offset may already be set from a nested lookbehind. */
9562     if (*errcodeptr == 0) *errcodeptr = ERR25;
9563     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9564     return FALSE;
9565     }
9566   if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9567   *bptr |= branchlength;  /* branchlength never more than 65535 */
9568   bptr = *pptrptr;
9569   }
9570 while (*bptr == META_ALT);
9571 
9572 return TRUE;
9573 }
9574 
9575 
9576 
9577 /*************************************************
9578 *         Check parsed pattern lookbehinds       *
9579 *************************************************/
9580 
9581 /* This function is called at the end of parsing a pattern if any lookbehinds
9582 were encountered. It scans the parsed pattern for them, calling
9583 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9584 the error offset is marked unset. The enables the functions above not to
9585 override settings from deeper nestings.
9586 
9587 This function is called recursively from get_branchlength() for lookaheads in
9588 order to process any lookbehinds that they may contain. It stops when it hits a
9589 non-nested closing parenthesis in this case, returning a pointer to it.
9590 
9591 Arguments
9592   pptr      points to where to start (start of pattern or start of lookahead)
9593   retptr    if not NULL, return the ket pointer here
9594   recurses  chain of recurse_check to catch mutual recursion
9595   cb        points to the compile block
9596   lcptr     points to loop counter
9597 
9598 Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9599 */
9600 
9601 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9602 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9603   parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9604 {
9605 int errorcode = 0;
9606 int nestlevel = 0;
9607 
9608 cb->erroroffset = PCRE2_UNSET;
9609 
9610 for (; *pptr != META_END; pptr++)
9611   {
9612   if (*pptr < META_END) continue;  /* Literal */
9613 
9614   switch (META_CODE(*pptr))
9615     {
9616     default:
9617     return ERR70;  /* Unrecognized meta code */
9618 
9619     case META_ESCAPE:
9620     if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9621       pptr += 1;
9622     break;
9623 
9624     case META_KET:
9625     if (--nestlevel < 0)
9626       {
9627       if (retptr != NULL) *retptr = pptr;
9628       return 0;
9629       }
9630     break;
9631 
9632     case META_ATOMIC:
9633     case META_CAPTURE:
9634     case META_COND_ASSERT:
9635     case META_LOOKAHEAD:
9636     case META_LOOKAHEADNOT:
9637     case META_LOOKAHEAD_NA:
9638     case META_NOCAPTURE:
9639     case META_SCRIPT_RUN:
9640     nestlevel++;
9641     break;
9642 
9643     case META_ACCEPT:
9644     case META_ALT:
9645     case META_ASTERISK:
9646     case META_ASTERISK_PLUS:
9647     case META_ASTERISK_QUERY:
9648     case META_BACKREF:
9649     case META_CIRCUMFLEX:
9650     case META_CLASS:
9651     case META_CLASS_EMPTY:
9652     case META_CLASS_EMPTY_NOT:
9653     case META_CLASS_END:
9654     case META_CLASS_NOT:
9655     case META_COMMIT:
9656     case META_DOLLAR:
9657     case META_DOT:
9658     case META_FAIL:
9659     case META_PLUS:
9660     case META_PLUS_PLUS:
9661     case META_PLUS_QUERY:
9662     case META_PRUNE:
9663     case META_QUERY:
9664     case META_QUERY_PLUS:
9665     case META_QUERY_QUERY:
9666     case META_RANGE_ESCAPED:
9667     case META_RANGE_LITERAL:
9668     case META_SKIP:
9669     case META_THEN:
9670     break;
9671 
9672     case META_RECURSE:
9673     pptr += SIZEOFFSET;
9674     break;
9675 
9676     case META_BACKREF_BYNAME:
9677     case META_RECURSE_BYNAME:
9678     pptr += 1 + SIZEOFFSET;
9679     break;
9680 
9681     case META_COND_DEFINE:
9682     pptr += SIZEOFFSET;
9683     nestlevel++;
9684     break;
9685 
9686     case META_COND_NAME:
9687     case META_COND_NUMBER:
9688     case META_COND_RNAME:
9689     case META_COND_RNUMBER:
9690     pptr += 1 + SIZEOFFSET;
9691     nestlevel++;
9692     break;
9693 
9694     case META_COND_VERSION:
9695     pptr += 3;
9696     nestlevel++;
9697     break;
9698 
9699     case META_CALLOUT_STRING:
9700     pptr += 3 + SIZEOFFSET;
9701     break;
9702 
9703     case META_BIGVALUE:
9704     case META_OPTIONS:
9705     case META_POSIX:
9706     case META_POSIX_NEG:
9707     pptr += 1;
9708     break;
9709 
9710     case META_MINMAX:
9711     case META_MINMAX_QUERY:
9712     case META_MINMAX_PLUS:
9713     pptr += 2;
9714     break;
9715 
9716     case META_CALLOUT_NUMBER:
9717     pptr += 3;
9718     break;
9719 
9720     case META_MARK:
9721     case META_COMMIT_ARG:
9722     case META_PRUNE_ARG:
9723     case META_SKIP_ARG:
9724     case META_THEN_ARG:
9725     pptr += 1 + pptr[1];
9726     break;
9727 
9728     case META_LOOKBEHIND:
9729     case META_LOOKBEHINDNOT:
9730     case META_LOOKBEHIND_NA:
9731     if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
9732       return errorcode;
9733     break;
9734     }
9735   }
9736 
9737 return 0;
9738 }
9739 
9740 
9741 
9742 /*************************************************
9743 *     External function to compile a pattern     *
9744 *************************************************/
9745 
9746 /* This function reads a regular expression in the form of a string and returns
9747 a pointer to a block of store holding a compiled version of the expression.
9748 
9749 Arguments:
9750   pattern       the regular expression
9751   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
9752   options       option bits
9753   errorptr      pointer to errorcode
9754   erroroffset   pointer to error offset
9755   ccontext      points to a compile context or is NULL
9756 
9757 Returns:        pointer to compiled data block, or NULL on error,
9758                 with errorcode and erroroffset set
9759 */
9760 
9761 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9762 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9763    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9764 {
9765 BOOL utf;                             /* Set TRUE for UTF mode */
9766 BOOL ucp;                             /* Set TRUE for UCP mode */
9767 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
9768 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
9769 pcre2_real_code *re = NULL;           /* What we will return */
9770 compile_block cb;                     /* "Static" compile-time data */
9771 const uint8_t *tables;                /* Char tables base pointer */
9772 
9773 PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
9774 PCRE2_SPTR codestart;                 /* Start of compiled code */
9775 PCRE2_SPTR ptr;                       /* Current pointer in pattern */
9776 uint32_t *pptr;                       /* Current pointer in parsed pattern */
9777 
9778 PCRE2_SIZE length = 1;                /* Allow for final END opcode */
9779 PCRE2_SIZE usedlength;                /* Actual length used */
9780 PCRE2_SIZE re_blocksize;              /* Size of memory block */
9781 PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
9782 PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
9783 
9784 uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
9785 uint32_t firstcu, reqcu;              /* Value of first/req code unit */
9786 uint32_t setflags = 0;                /* NL and BSR set flags */
9787 
9788 uint32_t skipatstart;                 /* When checking (*UTF) etc */
9789 uint32_t limit_heap  = UINT32_MAX;
9790 uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
9791 uint32_t limit_depth = UINT32_MAX;
9792 
9793 int newline = 0;                      /* Unset; can be set by the pattern */
9794 int bsr = 0;                          /* Unset; can be set by the pattern */
9795 int errorcode = 0;                    /* Initialize to avoid compiler warn */
9796 int regexrc;                          /* Return from compile */
9797 
9798 uint32_t i;                           /* Local loop counter */
9799 
9800 /* Comments at the head of this file explain about these variables. */
9801 
9802 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9803 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9804 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9805 
9806 /* The workspace is used in different ways in the different compiling phases.
9807 It needs to be 16-bit aligned for the preliminary parsing scan. */
9808 
9809 uint32_t c16workspace[C16_WORK_SIZE];
9810 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9811 
9812 
9813 /* -------------- Check arguments and set up the pattern ----------------- */
9814 
9815 /* There must be error code and offset pointers. */
9816 
9817 if (errorptr == NULL || erroroffset == NULL) return NULL;
9818 *errorptr = ERR0;
9819 *erroroffset = 0;
9820 
9821 /* There must be a pattern! */
9822 
9823 if (pattern == NULL)
9824   {
9825   *errorptr = ERR16;
9826   return NULL;
9827   }
9828 
9829 /* A NULL compile context means "use a default context" */
9830 
9831 if (ccontext == NULL)
9832   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9833 
9834 /* PCRE2_MATCH_INVALID_UTF implies UTF */
9835 
9836 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
9837 
9838 /* Check that all undefined public option bits are zero. */
9839 
9840 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9841     (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9842   {
9843   *errorptr = ERR17;
9844   return NULL;
9845   }
9846 
9847 if ((options & PCRE2_LITERAL) != 0 &&
9848     ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9849      (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9850   {
9851   *errorptr = ERR92;
9852   return NULL;
9853   }
9854 
9855 /* A zero-terminated pattern is indicated by the special length value
9856 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9857 
9858 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9859   patlen = PRIV(strlen)(pattern);
9860 
9861 if (patlen > ccontext->max_pattern_length)
9862   {
9863   *errorptr = ERR88;
9864   return NULL;
9865   }
9866 
9867 /* From here on, all returns from this function should end up going via the
9868 EXIT label. */
9869 
9870 
9871 /* ------------ Initialize the "static" compile data -------------- */
9872 
9873 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9874 
9875 cb.lcc = tables + lcc_offset;          /* Individual */
9876 cb.fcc = tables + fcc_offset;          /*   character */
9877 cb.cbits = tables + cbits_offset;      /*      tables */
9878 cb.ctypes = tables + ctypes_offset;
9879 
9880 cb.assert_depth = 0;
9881 cb.bracount = 0;
9882 cb.cx = ccontext;
9883 cb.dupnames = FALSE;
9884 cb.end_pattern = pattern + patlen;
9885 cb.erroroffset = 0;
9886 cb.external_flags = 0;
9887 cb.external_options = options;
9888 cb.groupinfo = stack_groupinfo;
9889 cb.had_recurse = FALSE;
9890 cb.lastcapture = 0;
9891 cb.max_lookbehind = 0;
9892 cb.name_entry_size = 0;
9893 cb.name_table = NULL;
9894 cb.named_groups = named_groups;
9895 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9896 cb.names_found = 0;
9897 cb.open_caps = NULL;
9898 cb.parens_depth = 0;
9899 cb.parsed_pattern = stack_parsed_pattern;
9900 cb.req_varyopt = 0;
9901 cb.start_code = cworkspace;
9902 cb.start_pattern = pattern;
9903 cb.start_workspace = cworkspace;
9904 cb.workspace_size = COMPILE_WORK_SIZE;
9905 
9906 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9907 references to help in deciding whether (.*) can be treated as anchored or not.
9908 */
9909 
9910 cb.top_backref = 0;
9911 cb.backref_map = 0;
9912 
9913 /* Escape sequences \1 to \9 are always back references, but as they are only
9914 two characters long, only two elements can be used in the parsed_pattern
9915 vector. The first contains the reference, and we'd like to use the second to
9916 record the offset in the pattern, so that forward references to non-existent
9917 groups can be diagnosed later with an offset. However, on 64-bit systems,
9918 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9919 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9920 references have enough space for the offset to be put into the parsed pattern.
9921 */
9922 
9923 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9924 
9925 
9926 /* --------------- Start looking at the pattern --------------- */
9927 
9928 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9929 the start of the pattern, and remember the offset to the actual regex. With
9930 valgrind support, make the terminator of a zero-terminated pattern
9931 inaccessible. This catches bugs that would otherwise only show up for
9932 non-zero-terminated patterns. */
9933 
9934 #ifdef SUPPORT_VALGRIND
9935 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9936 #endif
9937 
9938 ptr = pattern;
9939 skipatstart = 0;
9940 
9941 if ((options & PCRE2_LITERAL) == 0)
9942   {
9943   while (patlen - skipatstart >= 2 &&
9944          ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9945          ptr[skipatstart+1] == CHAR_ASTERISK)
9946     {
9947     for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9948       {
9949       uint32_t c, pp;
9950       pso *p = pso_list + i;
9951 
9952       if (patlen - skipatstart - 2 >= p->length &&
9953           PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9954             p->length) == 0)
9955         {
9956         skipatstart += p->length + 2;
9957         switch(p->type)
9958           {
9959           case PSO_OPT:
9960           cb.external_options |= p->value;
9961           break;
9962 
9963           case PSO_FLG:
9964           setflags |= p->value;
9965           break;
9966 
9967           case PSO_NL:
9968           newline = p->value;
9969           setflags |= PCRE2_NL_SET;
9970           break;
9971 
9972           case PSO_BSR:
9973           bsr = p->value;
9974           setflags |= PCRE2_BSR_SET;
9975           break;
9976 
9977           case PSO_LIMM:
9978           case PSO_LIMD:
9979           case PSO_LIMH:
9980           c = 0;
9981           pp = skipatstart;
9982           if (!IS_DIGIT(ptr[pp]))
9983             {
9984             errorcode = ERR60;
9985             ptr += pp;
9986             goto HAD_EARLY_ERROR;
9987             }
9988           while (IS_DIGIT(ptr[pp]))
9989             {
9990             if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9991             c = c*10 + (ptr[pp++] - CHAR_0);
9992             }
9993           if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9994             {
9995             errorcode = ERR60;
9996             ptr += pp;
9997             goto HAD_EARLY_ERROR;
9998             }
9999           if (p->type == PSO_LIMH) limit_heap = c;
10000             else if (p->type == PSO_LIMM) limit_match = c;
10001             else limit_depth = c;
10002           skipatstart += pp - skipatstart;
10003           break;
10004           }
10005         break;   /* Out of the table scan loop */
10006         }
10007       }
10008     if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10009     }
10010   }
10011 
10012 /* End of pattern-start options; advance to start of real regex. */
10013 
10014 ptr += skipatstart;
10015 
10016 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10017 
10018 #ifndef SUPPORT_UNICODE
10019 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10020   {
10021   errorcode = ERR32;
10022   goto HAD_EARLY_ERROR;
10023   }
10024 #endif
10025 
10026 /* Check UTF. We have the original options in 'options', with that value as
10027 modified by (*UTF) etc in cb->external_options. The extra option
10028 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10029 surrogate code points cannot be represented in UTF-16. */
10030 
10031 utf = (cb.external_options & PCRE2_UTF) != 0;
10032 if (utf)
10033   {
10034   if ((options & PCRE2_NEVER_UTF) != 0)
10035     {
10036     errorcode = ERR74;
10037     goto HAD_EARLY_ERROR;
10038     }
10039   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10040        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10041     goto HAD_ERROR;  /* Offset was set by valid_utf() */
10042 
10043 #if PCRE2_CODE_UNIT_WIDTH == 16
10044   if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10045     {
10046     errorcode = ERR91;
10047     goto HAD_EARLY_ERROR;
10048     }
10049 #endif
10050   }
10051 
10052 /* Check UCP lockout. */
10053 
10054 ucp = (cb.external_options & PCRE2_UCP) != 0;
10055 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10056   {
10057   errorcode = ERR75;
10058   goto HAD_EARLY_ERROR;
10059   }
10060 
10061 /* Process the BSR setting. */
10062 
10063 if (bsr == 0) bsr = ccontext->bsr_convention;
10064 
10065 /* Process the newline setting. */
10066 
10067 if (newline == 0) newline = ccontext->newline_convention;
10068 cb.nltype = NLTYPE_FIXED;
10069 switch(newline)
10070   {
10071   case PCRE2_NEWLINE_CR:
10072   cb.nllen = 1;
10073   cb.nl[0] = CHAR_CR;
10074   break;
10075 
10076   case PCRE2_NEWLINE_LF:
10077   cb.nllen = 1;
10078   cb.nl[0] = CHAR_NL;
10079   break;
10080 
10081   case PCRE2_NEWLINE_NUL:
10082   cb.nllen = 1;
10083   cb.nl[0] = CHAR_NUL;
10084   break;
10085 
10086   case PCRE2_NEWLINE_CRLF:
10087   cb.nllen = 2;
10088   cb.nl[0] = CHAR_CR;
10089   cb.nl[1] = CHAR_NL;
10090   break;
10091 
10092   case PCRE2_NEWLINE_ANY:
10093   cb.nltype = NLTYPE_ANY;
10094   break;
10095 
10096   case PCRE2_NEWLINE_ANYCRLF:
10097   cb.nltype = NLTYPE_ANYCRLF;
10098   break;
10099 
10100   default:
10101   errorcode = ERR56;
10102   goto HAD_EARLY_ERROR;
10103   }
10104 
10105 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10106 their numerical equivalents, so that this information is always available for
10107 the remaining processing. (2) At the same time, parse the pattern and put a
10108 processed version into the parsed_pattern vector. This has escapes interpreted
10109 and comments removed (amongst other things).
10110 
10111 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10112 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10113 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10114 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10115 characters greater than META_END (0x80000000) have to be coded as two units. In
10116 this case, therefore, we scan the pattern to check for such values. */
10117 
10118 #if PCRE2_CODE_UNIT_WIDTH == 32
10119 if (!utf)
10120   {
10121   PCRE2_SPTR p;
10122   for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10123   }
10124 #endif
10125 
10126 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10127 is set we have to assume a numerical callout (4 elements) for each character
10128 plus one at the end. This is overkill, but memory is plentiful these days. For
10129 many smaller patterns the vector on the stack (which was set up above) can be
10130 used. */
10131 
10132 parsed_size_needed = patlen - skipatstart + big32count;
10133 
10134 if ((ccontext->extra_options &
10135      (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10136   parsed_size_needed += 4;
10137 
10138 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10139   parsed_size_needed = (parsed_size_needed + 1) * 5;
10140 
10141 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10142   {
10143   uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10144     (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10145   if (heap_parsed_pattern == NULL)
10146     {
10147     *errorptr = ERR21;
10148     goto EXIT;
10149     }
10150   cb.parsed_pattern = heap_parsed_pattern;
10151   }
10152 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10153 
10154 /* Do the parsing scan. */
10155 
10156 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10157 if (errorcode != 0) goto HAD_CB_ERROR;
10158 
10159 /* Workspace is needed to remember information about numbered groups: whether a
10160 group can match an empty string and what its fixed length is. This is done to
10161 avoid the possibility of recursive references causing very long compile times
10162 when checking these features. Unnumbered groups do not have this exposure since
10163 they cannot be referenced. We use an indexed vector for this purpose. If there
10164 are sufficiently few groups, the default vector on the stack, as set up above,
10165 can be used. Otherwise we have to get/free a special vector. The vector must be
10166 initialized to zero. */
10167 
10168 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
10169   {
10170   cb.groupinfo = ccontext->memctl.malloc(
10171     (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
10172   if (cb.groupinfo == NULL)
10173     {
10174     errorcode = ERR21;
10175     cb.erroroffset = 0;
10176     goto HAD_CB_ERROR;
10177     }
10178   }
10179 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
10180 
10181 /* If there were any lookbehinds, scan the parsed pattern to figure out their
10182 lengths. */
10183 
10184 if (has_lookbehind)
10185   {
10186   int loopcount = 0;
10187   errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10188   if (errorcode != 0) goto HAD_CB_ERROR;
10189   }
10190 
10191 /* For debugging, there is a function that shows the parsed data vector. */
10192 
10193 #ifdef DEBUG_SHOW_PARSED
10194 fprintf(stderr, "+++ Pre-scan complete:\n");
10195 show_parsed(&cb);
10196 #endif
10197 
10198 /* For debugging capturing information this code can be enabled. */
10199 
10200 #ifdef DEBUG_SHOW_CAPTURES
10201   {
10202   named_group *ng = cb.named_groups;
10203   fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10204   for (i = 0; i < cb.names_found; i++, ng++)
10205     {
10206     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10207     }
10208   }
10209 #endif
10210 
10211 /* Pretend to compile the pattern while actually just accumulating the amount
10212 of memory required in the 'length' variable. This behaviour is triggered by
10213 passing a non-NULL final argument to compile_regex(). We pass a block of
10214 workspace (cworkspace) for it to compile parts of the pattern into; the
10215 compiled code is discarded when it is no longer needed, so hopefully this
10216 workspace will never overflow, though there is a test for its doing so.
10217 
10218 On error, errorcode will be set non-zero, so we don't need to look at the
10219 result of the function. The initial options have been put into the cb block,
10220 but we still have to pass a separate options variable (the first argument)
10221 because the options may change as the pattern is processed. */
10222 
10223 cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10224 pptr = cb.parsed_pattern;
10225 code = cworkspace;
10226 *code = OP_BRA;
10227 
10228 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
10229    &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
10230 
10231 if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10232 
10233 /* This should be caught in compile_regex(), but just in case... */
10234 
10235 if (length > MAX_PATTERN_SIZE)
10236   {
10237   errorcode = ERR20;
10238   goto HAD_CB_ERROR;
10239   }
10240 
10241 /* Compute the size of, and then get and initialize, the data block for storing
10242 the compiled pattern and names table. Integer overflow should no longer be
10243 possible because nowadays we limit the maximum value of cb.names_found and
10244 cb.name_entry_size. */
10245 
10246 re_blocksize = sizeof(pcre2_real_code) +
10247   CU2BYTES(length +
10248   (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10249 re = (pcre2_real_code *)
10250   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10251 if (re == NULL)
10252   {
10253   errorcode = ERR21;
10254   goto HAD_CB_ERROR;
10255   }
10256 
10257 /* The compiler may put padding at the end of the pcre2_real_code structure in
10258 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10259 compiled pattern is copied (for example, when serialized) undefined bytes are
10260 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10261 write to the last 8 bytes of the structure before setting the fields. */
10262 
10263 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10264 re->memctl = ccontext->memctl;
10265 re->tables = tables;
10266 re->executable_jit = NULL;
10267 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10268 re->blocksize = re_blocksize;
10269 re->magic_number = MAGIC_NUMBER;
10270 re->compile_options = options;
10271 re->overall_options = cb.external_options;
10272 re->extra_options = ccontext->extra_options;
10273 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10274 re->limit_heap = limit_heap;
10275 re->limit_match = limit_match;
10276 re->limit_depth = limit_depth;
10277 re->first_codeunit = 0;
10278 re->last_codeunit = 0;
10279 re->bsr_convention = bsr;
10280 re->newline_convention = newline;
10281 re->max_lookbehind = 0;
10282 re->minlength = 0;
10283 re->top_bracket = 0;
10284 re->top_backref = 0;
10285 re->name_entry_size = cb.name_entry_size;
10286 re->name_count = cb.names_found;
10287 
10288 /* The basic block is immediately followed by the name table, and the compiled
10289 code follows after that. */
10290 
10291 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10292   re->name_entry_size * re->name_count;
10293 
10294 /* Update the compile data block for the actual compile. The starting points of
10295 the name/number translation table and of the code are passed around in the
10296 compile data block. The start/end pattern and initial options are already set
10297 from the pre-compile phase, as is the name_entry_size field. */
10298 
10299 cb.parens_depth = 0;
10300 cb.assert_depth = 0;
10301 cb.lastcapture = 0;
10302 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10303 cb.start_code = codestart;
10304 cb.req_varyopt = 0;
10305 cb.had_accept = FALSE;
10306 cb.had_pruneorskip = FALSE;
10307 cb.open_caps = NULL;
10308 
10309 /* If any named groups were found, create the name/number table from the list
10310 created in the pre-pass. */
10311 
10312 if (cb.names_found > 0)
10313   {
10314   named_group *ng = cb.named_groups;
10315   for (i = 0; i < cb.names_found; i++, ng++)
10316     add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10317   }
10318 
10319 /* Set up a starting, non-extracting bracket, then compile the expression. On
10320 error, errorcode will be set non-zero, so we don't need to look at the result
10321 of the function here. */
10322 
10323 pptr = cb.parsed_pattern;
10324 code = (PCRE2_UCHAR *)codestart;
10325 *code = OP_BRA;
10326 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
10327   &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
10328 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10329 re->top_bracket = cb.bracount;
10330 re->top_backref = cb.top_backref;
10331 re->max_lookbehind = cb.max_lookbehind;
10332 
10333 if (cb.had_accept)
10334   {
10335   reqcu = 0;                     /* Must disable after (*ACCEPT) */
10336   reqcuflags = REQ_NONE;
10337   re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10338   }
10339 
10340 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10341 but the estimated length exceeds the really used length, adjust the value of
10342 re->blocksize, and if valgrind support is configured, mark the extra allocated
10343 memory as unaddressable, so that any out-of-bound reads can be detected. */
10344 
10345 *code++ = OP_END;
10346 usedlength = code - codestart;
10347 if (usedlength > length) errorcode = ERR23; else
10348   {
10349   re->blocksize -= CU2BYTES(length - usedlength);
10350 #ifdef SUPPORT_VALGRIND
10351   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10352 #endif
10353   }
10354 
10355 /* Scan the pattern for recursion/subroutine calls and convert the group
10356 numbers into offsets. Maintain a small cache so that repeated groups containing
10357 recursions are efficiently handled. */
10358 
10359 #define RSCAN_CACHE_SIZE 8
10360 
10361 if (errorcode == 0 && cb.had_recurse)
10362   {
10363   PCRE2_UCHAR *rcode;
10364   PCRE2_SPTR rgroup;
10365   unsigned int ccount = 0;
10366   int start = RSCAN_CACHE_SIZE;
10367   recurse_cache rc[RSCAN_CACHE_SIZE];
10368 
10369   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10370        rcode != NULL;
10371        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10372     {
10373     int p, groupnumber;
10374 
10375     groupnumber = (int)GET(rcode, 1);
10376     if (groupnumber == 0) rgroup = codestart; else
10377       {
10378       PCRE2_SPTR search_from = codestart;
10379       rgroup = NULL;
10380       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10381         {
10382         if (groupnumber == rc[p].groupnumber)
10383           {
10384           rgroup = rc[p].group;
10385           break;
10386           }
10387 
10388         /* Group n+1 must always start to the right of group n, so we can save
10389         search time below when the new group number is greater than any of the
10390         previously found groups. */
10391 
10392         if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10393         }
10394 
10395       if (rgroup == NULL)
10396         {
10397         rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10398         if (rgroup == NULL)
10399           {
10400           errorcode = ERR53;
10401           break;
10402           }
10403         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10404         rc[start].groupnumber = groupnumber;
10405         rc[start].group = rgroup;
10406         if (ccount < RSCAN_CACHE_SIZE) ccount++;
10407         }
10408       }
10409 
10410     PUT(rcode, 1, rgroup - codestart);
10411     }
10412   }
10413 
10414 /* In rare debugging situations we sometimes need to look at the compiled code
10415 at this stage. */
10416 
10417 #ifdef DEBUG_CALL_PRINTINT
10418 pcre2_printint(re, stderr, TRUE);
10419 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10420 #endif
10421 
10422 /* Unless disabled, check whether any single character iterators can be
10423 auto-possessified. The function overwrites the appropriate opcode values, so
10424 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10425 used in this code because at least one compiler gives a warning about loss of
10426 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10427 function call. */
10428 
10429 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10430   {
10431   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10432   if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10433   }
10434 
10435 /* Failed to compile, or error while post-processing. */
10436 
10437 if (errorcode != 0) goto HAD_CB_ERROR;
10438 
10439 /* Successful compile. If the anchored option was not passed, set it if
10440 we can determine that the pattern is anchored by virtue of ^ characters or \A
10441 or anything else, such as starting with non-atomic .* when DOTALL is set and
10442 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10443 disable this case). */
10444 
10445 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10446      is_anchored(codestart, 0, &cb, 0, FALSE))
10447   re->overall_options |= PCRE2_ANCHORED;
10448 
10449 /* Set up the first code unit or startline flag, the required code unit, and
10450 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10451 is set, as the data it would create will not be used. Note that a first code
10452 unit (but not the startline flag) is useful for anchored patterns because it
10453 can still give a quick "no match" and also avoid searching for a last code
10454 unit. */
10455 
10456 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10457   {
10458   int minminlength = 0;  /* For minimal minlength from first/required CU */
10459 
10460   /* If we do not have a first code unit, see if there is one that is asserted
10461   (these are not saved during the compile because they can cause conflicts with
10462   actual literals that follow). */
10463 
10464   if (firstcuflags >= REQ_NONE)
10465     firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10466 
10467   /* Save the data for a first code unit. The existence of one means the
10468   minimum length must be at least 1. */
10469 
10470   if (firstcuflags < REQ_NONE)
10471     {
10472     re->first_codeunit = firstcu;
10473     re->flags |= PCRE2_FIRSTSET;
10474     minminlength++;
10475 
10476     /* Handle caseless first code units. */
10477 
10478     if ((firstcuflags & REQ_CASELESS) != 0)
10479       {
10480       if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10481         {
10482         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10483         }
10484 
10485       /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10486       In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10487       points and cannot have another case, but if UCP is set they may do. */
10488 
10489 #ifdef SUPPORT_UNICODE
10490 #if PCRE2_CODE_UNIT_WIDTH == 8
10491       else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10492         re->flags |= PCRE2_FIRSTCASELESS;
10493 #else
10494       else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10495                UCD_OTHERCASE(firstcu) != firstcu)
10496         re->flags |= PCRE2_FIRSTCASELESS;
10497 #endif
10498 #endif  /* SUPPORT_UNICODE */
10499       }
10500     }
10501 
10502   /* When there is no first code unit, for non-anchored patterns, see if we can
10503   set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10504   branches start with ^ and also when all branches start with non-atomic .* for
10505   non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10506   that disables this case.) */
10507 
10508   else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10509            is_startline(codestart, 0, &cb, 0, FALSE))
10510     re->flags |= PCRE2_STARTLINE;
10511 
10512   /* Handle the "required code unit", if one is set. In the UTF case we can
10513   increment the minimum minimum length only if we are sure this really is a
10514   different character and not a non-starting code unit of the first character,
10515   because the minimum length count is in characters, not code units. */
10516 
10517   if (reqcuflags < REQ_NONE)
10518     {
10519 #if PCRE2_CODE_UNIT_WIDTH == 16
10520     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10521         firstcuflags >= REQ_NONE ||                 /* First not set */
10522         (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10523         (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10524 #elif PCRE2_CODE_UNIT_WIDTH == 8
10525     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10526         firstcuflags >= REQ_NONE ||                 /* First not set */
10527         (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10528         (reqcu & 0x80) == 0)                        /* Req is ASCII */
10529 #endif
10530       {
10531       minminlength++;
10532       }
10533 
10534     /* In the case of an anchored pattern, set up the value only if it follows
10535     a variable length item in the pattern. */
10536 
10537     if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10538         (reqcuflags & REQ_VARY) != 0)
10539       {
10540       re->last_codeunit = reqcu;
10541       re->flags |= PCRE2_LASTSET;
10542 
10543       /* Handle caseless required code units as for first code units (above). */
10544 
10545       if ((reqcuflags & REQ_CASELESS) != 0)
10546         {
10547         if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10548           {
10549           if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10550           }
10551 #ifdef SUPPORT_UNICODE
10552 #if PCRE2_CODE_UNIT_WIDTH == 8
10553       else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10554         re->flags |= PCRE2_LASTCASELESS;
10555 #else
10556       else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10557                UCD_OTHERCASE(reqcu) != reqcu)
10558         re->flags |= PCRE2_LASTCASELESS;
10559 #endif
10560 #endif  /* SUPPORT_UNICODE */
10561         }
10562       }
10563     }
10564 
10565   /* Study the compiled pattern to set up information such as a bitmap of
10566   starting code units and a minimum matching length. */
10567 
10568   if (PRIV(study)(re) != 0)
10569     {
10570     errorcode = ERR31;
10571     goto HAD_CB_ERROR;
10572     }
10573 
10574   /* If study() set a bitmap of starting code units, it implies a minimum
10575   length of at least one. */
10576 
10577   if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10578     minminlength = 1;
10579 
10580   /* If the minimum length set (or not set) by study() is less than the minimum
10581   implied by required code units, override it. */
10582 
10583   if (re->minlength < minminlength) re->minlength = minminlength;
10584   }   /* End of start-of-match optimizations. */
10585 
10586 /* Control ends up here in all cases. When running under valgrind, make a
10587 pattern's terminating zero defined again. If memory was obtained for the parsed
10588 version of the pattern, free it before returning. Also free the list of named
10589 groups if a larger one had to be obtained, and likewise the group information
10590 vector. */
10591 
10592 EXIT:
10593 #ifdef SUPPORT_VALGRIND
10594 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10595 #endif
10596 if (cb.parsed_pattern != stack_parsed_pattern)
10597   ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10598 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10599   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10600 if (cb.groupinfo != stack_groupinfo)
10601   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10602 return re;    /* Will be NULL after an error */
10603 
10604 /* Errors discovered in parse_regex() set the offset value in the compile
10605 block. Errors discovered before it is called must compute it from the ptr
10606 value. After parse_regex() is called, the offset in the compile block is set to
10607 the end of the pattern, but certain errors in compile_regex() may reset it if
10608 an offset is available in the parsed pattern. */
10609 
10610 HAD_CB_ERROR:
10611 ptr = pattern + cb.erroroffset;
10612 
10613 HAD_EARLY_ERROR:
10614 *erroroffset = ptr - pattern;
10615 
10616 HAD_ERROR:
10617 *errorptr = errorcode;
10618 pcre2_code_free(re);
10619 re = NULL;
10620 goto EXIT;
10621 }
10622 
10623 /* End of pcre2_compile.c */
10624