xref: /PHP-8.4/ext/pcre/pcre2lib/pcre2_compile.c (revision d1f14a46)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2024 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #define NLBLOCK cb             /* Block containing newline information */
47 #define PSSTART start_pattern  /* Field containing processed string start */
48 #define PSEND   end_pattern    /* Field containing processed string end */
49 
50 #include "pcre2_internal.h"
51 
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53 
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63 
64 /* Other debugging code can be enabled by these defines. */
65 
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68 
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71 
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c)                xdigitab[c]
75 
76 #else  /* Either 16-bit or 32-bit */
77 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78 
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81 
82 #else  /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86 
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90 
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100   { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102   { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110 
111 /* Macros for manipulating elements of the parsed pattern vector. */
112 
113 #define META_CODE(x)   (x & 0xffff0000u)
114 #define META_DATA(x)   (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116 
117 /* Function definitions to allow mutual recursion */
118 
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121   add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t,
122     compile_block *, const uint32_t *, unsigned int);
123 #endif
124 
125 static int
126   compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
127     uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128     open_capitem *, compile_block *, PCRE2_SIZE *);
129 
130 static int
131   get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
132     compile_block *);
133 
134 static BOOL
135   set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136     compile_block *);
137 
138 static int
139   check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140     compile_block *, int *);
141 
142 
143 /*************************************************
144 *      Code parameters and static tables         *
145 *************************************************/
146 
147 #define MAX_GROUP_NUMBER   65535u
148 #define MAX_REPEAT_COUNT   65535u
149 #define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
150 
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156 
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163 
164 In the real compile phase, this workspace is not currently used. */
165 
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
167 
168 #define C16_WORK_SIZE \
169   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170 
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174 
175 #define GROUPINFO_DEFAULT_SIZE 256
176 
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179 
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181 
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186 
187 #define NAMED_GROUP_LIST_SIZE  20
188 
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192 
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194 
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199 
200 #define OFLOW_MAX (INT_MAX - 20)
201 
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207 
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210 
211 #define META_END              0x80000000u  /* End of pattern */
212 
213 #define META_ALT              0x80010000u  /* alternation */
214 #define META_ATOMIC           0x80020000u  /* atomic group */
215 #define META_BACKREF          0x80030000u  /* Back ref */
216 #define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
217 #define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
219 #define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
220 #define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
221 #define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
222 #define META_CLASS            0x800a0000u  /* start non-empty class */
223 #define META_CLASS_EMPTY      0x800b0000u  /* empty class */
224 #define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
225 #define META_CLASS_END        0x800d0000u  /* end of non-empty class */
226 #define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
227 #define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
228 #define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
229 #define META_COND_NAME        0x80110000u  /* (?(<name>)... */
230 #define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
231 #define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
232 #define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
233 #define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR           0x80160000u  /* $ metacharacter */
235 #define META_DOT              0x80170000u  /* . metacharacter */
236 #define META_ESCAPE           0x80180000u  /* \d and friends */
237 #define META_KET              0x80190000u  /* closing parenthesis */
238 #define META_NOCAPTURE        0x801a0000u  /* no capture parens */
239 #define META_OPTIONS          0x801b0000u  /* (?i) and friends */
240 #define META_POSIX            0x801c0000u  /* POSIX class item */
241 #define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
242 #define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
243 #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
244 #define META_RECURSE          0x80200000u  /* Recursion */
245 #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
246 #define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
247 
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250 
251 #define META_LOOKAHEAD        0x80230000u  /* (?= */
252 #define META_LOOKAHEADNOT     0x80240000u  /* (?! */
253 #define META_LOOKBEHIND       0x80250000u  /* (?<= */
254 #define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
255 
256 /* These cannot be conditions */
257 
258 #define META_LOOKAHEAD_NA     0x80270000u  /* (*napla: */
259 #define META_LOOKBEHIND_NA    0x80280000u  /* (*naplb: */
260 
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264 
265 #define META_MARK             0x80290000u  /* (*MARK) */
266 #define META_ACCEPT           0x802a0000u  /* (*ACCEPT) */
267 #define META_FAIL             0x802b0000u  /* (*FAIL) */
268 #define META_COMMIT           0x802c0000u  /* These               */
269 #define META_COMMIT_ARG       0x802d0000u  /*   pairs             */
270 #define META_PRUNE            0x802e0000u  /*     must            */
271 #define META_PRUNE_ARG        0x802f0000u  /*       be            */
272 #define META_SKIP             0x80300000u  /*         kept        */
273 #define META_SKIP_ARG         0x80310000u  /*           in        */
274 #define META_THEN             0x80320000u  /*             this    */
275 #define META_THEN_ARG         0x80330000u  /*               order */
276 
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278 
279 #define META_ASTERISK         0x80340000u  /* *  */
280 #define META_ASTERISK_PLUS    0x80350000u  /* *+ */
281 #define META_ASTERISK_QUERY   0x80360000u  /* *? */
282 #define META_PLUS             0x80370000u  /* +  */
283 #define META_PLUS_PLUS        0x80380000u  /* ++ */
284 #define META_PLUS_QUERY       0x80390000u  /* +? */
285 #define META_QUERY            0x803a0000u  /* ?  */
286 #define META_QUERY_PLUS       0x803b0000u  /* ?+ */
287 #define META_QUERY_QUERY      0x803c0000u  /* ?? */
288 #define META_MINMAX           0x803d0000u  /* {n,m}  repeat */
289 #define META_MINMAX_PLUS      0x803e0000u  /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY     0x803f0000u  /* {n,m}? repeat */
291 
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER  META_MINMAX_QUERY
294 
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299 
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301 
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305 
306 static unsigned char meta_extra_lengths[] = {
307   0,             /* META_END */
308   0,             /* META_ALT */
309   0,             /* META_ATOMIC */
310   0,             /* META_BACKREF - more if group is >= 10 */
311   1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
312   1,             /* META_BIGVALUE */
313   3,             /* META_CALLOUT_NUMBER */
314   3+SIZEOFFSET,  /* META_CALLOUT_STRING */
315   0,             /* META_CAPTURE */
316   0,             /* META_CIRCUMFLEX */
317   0,             /* META_CLASS */
318   0,             /* META_CLASS_EMPTY */
319   0,             /* META_CLASS_EMPTY_NOT */
320   0,             /* META_CLASS_END */
321   0,             /* META_CLASS_NOT */
322   0,             /* META_COND_ASSERT */
323   SIZEOFFSET,    /* META_COND_DEFINE */
324   1+SIZEOFFSET,  /* META_COND_NAME */
325   1+SIZEOFFSET,  /* META_COND_NUMBER */
326   1+SIZEOFFSET,  /* META_COND_RNAME */
327   1+SIZEOFFSET,  /* META_COND_RNUMBER */
328   3,             /* META_COND_VERSION */
329   0,             /* META_DOLLAR */
330   0,             /* META_DOT */
331   0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332   0,             /* META_KET */
333   0,             /* META_NOCAPTURE */
334   1,             /* META_OPTIONS */
335   1,             /* META_POSIX */
336   1,             /* META_POSIX_NEG */
337   0,             /* META_RANGE_ESCAPED */
338   0,             /* META_RANGE_LITERAL */
339   SIZEOFFSET,    /* META_RECURSE */
340   1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
341   0,             /* META_SCRIPT_RUN */
342   0,             /* META_LOOKAHEAD */
343   0,             /* META_LOOKAHEADNOT */
344   SIZEOFFSET,    /* META_LOOKBEHIND */
345   SIZEOFFSET,    /* META_LOOKBEHINDNOT */
346   0,             /* META_LOOKAHEAD_NA */
347   SIZEOFFSET,    /* META_LOOKBEHIND_NA */
348   1,             /* META_MARK - plus the string length */
349   0,             /* META_ACCEPT */
350   0,             /* META_FAIL */
351   0,             /* META_COMMIT */
352   1,             /* META_COMMIT_ARG - plus the string length */
353   0,             /* META_PRUNE */
354   1,             /* META_PRUNE_ARG - plus the string length */
355   0,             /* META_SKIP */
356   1,             /* META_SKIP_ARG - plus the string length */
357   0,             /* META_THEN */
358   1,             /* META_THEN_ARG - plus the string length */
359   0,             /* META_ASTERISK */
360   0,             /* META_ASTERISK_PLUS */
361   0,             /* META_ASTERISK_QUERY */
362   0,             /* META_PLUS */
363   0,             /* META_PLUS_PLUS */
364   0,             /* META_PLUS_QUERY */
365   0,             /* META_QUERY */
366   0,             /* META_QUERY_PLUS */
367   0,             /* META_QUERY_QUERY */
368   2,             /* META_MINMAX */
369   2,             /* META_MINMAX_PLUS */
370   2              /* META_MINMAX_QUERY */
371 };
372 
373 /* Types for skipping parts of a parsed pattern. */
374 
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376 
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380 
381   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382 
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385 
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387 
388 /* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389 variables, which are concerned with first and required code units. A value
390 greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391 matching xxcu variable is set, and the low valued bits are relevant. */
392 
393 #define REQ_UNSET     0xffffffffu  /* Not yet found anything */
394 #define REQ_NONE      0xfffffffeu  /* Found not fixed character */
395 #define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
396 #define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
397 
398 /* These flags are used in the groupinfo vector. */
399 
400 #define GI_SET_FIXED_LENGTH    0x80000000u
401 #define GI_NOT_FIXED_LENGTH    0x40000000u
402 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
403 
404 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405 and is fast (a good compiler can turn it into a subtraction and unsigned
406 comparison). */
407 
408 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409 
410 /* Table to identify hex digits. The tables in chartables are dependent on the
411 locale, and may mark arbitrary characters as digits. We want to recognize only
412 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413 costs 256 bytes, but it is a lot faster than doing character value tests (at
414 least in some simple cases I timed), and in some applications one wants PCRE2
415 to compile efficiently as well as match efficiently. The value in the table is
416 the binary hex digit value, or 0xff for non-hex digits. */
417 
418 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419 UTF-8 mode. */
420 
421 #ifndef EBCDIC
422 static const uint8_t xdigitab[] =
423   {
424   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
425   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
426   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
427   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
428   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
429   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
430   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
431   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
432   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
433   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
434   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
435   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
436   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
437   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
438   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
439   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
440   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456 
457 #else
458 
459 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460 
461 static const uint8_t xdigitab[] =
462   {
463   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
464   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
465   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
466   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
467   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
468   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
469   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
470   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
471   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
472   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
473   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
474   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
475   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
476   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
477   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
479   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
480   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
481   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
482   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
483   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
484   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
485   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
486   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
487   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
488   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
489   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
490   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
491   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
492   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
493   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
494   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
495 #endif  /* EBCDIC */
496 
497 
498 /* Table for handling alphanumeric escaped characters. Positive returns are
499 simple data values; negative values are for special things like \d and so on.
500 Zero means further processing is needed (for things like \x), or the escape is
501 invalid. */
502 
503 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
504 in UTF-8 mode. It runs from '0' to 'z'. */
505 
506 #ifndef EBCDIC
507 #define ESCAPES_FIRST       CHAR_0
508 #define ESCAPES_LAST        CHAR_z
509 #define UPPER_CASE(c)       (c-32)
510 
511 static const short int escapes[] = {
512      0,                       0,
513      0,                       0,
514      0,                       0,
515      0,                       0,
516      0,                       0,
517      CHAR_COLON,              CHAR_SEMICOLON,
518      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
519      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
520      CHAR_COMMERCIAL_AT,      -ESC_A,
521      -ESC_B,                  -ESC_C,
522      -ESC_D,                  -ESC_E,
523      0,                       -ESC_G,
524      -ESC_H,                  0,
525      0,                       -ESC_K,
526      0,                       0,
527      -ESC_N,                  0,
528      -ESC_P,                  -ESC_Q,
529      -ESC_R,                  -ESC_S,
530      0,                       0,
531      -ESC_V,                  -ESC_W,
532      -ESC_X,                  0,
533      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
534      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
535      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
536      CHAR_GRAVE_ACCENT,       CHAR_BEL,
537      -ESC_b,                  0,
538      -ESC_d,                  CHAR_ESC,
539      CHAR_FF,                 0,
540      -ESC_h,                  0,
541      0,                       -ESC_k,
542      0,                       0,
543      CHAR_LF,                 0,
544      -ESC_p,                  0,
545      CHAR_CR,                 -ESC_s,
546      CHAR_HT,                 0,
547      -ESC_v,                  -ESC_w,
548      0,                       0,
549      -ESC_z
550 };
551 
552 #else
553 
554 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557 because it is defined as 'a', which of course picks up the ASCII value. */
558 
559 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
560 #define ESCAPES_FIRST       CHAR_a
561 #define ESCAPES_LAST        CHAR_9
562 #define UPPER_CASE(c)       (c+64)
563 #else                              /* Testing in an ASCII environment */
564 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
565 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
566 #define UPPER_CASE(c)  (c-32)
567 #endif
568 
569 static const short int escapes[] = {
570 /*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
571 /*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
572 /*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
573 /*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
574 /*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
575 /*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
576 /*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
577 /*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
578 /*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
579 /*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
580 /*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
581 /*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
582 /*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
583 /*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
584 /*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
585 /*  F8 */      0,        0
586 };
587 
588 /* We also need a table of characters that may follow \c in an EBCDIC
589 environment for characters 0-31. */
590 
591 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592 
593 #endif   /* EBCDIC */
594 
595 
596 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597 searched linearly. Put all the names into a single string, in order to reduce
598 the number of relocations when a shared library is dynamically linked. The
599 string is built from string macros so that it works in UTF-8 mode on EBCDIC
600 platforms. */
601 
602 typedef struct verbitem {
603   unsigned int len;          /* Length of verb name */
604   uint32_t meta;             /* Base META_ code */
605   int has_arg;               /* Argument requirement */
606 } verbitem;
607 
608 static const char verbnames[] =
609   "\0"                       /* Empty name is a shorthand for MARK */
610   STRING_MARK0
611   STRING_ACCEPT0
612   STRING_F0
613   STRING_FAIL0
614   STRING_COMMIT0
615   STRING_PRUNE0
616   STRING_SKIP0
617   STRING_THEN;
618 
619 static const verbitem verbs[] = {
620   { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
621   { 4, META_MARK,   +1 },
622   { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
623   { 1, META_FAIL,   -1 },
624   { 4, META_FAIL,   -1 },
625   { 6, META_COMMIT,  0 },
626   { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
627   { 4, META_SKIP,    0 },
628   { 4, META_THEN,    0 }
629 };
630 
631 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632 
633 /* Verb opcodes, indexed by their META code offset from META_MARK. */
634 
635 static const uint32_t verbops[] = {
636   OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637   OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638 
639 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640 
641 typedef struct alasitem {
642   unsigned int len;          /* Length of name */
643   uint32_t meta;             /* Base META_ code */
644 } alasitem;
645 
646 static const char alasnames[] =
647   STRING_pla0
648   STRING_plb0
649   STRING_napla0
650   STRING_naplb0
651   STRING_nla0
652   STRING_nlb0
653   STRING_positive_lookahead0
654   STRING_positive_lookbehind0
655   STRING_non_atomic_positive_lookahead0
656   STRING_non_atomic_positive_lookbehind0
657   STRING_negative_lookahead0
658   STRING_negative_lookbehind0
659   STRING_atomic0
660   STRING_sr0
661   STRING_asr0
662   STRING_script_run0
663   STRING_atomic_script_run;
664 
665 static const alasitem alasmeta[] = {
666   {  3, META_LOOKAHEAD         },
667   {  3, META_LOOKBEHIND        },
668   {  5, META_LOOKAHEAD_NA      },
669   {  5, META_LOOKBEHIND_NA     },
670   {  3, META_LOOKAHEADNOT      },
671   {  3, META_LOOKBEHINDNOT     },
672   { 18, META_LOOKAHEAD         },
673   { 19, META_LOOKBEHIND        },
674   { 29, META_LOOKAHEAD_NA      },
675   { 30, META_LOOKBEHIND_NA     },
676   { 18, META_LOOKAHEADNOT      },
677   { 19, META_LOOKBEHINDNOT     },
678   {  6, META_ATOMIC            },
679   {  2, META_SCRIPT_RUN        }, /* sr = script run */
680   {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681   { 10, META_SCRIPT_RUN        }, /* script run */
682   { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
683 };
684 
685 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686 
687 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688 
689 static uint32_t chartypeoffset[] = {
690   OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
691   OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692 
693 /* Tables of names of POSIX character classes and their lengths. The names are
694 now all in a single string, to reduce the number of relocations when a shared
695 library is dynamically loaded. The list of lengths is terminated by a zero
696 length entry. The first three must be alpha, lower, upper, as this is assumed
697 for handling case independence. The indices for several classes are needed, so
698 identify them. */
699 
700 static const char posix_names[] =
701   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704   STRING_word0  STRING_xdigit;
705 
706 static const uint8_t posix_name_lengths[] = {
707   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708 
709 #define PC_DIGIT   7
710 #define PC_GRAPH   8
711 #define PC_PRINT   9
712 #define PC_PUNCT  10
713 #define PC_XDIGIT 13
714 
715 /* Table of class bit maps for each POSIX class. Each class is formed from a
716 base map, with an optional addition or removal of another map. Then, for some
717 classes, there is some additional tweaking: for [:blank:] the vertical space
718 characters are removed, and for [:alpha:] and [:alnum:] the underscore
719 character is removed. The triples in the table consist of the base map offset,
720 second map offset or -1 if no second map, and a non-negative value for map
721 addition or a negative value for map subtraction (if there are two maps). The
722 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
723 remove vertical space characters, 2 => remove underscore. */
724 
725 static const int posix_class_maps[] = {
726   cbit_word,   cbit_digit, -2,            /* alpha */
727   cbit_lower,  -1,          0,            /* lower */
728   cbit_upper,  -1,          0,            /* upper */
729   cbit_word,   -1,          2,            /* alnum - word without underscore */
730   cbit_print,  cbit_cntrl,  0,            /* ascii */
731   cbit_space,  -1,          1,            /* blank - a GNU extension */
732   cbit_cntrl,  -1,          0,            /* cntrl */
733   cbit_digit,  -1,          0,            /* digit */
734   cbit_graph,  -1,          0,            /* graph */
735   cbit_print,  -1,          0,            /* print */
736   cbit_punct,  -1,          0,            /* punct */
737   cbit_space,  -1,          0,            /* space */
738   cbit_word,   -1,          0,            /* word - a Perl extension */
739   cbit_xdigit, -1,          0             /* xdigit */
740 };
741 
742 #ifdef SUPPORT_UNICODE
743 
744 /* The POSIX class Unicode property substitutes that are used in UCP mode must
745 be in the order of the POSIX class names, defined above. */
746 
747 static int posix_substitutes[] = {
748   PT_GC, ucp_L,     /* alpha */
749   PT_PC, ucp_Ll,    /* lower */
750   PT_PC, ucp_Lu,    /* upper */
751   PT_ALNUM, 0,      /* alnum */
752   -1, 0,            /* ascii, treat as non-UCP */
753   -1, 1,            /* blank, treat as \h */
754   PT_PC, ucp_Cc,    /* cntrl */
755   PT_PC, ucp_Nd,    /* digit */
756   PT_PXGRAPH, 0,    /* graph */
757   PT_PXPRINT, 0,    /* print */
758   PT_PXPUNCT, 0,    /* punct */
759   PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
760   PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
761   PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
762 };
763 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
764 #endif  /* SUPPORT_UNICODE */
765 
766 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
767 are allowed. */
768 
769 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
770   (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
771    PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
772    PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
773 
774 #define PUBLIC_COMPILE_OPTIONS \
775   (PUBLIC_LITERAL_COMPILE_OPTIONS| \
776    PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
777    PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
778    PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
779    PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
780    PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
781    PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
782 
783 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
784    (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT)
785 
786 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
787    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
788     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
789     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
790     PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
791     PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
792     PCRE2_EXTRA_ASCII_DIGIT)
793 
794 /* Compile time error code numbers. They are given names so that they can more
795 easily be tracked. When a new number is added, the tables called eint1 and
796 eint2 in pcre2posix.c may need to be updated, and a new error text must be
797 added to compile_error_texts in pcre2_error.c. Also, the error codes in
798 pcre2.h.in must be updated - their values are exactly 100 greater than these
799 values. */
800 
801 enum { ERR0 = COMPILE_ERROR_BASE,
802        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
803        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
804        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
805        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
806        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
807        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
808        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
809        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
810        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
811        ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99, ERR100,
812        ERR101 };
813 
814 /* This is a table of start-of-pattern options such as (*UTF) and settings such
815 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
816 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
817 generic and always supported. */
818 
819 enum { PSO_OPT,     /* Value is an option bit */
820        PSO_FLG,     /* Value is a flag bit */
821        PSO_NL,      /* Value is a newline type */
822        PSO_BSR,     /* Value is a \R type */
823        PSO_LIMH,    /* Read integer value for heap limit */
824        PSO_LIMM,    /* Read integer value for match limit */
825        PSO_LIMD     /* Read integer value for depth limit */
826      };
827 
828 typedef struct pso {
829   const uint8_t *name;
830   uint16_t length;
831   uint16_t type;
832   uint32_t value;
833 } pso;
834 
835 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
836 
837 static const pso pso_list[] = {
838   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
839   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
840   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
841   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
842   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
843   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
844   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
845   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
846   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
847   { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
848   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
849   { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
850   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
851   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
852   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
853   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
854   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
855   { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
856   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
857   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
858   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
859 };
860 
861 /* This table is used when converting repeating opcodes into possessified
862 versions as a result of an explicit possessive quantifier such as ++. A zero
863 value means there is no possessified version - in those cases the item in
864 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
865 because all relevant opcodes are less than that. */
866 
867 static const uint8_t opcode_possessify[] = {
868   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
869   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
870 
871   0,                       /* NOTI */
872   OP_POSSTAR, 0,           /* STAR, MINSTAR */
873   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
874   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
875   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
876   0,                       /* EXACT */
877   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
878 
879   OP_POSSTARI, 0,          /* STARI, MINSTARI */
880   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
881   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
882   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
883   0,                       /* EXACTI */
884   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
885 
886   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
887   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
888   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
889   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
890   0,                       /* NOTEXACT */
891   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
892 
893   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
894   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
895   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
896   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
897   0,                       /* NOTEXACTI */
898   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
899 
900   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
901   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
902   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
903   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
904   0,                       /* TYPEEXACT */
905   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
906 
907   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
908   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
909   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
910   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
911   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
912 
913   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
914   0, 0,                    /* REF, REFI */
915   0, 0,                    /* DNREF, DNREFI */
916   0, 0                     /* RECURSE, CALLOUT */
917 };
918 
919 
920 #ifdef DEBUG_SHOW_PARSED
921 /*************************************************
922 *     Show the parsed pattern for debugging      *
923 *************************************************/
924 
925 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
926 can be enabled. */
927 
show_parsed(compile_block * cb)928 static void show_parsed(compile_block *cb)
929 {
930 uint32_t *pptr = cb->parsed_pattern;
931 
932 for (;;)
933   {
934   int max, min;
935   PCRE2_SIZE offset;
936   uint32_t i;
937   uint32_t length;
938   uint32_t meta_arg = META_DATA(*pptr);
939 
940   fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
941 
942   if (*pptr < META_END)
943     {
944     if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
945     pptr++;
946     }
947 
948   else switch (META_CODE(*pptr++))
949     {
950     default:
951     fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
952     return;
953 
954     case META_END:
955     fprintf(stderr, "META_END\n");
956     return;
957 
958     case META_CAPTURE:
959     fprintf(stderr, "META_CAPTURE %d", meta_arg);
960     break;
961 
962     case META_RECURSE:
963     GETOFFSET(offset, pptr);
964     fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
965     break;
966 
967     case META_BACKREF:
968     if (meta_arg < 10)
969       offset = cb->small_ref_offset[meta_arg];
970     else
971       GETOFFSET(offset, pptr);
972     fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
973     break;
974 
975     case META_ESCAPE:
976     if (meta_arg == ESC_P || meta_arg == ESC_p)
977       {
978       uint32_t ptype = *pptr >> 16;
979       uint32_t pvalue = *pptr++ & 0xffff;
980       fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
981         ptype, pvalue);
982       }
983     else
984       {
985       uint32_t cc;
986       /* There's just one escape we might have here that isn't negated in the
987       escapes table. */
988       if (meta_arg == ESC_g) cc = CHAR_g;
989       else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
990         {
991         if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
992         }
993       if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
994       fprintf(stderr, "META \\%c", cc);
995       }
996     break;
997 
998     case META_MINMAX:
999     min = *pptr++;
1000     max = *pptr++;
1001     if (max != REPEAT_UNLIMITED)
1002       fprintf(stderr, "META {%d,%d}", min, max);
1003     else
1004       fprintf(stderr, "META {%d,}", min);
1005     break;
1006 
1007     case META_MINMAX_QUERY:
1008     min = *pptr++;
1009     max = *pptr++;
1010     if (max != REPEAT_UNLIMITED)
1011       fprintf(stderr, "META {%d,%d}?", min, max);
1012     else
1013       fprintf(stderr, "META {%d,}?", min);
1014     break;
1015 
1016     case META_MINMAX_PLUS:
1017     min = *pptr++;
1018     max = *pptr++;
1019     if (max != REPEAT_UNLIMITED)
1020       fprintf(stderr, "META {%d,%d}+", min, max);
1021     else
1022       fprintf(stderr, "META {%d,}+", min);
1023     break;
1024 
1025     case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1026     case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1027     case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1028     case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1029     case META_DOT: fprintf(stderr, "META_DOT"); break;
1030     case META_ASTERISK: fprintf(stderr, "META *"); break;
1031     case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1032     case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1033     case META_PLUS: fprintf(stderr, "META +"); break;
1034     case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1035     case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1036     case META_QUERY: fprintf(stderr, "META ?"); break;
1037     case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1038     case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1039 
1040     case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1041     case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1042     case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1043     case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1044     case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1045     case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1046     case META_KET: fprintf(stderr, "META )"); break;
1047     case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1048 
1049     case META_CLASS: fprintf(stderr, "META ["); break;
1050     case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1051     case META_CLASS_END: fprintf(stderr, "META ]"); break;
1052     case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1053     case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1054 
1055     case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1056     case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1057 
1058     case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1059     case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1060 
1061     case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1062     case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1063     case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1064     case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1065     case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1066     case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1067 
1068     case META_OPTIONS:
1069     fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
1070     pptr += 2;
1071     break;
1072 
1073     case META_LOOKBEHIND:
1074     fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
1075     pptr += 2;
1076     break;
1077 
1078     case META_LOOKBEHIND_NA:
1079     fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
1080     pptr += 2;
1081     break;
1082 
1083     case META_LOOKBEHINDNOT:
1084     fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
1085     pptr += 2;
1086     break;
1087 
1088     case META_CALLOUT_NUMBER:
1089     fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1090        pptr[1]);
1091     pptr += 3;
1092     break;
1093 
1094     case META_CALLOUT_STRING:
1095       {
1096       uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1097       uint32_t patlength = *pptr++;    /* Length of next pattern item */
1098       fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1099       GETOFFSET(offset, pptr);
1100       fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1101       }
1102     break;
1103 
1104     case META_RECURSE_BYNAME:
1105     fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1106     GETOFFSET(offset, pptr);
1107     fprintf(stderr, "%zd", offset);
1108     break;
1109 
1110     case META_BACKREF_BYNAME:
1111     fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1112     GETOFFSET(offset, pptr);
1113     fprintf(stderr, "%zd", offset);
1114     break;
1115 
1116     case META_COND_NUMBER:
1117     fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1118     GETOFFSET(offset, pptr);
1119     fprintf(stderr, "%zd", offset);
1120     pptr++;
1121     break;
1122 
1123     case META_COND_DEFINE:
1124     fprintf(stderr, "META (?(DEFINE) offset=");
1125     GETOFFSET(offset, pptr);
1126     fprintf(stderr, "%zd", offset);
1127     break;
1128 
1129     case META_COND_VERSION:
1130     fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1131     fprintf(stderr, "%d.", *pptr++);
1132     fprintf(stderr, "%d)", *pptr++);
1133     break;
1134 
1135     case META_COND_NAME:
1136     fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1137     GETOFFSET(offset, pptr);
1138     fprintf(stderr, "%zd", offset);
1139     break;
1140 
1141     case META_COND_RNAME:
1142     fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1143     GETOFFSET(offset, pptr);
1144     fprintf(stderr, "%zd", offset);
1145     break;
1146 
1147     /* This is kept as a name, because it might be. */
1148 
1149     case META_COND_RNUMBER:
1150     fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1151     GETOFFSET(offset, pptr);
1152     fprintf(stderr, "%zd", offset);
1153     break;
1154 
1155     case META_MARK:
1156     fprintf(stderr, "META (*MARK:");
1157     goto SHOWARG;
1158 
1159     case META_COMMIT_ARG:
1160     fprintf(stderr, "META (*COMMIT:");
1161     goto SHOWARG;
1162 
1163     case META_PRUNE_ARG:
1164     fprintf(stderr, "META (*PRUNE:");
1165     goto SHOWARG;
1166 
1167     case META_SKIP_ARG:
1168     fprintf(stderr, "META (*SKIP:");
1169     goto SHOWARG;
1170 
1171     case META_THEN_ARG:
1172     fprintf(stderr, "META (*THEN:");
1173     SHOWARG:
1174     length = *pptr++;
1175     for (i = 0; i < length; i++)
1176       {
1177       uint32_t cc = *pptr++;
1178       if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1179         else fprintf(stderr, "\\x{%x}", cc);
1180       }
1181     fprintf(stderr, ") length=%u", length);
1182     break;
1183     }
1184   fprintf(stderr, "\n");
1185   }
1186 return;
1187 }
1188 #endif  /* DEBUG_SHOW_PARSED */
1189 
1190 
1191 
1192 /*************************************************
1193 *               Copy compiled code               *
1194 *************************************************/
1195 
1196 /* Compiled JIT code cannot be copied, so the new compiled block has no
1197 associated JIT data. */
1198 
1199 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1200 pcre2_code_copy(const pcre2_code *code)
1201 {
1202 PCRE2_SIZE* ref_count;
1203 pcre2_code *newcode;
1204 
1205 if (code == NULL) return NULL;
1206 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1207 if (newcode == NULL) return NULL;
1208 memcpy(newcode, code, code->blocksize);
1209 newcode->executable_jit = NULL;
1210 
1211 /* If the code is one that has been deserialized, increment the reference count
1212 in the decoded tables. */
1213 
1214 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1215   {
1216   ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1217   (*ref_count)++;
1218   }
1219 
1220 return newcode;
1221 }
1222 
1223 
1224 
1225 /*************************************************
1226 *     Copy compiled code and character tables    *
1227 *************************************************/
1228 
1229 /* Compiled JIT code cannot be copied, so the new compiled block has no
1230 associated JIT data. This version of code_copy also makes a separate copy of
1231 the character tables. */
1232 
1233 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1234 pcre2_code_copy_with_tables(const pcre2_code *code)
1235 {
1236 PCRE2_SIZE* ref_count;
1237 pcre2_code *newcode;
1238 uint8_t *newtables;
1239 
1240 if (code == NULL) return NULL;
1241 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1242 if (newcode == NULL) return NULL;
1243 memcpy(newcode, code, code->blocksize);
1244 newcode->executable_jit = NULL;
1245 
1246 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1247   code->memctl.memory_data);
1248 if (newtables == NULL)
1249   {
1250   code->memctl.free((void *)newcode, code->memctl.memory_data);
1251   return NULL;
1252   }
1253 memcpy(newtables, code->tables, TABLES_LENGTH);
1254 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1255 *ref_count = 1;
1256 
1257 newcode->tables = newtables;
1258 newcode->flags |= PCRE2_DEREF_TABLES;
1259 return newcode;
1260 }
1261 
1262 
1263 
1264 /*************************************************
1265 *               Free compiled code               *
1266 *************************************************/
1267 
1268 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1269 pcre2_code_free(pcre2_code *code)
1270 {
1271 PCRE2_SIZE* ref_count;
1272 
1273 if (code != NULL)
1274   {
1275 #ifdef SUPPORT_JIT
1276   if (code->executable_jit != NULL)
1277     PRIV(jit_free)(code->executable_jit, &code->memctl);
1278 #endif
1279 
1280   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1281     {
1282     /* Decoded tables belong to the codes after deserialization, and they must
1283     be freed when there are no more references to them. The *ref_count should
1284     always be > 0. */
1285 
1286     ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1287     if (*ref_count > 0)
1288       {
1289       (*ref_count)--;
1290       if (*ref_count == 0)
1291         code->memctl.free((void *)code->tables, code->memctl.memory_data);
1292       }
1293     }
1294 
1295   code->memctl.free(code, code->memctl.memory_data);
1296   }
1297 }
1298 
1299 
1300 
1301 /*************************************************
1302 *         Read a number, possibly signed         *
1303 *************************************************/
1304 
1305 /* This function is used to read numbers in the pattern. The initial pointer
1306 must be at the sign or first digit of the number. When relative values
1307 (introduced by + or -) are allowed, they are relative group numbers, and the
1308 result must be greater than zero.
1309 
1310 Arguments:
1311   ptrptr      points to the character pointer variable
1312   ptrend      points to the end of the input string
1313   allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1314   max_value   the largest number allowed
1315   max_error   the error to give for an over-large number
1316   intptr      where to put the result
1317   errcodeptr  where to put an error code
1318 
1319 Returns:      TRUE  - a number was read
1320               FALSE - errorcode == 0 => no number was found
1321                       errorcode != 0 => an error occurred
1322 */
1323 
1324 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1325 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1326   uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1327 {
1328 int sign = 0;
1329 uint32_t n = 0;
1330 PCRE2_SPTR ptr = *ptrptr;
1331 BOOL yield = FALSE;
1332 
1333 *errorcodeptr = 0;
1334 
1335 if (allow_sign >= 0 && ptr < ptrend)
1336   {
1337   if (*ptr == CHAR_PLUS)
1338     {
1339     sign = +1;
1340     max_value -= allow_sign;
1341     ptr++;
1342     }
1343   else if (*ptr == CHAR_MINUS)
1344     {
1345     sign = -1;
1346     ptr++;
1347     }
1348   }
1349 
1350 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1351 while (ptr < ptrend && IS_DIGIT(*ptr))
1352   {
1353   n = n * 10 + *ptr++ - CHAR_0;
1354   if (n > max_value)
1355     {
1356     *errorcodeptr = max_error;
1357     goto EXIT;
1358     }
1359   }
1360 
1361 if (allow_sign >= 0 && sign != 0)
1362   {
1363   if (n == 0)
1364     {
1365     *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1366     goto EXIT;
1367     }
1368 
1369   if (sign > 0) n += allow_sign;
1370   else if ((int)n > allow_sign)
1371     {
1372     *errorcodeptr = ERR15;  /* Non-existent subpattern */
1373     goto EXIT;
1374     }
1375   else n = allow_sign + 1 - n;
1376   }
1377 
1378 yield = TRUE;
1379 
1380 EXIT:
1381 *intptr = n;
1382 *ptrptr = ptr;
1383 return yield;
1384 }
1385 
1386 
1387 
1388 /*************************************************
1389 *         Read repeat counts                     *
1390 *************************************************/
1391 
1392 /* Read an item of the form {n,m} and return the values when non-NULL pointers
1393 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1394 larger value is used for "unlimited". We have to use signed arguments for
1395 read_number() because it is capable of returning a signed value. As of Perl
1396 5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1397 tabs after { and before } and between the numbers and the comma, so we do too.
1398 
1399 Arguments:
1400   ptrptr         points to pointer to character after '{'
1401   ptrend         pointer to end of input
1402   minp           if not NULL, pointer to int for min
1403   maxp           if not NULL, pointer to int for max
1404   errorcodeptr   points to error code variable
1405 
1406 Returns:         FALSE if not a repeat quantifier, errorcode set zero
1407                  FALSE on error, with errorcode set non-zero
1408                  TRUE on success, with pointer updated to point after '}'
1409 */
1410 
1411 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1412 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1413   uint32_t *maxp, int *errorcodeptr)
1414 {
1415 PCRE2_SPTR p = *ptrptr;
1416 PCRE2_SPTR pp;
1417 BOOL yield = FALSE;
1418 BOOL had_minimum = FALSE;
1419 int32_t min = 0;
1420 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1421 
1422 *errorcodeptr = 0;
1423 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1424 
1425 /* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1426 such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1427 error. */
1428 
1429 pp = p;
1430 if (pp < ptrend && IS_DIGIT(*pp))
1431   {
1432   had_minimum = TRUE;
1433   while (++pp < ptrend && IS_DIGIT(*pp)) {}
1434   }
1435 
1436 while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1437 if (pp >= ptrend) return FALSE;
1438 
1439 if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1440   {
1441   if (!had_minimum) return FALSE;
1442   }
1443 else
1444   {
1445   if (*pp++ != CHAR_COMMA) return FALSE;
1446   while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1447   if (pp >= ptrend) return FALSE;
1448   if (IS_DIGIT(*pp))
1449     {
1450     while (++pp < ptrend && IS_DIGIT(*pp)) {}
1451     }
1452   else if (!had_minimum) return FALSE;
1453   while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1454   if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1455   }
1456 
1457 /* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
1458 or {n,m}. The only error that read_number() can return is for a number that is
1459 too big. If *errorcodeptr is returned as zero it means no number was found. */
1460 
1461 /* Deal with {,m} or n too big. If we successfully read m there is no need to
1462 check m >= n because n defaults to zero. */
1463 
1464 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1465   {
1466   if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1467   p++;  /* Skip comma and subsequent spaces */
1468   while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1469   if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1470     {
1471     if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1472     }
1473   }
1474 
1475 /* Have read one number. Deal with {n} or {n,} or {n,m} */
1476 
1477 else
1478   {
1479   while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1480   if (*p == CHAR_RIGHT_CURLY_BRACKET)
1481     {
1482     max = min;
1483     }
1484   else   /* Handle {n,} or {n,m} */
1485     {
1486     p++;    /* Skip comma and subsequent spaces */
1487     while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1488     if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1489       {
1490       if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1491       }
1492 
1493     if (max < min)
1494       {
1495       *errorcodeptr = ERR4;
1496       goto EXIT;
1497       }
1498     }
1499   }
1500 
1501 /* Valid quantifier exists */
1502 
1503 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1504 p++;
1505 yield = TRUE;
1506 if (minp != NULL) *minp = (uint32_t)min;
1507 if (maxp != NULL) *maxp = (uint32_t)max;
1508 
1509 /* Update the pattern pointer */
1510 
1511 EXIT:
1512 *ptrptr = p;
1513 return yield;
1514 }
1515 
1516 
1517 
1518 /*************************************************
1519 *            Handle escapes                      *
1520 *************************************************/
1521 
1522 /* This function is called when a \ has been encountered. It either returns a
1523 positive value for a simple escape such as \d, or 0 for a data character, which
1524 is placed in chptr. A backreference to group n is returned as negative n. On
1525 entry, ptr is pointing at the character after \. On exit, it points after the
1526 final code unit of the escape sequence.
1527 
1528 This function is also called from pcre2_substitute() to handle escape sequences
1529 in replacement strings. In this case, the cb argument is NULL, and in the case
1530 of escapes that have further processing, only sequences that define a data
1531 character are recognised. The isclass argument is not relevant; the options
1532 argument is the final value of the compiled pattern's options.
1533 
1534 Arguments:
1535   ptrptr         points to the input position pointer
1536   ptrend         points to the end of the input
1537   chptr          points to a returned data character
1538   errorcodeptr   points to the errorcode variable (containing zero)
1539   options        the current options bits
1540   xoptions       the current extra options bits
1541   isclass        TRUE if inside a character class
1542   cb             compile data block or NULL when called from pcre2_substitute()
1543 
1544 Returns:         zero => a data character
1545                  positive => a special escape sequence
1546                  negative => a numerical back reference
1547                  on error, errorcodeptr is set non-zero
1548 */
1549 
1550 int
PRIV(check_escape)1551 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1552   int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
1553   compile_block *cb)
1554 {
1555 BOOL utf = (options & PCRE2_UTF) != 0;
1556 BOOL alt_bsux =
1557   ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1558 PCRE2_SPTR ptr = *ptrptr;
1559 uint32_t c, cc;
1560 int escape = 0;
1561 int i;
1562 
1563 /* If backslash is at the end of the string, it's an error. */
1564 
1565 if (ptr >= ptrend)
1566   {
1567   *errorcodeptr = ERR1;
1568   return 0;
1569   }
1570 
1571 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1572 *errorcodeptr = 0;              /* Be optimistic */
1573 
1574 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1575 value test saves a memory lookup for code points outside the alphanumeric
1576 range. */
1577 
1578 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1579 
1580 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1581 positive value is a literal value for something like \n. A negative value is
1582 the negation of one of the ESC_ macros that is passed back for handling by the
1583 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1584 is supported. If the value is zero, further processing is handled below. */
1585 
1586 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1587   {
1588   if (i > 0)
1589     {
1590     c = (uint32_t)i;
1591     if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1592       c = CHAR_LF;
1593     }
1594   else  /* Negative table entry */
1595     {
1596     escape = -i;                    /* Else return a special escape */
1597     if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1598       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1599 
1600     /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1601     Unicode code points, as well as plain \N for "not newline". PCRE does not
1602     support \N{name}. However, it does support quantification such as \N{2,3},
1603     so if \N{ is not followed by U+dddd we check for a quantifier. */
1604 
1605     if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1606       {
1607       PCRE2_SPTR p = ptr + 1;
1608 
1609       /* Perl ignores spaces and tabs after { */
1610 
1611       while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1612 
1613       /* \N{U+ can be handled by the \x{ code. However, this construction is
1614       not valid in EBCDIC environments because it specifies a Unicode
1615       character, not a codepoint in the local code. For example \N{U+0041}
1616       must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1617       casing semantics for the entire pattern, so allow it only in UTF (i.e.
1618       Unicode) mode. */
1619 
1620       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1621         {
1622 #ifdef EBCDIC
1623         *errorcodeptr = ERR93;
1624 #else
1625         if (utf)
1626           {
1627           ptr = p + 2;
1628           escape = 0;   /* Not a fancy escape after all */
1629           goto COME_FROM_NU;
1630           }
1631         else *errorcodeptr = ERR93;
1632 #endif
1633         }
1634 
1635       /* Give an error if what follows is not a quantifier, but don't override
1636       an error set by the quantifier reader (e.g. number overflow). */
1637 
1638       else
1639         {
1640         if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1641              *errorcodeptr == 0)
1642           *errorcodeptr = ERR37;
1643         }
1644       }
1645     }
1646   }
1647 
1648 /* Escapes that need further processing, including those that are unknown, have
1649 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1650 \o, and \x are recognized (\u and \U can never appear as they are used for case
1651 forcing). */
1652 
1653 else
1654   {
1655   int s;
1656   PCRE2_SPTR oldptr;
1657   BOOL overflow;
1658 
1659   /* Filter calls from pcre2_substitute(). */
1660 
1661   if (cb == NULL)
1662     {
1663     if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1664       {
1665       *errorcodeptr = ERR3;
1666       return 0;
1667       }
1668     alt_bsux = FALSE;   /* Do not modify \x handling */
1669     }
1670 
1671   switch (c)
1672     {
1673     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1674     error. */
1675 
1676     case CHAR_F:
1677     case CHAR_l:
1678     case CHAR_L:
1679     *errorcodeptr = ERR37;
1680     break;
1681 
1682     /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1683     is set. Otherwise, \u must be followed by exactly four hex digits or, if
1684     PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1685     Otherwise it is a lowercase u letter. This gives some compatibility with
1686     ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1687     allowed. When \u{ is not followed by hex digits, a special return is given
1688     because otherwise \u{ 12} (for example) would be treated as u{12}. */
1689 
1690     case CHAR_u:
1691     if (!alt_bsux) *errorcodeptr = ERR37; else
1692       {
1693       uint32_t xc;
1694 
1695       if (ptr >= ptrend) break;
1696       if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1697           (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1698         {
1699         PCRE2_SPTR hptr = ptr + 1;
1700 
1701         cc = 0;
1702         while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1703           {
1704           if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1705             {
1706             *errorcodeptr = ERR77;
1707             ptr = hptr;   /* Show where */
1708             break;        /* *hptr != } will cause another break below */
1709             }
1710           cc = (cc << 4) | xc;
1711           hptr++;
1712           }
1713 
1714         if (hptr == ptr + 1 ||   /* No hex digits */
1715             hptr >= ptrend ||    /* Hit end of input */
1716             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1717           {
1718           escape = ESC_ub;    /* Special return */
1719           ptr++;              /* Skip { */
1720           break;              /* Hex escape not recognized */
1721           }
1722 
1723         c = cc;          /* Accept the code point */
1724         ptr = hptr + 1;
1725         }
1726 
1727       else  /* Must be exactly 4 hex digits */
1728         {
1729         if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1730         if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1731         if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1732         cc = (cc << 4) | xc;
1733         if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1734         cc = (cc << 4) | xc;
1735         if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1736         c = (cc << 4) | xc;
1737         ptr += 4;
1738         }
1739 
1740       if (utf)
1741         {
1742         if (c > 0x10ffffU) *errorcodeptr = ERR77;
1743         else
1744           if (c >= 0xd800 && c <= 0xdfff &&
1745               (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1746                 *errorcodeptr = ERR73;
1747         }
1748       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1749       }
1750     break;
1751 
1752     /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1753     in which case it is an upper case letter. */
1754 
1755     case CHAR_U:
1756     if (!alt_bsux) *errorcodeptr = ERR37;
1757     break;
1758 
1759     /* In a character class, \g is just a literal "g". Outside a character
1760     class, \g must be followed by one of a number of specific things:
1761 
1762     (1) A number, either plain or braced. If positive, it is an absolute
1763     backreference. If negative, it is a relative backreference. This is a Perl
1764     5.10 feature.
1765 
1766     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1767     is part of Perl's movement towards a unified syntax for back references. As
1768     this is synonymous with \k{name}, we fudge it up by pretending it really
1769     was \k{name}.
1770 
1771     (3) For Oniguruma compatibility we also support \g followed by a name or a
1772     number either in angle brackets or in single quotes. However, these are
1773     (possibly recursive) subroutine calls, _not_ backreferences. We return
1774     the ESC_g code.
1775 
1776     Summary: Return a negative number for a numerical back reference, ESC_k for
1777     a named back reference, and ESC_g for a named or numbered subroutine call.
1778     */
1779 
1780     case CHAR_g:
1781     if (isclass) break;
1782 
1783     if (ptr >= ptrend)
1784       {
1785       *errorcodeptr = ERR57;
1786       break;
1787       }
1788 
1789     if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790       {
1791       escape = ESC_g;
1792       break;
1793       }
1794 
1795     /* If there is a brace delimiter, try to read a numerical reference. If
1796     there isn't one, assume we have a name and treat it as \k. */
1797 
1798     if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799       {
1800       PCRE2_SPTR p = ptr + 1;
1801 
1802       while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803       if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804           errorcodeptr))
1805         {
1806         if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1807         break;
1808         }
1809       while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810 
1811       if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812         {
1813         *errorcodeptr = ERR57;
1814         break;
1815         }
1816       ptr = p + 1;
1817       }
1818 
1819     /* Read an undelimited number */
1820 
1821     else
1822       {
1823       if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1824           errorcodeptr))
1825         {
1826         if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1827         break;
1828         }
1829       }
1830 
1831     if (s <= 0)
1832       {
1833       *errorcodeptr = ERR15;
1834       break;
1835       }
1836 
1837     escape = -s;
1838     break;
1839 
1840     /* The handling of escape sequences consisting of a string of digits
1841     starting with one that is not zero is not straightforward. Perl has changed
1842     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1843     recommended to avoid the ambiguities in the old syntax.
1844 
1845     Outside a character class, the digits are read as a decimal number. If the
1846     number is less than 10, or if there are that many previous extracting left
1847     brackets, it is a back reference. Otherwise, up to three octal digits are
1848     read to form an escaped character code. Thus \123 is likely to be octal 123
1849     (cf \0123, which is octal 012 followed by the literal 3).
1850 
1851     Inside a character class, \ followed by a digit is always either a literal
1852     8 or 9 or an octal number. */
1853 
1854     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1855     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1856 
1857     if (!isclass)
1858       {
1859       oldptr = ptr;
1860       ptr--;   /* Back to the digit */
1861 
1862       /* As we know we are at a digit, the only possible error from
1863       read_number() is a number that is too large to be a group number. In this
1864       case we fall through handle this as not a group reference. If we have
1865       read a small enough number, check for a back reference.
1866 
1867       \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1868       are octal escapes if there are not that many previous captures. */
1869 
1870       if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1871           (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1872         {
1873         if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1874           else escape = -s;     /* Indicates a back reference */
1875         break;
1876         }
1877 
1878       ptr = oldptr;      /* Put the pointer back and fall through */
1879       }
1880 
1881     /* Handle a digit following \ when the number is not a back reference, or
1882     we are within a character class. If the first digit is 8 or 9, Perl used to
1883     generate a binary zero and then treat the digit as a following literal. At
1884     least by Perl 5.18 this changed so as not to insert the binary zero. */
1885 
1886     if (c >= CHAR_8) break;
1887 
1888     /* Fall through */
1889 
1890     /* \0 always starts an octal number, but we may drop through to here with a
1891     larger first octal digit. The original code used just to take the least
1892     significant 8 bits of octal numbers (I think this is what early Perls used
1893     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1894     but no more than 3 octal digits. */
1895 
1896     case CHAR_0:
1897     c -= CHAR_0;
1898     while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1899         c = c * 8 + *ptr++ - CHAR_0;
1900 #if PCRE2_CODE_UNIT_WIDTH == 8
1901     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1902 #endif
1903     break;
1904 
1905     /* \o is a relatively new Perl feature, supporting a more general way of
1906     specifying character codes in octal. The only supported form is \o{ddd},
1907     with optional spaces or tabs after { and before }. */
1908 
1909     case CHAR_o:
1910     if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1911       {
1912       ptr--;
1913       *errorcodeptr = ERR55;
1914       break;
1915       }
1916 
1917     while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1918     if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1919       {
1920       *errorcodeptr = ERR78;
1921       break;
1922       }
1923 
1924     c = 0;
1925     overflow = FALSE;
1926     while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1927       {
1928       cc = *ptr++;
1929       if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1930 #if PCRE2_CODE_UNIT_WIDTH == 32
1931       if (c >= 0x20000000l) { overflow = TRUE; break; }
1932 #endif
1933       c = (c << 3) + (cc - CHAR_0);
1934 #if PCRE2_CODE_UNIT_WIDTH == 8
1935       if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1936 #elif PCRE2_CODE_UNIT_WIDTH == 16
1937       if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1938 #elif PCRE2_CODE_UNIT_WIDTH == 32
1939       if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1940 #endif
1941       }
1942 
1943     while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1944 
1945     if (overflow)
1946       {
1947       while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1948       *errorcodeptr = ERR34;
1949       }
1950     else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1951       {
1952       if (utf && c >= 0xd800 && c <= 0xdfff &&
1953           (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1954         {
1955         ptr--;
1956         *errorcodeptr = ERR73;
1957         }
1958       }
1959     else
1960       {
1961       ptr--;
1962       *errorcodeptr = ERR64;
1963       }
1964     break;
1965 
1966     /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1967     by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1968 
1969     case CHAR_x:
1970     if (alt_bsux)
1971       {
1972       uint32_t xc;
1973       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1974       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1975       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1976       c = (cc << 4) | xc;
1977       ptr += 2;
1978       }
1979 
1980     /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1981     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1982     digits. If not, { used to be treated as a data character. However, Perl
1983     seems to read hex digits up to the first non-such, and ignore the rest, so
1984     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1985     now gives an error. */
1986 
1987     else
1988       {
1989       if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1990         {
1991         ptr++;
1992         while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1993 
1994 #ifndef EBCDIC
1995         COME_FROM_NU:
1996 #endif
1997         if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1998           {
1999           *errorcodeptr = ERR78;
2000           break;
2001           }
2002         c = 0;
2003         overflow = FALSE;
2004 
2005         while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2006           {
2007           ptr++;
2008           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2009 #if PCRE2_CODE_UNIT_WIDTH == 32
2010           if (c >= 0x10000000l) { overflow = TRUE; break; }
2011 #endif
2012           c = (c << 4) | cc;
2013           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2014             {
2015             overflow = TRUE;
2016             break;
2017             }
2018           }
2019 
2020         /* Perl ignores spaces and tabs before } */
2021 
2022         while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2023 
2024         /* On overflow, skip remaining hex digits */
2025 
2026         if (overflow)
2027           {
2028           while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2029           *errorcodeptr = ERR34;
2030           }
2031         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2032           {
2033           if (utf && c >= 0xd800 && c <= 0xdfff &&
2034               (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2035             {
2036             ptr--;
2037             *errorcodeptr = ERR73;
2038             }
2039           }
2040 
2041         /* If the sequence of hex digits (followed by optional space) does not
2042         end with '}', give an error. We used just to recognize this construct
2043         and fall through to the normal \x handling, but nowadays Perl gives an
2044         error, which seems much more sensible, so we do too. */
2045 
2046         else
2047           {
2048           ptr--;
2049           *errorcodeptr = ERR67;
2050           }
2051         }   /* End of \x{} processing */
2052 
2053       /* Read a up to two hex digits after \x */
2054 
2055       else
2056         {
2057         c = 0;
2058         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2059         ptr++;
2060         c = cc;
2061         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2062         ptr++;
2063         c = (c << 4) | cc;
2064         }     /* End of \xdd handling */
2065       }       /* End of Perl-style \x handling */
2066     break;
2067 
2068     /* The handling of \c is different in ASCII and EBCDIC environments. In an
2069     ASCII (or Unicode) environment, an error is given if the character
2070     following \c is not a printable ASCII character. Otherwise, the following
2071     character is upper-cased if it is a letter, and after that the 0x40 bit is
2072     flipped. The result is the value of the escape.
2073 
2074     In an EBCDIC environment the handling of \c is compatible with the
2075     specification in the perlebcdic document. The following character must be
2076     a letter or one of small number of special characters. These provide a
2077     means of defining the character values 0-31.
2078 
2079     For testing the EBCDIC handling of \c in an ASCII environment, recognize
2080     the EBCDIC value of 'c' explicitly. */
2081 
2082 #if defined EBCDIC && 'a' != 0x81
2083     case 0x83:
2084 #else
2085     case CHAR_c:
2086 #endif
2087     if (ptr >= ptrend)
2088       {
2089       *errorcodeptr = ERR2;
2090       break;
2091       }
2092     c = *ptr;
2093     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2094 
2095     /* Handle \c in an ASCII/Unicode environment. */
2096 
2097 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
2098     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2099       {
2100       *errorcodeptr = ERR68;
2101       break;
2102       }
2103     c ^= 0x40;
2104 
2105     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2106     255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2107     POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2108     The other valid sequences correspond to a list of specific characters. */
2109 
2110 #else
2111     if (c == CHAR_QUESTION_MARK)
2112       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2113     else
2114       {
2115       for (i = 0; i < 32; i++)
2116         {
2117         if (c == ebcdic_escape_c[i]) break;
2118         }
2119       if (i < 32) c = i; else *errorcodeptr = ERR68;
2120       }
2121 #endif  /* EBCDIC */
2122 
2123     ptr++;
2124     break;
2125 
2126     /* Any other alphanumeric following \ is an error. Perl gives an error only
2127     if in warning mode, but PCRE doesn't have a warning mode. */
2128 
2129     default:
2130     *errorcodeptr = ERR3;
2131     *ptrptr = ptr - 1;     /* Point to the character at fault */
2132     return 0;
2133     }
2134   }
2135 
2136 /* Set the pointer to the next character before returning. */
2137 
2138 *ptrptr = ptr;
2139 *chptr = c;
2140 return escape;
2141 }
2142 
2143 
2144 
2145 #ifdef SUPPORT_UNICODE
2146 /*************************************************
2147 *               Handle \P and \p                 *
2148 *************************************************/
2149 
2150 /* This function is called after \P or \p has been encountered, provided that
2151 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2152 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2153 after the final code unit of the escape sequence.
2154 
2155 Arguments:
2156   ptrptr         the pattern position pointer
2157   negptr         a boolean that is set TRUE for negation else FALSE
2158   ptypeptr       an unsigned int that is set to the type value
2159   pdataptr       an unsigned int that is set to the detailed property value
2160   errorcodeptr   the error code variable
2161   cb             the compile data
2162 
2163 Returns:         TRUE if the type value was found, or FALSE for an invalid type
2164 */
2165 
2166 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2167 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2168   uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2169 {
2170 PCRE2_UCHAR c;
2171 PCRE2_SIZE i, bot, top;
2172 PCRE2_SPTR ptr = *ptrptr;
2173 PCRE2_UCHAR name[50];
2174 PCRE2_UCHAR *vptr = NULL;
2175 uint16_t ptscript = PT_NOTSCRIPT;
2176 
2177 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2178 c = *ptr++;
2179 *negptr = FALSE;
2180 
2181 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2182 negation. */
2183 
2184 if (c == CHAR_LEFT_CURLY_BRACKET)
2185   {
2186   if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2187 
2188   if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2189     {
2190     *negptr = TRUE;
2191     ptr++;
2192     }
2193 
2194   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2195     {
2196     if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2197     c = *ptr++;
2198 #if PCRE2_CODE_UNIT_WIDTH != 8
2199     while (c == '_' || c == '-' || (c <= 0xff && isspace(c)))
2200 #else
2201     while (c == '_' || c == '-' || isspace(c))
2202 #endif
2203       {
2204       if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2205       c = *ptr++;
2206       }
2207     if (c == CHAR_NUL) goto ERROR_RETURN;
2208     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2209     name[i] = tolower(c);
2210     if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2211     }
2212 
2213   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2214   name[i] = 0;
2215   }
2216 
2217 /* If { doesn't follow \p or \P there is just one following character, which
2218 must be an ASCII letter. */
2219 
2220 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2221   {
2222   name[0] = tolower(c);
2223   name[1] = 0;
2224   }
2225 else goto ERROR_RETURN;
2226 
2227 *ptrptr = ptr;
2228 
2229 /* If the property contains ':' or '=' we have class name and value separately
2230 specified. The following are supported:
2231 
2232   . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2233   . Script (synonym sc) for which the property name is the script name
2234   . Script_Extensions (synonym scx), ditto
2235 
2236 As this is a small number, we currently just check the names directly. If this
2237 grows, a sorted table and a switch will be neater.
2238 
2239 For both the script properties, set a PT_xxx value so that (1) they can be
2240 distinguished and (2) invalid script names that happen to be the name of
2241 another property can be diagnosed. */
2242 
2243 if (vptr != NULL)
2244   {
2245   int offset = 0;
2246   PCRE2_UCHAR sname[8];
2247 
2248   *vptr = 0;   /* Terminate property name */
2249   if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2250       PRIV(strcmp_c8)(name, STRING_bc) == 0)
2251     {
2252     offset = 4;
2253     sname[0] = CHAR_b;
2254     sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2255     sname[2] = CHAR_d;
2256     sname[3] = CHAR_i;
2257     }
2258 
2259   else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2260            PRIV(strcmp_c8)(name, STRING_sc) == 0)
2261     ptscript = PT_SC;
2262 
2263   else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2264            PRIV(strcmp_c8)(name, STRING_scx) == 0)
2265     ptscript = PT_SCX;
2266 
2267   else
2268     {
2269     *errorcodeptr = ERR47;
2270     return FALSE;
2271     }
2272 
2273   /* Adjust the string in name[] as needed */
2274 
2275   memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2276   if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2277   }
2278 
2279 /* Search for a recognized property using binary chop. */
2280 
2281 bot = 0;
2282 top = PRIV(utt_size);
2283 
2284 while (bot < top)
2285   {
2286   int r;
2287   i = (bot + top) >> 1;
2288   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2289 
2290   /* When a matching property is found, some extra checking is needed when the
2291   \p{xx:yy} syntax is used and xx is either sc or scx. */
2292 
2293   if (r == 0)
2294     {
2295     *pdataptr = PRIV(utt)[i].value;
2296     if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2297       {
2298       *ptypeptr = PRIV(utt)[i].type;
2299       return TRUE;
2300       }
2301 
2302     switch (PRIV(utt)[i].type)
2303       {
2304       case PT_SC:
2305       *ptypeptr = PT_SC;
2306       return TRUE;
2307 
2308       case PT_SCX:
2309       *ptypeptr = ptscript;
2310       return TRUE;
2311       }
2312 
2313     break;  /* Non-script found */
2314     }
2315 
2316   if (r > 0) bot = i + 1; else top = i;
2317   }
2318 
2319 *errorcodeptr = ERR47;   /* Unrecognized property */
2320 return FALSE;
2321 
2322 ERROR_RETURN:            /* Malformed \P or \p */
2323 *errorcodeptr = ERR46;
2324 *ptrptr = ptr;
2325 return FALSE;
2326 }
2327 #endif
2328 
2329 
2330 
2331 /*************************************************
2332 *           Check for POSIX class syntax         *
2333 *************************************************/
2334 
2335 /* This function is called when the sequence "[:" or "[." or "[=" is
2336 encountered in a character class. It checks whether this is followed by a
2337 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2338 reach an unescaped ']' without the special preceding character, return FALSE.
2339 
2340 Originally, this function only recognized a sequence of letters between the
2341 terminators, but it seems that Perl recognizes any sequence of characters,
2342 though of course unknown POSIX names are subsequently rejected. Perl gives an
2343 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2344 didn't consider this to be a POSIX class. Likewise for [:1234:].
2345 
2346 The problem in trying to be exactly like Perl is in the handling of escapes. We
2347 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2348 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2349 below handles the special cases \\ and \], but does not try to do any other
2350 escape processing. This makes it different from Perl for cases such as
2351 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2352 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2353 when Perl does, I think.
2354 
2355 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2356 It seems that the appearance of a nested POSIX class supersedes an apparent
2357 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2358 a digit. This is handled by returning FALSE if the start of a new group with
2359 the same terminator is encountered, since the next closing sequence must close
2360 the nested group, not the outer one.
2361 
2362 In Perl, unescaped square brackets may also appear as part of class names. For
2363 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2364 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2365 seem right at all. PCRE does not allow closing square brackets in POSIX class
2366 names.
2367 
2368 Arguments:
2369   ptr      pointer to the character after the initial [ (colon, dot, equals)
2370   ptrend   pointer to the end of the pattern
2371   endptr   where to return a pointer to the terminating ':', '.', or '='
2372 
2373 Returns:   TRUE or FALSE
2374 */
2375 
2376 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2377 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2378 {
2379 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2380 terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2381 
2382 for (; ptrend - ptr >= 2; ptr++)
2383   {
2384   if (*ptr == CHAR_BACKSLASH &&
2385       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2386     ptr++;
2387 
2388   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2389             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2390 
2391   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2392     {
2393     *endptr = ptr;
2394     return TRUE;
2395     }
2396   }
2397 
2398 return FALSE;
2399 }
2400 
2401 
2402 
2403 /*************************************************
2404 *          Check POSIX class name                *
2405 *************************************************/
2406 
2407 /* This function is called to check the name given in a POSIX-style class entry
2408 such as [:alnum:].
2409 
2410 Arguments:
2411   ptr        points to the first letter
2412   len        the length of the name
2413 
2414 Returns:     a value representing the name, or -1 if unknown
2415 */
2416 
2417 static int
check_posix_name(PCRE2_SPTR ptr,int len)2418 check_posix_name(PCRE2_SPTR ptr, int len)
2419 {
2420 const char *pn = posix_names;
2421 int yield = 0;
2422 while (posix_name_lengths[yield] != 0)
2423   {
2424   if (len == posix_name_lengths[yield] &&
2425     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2426   pn += posix_name_lengths[yield] + 1;
2427   yield++;
2428   }
2429 return -1;
2430 }
2431 
2432 
2433 
2434 /*************************************************
2435 *       Read a subpattern or VERB name           *
2436 *************************************************/
2437 
2438 /* This function is called from parse_regex() below whenever it needs to read
2439 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2440 pointer must be to the preceding character. If that character is '*' we are
2441 reading a verb or alpha assertion name. The pointer is updated to point after
2442 the name, for a VERB or alpha assertion name, or after tha name's terminator
2443 for a subpattern name. Returning both the offset and the name pointer is
2444 redundant information, but some callers use one and some the other, so it is
2445 simplest just to return both. When the name is in braces, spaces and tabs are
2446 allowed (and ignored) at either end.
2447 
2448 Arguments:
2449   ptrptr      points to the character pointer variable
2450   ptrend      points to the end of the input string
2451   utf         true if the input is UTF-encoded
2452   terminator  the terminator of a subpattern name must be this
2453   offsetptr   where to put the offset from the start of the pattern
2454   nameptr     where to put a pointer to the name in the input
2455   namelenptr  where to put the length of the name
2456   errcodeptr  where to put an error code
2457   cb          pointer to the compile data block
2458 
2459 Returns:    TRUE if a name was read
2460             FALSE otherwise, with error code set
2461 */
2462 
2463 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2464 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2465   PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2466   int *errorcodeptr, compile_block *cb)
2467 {
2468 PCRE2_SPTR ptr = *ptrptr;
2469 BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2470 BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2471 
2472 if (is_braced)
2473   while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2474 
2475 if (ptr >= ptrend)                 /* No characters in name */
2476   {
2477   *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2478                             ERR60; /* Verb not recognized or malformed */
2479   goto FAILED;
2480   }
2481 
2482 *nameptr = ptr;
2483 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2484 
2485 /* In UTF mode, a group name may contain letters and decimal digits as defined
2486 by Unicode properties, and underscores, but must not start with a digit. */
2487 
2488 #ifdef SUPPORT_UNICODE
2489 if (utf && is_group)
2490   {
2491   uint32_t c, type;
2492 
2493   GETCHAR(c, ptr);
2494   type = UCD_CHARTYPE(c);
2495 
2496   if (type == ucp_Nd)
2497     {
2498     *errorcodeptr = ERR44;
2499     goto FAILED;
2500     }
2501 
2502   for(;;)
2503     {
2504     if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2505         c != CHAR_UNDERSCORE) break;
2506     ptr++;
2507     FORWARDCHARTEST(ptr, ptrend);
2508     if (ptr >= ptrend) break;
2509     GETCHAR(c, ptr);
2510     type = UCD_CHARTYPE(c);
2511     }
2512   }
2513 else
2514 #else
2515 (void)utf;  /* Avoid compiler warning */
2516 #endif      /* SUPPORT_UNICODE */
2517 
2518 /* Handle non-group names and group names in non-UTF modes. A group name must
2519 not start with a digit. If either of the others start with a digit it just
2520 won't be recognized. */
2521 
2522   {
2523   if (is_group && IS_DIGIT(*ptr))
2524     {
2525     *errorcodeptr = ERR44;
2526     goto FAILED;
2527     }
2528 
2529   while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2530     {
2531     ptr++;
2532     }
2533   }
2534 
2535 /* Check name length */
2536 
2537 if (ptr > *nameptr + MAX_NAME_SIZE)
2538   {
2539   *errorcodeptr = ERR48;
2540   goto FAILED;
2541   }
2542 *namelenptr = (uint32_t)(ptr - *nameptr);
2543 
2544 /* Subpattern names must not be empty, and their terminator is checked here.
2545 (What follows a verb or alpha assertion name is checked separately.) */
2546 
2547 if (is_group)
2548   {
2549   if (ptr == *nameptr)
2550     {
2551     *errorcodeptr = ERR62;   /* Subpattern name expected */
2552     goto FAILED;
2553     }
2554   if (is_braced)
2555     while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2556   if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2557     {
2558     *errorcodeptr = ERR42;
2559     goto FAILED;
2560     }
2561   ptr++;
2562   }
2563 
2564 *ptrptr = ptr;
2565 return TRUE;
2566 
2567 FAILED:
2568 *ptrptr = ptr;
2569 return FALSE;
2570 }
2571 
2572 
2573 
2574 /*************************************************
2575 *          Manage callouts at start of cycle     *
2576 *************************************************/
2577 
2578 /* At the start of a new item in parse_regex() we are able to record the
2579 details of the previous item in a prior callout, and also to set up an
2580 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2581 which would otherwise happen for items such as \Q that contribute nothing to
2582 the parsed pattern.
2583 
2584 Arguments:
2585   ptr              current pattern pointer
2586   pcalloutptr      points to a pointer to previous callout, or NULL
2587   auto_callout     TRUE if auto_callouts are enabled
2588   parsed_pattern   the parsed pattern pointer
2589   cb               compile block
2590 
2591 Returns: possibly updated parsed_pattern pointer.
2592 */
2593 
2594 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2595 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2596   uint32_t *parsed_pattern, compile_block *cb)
2597 {
2598 uint32_t *previous_callout = *pcalloutptr;
2599 
2600 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2601   cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2602 
2603 if (!auto_callout) previous_callout = NULL; else
2604   {
2605   if (previous_callout == NULL ||
2606       previous_callout != parsed_pattern - 4 ||
2607       previous_callout[3] != 255)
2608     {
2609     previous_callout = parsed_pattern;  /* Set up new automatic callout */
2610     parsed_pattern += 4;
2611     previous_callout[0] = META_CALLOUT_NUMBER;
2612     previous_callout[2] = 0;
2613     previous_callout[3] = 255;
2614     }
2615   previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2616   }
2617 
2618 *pcalloutptr = previous_callout;
2619 return parsed_pattern;
2620 }
2621 
2622 
2623 
2624 /*************************************************
2625 *          Handle \d, \D, \s, \S, \w, \W         *
2626 *************************************************/
2627 
2628 /* This function is called from parse_regex() below, both for freestanding
2629 escapes, and those within classes, to handle those escapes that may change when
2630 Unicode property support is requested. Note that PCRE2_UCP will never be set
2631 without Unicode support because that is checked when pcre2_compile() is called.
2632 
2633 Arguments:
2634   escape          the ESC_... value
2635   parsed_pattern  where to add the code
2636   options         options bits
2637   xoptions        extra options bits
2638 
2639 Returns:          updated value of parsed_pattern
2640 */
2641 static uint32_t *
handle_escdsw(int escape,uint32_t * parsed_pattern,uint32_t options,uint32_t xoptions)2642 handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2643   uint32_t xoptions)
2644 {
2645 uint32_t ascii_option = 0;
2646 uint32_t prop = ESC_p;
2647 
2648 switch(escape)
2649   {
2650   case ESC_D:
2651   prop = ESC_P;
2652   /* Fall through */
2653   case ESC_d:
2654   ascii_option = PCRE2_EXTRA_ASCII_BSD;
2655   break;
2656 
2657   case ESC_S:
2658   prop = ESC_P;
2659   /* Fall through */
2660   case ESC_s:
2661   ascii_option = PCRE2_EXTRA_ASCII_BSS;
2662   break;
2663 
2664   case ESC_W:
2665   prop = ESC_P;
2666   /* Fall through */
2667   case ESC_w:
2668   ascii_option = PCRE2_EXTRA_ASCII_BSW;
2669   break;
2670   }
2671 
2672 if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2673   {
2674   *parsed_pattern++ = META_ESCAPE + escape;
2675   }
2676 else
2677   {
2678   *parsed_pattern++ = META_ESCAPE + prop;
2679   switch(escape)
2680     {
2681     case ESC_d:
2682     case ESC_D:
2683     *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2684     break;
2685 
2686     case ESC_s:
2687     case ESC_S:
2688     *parsed_pattern++ = PT_SPACE << 16;
2689     break;
2690 
2691     case ESC_w:
2692     case ESC_W:
2693     *parsed_pattern++ = PT_WORD << 16;
2694     break;
2695     }
2696   }
2697 
2698 return parsed_pattern;
2699 }
2700 
2701 
2702 
2703 /*************************************************
2704 *      Parse regex and identify named groups     *
2705 *************************************************/
2706 
2707 /* This function is called first of all. It scans the pattern and does two
2708 things: (1) It identifies capturing groups and makes a table of named capturing
2709 groups so that information about them is fully available to both the compiling
2710 scans. (2) It writes a parsed version of the pattern with comments omitted and
2711 escapes processed into the parsed_pattern vector.
2712 
2713 Arguments:
2714   ptr             points to the start of the pattern
2715   options         compiling dynamic options (may change during the scan)
2716   has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2717   cb              pointer to the compile data block
2718 
2719 Returns:   zero on success or a non-zero error code, with the
2720              error offset placed in the cb field
2721 */
2722 
2723 /* A structure and some flags for dealing with nested groups. */
2724 
2725 typedef struct nest_save {
2726   uint16_t  nest_depth;
2727   uint16_t  reset_group;
2728   uint16_t  max_group;
2729   uint16_t  flags;
2730   uint32_t  options;
2731   uint32_t  xoptions;
2732 } nest_save;
2733 
2734 #define NSF_RESET          0x0001u
2735 #define NSF_CONDASSERT     0x0002u
2736 #define NSF_ATOMICSR       0x0004u
2737 
2738 /* Options that are changeable within the pattern must be tracked during
2739 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2740 but all must be tracked so that META_OPTIONS items set the correct values for
2741 the main compiling phase. */
2742 
2743 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2744   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2745   PCRE2_UNGREEDY)
2746 
2747 #define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2748   PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2749   PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2750 
2751 /* States used for analyzing ranges in character classes. The two OK values
2752 must be last. */
2753 
2754 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2755 
2756 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2757 the storing of literal values in the main parsed pattern, where they can always
2758 be quantified. */
2759 
2760 #if PCRE2_CODE_UNIT_WIDTH == 32
2761 #define PARSED_LITERAL(c, p) \
2762   { \
2763   if (c >= META_END) *p++ = META_BIGVALUE; \
2764   *p++ = c; \
2765   okquantifier = TRUE; \
2766   }
2767 #else
2768 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2769 #endif
2770 
2771 /* Here's the actual function. */
2772 
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2773 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2774   compile_block *cb)
2775 {
2776 uint32_t c;
2777 uint32_t delimiter;
2778 uint32_t namelen;
2779 uint32_t class_range_state;
2780 uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2781 uint32_t *verbstartptr = NULL;
2782 uint32_t *previous_callout = NULL;
2783 uint32_t *parsed_pattern = cb->parsed_pattern;
2784 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2785 uint32_t *this_parsed_item = NULL;
2786 uint32_t *prev_parsed_item = NULL;
2787 uint32_t meta_quantifier = 0;
2788 uint32_t add_after_mark = 0;
2789 uint32_t xoptions = cb->cx->extra_options;
2790 uint16_t nest_depth = 0;
2791 int after_manual_callout = 0;
2792 int expect_cond_assert = 0;
2793 int errorcode = 0;
2794 int escape;
2795 int i;
2796 BOOL inescq = FALSE;
2797 BOOL inverbname = FALSE;
2798 BOOL utf = (options & PCRE2_UTF) != 0;
2799 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2800 BOOL isdupname;
2801 BOOL negate_class;
2802 BOOL okquantifier = FALSE;
2803 PCRE2_SPTR thisptr;
2804 PCRE2_SPTR name;
2805 PCRE2_SPTR ptrend = cb->end_pattern;
2806 PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2807 named_group *ng;
2808 nest_save *top_nest, *end_nests;
2809 
2810 /* Insert leading items for word and line matching (features provided for the
2811 benefit of pcre2grep). */
2812 
2813 if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2814   {
2815   *parsed_pattern++ = META_CIRCUMFLEX;
2816   *parsed_pattern++ = META_NOCAPTURE;
2817   }
2818 else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2819   {
2820   *parsed_pattern++ = META_ESCAPE + ESC_b;
2821   *parsed_pattern++ = META_NOCAPTURE;
2822   }
2823 
2824 /* If the pattern is actually a literal string, process it separately to avoid
2825 cluttering up the main loop. */
2826 
2827 if ((options & PCRE2_LITERAL) != 0)
2828   {
2829   while (ptr < ptrend)
2830     {
2831     if (parsed_pattern >= parsed_pattern_end)
2832       {
2833       errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2834       goto FAILED;
2835       }
2836     thisptr = ptr;
2837     GETCHARINCTEST(c, ptr);
2838     if (auto_callout)
2839       parsed_pattern = manage_callouts(thisptr, &previous_callout,
2840         auto_callout, parsed_pattern, cb);
2841     PARSED_LITERAL(c, parsed_pattern);
2842     }
2843   goto PARSED_END;
2844   }
2845 
2846 /* Process a real regex which may contain meta-characters. */
2847 
2848 top_nest = NULL;
2849 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2850 
2851 /* The size of the nest_save structure might not be a factor of the size of the
2852 workspace. Therefore we must round down end_nests so as to correctly avoid
2853 creating a nest_save that spans the end of the workspace. */
2854 
2855 end_nests = (nest_save *)((char *)end_nests -
2856   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2857 
2858 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2859 
2860 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2861 
2862 /* Now scan the pattern */
2863 
2864 while (ptr < ptrend)
2865   {
2866   int prev_expect_cond_assert;
2867   uint32_t min_repeat = 0, max_repeat = 0;
2868   uint32_t set, unset, *optset;
2869   uint32_t xset, xunset, *xoptset;
2870   uint32_t terminator;
2871   uint32_t prev_meta_quantifier;
2872   BOOL prev_okquantifier;
2873   PCRE2_SPTR tempptr;
2874   PCRE2_SIZE offset;
2875 
2876   if (parsed_pattern >= parsed_pattern_end)
2877     {
2878     errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2879     goto FAILED;
2880     }
2881 
2882   if (nest_depth > cb->cx->parens_nest_limit)
2883     {
2884     errorcode = ERR19;
2885     goto FAILED;        /* Parentheses too deeply nested */
2886     }
2887 
2888   /* If the last time round this loop something was added, parsed_pattern will
2889   no longer be equal to this_parsed_item. Remember where the previous item
2890   started and reset for the next item. Note that sometimes round the loop,
2891   nothing gets added (e.g. for ignored white space). */
2892 
2893   if (this_parsed_item != parsed_pattern)
2894     {
2895     prev_parsed_item = this_parsed_item;
2896     this_parsed_item = parsed_pattern;
2897     }
2898 
2899   /* Get next input character, save its position for callout handling. */
2900 
2901   thisptr = ptr;
2902   GETCHARINCTEST(c, ptr);
2903 
2904   /* Copy quoted literals until \E, allowing for the possibility of automatic
2905   callouts, except when processing a (*VERB) "name".  */
2906 
2907   if (inescq)
2908     {
2909     if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2910       {
2911       inescq = FALSE;
2912       ptr++;   /* Skip E */
2913       }
2914     else
2915       {
2916       if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2917         {                           /* expecting a conditional assertion, */
2918         ptr--;                      /* but an empty \Q\E sequence is OK.  */
2919         errorcode = ERR28;
2920         goto FAILED;
2921         }
2922       if (inverbname)
2923         {                          /* Don't use PARSED_LITERAL() because it */
2924 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2925         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2926 #endif
2927         *parsed_pattern++ = c;
2928         }
2929       else
2930         {
2931         if (after_manual_callout-- <= 0)
2932           parsed_pattern = manage_callouts(thisptr, &previous_callout,
2933             auto_callout, parsed_pattern, cb);
2934         PARSED_LITERAL(c, parsed_pattern);
2935         }
2936       meta_quantifier = 0;
2937       }
2938     continue;  /* Next character */
2939     }
2940 
2941   /* If we are processing the "name" part of a (*VERB:NAME) item, all
2942   characters up to the closing parenthesis are literals except when
2943   PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2944   and \E and escaped characters are allowed (no character types such as \d). If
2945   PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2946   this by not entering the special (*VERB:NAME) processing - they are then
2947   picked up below. Note that c is a character, not a code unit, so we must not
2948   use MAX_255 to test its size because MAX_255 tests code units and is assumed
2949   TRUE in 8-bit mode. */
2950 
2951   if (inverbname &&
2952        (
2953         /* EITHER: not both options set */
2954         ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2955                     (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2956 #ifdef SUPPORT_UNICODE
2957         /* OR: character > 255 AND not Unicode Pattern White Space */
2958         (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2959 #endif
2960         /* OR: not a # comment or isspace() white space */
2961         (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2962 #ifdef SUPPORT_UNICODE
2963         /* and not CHAR_NEL when Unicode is supported */
2964           && c != CHAR_NEL
2965 #endif
2966        )))
2967     {
2968     PCRE2_SIZE verbnamelength;
2969 
2970     switch(c)
2971       {
2972       default:                     /* Don't use PARSED_LITERAL() because it */
2973 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2974       if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2975 #endif
2976       *parsed_pattern++ = c;
2977       break;
2978 
2979       case CHAR_RIGHT_PARENTHESIS:
2980       inverbname = FALSE;
2981       /* This is the length in characters */
2982       verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2983       /* But the limit on the length is in code units */
2984       if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2985         {
2986         ptr--;
2987         errorcode = ERR76;
2988         goto FAILED;
2989         }
2990       *verblengthptr = (uint32_t)verbnamelength;
2991 
2992       /* If this name was on a verb such as (*ACCEPT) which does not continue,
2993       a (*MARK) was generated for the name. We now add the original verb as the
2994       next item. */
2995 
2996       if (add_after_mark != 0)
2997         {
2998         *parsed_pattern++ = add_after_mark;
2999         add_after_mark = 0;
3000         }
3001       break;
3002 
3003       case CHAR_BACKSLASH:
3004       if ((options & PCRE2_ALT_VERBNAMES) != 0)
3005         {
3006         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3007           xoptions, FALSE, cb);
3008         if (errorcode != 0) goto FAILED;
3009         }
3010       else escape = 0;   /* Treat all as literal */
3011 
3012       switch(escape)
3013         {
3014         case 0:                    /* Don't use PARSED_LITERAL() because it */
3015 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3016         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3017 #endif
3018         *parsed_pattern++ = c;
3019         break;
3020 
3021         case ESC_ub:
3022         *parsed_pattern++ = CHAR_u;
3023         PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3024         break;
3025 
3026         case ESC_Q:
3027         inescq = TRUE;
3028         break;
3029 
3030         case ESC_E:           /* Ignore */
3031         break;
3032 
3033         default:
3034         errorcode = ERR40;    /* Invalid in verb name */
3035         goto FAILED;
3036         }
3037       }
3038     continue;   /* Next character in pattern */
3039     }
3040 
3041   /* Not a verb name character. At this point we must process everything that
3042   must not change the quantification state. This is mainly comments, but we
3043   handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3044   A+, as in Perl. An isolated \E is ignored. */
3045 
3046   if (c == CHAR_BACKSLASH && ptr < ptrend)
3047     {
3048     if (*ptr == CHAR_Q || *ptr == CHAR_E)
3049       {
3050       inescq = *ptr == CHAR_Q;
3051       ptr++;
3052       continue;
3053       }
3054     }
3055 
3056   /* Skip over whitespace and # comments in extended mode. Note that c is a
3057   character, not a code unit, so we must not use MAX_255 to test its size
3058   because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3059   whitespace characters are those designated as "Pattern White Space" by
3060   Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3061   U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3062   subset of space characters that match \h and \v. */
3063 
3064   if ((options & PCRE2_EXTENDED) != 0)
3065     {
3066     if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3067 #ifdef SUPPORT_UNICODE
3068     if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3069 #endif
3070     if (c == CHAR_NUMBER_SIGN)
3071       {
3072       while (ptr < ptrend)
3073         {
3074         if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3075           {                       /* IS_NEWLINE sets cb->nllen. */
3076           ptr += cb->nllen;
3077           break;
3078           }
3079         ptr++;
3080 #ifdef SUPPORT_UNICODE
3081         if (utf) FORWARDCHARTEST(ptr, ptrend);
3082 #endif
3083         }
3084       continue;  /* Next character in pattern */
3085       }
3086     }
3087 
3088   /* Skip over bracketed comments */
3089 
3090   if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3091       ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3092     {
3093     while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3094     if (ptr >= ptrend)
3095       {
3096       errorcode = ERR18;  /* A special error for missing ) in a comment */
3097       goto FAILED;        /* to make it easier to debug. */
3098       }
3099     ptr++;
3100     continue;  /* Next character in pattern */
3101     }
3102 
3103   /* If the next item is not a quantifier, fill in length of any previous
3104   callout and create an auto callout if required. */
3105 
3106   if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3107        (c != CHAR_LEFT_CURLY_BRACKET ||
3108          (tempptr = ptr,
3109          !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3110     {
3111     if (after_manual_callout-- <= 0)
3112       {
3113       parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3114         parsed_pattern, cb);
3115       this_parsed_item = parsed_pattern;  /* New start for current item */
3116       }
3117     }
3118 
3119   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3120   assertion, possibly preceded by a callout. If the value is 1, we have just
3121   had the callout and expect an assertion. There must be at least 3 more
3122   characters in all cases. When expect_cond_assert is 2, we know that the
3123   current character is an opening parenthesis, as otherwise we wouldn't be
3124   here. However, when it is 1, we need to check, and it's easiest just to check
3125   always. Note that expect_cond_assert may be negative, since all callouts just
3126   decrement it. */
3127 
3128   if (expect_cond_assert > 0)
3129     {
3130     BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3131               (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3132     if (ok)
3133       {
3134       if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3135         {
3136         ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3137         }
3138       else switch(ptr[1])  /* Traditional symbolic format */
3139         {
3140         case CHAR_C:
3141         ok = expect_cond_assert == 2;
3142         break;
3143 
3144         case CHAR_EQUALS_SIGN:
3145         case CHAR_EXCLAMATION_MARK:
3146         break;
3147 
3148         case CHAR_LESS_THAN_SIGN:
3149         ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3150         break;
3151 
3152         default:
3153         ok = FALSE;
3154         }
3155       }
3156 
3157     if (!ok)
3158       {
3159       ptr--;   /* Adjust error offset */
3160       errorcode = ERR28;
3161       goto FAILED;
3162       }
3163     }
3164 
3165   /* Remember whether we are expecting a conditional assertion, and set the
3166   default for this item. */
3167 
3168   prev_expect_cond_assert = expect_cond_assert;
3169   expect_cond_assert = 0;
3170 
3171   /* Remember quantification status for the previous significant item, then set
3172   default for this item. */
3173 
3174   prev_okquantifier = okquantifier;
3175   prev_meta_quantifier = meta_quantifier;
3176   okquantifier = FALSE;
3177   meta_quantifier = 0;
3178 
3179   /* If the previous significant item was a quantifier, adjust the parsed code
3180   if there is a following modifier. The base meta value is always followed by
3181   the PLUS and QUERY values, in that order. We do this here rather than after
3182   reading a quantifier so that intervening comments and /x whitespace can be
3183   ignored without having to replicate code. */
3184 
3185   if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3186     {
3187     parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3188       prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3189         0x00020000u : 0x00010000u);
3190     continue;  /* Next character in pattern */
3191     }
3192 
3193   /* Process the next item in the main part of a pattern. */
3194 
3195   switch(c)
3196     {
3197     default:              /* Non-special character */
3198     PARSED_LITERAL(c, parsed_pattern);
3199     break;
3200 
3201 
3202     /* ---- Escape sequence ---- */
3203 
3204     case CHAR_BACKSLASH:
3205     tempptr = ptr;
3206     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3207       xoptions, FALSE, cb);
3208     if (errorcode != 0)
3209       {
3210       ESCAPE_FAILED:
3211       if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3212         goto FAILED;
3213       ptr = tempptr;
3214       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3215         {
3216         GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3217         }
3218       escape = 0;                 /* Treat as literal character */
3219       }
3220 
3221     /* The escape was a data escape or literal character. */
3222 
3223     if (escape == 0)
3224       {
3225       PARSED_LITERAL(c, parsed_pattern);
3226       }
3227 
3228     /* The escape was a back (or forward) reference. We keep the offset in
3229     order to give a more useful diagnostic for a bad forward reference. For
3230     references to groups numbered less than 10 we can't use more than two items
3231     in parsed_pattern because they may be just two characters in the input (and
3232     in a 64-bit world an offset may need two elements). So for them, the offset
3233     of the first occurrent is held in a special vector. */
3234 
3235     else if (escape < 0)
3236       {
3237       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3238       escape = -escape;
3239       *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3240       if (escape < 10)
3241         {
3242         if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3243           cb->small_ref_offset[escape] = offset;
3244         }
3245       else
3246         {
3247         PUTOFFSET(offset, parsed_pattern);
3248         }
3249       okquantifier = TRUE;
3250       }
3251 
3252     /* The escape was a character class such as \d etc. or other special
3253     escape indicator such as \A or \X. Most of them generate just a single
3254     parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3255     value. They are supported only when Unicode is available. The type and
3256     value are packed into a single 32-bit value so that the whole sequences
3257     uses only two elements in the parsed_vector. This is because the same
3258     coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3259     set.
3260 
3261     There are also some cases where the escape sequence is followed by a name:
3262     \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3263     and \g'name' are subroutine calls by name; \g{name} is a synonym for
3264     \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3265     and returned as a negative value (handled above). A name is coded as an
3266     offset into the pattern and a length. */
3267 
3268     else switch (escape)
3269       {
3270       case ESC_C:
3271 #ifdef NEVER_BACKSLASH_C
3272       errorcode = ERR85;
3273       goto ESCAPE_FAILED;
3274 #else
3275       if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3276         {
3277         errorcode = ERR83;
3278         goto ESCAPE_FAILED;
3279         }
3280 #endif
3281       okquantifier = TRUE;
3282       *parsed_pattern++ = META_ESCAPE + escape;
3283       break;
3284 
3285       /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3286       when \u{ is not followed by hex digits and }. It requests two literal
3287       characters, u and { and we need this, as otherwise \u{ 12} (for example)
3288       would be treated as u{12} now that spaces are allowed in quantifiers. */
3289 
3290       case ESC_ub:
3291       *parsed_pattern++ = CHAR_u;
3292       PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3293       break;
3294 
3295       case ESC_X:
3296 #ifndef SUPPORT_UNICODE
3297       errorcode = ERR45;   /* Supported only with Unicode support */
3298       goto ESCAPE_FAILED;
3299 #endif
3300       case ESC_H:
3301       case ESC_h:
3302       case ESC_N:
3303       case ESC_R:
3304       case ESC_V:
3305       case ESC_v:
3306       okquantifier = TRUE;
3307       *parsed_pattern++ = META_ESCAPE + escape;
3308       break;
3309 
3310       default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3311       *parsed_pattern++ = META_ESCAPE + escape;
3312       break;
3313 
3314       /* Escapes that may change in UCP mode. */
3315 
3316       case ESC_d:
3317       case ESC_D:
3318       case ESC_s:
3319       case ESC_S:
3320       case ESC_w:
3321       case ESC_W:
3322       okquantifier = TRUE;
3323       parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3324         xoptions);
3325       break;
3326 
3327       /* Unicode property matching */
3328 
3329       case ESC_P:
3330       case ESC_p:
3331 #ifdef SUPPORT_UNICODE
3332         {
3333         BOOL negated;
3334         uint16_t ptype = 0, pdata = 0;
3335         if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3336           goto ESCAPE_FAILED;
3337         if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3338         *parsed_pattern++ = META_ESCAPE + escape;
3339         *parsed_pattern++ = (ptype << 16) | pdata;
3340         okquantifier = TRUE;
3341         }
3342 #else
3343       errorcode = ERR45;
3344       goto ESCAPE_FAILED;
3345 #endif
3346       break;  /* End \P and \p */
3347 
3348       /* When \g is used with quotes or angle brackets as delimiters, it is a
3349       numerical or named subroutine call, and control comes here. When used
3350       with brace delimiters it is a numberical back reference and does not come
3351       here because check_escape() returns it directly as a reference. \k is
3352       always a named back reference. */
3353 
3354       case ESC_g:
3355       case ESC_k:
3356       if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3357           *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3358         {
3359         errorcode = (escape == ESC_g)? ERR57 : ERR69;
3360         goto ESCAPE_FAILED;
3361         }
3362       terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3363         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3364         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3365 
3366       /* For a non-braced \g, check for a numerical recursion. */
3367 
3368       if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3369         {
3370         PCRE2_SPTR p = ptr + 1;
3371 
3372         if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3373             &errorcode))
3374           {
3375           if (p >= ptrend || *p != terminator)
3376             {
3377             errorcode = ERR57;
3378             goto ESCAPE_FAILED;
3379             }
3380           ptr = p;
3381           goto SET_RECURSION;
3382           }
3383         if (errorcode != 0) goto ESCAPE_FAILED;
3384         }
3385 
3386       /* Not a numerical recursion. Perl allows spaces and tabs after { and
3387       before } but not for other delimiters. */
3388 
3389       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3390           &errorcode, cb)) goto ESCAPE_FAILED;
3391 
3392       /* \k and \g when used with braces are back references, whereas \g used
3393       with quotes or angle brackets is a recursion */
3394 
3395       *parsed_pattern++ =
3396         (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3397           META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3398       *parsed_pattern++ = namelen;
3399 
3400       PUTOFFSET(offset, parsed_pattern);
3401       okquantifier = TRUE;
3402       break;  /* End special escape processing */
3403       }
3404     break;    /* End escape sequence processing */
3405 
3406 
3407     /* ---- Single-character special items ---- */
3408 
3409     case CHAR_CIRCUMFLEX_ACCENT:
3410     *parsed_pattern++ = META_CIRCUMFLEX;
3411     break;
3412 
3413     case CHAR_DOLLAR_SIGN:
3414     *parsed_pattern++ = META_DOLLAR;
3415     break;
3416 
3417     case CHAR_DOT:
3418     *parsed_pattern++ = META_DOT;
3419     okquantifier = TRUE;
3420     break;
3421 
3422 
3423     /* ---- Single-character quantifiers ---- */
3424 
3425     case CHAR_ASTERISK:
3426     meta_quantifier = META_ASTERISK;
3427     goto CHECK_QUANTIFIER;
3428 
3429     case CHAR_PLUS:
3430     meta_quantifier = META_PLUS;
3431     goto CHECK_QUANTIFIER;
3432 
3433     case CHAR_QUESTION_MARK:
3434     meta_quantifier = META_QUERY;
3435     goto CHECK_QUANTIFIER;
3436 
3437 
3438     /* ---- Potential {n,m} quantifier ---- */
3439 
3440     case CHAR_LEFT_CURLY_BRACKET:
3441     if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3442         &errorcode))
3443       {
3444       if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3445       PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3446       break;                               /* No more quantifier processing */
3447       }
3448     meta_quantifier = META_MINMAX;
3449     /* Fall through */
3450 
3451 
3452     /* ---- Quantifier post-processing ---- */
3453 
3454     /* Check that a quantifier is allowed after the previous item. This
3455     guarantees that there is a previous item. */
3456 
3457     CHECK_QUANTIFIER:
3458     if (!prev_okquantifier)
3459       {
3460       errorcode = ERR9;
3461       goto FAILED_BACK;
3462       }
3463 
3464     /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3465     quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3466     sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3467     wrapping it in non-capturing brackets, but we have to allow for a preceding
3468     (*MARK) for when (*ACCEPT) has an argument. */
3469 
3470     if (*prev_parsed_item == META_ACCEPT)
3471       {
3472       uint32_t *p;
3473       for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3474       *verbstartptr = META_NOCAPTURE;
3475       parsed_pattern[1] = META_KET;
3476       parsed_pattern += 2;
3477       }
3478 
3479     /* Now we can put the quantifier into the parsed pattern vector. At this
3480     stage, we have only the basic quantifier. The check for a following + or ?
3481     modifier happens at the top of the loop, after any intervening comments
3482     have been removed. */
3483 
3484     *parsed_pattern++ = meta_quantifier;
3485     if (c == CHAR_LEFT_CURLY_BRACKET)
3486       {
3487       *parsed_pattern++ = min_repeat;
3488       *parsed_pattern++ = max_repeat;
3489       }
3490     break;
3491 
3492 
3493     /* ---- Character class ---- */
3494 
3495     case CHAR_LEFT_SQUARE_BRACKET:
3496     okquantifier = TRUE;
3497 
3498     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3499     used for "start of word" and "end of word". As these are otherwise illegal
3500     sequences, we don't break anything by recognizing them. They are replaced
3501     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3502     erroneous and are handled by the normal code below. */
3503 
3504     if (ptrend - ptr >= 6 &&
3505          (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3506           PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3507       {
3508       *parsed_pattern++ = META_ESCAPE + ESC_b;
3509 
3510       if (ptr[2] == CHAR_LESS_THAN_SIGN)
3511         {
3512         *parsed_pattern++ = META_LOOKAHEAD;
3513         }
3514       else
3515         {
3516         *parsed_pattern++ = META_LOOKBEHIND;
3517         *has_lookbehind = TRUE;
3518 
3519         /* The offset is used only for the "non-fixed length" error; this won't
3520         occur here, so just store zero. */
3521 
3522         PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3523         }
3524 
3525       if ((options & PCRE2_UCP) == 0)
3526         *parsed_pattern++ = META_ESCAPE + ESC_w;
3527       else
3528         {
3529         *parsed_pattern++ = META_ESCAPE + ESC_p;
3530         *parsed_pattern++ = PT_WORD << 16;
3531         }
3532       *parsed_pattern++ = META_KET;
3533       ptr += 6;
3534       break;
3535       }
3536 
3537     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3538     they are encountered at the top level, so we'll do that too. */
3539 
3540     if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3541          *ptr == CHAR_EQUALS_SIGN) &&
3542         check_posix_syntax(ptr, ptrend, &tempptr))
3543       {
3544       errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3545       goto FAILED;
3546       }
3547 
3548     /* Process a regular character class. If the first character is '^', set
3549     the negation flag. If the first few characters (either before or after ^)
3550     are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3551     This makes for compatibility with Perl. */
3552 
3553     negate_class = FALSE;
3554     while (ptr < ptrend)
3555       {
3556       GETCHARINCTEST(c, ptr);
3557       if (c == CHAR_BACKSLASH)
3558         {
3559         if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3560         else if (ptrend - ptr >= 3 &&
3561              PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3562           ptr += 3;
3563         else
3564           break;
3565         }
3566       else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3567                (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3568         continue;
3569       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3570         negate_class = TRUE;
3571       else break;
3572       }
3573 
3574     /* Now the real contents of the class; c has the first "real" character.
3575     Empty classes are permitted only if the option is set. */
3576 
3577     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3578         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3579       {
3580       *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3581       break;  /* End of class processing */
3582       }
3583 
3584     /* Process a non-empty class. */
3585 
3586     *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3587     class_range_state = RANGE_NO;
3588 
3589     /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3590     because there are holes in the encoding, and simply using the range A-Z
3591     (for example) would include the characters in the holes. This applies only
3592     to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3593     in this respect. In order to accommodate this, we keep track of whether
3594     character values are literal or not, and a state variable for handling
3595     ranges. */
3596 
3597     /* Loop for the contents of the class */
3598 
3599     for (;;)
3600       {
3601       BOOL char_is_literal = TRUE;
3602 
3603       /* Inside \Q...\E everything is literal except \E */
3604 
3605       if (inescq)
3606         {
3607         if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3608           {
3609           inescq = FALSE;                   /* Reset literal state */
3610           ptr++;                            /* Skip the 'E' */
3611           goto CLASS_CONTINUE;
3612           }
3613         goto CLASS_LITERAL;
3614         }
3615 
3616       /* Skip over space and tab (only) in extended-more mode. */
3617 
3618       if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3619           (c == CHAR_SPACE || c == CHAR_HT))
3620         goto CLASS_CONTINUE;
3621 
3622       /* Handle POSIX class names. Perl allows a negation extension of the
3623       form [:^name:]. A square bracket that doesn't match the syntax is
3624       treated as a literal. We also recognize the POSIX constructions
3625       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3626       5.6 and 5.8 do. */
3627 
3628       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3629           ptrend - ptr >= 3 &&
3630           (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3631            *ptr == CHAR_EQUALS_SIGN) &&
3632           check_posix_syntax(ptr, ptrend, &tempptr))
3633         {
3634         BOOL posix_negate = FALSE;
3635         int posix_class;
3636 
3637         /* Perl treats a hyphen before a POSIX class as a literal, not the
3638         start of a range. However, it gives a warning in its warning mode. PCRE
3639         does not have a warning mode, so we give an error, because this is
3640         likely an error on the user's part. */
3641 
3642         if (class_range_state == RANGE_STARTED)
3643           {
3644           errorcode = ERR50;
3645           goto FAILED;
3646           }
3647 
3648         if (*ptr != CHAR_COLON)
3649           {
3650           errorcode = ERR13;
3651           goto FAILED_BACK;
3652           }
3653 
3654         if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3655           {
3656           posix_negate = TRUE;
3657           ptr++;
3658           }
3659 
3660         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3661         if (posix_class < 0)
3662           {
3663           errorcode = ERR30;
3664           goto FAILED;
3665           }
3666         ptr = tempptr + 2;
3667 
3668         /* Perl treats a hyphen after a POSIX class as a literal, not the
3669         start of a range. However, it gives a warning in its warning mode
3670         unless the hyphen is the last character in the class. PCRE does not
3671         have a warning mode, so we give an error, because this is likely an
3672         error on the user's part. */
3673 
3674         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3675             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3676           {
3677           errorcode = ERR50;
3678           goto FAILED;
3679           }
3680 
3681         /* Set "a hyphen is not the start of a range" for the -] case, and also
3682         in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3683         fuzzers do that kind of thing) and *then* a hyphen. This causes that
3684         hyphen to be treated as a literal. I don't think it's worth setting up
3685         special apparatus to do otherwise. */
3686 
3687         class_range_state = RANGE_NO;
3688 
3689         /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3690         of the POSIX classes are converted to use Unicode properties \p or \P
3691         or, in one case, \h or \H. The substitutes table has two values per
3692         class, containing the type and value of a \p or \P item. The special
3693         cases are specified with a negative type: a non-zero value causes \h or
3694         \H to be used, and a zero value falls through to behave like a non-UCP
3695         POSIX class. There are now also some extra options that force ASCII for
3696         some classes. */
3697 
3698 #ifdef SUPPORT_UNICODE
3699         if ((options & PCRE2_UCP) != 0 &&
3700             (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3701             !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3702               (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3703           {
3704           int ptype = posix_substitutes[2*posix_class];
3705           int pvalue = posix_substitutes[2*posix_class + 1];
3706 
3707           if (ptype >= 0)
3708             {
3709             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3710             *parsed_pattern++ = (ptype << 16) | pvalue;
3711             goto CLASS_CONTINUE;
3712             }
3713 
3714           if (pvalue != 0)
3715             {
3716             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3717             goto CLASS_CONTINUE;
3718             }
3719 
3720           /* Fall through */
3721           }
3722 #endif  /* SUPPORT_UNICODE */
3723 
3724         /* Non-UCP POSIX class */
3725 
3726         *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3727         *parsed_pattern++ = posix_class;
3728         }
3729 
3730       /* Handle potential start of range */
3731 
3732       else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3733         {
3734         *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3735           META_RANGE_LITERAL : META_RANGE_ESCAPED;
3736         class_range_state = RANGE_STARTED;
3737         }
3738 
3739       /* Handle a literal character */
3740 
3741       else if (c != CHAR_BACKSLASH)
3742         {
3743         CLASS_LITERAL:
3744         if (class_range_state == RANGE_STARTED)
3745           {
3746           if (c == parsed_pattern[-2])       /* Optimize one-char range */
3747             parsed_pattern--;
3748           else if (parsed_pattern[-2] > c)   /* Check range is in order */
3749             {
3750             errorcode = ERR8;
3751             goto FAILED_BACK;
3752             }
3753           else
3754             {
3755             if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3756               parsed_pattern[-1] = META_RANGE_ESCAPED;
3757             PARSED_LITERAL(c, parsed_pattern);
3758             }
3759           class_range_state = RANGE_NO;
3760           }
3761         else  /* Potential start of range */
3762           {
3763           class_range_state = char_is_literal?
3764             RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3765           PARSED_LITERAL(c, parsed_pattern);
3766           }
3767         }
3768 
3769       /* Handle escapes in a class */
3770 
3771       else
3772         {
3773         tempptr = ptr;
3774         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3775           xoptions, TRUE, cb);
3776 
3777         if (errorcode != 0)
3778           {
3779           if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3780             goto FAILED;
3781           ptr = tempptr;
3782           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3783             {
3784             GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3785             }
3786           escape = 0;                 /* Treat as literal character */
3787           }
3788 
3789         switch(escape)
3790           {
3791           case 0:  /* Escaped character code point is in c */
3792           char_is_literal = FALSE;
3793           goto CLASS_LITERAL;      /* (a few lines above) */
3794 
3795           case ESC_b:
3796           c = CHAR_BS;    /* \b is backspace in a class */
3797           char_is_literal = FALSE;
3798           goto CLASS_LITERAL;
3799 
3800           case ESC_Q:
3801           inescq = TRUE;  /* Enter literal mode */
3802           goto CLASS_CONTINUE;
3803 
3804           case ESC_E:     /* Ignore orphan \E */
3805           goto CLASS_CONTINUE;
3806 
3807           case ESC_B:     /* Always an error in a class */
3808           case ESC_R:
3809           case ESC_X:
3810           errorcode = ERR7;
3811           ptr--;
3812           goto FAILED;
3813           }
3814 
3815         /* The second part of a range can be a single-character escape
3816         sequence (detected above), but not any of the other escapes. Perl
3817         treats a hyphen as a literal in such circumstances. However, in Perl's
3818         warning mode, a warning is given, so PCRE now faults it, as it is
3819         almost certainly a mistake on the user's part. */
3820 
3821         if (class_range_state == RANGE_STARTED)
3822           {
3823           errorcode = ERR50;
3824           goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3825           }
3826 
3827         /* Of the remaining escapes, only those that define characters are
3828         allowed in a class. None may start a range. */
3829 
3830         class_range_state = RANGE_NO;
3831         switch(escape)
3832           {
3833           case ESC_N:
3834           errorcode = ERR71;
3835           goto FAILED;
3836 
3837           case ESC_H:
3838           case ESC_h:
3839           case ESC_V:
3840           case ESC_v:
3841           *parsed_pattern++ = META_ESCAPE + escape;
3842           break;
3843 
3844           /* These escapes may be converted to Unicode property tests when
3845           PCRE2_UCP is set. */
3846 
3847           case ESC_d:
3848           case ESC_D:
3849           case ESC_s:
3850           case ESC_S:
3851           case ESC_w:
3852           case ESC_W:
3853           parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3854             xoptions);
3855           break;
3856 
3857           /* Explicit Unicode property matching */
3858 
3859           case ESC_P:
3860           case ESC_p:
3861 #ifdef SUPPORT_UNICODE
3862             {
3863             BOOL negated;
3864             uint16_t ptype = 0, pdata = 0;
3865             if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3866               goto FAILED;
3867             if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3868             *parsed_pattern++ = META_ESCAPE + escape;
3869             *parsed_pattern++ = (ptype << 16) | pdata;
3870             }
3871 #else
3872           errorcode = ERR45;
3873           goto FAILED;
3874 #endif
3875           break;  /* End \P and \p */
3876 
3877           default:    /* All others are not allowed in a class */
3878           errorcode = ERR7;
3879           ptr--;
3880           goto FAILED;
3881           }
3882 
3883         /* Perl gives a warning unless a following hyphen is the last character
3884         in the class. PCRE throws an error. */
3885 
3886         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3887             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3888           {
3889           errorcode = ERR50;
3890           goto FAILED;
3891           }
3892         }
3893 
3894       /* Proceed to next thing in the class. */
3895 
3896       CLASS_CONTINUE:
3897       if (ptr >= ptrend)
3898         {
3899         errorcode = ERR6;  /* Missing terminating ']' */
3900         goto FAILED;
3901         }
3902       GETCHARINCTEST(c, ptr);
3903       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3904       }     /* End of class-processing loop */
3905 
3906     /* -] at the end of a class is a literal '-' */
3907 
3908     if (class_range_state == RANGE_STARTED)
3909       {
3910       parsed_pattern[-1] = CHAR_MINUS;
3911       class_range_state = RANGE_NO;
3912       }
3913 
3914     *parsed_pattern++ = META_CLASS_END;
3915     break;  /* End of character class */
3916 
3917 
3918     /* ---- Opening parenthesis ---- */
3919 
3920     case CHAR_LEFT_PARENTHESIS:
3921     if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3922 
3923     /* If ( is not followed by ? it is either a capture or a special verb or an
3924     alpha assertion or a positive non-atomic lookahead. */
3925 
3926     if (*ptr != CHAR_QUESTION_MARK)
3927       {
3928       const char *vn;
3929 
3930       /* Handle capturing brackets (or non-capturing if auto-capture is turned
3931       off). */
3932 
3933       if (*ptr != CHAR_ASTERISK)
3934         {
3935         nest_depth++;
3936         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3937           {
3938           if (cb->bracount >= MAX_GROUP_NUMBER)
3939             {
3940             errorcode = ERR97;
3941             goto FAILED;
3942             }
3943           cb->bracount++;
3944           *parsed_pattern++ = META_CAPTURE | cb->bracount;
3945           }
3946         else *parsed_pattern++ = META_NOCAPTURE;
3947         }
3948 
3949       /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3950       quantifier" error rather than "(*MARK) must have an argument". */
3951 
3952       else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3953         break;
3954 
3955       /* Handle "alpha assertions" such as (*pla:...). Most of these are
3956       synonyms for the historical symbolic assertions, but the script run and
3957       non-atomic lookaround ones are new. They are distinguished by starting
3958       with a lower case letter. Checking both ends of the alphabet makes this
3959       work in all character codes. */
3960 
3961       else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3962         {
3963         uint32_t meta;
3964 
3965         vn = alasnames;
3966         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3967           &errorcode, cb)) goto FAILED;
3968         if (ptr >= ptrend || *ptr != CHAR_COLON)
3969           {
3970           errorcode = ERR95;  /* Malformed */
3971           goto FAILED;
3972           }
3973 
3974         /* Scan the table of alpha assertion names */
3975 
3976         for (i = 0; i < alascount; i++)
3977           {
3978           if (namelen == alasmeta[i].len &&
3979               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3980             break;
3981           vn += alasmeta[i].len + 1;
3982           }
3983 
3984         if (i >= alascount)
3985           {
3986           errorcode = ERR95;  /* Alpha assertion not recognized */
3987           goto FAILED;
3988           }
3989 
3990         /* Check for expecting an assertion condition. If so, only atomic
3991         lookaround assertions are valid. */
3992 
3993         meta = alasmeta[i].meta;
3994         if (prev_expect_cond_assert > 0 &&
3995             (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3996           {
3997           errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3998             ERR98 : ERR28;  /* (Atomic) assertion expected */
3999           goto FAILED;
4000           }
4001 
4002         /* The lookaround alphabetic synonyms can mostly be handled by jumping
4003         to the code that handles the traditional symbolic forms. */
4004 
4005         switch(meta)
4006           {
4007           default:
4008           errorcode = ERR89;  /* Unknown code; should never occur because */
4009           goto FAILED;        /* the meta values come from a table above. */
4010 
4011           case META_ATOMIC:
4012           goto ATOMIC_GROUP;
4013 
4014           case META_LOOKAHEAD:
4015           goto POSITIVE_LOOK_AHEAD;
4016 
4017           case META_LOOKAHEAD_NA:
4018           goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4019 
4020           case META_LOOKAHEADNOT:
4021           goto NEGATIVE_LOOK_AHEAD;
4022 
4023           case META_LOOKBEHIND:
4024           case META_LOOKBEHINDNOT:
4025           case META_LOOKBEHIND_NA:
4026           *parsed_pattern++ = meta;
4027           ptr--;
4028           goto POST_LOOKBEHIND;
4029 
4030           /* The script run facilities are handled here. Unicode support is
4031           required (give an error if not, as this is a security issue). Always
4032           record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4033           META_ATOMIC and remember that we need two META_KETs at the end. */
4034 
4035           case META_SCRIPT_RUN:
4036           case META_ATOMIC_SCRIPT_RUN:
4037 #ifdef SUPPORT_UNICODE
4038           *parsed_pattern++ = META_SCRIPT_RUN;
4039           nest_depth++;
4040           ptr++;
4041           if (meta == META_ATOMIC_SCRIPT_RUN)
4042             {
4043             *parsed_pattern++ = META_ATOMIC;
4044             if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4045             else if (++top_nest >= end_nests)
4046               {
4047               errorcode = ERR84;
4048               goto FAILED;
4049               }
4050             top_nest->nest_depth = nest_depth;
4051             top_nest->flags = NSF_ATOMICSR;
4052             top_nest->options = options & PARSE_TRACKED_OPTIONS;
4053             top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4054             }
4055           break;
4056 #else  /* SUPPORT_UNICODE */
4057           errorcode = ERR96;
4058           goto FAILED;
4059 #endif
4060           }
4061         }
4062 
4063 
4064       /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4065 
4066       else
4067         {
4068         vn = verbnames;
4069         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4070           &errorcode, cb)) goto FAILED;
4071         if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4072                               *ptr != CHAR_RIGHT_PARENTHESIS))
4073           {
4074           errorcode = ERR60;  /* Malformed */
4075           goto FAILED;
4076           }
4077 
4078         /* Scan the table of verb names */
4079 
4080         for (i = 0; i < verbcount; i++)
4081           {
4082           if (namelen == verbs[i].len &&
4083               PRIV(strncmp_c8)(name, vn, namelen) == 0)
4084             break;
4085           vn += verbs[i].len + 1;
4086           }
4087 
4088         if (i >= verbcount)
4089           {
4090           errorcode = ERR60;  /* Verb not recognized */
4091           goto FAILED;
4092           }
4093 
4094         /* An empty argument is treated as no argument. */
4095 
4096         if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4097              ptr[1] == CHAR_RIGHT_PARENTHESIS)
4098           ptr++;    /* Advance to the closing parens */
4099 
4100         /* Check for mandatory non-empty argument; this is (*MARK) */
4101 
4102         if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4103           {
4104           errorcode = ERR66;
4105           goto FAILED;
4106           }
4107 
4108         /* Remember where this verb, possibly with a preceding (*MARK), starts,
4109         for handling quantified (*ACCEPT). */
4110 
4111         verbstartptr = parsed_pattern;
4112         okquantifier = (verbs[i].meta == META_ACCEPT);
4113 
4114         /* It appears that Perl allows any characters whatsoever, other than a
4115         closing parenthesis, to appear in arguments ("names"), so we no longer
4116         insist on letters, digits, and underscores. Perl does not, however, do
4117         any interpretation within arguments, and has no means of including a
4118         closing parenthesis. PCRE supports escape processing but only when it
4119         is requested by an option. We set inverbname TRUE here, and let the
4120         main loop take care of this so that escape and \x processing is done by
4121         the main code above. */
4122 
4123         if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4124           {
4125           /* Some optional arguments can be treated as a preceding (*MARK) */
4126 
4127           if (verbs[i].has_arg < 0)
4128             {
4129             add_after_mark = verbs[i].meta;
4130             *parsed_pattern++ = META_MARK;
4131             }
4132 
4133           /* The remaining verbs with arguments (except *MARK) need a different
4134           opcode. */
4135 
4136           else
4137             {
4138             *parsed_pattern++ = verbs[i].meta +
4139               ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4140             }
4141 
4142           /* Set up for reading the name in the main loop. */
4143 
4144           verblengthptr = parsed_pattern++;
4145           verbnamestart = ptr;
4146           inverbname = TRUE;
4147           }
4148         else  /* No verb "name" argument */
4149           {
4150           *parsed_pattern++ = verbs[i].meta;
4151           }
4152         }     /* End of (*VERB) handling */
4153       break;  /* Done with this parenthesis */
4154       }       /* End of groups that don't start with (? */
4155 
4156 
4157     /* ---- Items starting (? ---- */
4158 
4159     /* The type of item is determined by what follows (?. Handle (?| and option
4160     changes under "default" because both need a new block on the nest stack.
4161     Comments starting with (?# are handled above. Note that there is some
4162     ambiguity about the sequence (?- because if a digit follows it's a relative
4163     recursion or subroutine call whereas otherwise it's an option unsetting. */
4164 
4165     if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4166 
4167     switch(*ptr)
4168       {
4169       default:
4170       if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4171         goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4172 
4173       /* We now have either (?| or a (possibly empty) option setting,
4174       optionally followed by a non-capturing group. */
4175 
4176       nest_depth++;
4177       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4178       else if (++top_nest >= end_nests)
4179         {
4180         errorcode = ERR84;
4181         goto FAILED;
4182         }
4183       top_nest->nest_depth = nest_depth;
4184       top_nest->flags = 0;
4185       top_nest->options = options & PARSE_TRACKED_OPTIONS;
4186       top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4187 
4188       /* Start of non-capturing group that resets the capture count for each
4189       branch. */
4190 
4191       if (*ptr == CHAR_VERTICAL_LINE)
4192         {
4193         top_nest->reset_group = (uint16_t)cb->bracount;
4194         top_nest->max_group = (uint16_t)cb->bracount;
4195         top_nest->flags |= NSF_RESET;
4196         cb->external_flags |= PCRE2_DUPCAPUSED;
4197         *parsed_pattern++ = META_NOCAPTURE;
4198         ptr++;
4199         }
4200 
4201       /* Scan for options imnrsxJU to be set or unset. */
4202 
4203       else
4204         {
4205         BOOL hyphenok = TRUE;
4206         uint32_t oldoptions = options;
4207         uint32_t oldxoptions = xoptions;
4208 
4209         top_nest->reset_group = 0;
4210         top_nest->max_group = 0;
4211         set = unset = 0;
4212         optset = &set;
4213         xset = xunset = 0;
4214         xoptset = &xset;
4215 
4216         /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4217 
4218         if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4219           {
4220           options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4221                        PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4222           xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4223           hyphenok = FALSE;
4224           ptr++;
4225           }
4226 
4227         while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4228                                *ptr != CHAR_COLON)
4229           {
4230           switch (*ptr++)
4231             {
4232             case CHAR_MINUS:
4233             if (!hyphenok)
4234               {
4235               errorcode = ERR94;
4236               ptr--;  /* Correct the offset */
4237               goto FAILED;
4238               }
4239             optset = &unset;
4240             xoptset = &xunset;
4241             hyphenok = FALSE;
4242             break;
4243 
4244             /* There are some two-character sequences that start with 'a'. */
4245 
4246             case CHAR_a:
4247             if (ptr < ptrend)
4248               {
4249               if (*ptr == CHAR_D)
4250                 {
4251                 *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4252                 ptr++;
4253                 break;
4254                 }
4255               if (*ptr == CHAR_P)
4256                 {
4257                 *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4258                 ptr++;
4259                 break;
4260                 }
4261               if (*ptr == CHAR_S)
4262                 {
4263                 *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4264                 ptr++;
4265                 break;
4266                 }
4267               if (*ptr == CHAR_T)
4268                 {
4269                 *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4270                 ptr++;
4271                 break;
4272                 }
4273               if (*ptr == CHAR_W)
4274                 {
4275                 *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4276                 ptr++;
4277                 break;
4278                 }
4279               }
4280             *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4281                         PCRE2_EXTRA_ASCII_BSW|
4282                         PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4283             break;
4284 
4285             case CHAR_J:  /* Record that it changed in the external options */
4286             *optset |= PCRE2_DUPNAMES;
4287             cb->external_flags |= PCRE2_JCHANGED;
4288             break;
4289 
4290             case CHAR_i: *optset |= PCRE2_CASELESS; break;
4291             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4292             case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4293             case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4294             case CHAR_s: *optset |= PCRE2_DOTALL; break;
4295             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4296 
4297             /* If x appears twice it sets the extended extended option. */
4298 
4299             case CHAR_x:
4300             *optset |= PCRE2_EXTENDED;
4301             if (ptr < ptrend && *ptr == CHAR_x)
4302               {
4303               *optset |= PCRE2_EXTENDED_MORE;
4304               ptr++;
4305               }
4306             break;
4307 
4308             default:
4309             errorcode = ERR11;
4310             ptr--;    /* Correct the offset */
4311             goto FAILED;
4312             }
4313           }
4314 
4315         /* If we are setting extended without extended-more, ensure that any
4316         existing extended-more gets unset. Also, unsetting extended must also
4317         unset extended-more. */
4318 
4319         if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4320             (unset & PCRE2_EXTENDED) != 0)
4321           unset |= PCRE2_EXTENDED_MORE;
4322 
4323         options = (options | set) & (~unset);
4324         xoptions = (xoptions | xset) & (~xunset);
4325 
4326         /* If the options ended with ')' this is not the start of a nested
4327         group with option changes, so the options change at this level.
4328         In this case, if the previous level set up a nest block, discard the
4329         one we have just created. Otherwise adjust it for the previous level.
4330         If the options ended with ':' we are starting a non-capturing group,
4331         possibly with an options setting. */
4332 
4333         if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4334         if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4335           {
4336           nest_depth--;  /* This is not a nested group after all. */
4337           if (top_nest > (nest_save *)(cb->start_workspace) &&
4338               (top_nest-1)->nest_depth == nest_depth) top_nest--;
4339           else top_nest->nest_depth = nest_depth;
4340           }
4341         else *parsed_pattern++ = META_NOCAPTURE;
4342 
4343         /* If nothing changed, no need to record. */
4344 
4345         if (options != oldoptions || xoptions != oldxoptions)
4346           {
4347           *parsed_pattern++ = META_OPTIONS;
4348           *parsed_pattern++ = options;
4349           *parsed_pattern++ = xoptions;
4350           }
4351         }     /* End options processing */
4352       break;  /* End default case after (? */
4353 
4354 
4355       /* ---- Python syntax support ---- */
4356 
4357       case CHAR_P:
4358       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4359 
4360       /* (?P<name> is the same as (?<name>, which defines a named group. */
4361 
4362       if (*ptr == CHAR_LESS_THAN_SIGN)
4363         {
4364         terminator = CHAR_GREATER_THAN_SIGN;
4365         goto DEFINE_NAME;
4366         }
4367 
4368       /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4369       call. */
4370 
4371       if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4372 
4373       /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4374       else after (?P is an error. */
4375 
4376       if (*ptr != CHAR_EQUALS_SIGN)
4377         {
4378         errorcode = ERR41;
4379         goto FAILED;
4380         }
4381       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4382           &namelen, &errorcode, cb)) goto FAILED;
4383       *parsed_pattern++ = META_BACKREF_BYNAME;
4384       *parsed_pattern++ = namelen;
4385       PUTOFFSET(offset, parsed_pattern);
4386       okquantifier = TRUE;
4387       break;   /* End of (?P processing */
4388 
4389 
4390       /* ---- Recursion/subroutine calls by number ---- */
4391 
4392       case CHAR_R:
4393       i = 0;         /* (?R) == (?R0) */
4394       ptr++;
4395       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4396         {
4397         errorcode = ERR58;
4398         goto FAILED;
4399         }
4400       goto SET_RECURSION;
4401 
4402       /* An item starting (?- followed by a digit comes here via the "default"
4403       case because (?- followed by a non-digit is an options setting. */
4404 
4405       case CHAR_PLUS:
4406       if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4407         {
4408         errorcode = ERR29;   /* Missing number */
4409         goto FAILED;
4410         }
4411       /* Fall through */
4412 
4413       case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4414       case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4415       RECURSION_BYNUMBER:
4416       if (!read_number(&ptr, ptrend,
4417           (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4418           MAX_GROUP_NUMBER, ERR61,
4419           &i, &errorcode)) goto FAILED;
4420       if (i < 0)  /* NB (?0) is permitted */
4421         {
4422         errorcode = ERR15;   /* Unknown group */
4423         goto FAILED_BACK;
4424         }
4425       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4426         goto UNCLOSED_PARENTHESIS;
4427 
4428       SET_RECURSION:
4429       *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4430       offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4431       ptr++;
4432       PUTOFFSET(offset, parsed_pattern);
4433       okquantifier = TRUE;
4434       break;  /* End of recursive call by number handling */
4435 
4436 
4437       /* ---- Recursion/subroutine calls by name ---- */
4438 
4439       case CHAR_AMPERSAND:
4440       RECURSE_BY_NAME:
4441       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4442           &namelen, &errorcode, cb)) goto FAILED;
4443       *parsed_pattern++ = META_RECURSE_BYNAME;
4444       *parsed_pattern++ = namelen;
4445       PUTOFFSET(offset, parsed_pattern);
4446       okquantifier = TRUE;
4447       break;
4448 
4449       /* ---- Callout with numerical or string argument ---- */
4450 
4451       case CHAR_C:
4452       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4453 
4454       /* If the previous item was a condition starting (?(? an assertion,
4455       optionally preceded by a callout, is expected. This is checked later on,
4456       during actual compilation. However we need to identify this kind of
4457       assertion in this pass because it must not be qualified. The value of
4458       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4459       for a callout - still leaving a positive value that identifies the
4460       assertion. Multiple callouts or any other items will make it zero or
4461       less, which doesn't matter because they will cause an error later. */
4462 
4463       expect_cond_assert = prev_expect_cond_assert - 1;
4464 
4465       /* If previous_callout is not NULL, it means this follows a previous
4466       callout. If it was a manual callout, do nothing; this means its "length
4467       of next pattern item" field will remain zero. If it was an automatic
4468       callout, abolish it. */
4469 
4470       if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4471           previous_callout == parsed_pattern - 4 &&
4472           parsed_pattern[-1] == 255)
4473         parsed_pattern = previous_callout;
4474 
4475       /* Save for updating next pattern item length, and skip one item before
4476       completing. */
4477 
4478       previous_callout = parsed_pattern;
4479       after_manual_callout = 1;
4480 
4481       /* Handle a string argument; specific delimiter is required. */
4482 
4483       if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4484         {
4485         PCRE2_SIZE calloutlength;
4486         PCRE2_SPTR startptr = ptr;
4487 
4488         delimiter = 0;
4489         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4490           {
4491           if (*ptr == PRIV(callout_start_delims)[i])
4492             {
4493             delimiter = PRIV(callout_end_delims)[i];
4494             break;
4495             }
4496           }
4497         if (delimiter == 0)
4498           {
4499           errorcode = ERR82;
4500           goto FAILED;
4501           }
4502 
4503         *parsed_pattern = META_CALLOUT_STRING;
4504         parsed_pattern += 3;   /* Skip pattern info */
4505 
4506         for (;;)
4507           {
4508           if (++ptr >= ptrend)
4509             {
4510             errorcode = ERR81;
4511             ptr = startptr;   /* To give a more useful message */
4512             goto FAILED;
4513             }
4514           if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4515             break;
4516           }
4517 
4518         calloutlength = (PCRE2_SIZE)(ptr - startptr);
4519         if (calloutlength > UINT32_MAX)
4520           {
4521           errorcode = ERR72;
4522           goto FAILED;
4523           }
4524         *parsed_pattern++ = (uint32_t)calloutlength;
4525         offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4526         PUTOFFSET(offset, parsed_pattern);
4527         }
4528 
4529       /* Handle a callout with an optional numerical argument, which must be
4530       less than or equal to 255. A missing argument gives 0. */
4531 
4532       else
4533         {
4534         int n = 0;
4535         *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4536         parsed_pattern += 3;                       /* Skip pattern info */
4537         while (ptr < ptrend && IS_DIGIT(*ptr))
4538           {
4539           n = n * 10 + *ptr++ - CHAR_0;
4540           if (n > 255)
4541             {
4542             errorcode = ERR38;
4543             goto FAILED;
4544             }
4545           }
4546         *parsed_pattern++ = n;
4547         }
4548 
4549       /* Both formats must have a closing parenthesis */
4550 
4551       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4552         {
4553         errorcode = ERR39;
4554         goto FAILED;
4555         }
4556       ptr++;
4557 
4558       /* Remember the offset to the next item in the pattern, and set a default
4559       length. This should get updated after the next item is read. */
4560 
4561       previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4562       previous_callout[2] = 0;
4563       break;                  /* End callout */
4564 
4565 
4566       /* ---- Conditional group ---- */
4567 
4568       /* A condition can be an assertion, a number (referring to a numbered
4569       group's having been set), a name (referring to a named group), or 'R',
4570       referring to overall recursion. R<digits> and R&name are also permitted
4571       for recursion state tests. Numbers may be preceded by + or - to specify a
4572       relative group number.
4573 
4574       There are several syntaxes for testing a named group: (?(name)) is used
4575       by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4576 
4577       There are two unfortunate ambiguities. 'R' can be the recursive thing or
4578       the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4579       the Perl DEFINE feature or the Python named test. We look for a name
4580       first; if not found, we try the other case.
4581 
4582       For compatibility with auto-callouts, we allow a callout to be specified
4583       before a condition that is an assertion. */
4584 
4585       case CHAR_LEFT_PARENTHESIS:
4586       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4587       nest_depth++;
4588 
4589       /* If the next character is ? or * there must be an assertion next
4590       (optionally preceded by a callout). We do not check this here, but
4591       instead we set expect_cond_assert to 2. If this is still greater than
4592       zero (callouts decrement it) when the next assertion is read, it will be
4593       marked as a condition that must not be repeated. A value greater than
4594       zero also causes checking that an assertion (possibly with callout)
4595       follows. */
4596 
4597       if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4598         {
4599         *parsed_pattern++ = META_COND_ASSERT;
4600         ptr--;   /* Pull pointer back to the opening parenthesis. */
4601         expect_cond_assert = 2;
4602         break;  /* End of conditional */
4603         }
4604 
4605       /* Handle (?([+-]number)... */
4606 
4607       if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4608           &errorcode))
4609         {
4610         if (i <= 0)
4611           {
4612           errorcode = ERR15;
4613           goto FAILED;
4614           }
4615         *parsed_pattern++ = META_COND_NUMBER;
4616         offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4617         PUTOFFSET(offset, parsed_pattern);
4618         *parsed_pattern++ = i;
4619         }
4620       else if (errorcode != 0) goto FAILED;   /* Number too big */
4621 
4622       /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4623 
4624       else if (ptrend - ptr >= 10 &&
4625                PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4626                ptr[7] != CHAR_RIGHT_PARENTHESIS)
4627         {
4628         uint32_t ge = 0;
4629         int major = 0;
4630         int minor = 0;
4631 
4632         ptr += 7;
4633         if (*ptr == CHAR_GREATER_THAN_SIGN)
4634           {
4635           ge = 1;
4636           ptr++;
4637           }
4638 
4639         /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4640         references its argument twice. */
4641 
4642         if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4643           goto BAD_VERSION_CONDITION;
4644 
4645         if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4646           goto FAILED;
4647 
4648         if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4649         if (*ptr == CHAR_DOT)
4650           {
4651           if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4652           minor = (*ptr++ - CHAR_0) * 10;
4653           if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4654           if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4655           if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4656             goto BAD_VERSION_CONDITION;
4657           }
4658 
4659         *parsed_pattern++ = META_COND_VERSION;
4660         *parsed_pattern++ = ge;
4661         *parsed_pattern++ = major;
4662         *parsed_pattern++ = minor;
4663         }
4664 
4665       /* All the remaining cases now require us to read a name. We cannot at
4666       this stage distinguish ambiguous cases such as (?(R12) which might be a
4667       recursion test by number or a name, because the named groups have not yet
4668       all been identified. Those cases are treated as names, but given a
4669       different META code. */
4670 
4671       else
4672         {
4673         BOOL was_r_ampersand = FALSE;
4674 
4675         if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4676           {
4677           terminator = CHAR_RIGHT_PARENTHESIS;
4678           was_r_ampersand = TRUE;
4679           ptr++;
4680           }
4681         else if (*ptr == CHAR_LESS_THAN_SIGN)
4682           terminator = CHAR_GREATER_THAN_SIGN;
4683         else if (*ptr == CHAR_APOSTROPHE)
4684           terminator = CHAR_APOSTROPHE;
4685         else
4686           {
4687           terminator = CHAR_RIGHT_PARENTHESIS;
4688           ptr--;   /* Point to char before name */
4689           }
4690         if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4691             &errorcode, cb)) goto FAILED;
4692 
4693         /* Handle (?(R&name) */
4694 
4695         if (was_r_ampersand)
4696           {
4697           *parsed_pattern = META_COND_RNAME;
4698           ptr--;   /* Back to closing parens */
4699           }
4700 
4701         /* Handle (?(name). If the name is "DEFINE" we identify it with a
4702         special code. Likewise if the name consists of R followed only by
4703         digits. Otherwise, handle it like a quoted name. */
4704 
4705         else if (terminator == CHAR_RIGHT_PARENTHESIS)
4706           {
4707           if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4708             *parsed_pattern = META_COND_DEFINE;
4709           else
4710             {
4711             for (i = 1; i < (int)namelen; i++)
4712               if (!IS_DIGIT(name[i])) break;
4713             *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4714               META_COND_RNUMBER : META_COND_NAME;
4715             }
4716           ptr--;   /* Back to closing parens */
4717           }
4718 
4719         /* Handle (?('name') or (?(<name>) */
4720 
4721         else *parsed_pattern = META_COND_NAME;
4722 
4723         /* All these cases except DEFINE end with the name length and offset;
4724         DEFINE just has an offset (for the "too many branches" error). */
4725 
4726         if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4727         PUTOFFSET(offset, parsed_pattern);
4728         }  /* End cases that read a name */
4729 
4730       /* Check the closing parenthesis of the condition */
4731 
4732       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4733         {
4734         errorcode = ERR24;
4735         goto FAILED;
4736         }
4737       ptr++;
4738       break;  /* End of condition processing */
4739 
4740 
4741       /* ---- Atomic group ---- */
4742 
4743       case CHAR_GREATER_THAN_SIGN:
4744       ATOMIC_GROUP:                          /* Come from (*atomic: */
4745       *parsed_pattern++ = META_ATOMIC;
4746       nest_depth++;
4747       ptr++;
4748       break;
4749 
4750 
4751       /* ---- Lookahead assertions ---- */
4752 
4753       case CHAR_EQUALS_SIGN:
4754       POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4755       *parsed_pattern++ = META_LOOKAHEAD;
4756       ptr++;
4757       goto POST_ASSERTION;
4758 
4759       case CHAR_ASTERISK:
4760       POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */
4761       *parsed_pattern++ = META_LOOKAHEAD_NA;
4762       ptr++;
4763       goto POST_ASSERTION;
4764 
4765       case CHAR_EXCLAMATION_MARK:
4766       NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4767       *parsed_pattern++ = META_LOOKAHEADNOT;
4768       ptr++;
4769       goto POST_ASSERTION;
4770 
4771 
4772       /* ---- Lookbehind assertions ---- */
4773 
4774       /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4775       is the start of the name of a capturing group. */
4776 
4777       case CHAR_LESS_THAN_SIGN:
4778       if (ptrend - ptr <= 1 ||
4779          (ptr[1] != CHAR_EQUALS_SIGN &&
4780           ptr[1] != CHAR_EXCLAMATION_MARK &&
4781           ptr[1] != CHAR_ASTERISK))
4782         {
4783         terminator = CHAR_GREATER_THAN_SIGN;
4784         goto DEFINE_NAME;
4785         }
4786       *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4787         META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4788         META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4789 
4790       POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
4791       *has_lookbehind = TRUE;
4792       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4793       PUTOFFSET(offset, parsed_pattern);
4794       ptr += 2;
4795       /* Fall through */
4796 
4797       /* If the previous item was a condition starting (?(? an assertion,
4798       optionally preceded by a callout, is expected. This is checked later on,
4799       during actual compilation. However we need to identify this kind of
4800       assertion in this pass because it must not be qualified. The value of
4801       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4802       for a callout - still leaving a positive value that identifies the
4803       assertion. Multiple callouts or any other items will make it zero or
4804       less, which doesn't matter because they will cause an error later. */
4805 
4806       POST_ASSERTION:
4807       nest_depth++;
4808       if (prev_expect_cond_assert > 0)
4809         {
4810         if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4811         else if (++top_nest >= end_nests)
4812           {
4813           errorcode = ERR84;
4814           goto FAILED;
4815           }
4816         top_nest->nest_depth = nest_depth;
4817         top_nest->flags = NSF_CONDASSERT;
4818         top_nest->options = options & PARSE_TRACKED_OPTIONS;
4819         top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4820         }
4821       break;
4822 
4823 
4824       /* ---- Define a named group ---- */
4825 
4826       /* A named group may be defined as (?'name') or (?<name>). In the latter
4827       case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4828       terminator set to '>'. */
4829 
4830       case CHAR_APOSTROPHE:
4831       terminator = CHAR_APOSTROPHE;    /* Terminator */
4832 
4833       DEFINE_NAME:
4834       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4835           &errorcode, cb)) goto FAILED;
4836 
4837       /* We have a name for this capturing group. It is also assigned a number,
4838       which is its primary means of identification. */
4839 
4840       if (cb->bracount >= MAX_GROUP_NUMBER)
4841         {
4842         errorcode = ERR97;
4843         goto FAILED;
4844         }
4845       cb->bracount++;
4846       *parsed_pattern++ = META_CAPTURE | cb->bracount;
4847       nest_depth++;
4848 
4849       /* Check not too many names */
4850 
4851       if (cb->names_found >= MAX_NAME_COUNT)
4852         {
4853         errorcode = ERR49;
4854         goto FAILED;
4855         }
4856 
4857       /* Adjust the entry size to accommodate the longest name found. */
4858 
4859       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4860         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4861 
4862       /* Scan the list to check for duplicates. For duplicate names, if the
4863       number is the same, break the loop, which causes the name to be
4864       discarded; otherwise, if DUPNAMES is not set, give an error.
4865       If it is set, allow the name with a different number, but continue
4866       scanning in case this is a duplicate with the same number. For
4867       non-duplicate names, give an error if the number is duplicated. */
4868 
4869       isdupname = FALSE;
4870       ng = cb->named_groups;
4871       for (i = 0; i < cb->names_found; i++, ng++)
4872         {
4873         if (namelen == ng->length &&
4874             PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4875           {
4876           if (ng->number == cb->bracount) break;
4877           if ((options & PCRE2_DUPNAMES) == 0)
4878             {
4879             errorcode = ERR43;
4880             goto FAILED;
4881             }
4882           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4883           cb->dupnames = TRUE;              /* Duplicate names exist */
4884           }
4885         else if (ng->number == cb->bracount)
4886           {
4887           errorcode = ERR65;
4888           goto FAILED;
4889           }
4890         }
4891 
4892       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4893 
4894       /* Increase the list size if necessary */
4895 
4896       if (cb->names_found >= cb->named_group_list_size)
4897         {
4898         uint32_t newsize = cb->named_group_list_size * 2;
4899         named_group *newspace =
4900           cb->cx->memctl.malloc(newsize * sizeof(named_group),
4901           cb->cx->memctl.memory_data);
4902         if (newspace == NULL)
4903           {
4904           errorcode = ERR21;
4905           goto FAILED;
4906           }
4907 
4908         memcpy(newspace, cb->named_groups,
4909           cb->named_group_list_size * sizeof(named_group));
4910         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4911           cb->cx->memctl.free((void *)cb->named_groups,
4912           cb->cx->memctl.memory_data);
4913         cb->named_groups = newspace;
4914         cb->named_group_list_size = newsize;
4915         }
4916 
4917       /* Add this name to the list */
4918 
4919       cb->named_groups[cb->names_found].name = name;
4920       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4921       cb->named_groups[cb->names_found].number = cb->bracount;
4922       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4923       cb->names_found++;
4924       break;
4925       }        /* End of (? switch */
4926     break;     /* End of ( handling */
4927 
4928 
4929     /* ---- Branch terminators ---- */
4930 
4931     /* Alternation: reset the capture count if we are in a (?| group. */
4932 
4933     case CHAR_VERTICAL_LINE:
4934     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4935         (top_nest->flags & NSF_RESET) != 0)
4936       {
4937       if (cb->bracount > top_nest->max_group)
4938         top_nest->max_group = (uint16_t)cb->bracount;
4939       cb->bracount = top_nest->reset_group;
4940       }
4941     *parsed_pattern++ = META_ALT;
4942     break;
4943 
4944     /* End of group; reset the capture count to the maximum if we are in a (?|
4945     group and/or reset the options that are tracked during parsing. Disallow
4946     quantifier for a condition that is an assertion. */
4947 
4948     case CHAR_RIGHT_PARENTHESIS:
4949     okquantifier = TRUE;
4950     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4951       {
4952       options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4953       xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
4954       if ((top_nest->flags & NSF_RESET) != 0 &&
4955           top_nest->max_group > cb->bracount)
4956         cb->bracount = top_nest->max_group;
4957       if ((top_nest->flags & NSF_CONDASSERT) != 0)
4958         okquantifier = FALSE;
4959 
4960       if ((top_nest->flags & NSF_ATOMICSR) != 0)
4961         {
4962         *parsed_pattern++ = META_KET;
4963         }
4964 
4965       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4966         else top_nest--;
4967       }
4968     if (nest_depth == 0)    /* Unmatched closing parenthesis */
4969       {
4970       errorcode = ERR22;
4971       goto FAILED_BACK;
4972       }
4973     nest_depth--;
4974     *parsed_pattern++ = META_KET;
4975     break;
4976     }  /* End of switch on pattern character */
4977   }    /* End of main character scan loop */
4978 
4979 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4980 
4981 if (inverbname && ptr >= ptrend)
4982   {
4983   errorcode = ERR60;
4984   goto FAILED;
4985   }
4986 
4987 /* Manage callout for the final item */
4988 
4989 PARSED_END:
4990 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4991   parsed_pattern, cb);
4992 
4993 /* Insert trailing items for word and line matching (features provided for the
4994 benefit of pcre2grep). */
4995 
4996 if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
4997   {
4998   *parsed_pattern++ = META_KET;
4999   *parsed_pattern++ = META_DOLLAR;
5000   }
5001 else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5002   {
5003   *parsed_pattern++ = META_KET;
5004   *parsed_pattern++ = META_ESCAPE + ESC_b;
5005   }
5006 
5007 /* Terminate the parsed pattern, then return success if all groups are closed.
5008 Otherwise we have unclosed parentheses. */
5009 
5010 if (parsed_pattern >= parsed_pattern_end)
5011   {
5012   errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5013   goto FAILED;
5014   }
5015 
5016 *parsed_pattern = META_END;
5017 if (nest_depth == 0) return 0;
5018 
5019 UNCLOSED_PARENTHESIS:
5020 errorcode = ERR14;
5021 
5022 /* Come here for all failures. */
5023 
5024 FAILED:
5025 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5026 return errorcode;
5027 
5028 /* Some errors need to indicate the previous character. */
5029 
5030 FAILED_BACK:
5031 ptr--;
5032 goto FAILED;
5033 
5034 /* This failure happens several times. */
5035 
5036 BAD_VERSION_CONDITION:
5037 errorcode = ERR79;
5038 goto FAILED;
5039 }
5040 
5041 
5042 
5043 /*************************************************
5044 *       Find first significant opcode            *
5045 *************************************************/
5046 
5047 /* This is called by several functions that scan a compiled expression looking
5048 for a fixed first character, or an anchoring opcode etc. It skips over things
5049 that do not influence this. For some calls, it makes sense to skip negative
5050 forward and all backward assertions, and also the \b assertion; for others it
5051 does not.
5052 
5053 Arguments:
5054   code         pointer to the start of the group
5055   skipassert   TRUE if certain assertions are to be skipped
5056 
5057 Returns:       pointer to the first significant opcode
5058 */
5059 
5060 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)5061 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5062 {
5063 for (;;)
5064   {
5065   switch ((int)*code)
5066     {
5067     case OP_ASSERT_NOT:
5068     case OP_ASSERTBACK:
5069     case OP_ASSERTBACK_NOT:
5070     case OP_ASSERTBACK_NA:
5071     if (!skipassert) return code;
5072     do code += GET(code, 1); while (*code == OP_ALT);
5073     code += PRIV(OP_lengths)[*code];
5074     break;
5075 
5076     case OP_WORD_BOUNDARY:
5077     case OP_NOT_WORD_BOUNDARY:
5078     case OP_UCP_WORD_BOUNDARY:
5079     case OP_NOT_UCP_WORD_BOUNDARY:
5080     if (!skipassert) return code;
5081     /* Fall through */
5082 
5083     case OP_CALLOUT:
5084     case OP_CREF:
5085     case OP_DNCREF:
5086     case OP_RREF:
5087     case OP_DNRREF:
5088     case OP_FALSE:
5089     case OP_TRUE:
5090     code += PRIV(OP_lengths)[*code];
5091     break;
5092 
5093     case OP_CALLOUT_STR:
5094     code += GET(code, 1 + 2*LINK_SIZE);
5095     break;
5096 
5097     case OP_SKIPZERO:
5098     code += 2 + GET(code, 2) + LINK_SIZE;
5099     break;
5100 
5101     case OP_COND:
5102     case OP_SCOND:
5103     if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
5104         code[GET(code, 1)] != OP_KET)      /* More than one branch */
5105       return code;
5106     code += GET(code, 1) + 1 + LINK_SIZE;
5107     break;
5108 
5109     case OP_MARK:
5110     case OP_COMMIT_ARG:
5111     case OP_PRUNE_ARG:
5112     case OP_SKIP_ARG:
5113     case OP_THEN_ARG:
5114     code += code[1] + PRIV(OP_lengths)[*code];
5115     break;
5116 
5117     default:
5118     return code;
5119     }
5120   }
5121 /* Control never reaches here */
5122 }
5123 
5124 
5125 
5126 #ifdef SUPPORT_UNICODE
5127 /*************************************************
5128 *           Get othercase range                  *
5129 *************************************************/
5130 
5131 /* This function is passed the start and end of a class range in UCP mode. For
5132 single characters the range may be just one character long. The function
5133 searches up the characters, looking for ranges of characters in the "other"
5134 case. Each call returns the next one, updating the start address. A character
5135 with multiple other cases is returned on its own with a special return value.
5136 
5137 Arguments:
5138   cptr        points to starting character value; updated
5139   d           end value
5140   ocptr       where to put start of othercase range
5141   odptr       where to put end of othercase range
5142   restricted  TRUE if caseless restriction applies
5143 
5144 Yield:        -1 when no more
5145                0 when a range is returned
5146               >0 the CASESET offset for char with multiple other cases;
5147                  for this return, *ocptr contains the original
5148 */
5149 
5150 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr,BOOL restricted)5151 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
5152   uint32_t *odptr, BOOL restricted)
5153 {
5154 uint32_t c, othercase, next;
5155 unsigned int co;
5156 
5157 /* Find the first character that has an other case. If it has multiple other
5158 cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the
5159 multi-case entries that begin with ASCII values. In 32-bit mode, a value
5160 greater than the Unicode maximum ends the range. */
5161 
5162 for (c = *cptr; c <= d; c++)
5163   {
5164 #if PCRE2_CODE_UNIT_WIDTH == 32
5165   if (c > MAX_UTF_CODE_POINT) return -1;
5166 #endif
5167   if ((co = UCD_CASESET(c)) != 0 &&
5168       (!restricted || PRIV(ucd_caseless_sets)[co] > 127))
5169     {
5170     *ocptr = c++;   /* Character that has the set */
5171     *cptr = c;      /* Rest of input range */
5172     return (int)co;
5173     }
5174 
5175    /* This is not a valid multiple-case character. Check that the single other
5176    case is different to the original. We don't need to check "restricted" here
5177    because the non-ASCII characters with multiple cases that include an ASCII
5178    character don't have a different "othercase". */
5179 
5180   if ((othercase = UCD_OTHERCASE(c)) != c) break;
5181   }
5182 
5183 if (c > d) return -1;  /* Reached end of range */
5184 
5185 /* Found a character that has a single other case. Search for the end of the
5186 range, which is either the end of the input range, or a character that has zero
5187 or more than one other cases. */
5188 
5189 *ocptr = othercase;
5190 next = othercase + 1;
5191 
5192 for (++c; c <= d; c++)
5193   {
5194   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
5195   next++;
5196   }
5197 
5198 *odptr = next - 1;     /* End of othercase range */
5199 *cptr = c;             /* Rest of input range */
5200 return 0;
5201 }
5202 #endif  /* SUPPORT_UNICODE */
5203 
5204 
5205 
5206 /*************************************************
5207 * Add a character or range to a class (internal) *
5208 *************************************************/
5209 
5210 /* This function packages up the logic of adding a character or range of
5211 characters to a class. The character values in the arguments will be within the
5212 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
5213 called only from within the "add to class" group of functions, some of which
5214 are recursive and mutually recursive. The external entry point is
5215 add_to_class().
5216 
5217 Arguments:
5218   classbits     the bit map for characters < 256
5219   uchardptr     points to the pointer for extra data
5220   options       the options bits
5221   xoptions      the extra options bits
5222   cb            compile data
5223   start         start of range character
5224   end           end of range character
5225 
5226 Returns:        the number of < 256 characters added
5227                 the pointer to extra data is updated
5228 */
5229 
5230 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5231 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5232   uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start,
5233   uint32_t end)
5234 {
5235 uint32_t c;
5236 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5237 unsigned int n8 = 0;
5238 
5239 /* If caseless matching is required, scan the range and process alternate
5240 cases. In Unicode, there are 8-bit characters that have alternate cases that
5241 are greater than 255 and vice-versa (though these may be ignored if caseless
5242 restriction is in force). Sometimes we can just extend the original range. */
5243 
5244 if ((options & PCRE2_CASELESS) != 0)
5245   {
5246 #ifdef SUPPORT_UNICODE
5247   if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5248     {
5249     int rc;
5250     uint32_t oc, od;
5251 
5252     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
5253     c = start;
5254 
5255     while ((rc = get_othercase_range(&c, end, &oc, &od,
5256              (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0)
5257       {
5258       /* Handle a single character that has more than one other case. */
5259 
5260       if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr,
5261         options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc);
5262 
5263       /* Do nothing if the other case range is within the original range. */
5264 
5265       else if (oc >= cb->class_range_start && od <= cb->class_range_end)
5266         continue;
5267 
5268       /* Extend the original range if there is overlap, noting that if oc < c,
5269       we can't have od > end because a subrange is always shorter than the
5270       basic range. Otherwise, use a recursive call to add the additional range.
5271       */
5272 
5273       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5274       else if (od > end && oc <= end + 1)
5275         {
5276         end = od;       /* Extend upwards */
5277         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5278         }
5279       else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions,
5280         cb, oc, od);
5281       }
5282     }
5283   else
5284 #else
5285   (void)xoptions;   /* Avoid compiler warning */
5286 #endif  /* SUPPORT_UNICODE */
5287 
5288   /* Not UTF mode */
5289 
5290   for (c = start; c <= classbits_end; c++)
5291     {
5292     SETBIT(classbits, cb->fcc[c]);
5293     n8++;
5294     }
5295   }
5296 
5297 /* Now handle the originally supplied range. Adjust the final value according
5298 to the bit length - this means that the same lists of (e.g.) horizontal spaces
5299 can be used in all cases. */
5300 
5301 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5302   end = MAX_NON_UTF_CHAR;
5303 
5304 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5305 
5306 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
5307 
5308 for (c = start; c <= classbits_end; c++)
5309   {
5310   /* Regardless of start, c will always be <= 255. */
5311   SETBIT(classbits, c);
5312   n8++;
5313   }
5314 
5315 #ifdef SUPPORT_WIDE_CHARS
5316 if (start <= 0xff) start = 0xff + 1;
5317 
5318 if (end >= start)
5319   {
5320   PCRE2_UCHAR *uchardata = *uchardptr;
5321 
5322 #ifdef SUPPORT_UNICODE
5323   if ((options & PCRE2_UTF) != 0)
5324     {
5325     if (start < end)
5326       {
5327       *uchardata++ = XCL_RANGE;
5328       uchardata += PRIV(ord2utf)(start, uchardata);
5329       uchardata += PRIV(ord2utf)(end, uchardata);
5330       }
5331     else if (start == end)
5332       {
5333       *uchardata++ = XCL_SINGLE;
5334       uchardata += PRIV(ord2utf)(start, uchardata);
5335       }
5336     }
5337   else
5338 #endif  /* SUPPORT_UNICODE */
5339 
5340   /* Without UTF support, character values are constrained by the bit length,
5341   and can only be > 256 for 16-bit and 32-bit libraries. */
5342 
5343 #if PCRE2_CODE_UNIT_WIDTH == 8
5344     {}
5345 #else
5346   if (start < end)
5347     {
5348     *uchardata++ = XCL_RANGE;
5349     *uchardata++ = start;
5350     *uchardata++ = end;
5351     }
5352   else if (start == end)
5353     {
5354     *uchardata++ = XCL_SINGLE;
5355     *uchardata++ = start;
5356     }
5357 #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
5358   *uchardptr = uchardata;   /* Updata extra data pointer */
5359   }
5360 #else  /* SUPPORT_WIDE_CHARS */
5361   (void)uchardptr;          /* Avoid compiler warning */
5362 #endif /* SUPPORT_WIDE_CHARS */
5363 
5364 return n8;    /* Number of 8-bit characters */
5365 }
5366 
5367 
5368 
5369 #ifdef SUPPORT_UNICODE
5370 /*************************************************
5371 * Add a list of characters to a class (internal) *
5372 *************************************************/
5373 
5374 /* This function is used for adding a list of case-equivalent characters to a
5375 class when in UTF mode. This function is called only from within
5376 add_to_class_internal(), with which it is mutually recursive.
5377 
5378 Arguments:
5379   classbits     the bit map for characters < 256
5380   uchardptr     points to the pointer for extra data
5381   options       the options bits
5382   xoptions      the extra options bits
5383   cb            contains pointers to tables etc.
5384   p             points to row of 32-bit values, terminated by NOTACHAR
5385   except        character to omit; this is used when adding lists of
5386                   case-equivalent characters to avoid including the one we
5387                   already know about
5388 
5389 Returns:        the number of < 256 characters added
5390                 the pointer to extra data is updated
5391 */
5392 
5393 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5394 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5395   uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p,
5396   unsigned int except)
5397 {
5398 unsigned int n8 = 0;
5399 while (p[0] < NOTACHAR)
5400   {
5401   unsigned int n = 0;
5402   if (p[0] != except)
5403     {
5404     while(p[n+1] == p[0] + n + 1) n++;
5405     n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5406       p[0], p[n]);
5407     }
5408   p += n + 1;
5409   }
5410 return n8;
5411 }
5412 #endif
5413 
5414 
5415 
5416 /*************************************************
5417 *   External entry point for add range to class  *
5418 *************************************************/
5419 
5420 /* This function sets the overall range so that the internal functions can try
5421 to avoid duplication when handling case-independence.
5422 
5423 Arguments:
5424   classbits     the bit map for characters < 256
5425   uchardptr     points to the pointer for extra data
5426   options       the options bits
5427   xoptions      the extra options bits
5428   cb            compile data
5429   start         start of range character
5430   end           end of range character
5431 
5432 Returns:        the number of < 256 characters added
5433                 the pointer to extra data is updated
5434 */
5435 
5436 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5437 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5438   uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end)
5439 {
5440 cb->class_range_start = start;
5441 cb->class_range_end = end;
5442 return add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5443   start, end);
5444 }
5445 
5446 
5447 /*************************************************
5448 *   External entry point for add list to class   *
5449 *************************************************/
5450 
5451 /* This function is used for adding a list of horizontal or vertical whitespace
5452 characters to a class. The list must be in order so that ranges of characters
5453 can be detected and handled appropriately. This function sets the overall range
5454 so that the internal functions can try to avoid duplication when handling
5455 case-independence.
5456 
5457 Arguments:
5458   classbits     the bit map for characters < 256
5459   uchardptr     points to the pointer for extra data
5460   options       the options bits
5461   xoptions      the extra options bits
5462   cb            contains pointers to tables etc.
5463   p             points to row of 32-bit values, terminated by NOTACHAR
5464   except        character to omit; this is used when adding lists of
5465                   case-equivalent characters to avoid including the one we
5466                   already know about
5467 
5468 Returns:        the number of < 256 characters added
5469                 the pointer to extra data is updated
5470 */
5471 
5472 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5473 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5474   uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except)
5475 {
5476 unsigned int n8 = 0;
5477 while (p[0] < NOTACHAR)
5478   {
5479   unsigned int n = 0;
5480   if (p[0] != except)
5481     {
5482     while(p[n+1] == p[0] + n + 1) n++;
5483     cb->class_range_start = p[0];
5484     cb->class_range_end = p[n];
5485     n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5486       p[0], p[n]);
5487     }
5488   p += n + 1;
5489   }
5490 return n8;
5491 }
5492 
5493 
5494 
5495 /*************************************************
5496 *    Add characters not in a list to a class     *
5497 *************************************************/
5498 
5499 /* This function is used for adding the complement of a list of horizontal or
5500 vertical whitespace to a class. The list must be in order.
5501 
5502 Arguments:
5503   classbits     the bit map for characters < 256
5504   uchardptr     points to the pointer for extra data
5505   options       the options bits
5506   xoptions      the extra options bits
5507   cb            contains pointers to tables etc.
5508   p             points to row of 32-bit values, terminated by NOTACHAR
5509 
5510 Returns:        the number of < 256 characters added
5511                 the pointer to extra data is updated
5512 */
5513 
5514 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p)5515 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5516   uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p)
5517 {
5518 BOOL utf = (options & PCRE2_UTF) != 0;
5519 unsigned int n8 = 0;
5520 if (p[0] > 0)
5521   n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1);
5522 while (p[0] < NOTACHAR)
5523   {
5524   while (p[1] == p[0] + 1) p++;
5525   n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1,
5526     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5527   p++;
5528   }
5529 return n8;
5530 }
5531 
5532 
5533 
5534 /*************************************************
5535 *    Find details of duplicate group names       *
5536 *************************************************/
5537 
5538 /* This is called from compile_branch() when it needs to know the index and
5539 count of duplicates in the names table when processing named backreferences,
5540 either directly, or as conditions.
5541 
5542 Arguments:
5543   name          points to the name
5544   length        the length of the name
5545   indexptr      where to put the index
5546   countptr      where to put the count of duplicates
5547   errorcodeptr  where to put an error code
5548   cb            the compile block
5549 
5550 Returns:        TRUE if OK, FALSE if not, error code set
5551 */
5552 
5553 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5554 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5555   int *countptr, int *errorcodeptr, compile_block *cb)
5556 {
5557 uint32_t i, groupnumber;
5558 int count;
5559 PCRE2_UCHAR *slot = cb->name_table;
5560 
5561 /* Find the first entry in the table */
5562 
5563 for (i = 0; i < cb->names_found; i++)
5564   {
5565   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5566       slot[IMM2_SIZE+length] == 0) break;
5567   slot += cb->name_entry_size;
5568   }
5569 
5570 /* This should not occur, because this function is called only when we know we
5571 have duplicate names. Give an internal error. */
5572 
5573 if (i >= cb->names_found)
5574   {
5575   *errorcodeptr = ERR53;
5576   cb->erroroffset = name - cb->start_pattern;
5577   return FALSE;
5578   }
5579 
5580 /* Record the index and then see how many duplicates there are, updating the
5581 backref map and maximum back reference as we do. */
5582 
5583 *indexptr = i;
5584 count = 0;
5585 
5586 for (;;)
5587   {
5588   count++;
5589   groupnumber = GET2(slot,0);
5590   cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5591   if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5592   if (++i >= cb->names_found) break;
5593   slot += cb->name_entry_size;
5594   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5595     (slot+IMM2_SIZE)[length] != 0) break;
5596   }
5597 
5598 *countptr = count;
5599 return TRUE;
5600 }
5601 
5602 
5603 
5604 /*************************************************
5605 *           Compile one branch                   *
5606 *************************************************/
5607 
5608 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5609 the options are changed during the branch, the pointer is used to change the
5610 external options bits. This function is used during the pre-compile phase when
5611 we are trying to find out the amount of memory needed, as well as during the
5612 real compile phase. The value of lengthptr distinguishes the two phases.
5613 
5614 Arguments:
5615   optionsptr        pointer to the option bits
5616   xoptionsptr       pointer to the extra option bits
5617   codeptr           points to the pointer to the current code point
5618   pptrptr           points to the current parsed pattern pointer
5619   errorcodeptr      points to error code variable
5620   firstcuptr        place to put the first required code unit
5621   firstcuflagsptr   place to put the first code unit flags
5622   reqcuptr          place to put the last required code unit
5623   reqcuflagsptr     place to put the last required code unit flags
5624   bcptr             points to current branch chain
5625   open_caps         points to current capitem
5626   cb                contains pointers to tables etc.
5627   lengthptr         NULL during the real compile phase
5628                     points to length accumulator during pre-compile phase
5629 
5630 Returns:            0 There's been an error, *errorcodeptr is non-zero
5631                    +1 Success, this branch must match at least one character
5632                    -1 Success, this branch may match an empty string
5633 */
5634 
5635 static int
compile_branch(uint32_t * optionsptr,uint32_t * xoptionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)5636 compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5637   PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5638   uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5639   uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5640   compile_block *cb, PCRE2_SIZE *lengthptr)
5641 {
5642 int bravalue = 0;
5643 int okreturn = -1;
5644 int group_return = 0;
5645 uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5646 uint32_t greedy_default, greedy_non_default;
5647 uint32_t repeat_type, op_type;
5648 uint32_t options = *optionsptr;               /* May change dynamically */
5649 uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
5650 uint32_t firstcu, reqcu;
5651 uint32_t zeroreqcu, zerofirstcu;
5652 uint32_t escape;
5653 uint32_t *pptr = *pptrptr;
5654 uint32_t meta, meta_arg;
5655 uint32_t firstcuflags, reqcuflags;
5656 uint32_t zeroreqcuflags, zerofirstcuflags;
5657 uint32_t req_caseopt, reqvary, tempreqvary;
5658 PCRE2_SIZE offset = 0;
5659 PCRE2_SIZE length_prevgroup = 0;
5660 PCRE2_UCHAR *code = *codeptr;
5661 PCRE2_UCHAR *last_code = code;
5662 PCRE2_UCHAR *orig_code = code;
5663 PCRE2_UCHAR *tempcode;
5664 PCRE2_UCHAR *previous = NULL;
5665 PCRE2_UCHAR op_previous;
5666 BOOL groupsetfirstcu = FALSE;
5667 BOOL had_accept = FALSE;
5668 BOOL matched_char = FALSE;
5669 BOOL previous_matched_char = FALSE;
5670 BOOL reset_caseful = FALSE;
5671 const uint8_t *cbits = cb->cbits;
5672 uint8_t classbits[32];
5673 
5674 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5675 not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5676 as we process the pattern. */
5677 
5678 #ifdef SUPPORT_UNICODE
5679 BOOL utf = (options & PCRE2_UTF) != 0;
5680 BOOL ucp = (options & PCRE2_UCP) != 0;
5681 #else  /* No Unicode support */
5682 BOOL utf = FALSE;
5683 #endif
5684 
5685 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5686 class_uchardata always so that it can be passed to add_to_class() always,
5687 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5688 alternative calls for the different cases. */
5689 
5690 PCRE2_UCHAR *class_uchardata;
5691 #ifdef SUPPORT_WIDE_CHARS
5692 BOOL xclass;
5693 PCRE2_UCHAR *class_uchardata_base;
5694 #endif
5695 
5696 /* Set up the default and non-default settings for greediness */
5697 
5698 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5699 greedy_non_default = greedy_default ^ 1;
5700 
5701 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5702 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5703 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5704 
5705 When we hit a repeat whose minimum is zero, we may have to adjust these values
5706 to take the zero repeat into account. This is implemented by setting them to
5707 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5708 item types that can be repeated set these backoff variables appropriately. */
5709 
5710 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5711 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5712 
5713 /* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5714 according to the current setting of the caseless flag. The REQ_CASELESS value
5715 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5716 to record the case status of the value. This is used only for ASCII characters.
5717 */
5718 
5719 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5720 
5721 /* Switch on next META item until the end of the branch */
5722 
5723 for (;; pptr++)
5724   {
5725 #ifdef SUPPORT_WIDE_CHARS
5726   BOOL xclass_has_prop;
5727 #endif
5728   BOOL negate_class;
5729   BOOL should_flip_negation;
5730   BOOL match_all_or_no_wide_chars;
5731   BOOL possessive_quantifier;
5732   BOOL note_group_empty;
5733   int class_has_8bitchar;
5734   uint32_t mclength;
5735   uint32_t skipunits;
5736   uint32_t subreqcu, subfirstcu;
5737   uint32_t groupnumber;
5738   uint32_t verbarglen, verbculen;
5739   uint32_t subreqcuflags, subfirstcuflags;
5740   open_capitem *oc;
5741   PCRE2_UCHAR mcbuffer[8];
5742 
5743   /* Get next META item in the pattern and its potential argument. */
5744 
5745   meta = META_CODE(*pptr);
5746   meta_arg = META_DATA(*pptr);
5747 
5748   /* If we are in the pre-compile phase, accumulate the length used for the
5749   previous cycle of this loop, unless the next item is a quantifier. */
5750 
5751   if (lengthptr != NULL)
5752     {
5753     if (code > cb->start_workspace + cb->workspace_size -
5754         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5755       {
5756       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5757         ERR52 : ERR86;
5758       return 0;
5759       }
5760 
5761     /* There is at least one situation where code goes backwards: this is the
5762     case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5763     is processed, the whole class is eliminated. However, it is created first,
5764     so we have to allow memory for it. Therefore, don't ever reduce the length
5765     at this point. */
5766 
5767     if (code < last_code) code = last_code;
5768 
5769     /* If the next thing is not a quantifier, we add the length of the previous
5770     item into the total, and reset the code pointer to the start of the
5771     workspace. Otherwise leave the previous item available to be quantified. */
5772 
5773     if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5774       {
5775       if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5776         {
5777         *errorcodeptr = ERR20;   /* Integer overflow */
5778         return 0;
5779         }
5780       *lengthptr += (PCRE2_SIZE)(code - orig_code);
5781       if (*lengthptr > MAX_PATTERN_SIZE)
5782         {
5783         *errorcodeptr = ERR20;   /* Pattern is too large */
5784         return 0;
5785         }
5786       code = orig_code;
5787       }
5788 
5789     /* Remember where this code item starts so we can catch the "backwards"
5790     case above next time round. */
5791 
5792     last_code = code;
5793     }
5794 
5795   /* Process the next parsed pattern item. If it is not a quantifier, remember
5796   where it starts so that it can be quantified when a quantifier follows.
5797   Checking for the legality of quantifiers happens in parse_regex(), except for
5798   a quantifier after an assertion that is a condition. */
5799 
5800   if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5801     {
5802     previous = code;
5803     if (matched_char && !had_accept) okreturn = 1;
5804     }
5805 
5806   previous_matched_char = matched_char;
5807   matched_char = FALSE;
5808   note_group_empty = FALSE;
5809   skipunits = 0;         /* Default value for most subgroups */
5810 
5811   switch(meta)
5812     {
5813     /* ===================================================================*/
5814     /* The branch terminates at pattern end or | or ) */
5815 
5816     case META_END:
5817     case META_ALT:
5818     case META_KET:
5819     *firstcuptr = firstcu;
5820     *firstcuflagsptr = firstcuflags;
5821     *reqcuptr = reqcu;
5822     *reqcuflagsptr = reqcuflags;
5823     *codeptr = code;
5824     *pptrptr = pptr;
5825     return okreturn;
5826 
5827 
5828     /* ===================================================================*/
5829     /* Handle single-character metacharacters. In multiline mode, ^ disables
5830     the setting of any following char as a first character. */
5831 
5832     case META_CIRCUMFLEX:
5833     if ((options & PCRE2_MULTILINE) != 0)
5834       {
5835       if (firstcuflags == REQ_UNSET)
5836         zerofirstcuflags = firstcuflags = REQ_NONE;
5837       *code++ = OP_CIRCM;
5838       }
5839     else *code++ = OP_CIRC;
5840     break;
5841 
5842     case META_DOLLAR:
5843     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5844     break;
5845 
5846     /* There can never be a first char if '.' is first, whatever happens about
5847     repeats. The value of reqcu doesn't change either. */
5848 
5849     case META_DOT:
5850     matched_char = TRUE;
5851     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5852     zerofirstcu = firstcu;
5853     zerofirstcuflags = firstcuflags;
5854     zeroreqcu = reqcu;
5855     zeroreqcuflags = reqcuflags;
5856     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5857     break;
5858 
5859 
5860     /* ===================================================================*/
5861     /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5862     Otherwise, an initial ']' is taken as a data character. When empty classes
5863     are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5864     match any character, so generate OP_ALLANY. */
5865 
5866     case META_CLASS_EMPTY:
5867     case META_CLASS_EMPTY_NOT:
5868     matched_char = TRUE;
5869     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5870     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5871     zerofirstcu = firstcu;
5872     zerofirstcuflags = firstcuflags;
5873     break;
5874 
5875 
5876     /* ===================================================================*/
5877     /* Non-empty character class. If the included characters are all < 256, we
5878     build a 32-byte bitmap of the permitted characters, except in the special
5879     case where there is only one such character. For negated classes, we build
5880     the map as usual, then invert it at the end. However, we use a different
5881     opcode so that data characters > 255 can be handled correctly.
5882 
5883     If the class contains characters outside the 0-255 range, a different
5884     opcode is compiled. It may optionally have a bit map for characters < 256,
5885     but those above are explicitly listed afterwards. A flag code unit tells
5886     whether the bitmap is present, and whether this is a negated class or
5887     not. */
5888 
5889     case META_CLASS_NOT:
5890     case META_CLASS:
5891     matched_char = TRUE;
5892     negate_class = meta == META_CLASS_NOT;
5893 
5894     /* We can optimize the case of a single character in a class by generating
5895     OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5896     negative. In the negative case there can be no first char if this item is
5897     first, whatever repeat count may follow. In the case of reqcu, save the
5898     previous value for reinstating. */
5899 
5900     /* NOTE: at present this optimization is not effective if the only
5901     character in a class in 32-bit, non-UCP mode has its top bit set. */
5902 
5903     if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5904       {
5905 #ifdef SUPPORT_UNICODE
5906       uint32_t d;
5907 #endif
5908       uint32_t c = pptr[1];
5909 
5910       pptr += 2;                 /* Move on to class end */
5911       if (meta == META_CLASS)    /* A positive one-char class can be */
5912         {                        /* handled as a normal literal character. */
5913         meta = c;                /* Set up the character */
5914         goto NORMAL_CHAR_SET;
5915         }
5916 
5917       /* Handle a negative one-character class */
5918 
5919       zeroreqcu = reqcu;
5920       zeroreqcuflags = reqcuflags;
5921       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5922       zerofirstcu = firstcu;
5923       zerofirstcuflags = firstcuflags;
5924 
5925       /* For caseless UTF or UCP mode, check whether this character has more
5926       than one other case. If so, generate a special OP_NOTPROP item instead of
5927       OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
5928       caseless set that starts with an ASCII character. */
5929 
5930 #ifdef SUPPORT_UNICODE
5931       if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5932           (d = UCD_CASESET(c)) != 0 &&
5933           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
5934           PRIV(ucd_caseless_sets)[d] > 127))
5935         {
5936         *code++ = OP_NOTPROP;
5937         *code++ = PT_CLIST;
5938         *code++ = d;
5939         break;   /* We are finished with this class */
5940         }
5941 #endif
5942       /* Char has only one other (usable) case, or UCP not available */
5943 
5944       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5945       code += PUTCHAR(c, code);
5946       break;   /* We are finished with this class */
5947       }        /* End of 1-char optimization */
5948 
5949     /* Handle character classes that contain more than just one literal
5950     character. If there are exactly two characters in a positive class, see if
5951     they are case partners. This can be optimized to generate a caseless single
5952     character match (which also sets first/required code units if relevant).
5953     When casing restrictions apply, ignore a caseless set if both characters
5954     are ASCII. */
5955 
5956     if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5957         pptr[3] == META_CLASS_END)
5958       {
5959       uint32_t c = pptr[1];
5960 
5961 #ifdef SUPPORT_UNICODE
5962       if (UCD_CASESET(c) == 0 ||
5963          ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
5964          c < 128 && pptr[2] < 128))
5965 #endif
5966         {
5967         uint32_t d;
5968 
5969 #ifdef SUPPORT_UNICODE
5970         if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5971 #endif
5972           {
5973 #if PCRE2_CODE_UNIT_WIDTH != 8
5974           if (c > 255) d = c; else
5975 #endif
5976           d = TABLE_GET(c, cb->fcc, c);
5977           }
5978 
5979         if (c != d && pptr[2] == d)
5980           {
5981           pptr += 3;                 /* Move on to class end */
5982           meta = c;
5983           if ((options & PCRE2_CASELESS) == 0)
5984             {
5985             reset_caseful = TRUE;
5986             options |= PCRE2_CASELESS;
5987             req_caseopt = REQ_CASELESS;
5988             }
5989           goto CLASS_CASELESS_CHAR;
5990           }
5991         }
5992       }
5993 
5994     /* If a non-extended class contains a negative special such as \S, we need
5995     to flip the negation flag at the end, so that support for characters > 255
5996     works correctly (they are all included in the class). An extended class may
5997     need to insert specific matching or non-matching code for wide characters.
5998     */
5999 
6000     should_flip_negation = match_all_or_no_wide_chars = FALSE;
6001 
6002     /* Extended class (xclass) will be used when characters > 255
6003     might match. */
6004 
6005 #ifdef SUPPORT_WIDE_CHARS
6006     xclass = FALSE;
6007     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
6008     class_uchardata_base = class_uchardata;   /* Save the start */
6009 #endif
6010 
6011     /* For optimization purposes, we track some properties of the class:
6012     class_has_8bitchar will be non-zero if the class contains at least one
6013     character with a code point less than 256; xclass_has_prop will be TRUE if
6014     Unicode property checks are present in the class. */
6015 
6016     class_has_8bitchar = 0;
6017 #ifdef SUPPORT_WIDE_CHARS
6018     xclass_has_prop = FALSE;
6019 #endif
6020 
6021     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
6022     in a temporary bit of memory, in case the class contains fewer than two
6023     8-bit characters because in that case the compiled code doesn't use the bit
6024     map. */
6025 
6026     memset(classbits, 0, 32 * sizeof(uint8_t));
6027 
6028     /* Process items until META_CLASS_END is reached. */
6029 
6030     while ((meta = *(++pptr)) != META_CLASS_END)
6031       {
6032       /* Handle POSIX classes such as [:alpha:] etc. */
6033 
6034       if (meta == META_POSIX || meta == META_POSIX_NEG)
6035         {
6036         BOOL local_negate = (meta == META_POSIX_NEG);
6037         int posix_class = *(++pptr);
6038         int taboffset, tabopt;
6039         uint8_t pbits[32];
6040 
6041         should_flip_negation = local_negate;  /* Note negative special */
6042 
6043         /* If matching is caseless, upper and lower are converted to alpha.
6044         This relies on the fact that the class table starts with alpha,
6045         lower, upper as the first 3 entries. */
6046 
6047         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
6048           posix_class = 0;
6049 
6050         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
6051         different escape sequences that use Unicode properties \p or \P.
6052         Others that are not available via \p or \P have to generate
6053         XCL_PROP/XCL_NOTPROP directly, which is done here. */
6054 
6055 #ifdef SUPPORT_UNICODE
6056         if ((options & PCRE2_UCP) != 0 &&
6057             (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
6058           {
6059           switch(posix_class)
6060             {
6061             case PC_GRAPH:
6062             case PC_PRINT:
6063             case PC_PUNCT:
6064             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
6065             *class_uchardata++ = (PCRE2_UCHAR)
6066               ((posix_class == PC_GRAPH)? PT_PXGRAPH :
6067                (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
6068             *class_uchardata++ = 0;
6069             xclass_has_prop = TRUE;
6070             goto CONTINUE_CLASS;
6071 
6072             /* For the other POSIX classes (ex: ascii) we are going to
6073             fall through to the non-UCP case and build a bit map for
6074             characters with code points less than 256. However, if we are in
6075             a negated POSIX class, characters with code points greater than
6076             255 must either all match or all not match, depending on whether
6077             the whole class is not or is negated. For example, for
6078             [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
6079             they must not.
6080 
6081             In the special case where there are no xclass items, this is
6082             automatically handled by the use of OP_CLASS or OP_NCLASS, but an
6083             explicit range is needed for OP_XCLASS. Setting a flag here
6084             causes the range to be generated later when it is known that
6085             OP_XCLASS is required. In the 8-bit library this is relevant only in
6086             utf mode, since no wide characters can exist otherwise. */
6087 
6088             default:
6089 #if PCRE2_CODE_UNIT_WIDTH == 8
6090             if (utf)
6091 #endif
6092             match_all_or_no_wide_chars |= local_negate;
6093             break;
6094             }
6095           }
6096 #endif  /* SUPPORT_UNICODE */
6097 
6098         /* In the non-UCP case, or when UCP makes no difference, we build the
6099         bit map for the POSIX class in a chunk of local store because we may
6100         be adding and subtracting from it, and we don't want to subtract bits
6101         that may be in the main map already. At the end we or the result into
6102         the bit map that is being built. */
6103 
6104         posix_class *= 3;
6105 
6106         /* Copy in the first table (always present) */
6107 
6108         memcpy(pbits, cbits + posix_class_maps[posix_class],
6109           32 * sizeof(uint8_t));
6110 
6111         /* If there is a second table, add or remove it as required. */
6112 
6113         taboffset = posix_class_maps[posix_class + 1];
6114         tabopt = posix_class_maps[posix_class + 2];
6115 
6116         if (taboffset >= 0)
6117           {
6118           if (tabopt >= 0)
6119             for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
6120           else
6121             for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
6122           }
6123 
6124         /* Now see if we need to remove any special characters. An option
6125         value of 1 removes vertical space and 2 removes underscore. */
6126 
6127         if (tabopt < 0) tabopt = -tabopt;
6128         if (tabopt == 1) pbits[1] &= ~0x3c;
6129           else if (tabopt == 2) pbits[11] &= 0x7f;
6130 
6131         /* Add the POSIX table or its complement into the main table that is
6132         being built and we are done. */
6133 
6134         if (local_negate)
6135           for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
6136         else
6137           for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
6138 
6139         /* Every class contains at least one < 256 character. */
6140 
6141         class_has_8bitchar = 1;
6142         goto CONTINUE_CLASS;    /* End of POSIX handling */
6143         }
6144 
6145       /* Other than POSIX classes, the only items we should encounter are
6146       \d-type escapes and literal characters (possibly as ranges). */
6147 
6148       if (meta == META_BIGVALUE)
6149         {
6150         meta = *(++pptr);
6151         goto CLASS_LITERAL;
6152         }
6153 
6154       /* Any other non-literal must be an escape */
6155 
6156       if (meta >= META_END)
6157         {
6158         if (META_CODE(meta) != META_ESCAPE)
6159           {
6160 #ifdef DEBUG_SHOW_PARSED
6161           fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
6162                           "in character class\n", meta);
6163 #endif
6164           *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
6165           return 0;
6166           }
6167         escape = META_DATA(meta);
6168 
6169         /* Every class contains at least one < 256 character. */
6170 
6171         class_has_8bitchar++;
6172 
6173         switch(escape)
6174           {
6175           case ESC_d:
6176           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
6177           break;
6178 
6179           case ESC_D:
6180           should_flip_negation = TRUE;
6181           for (int i = 0; i < 32; i++)
6182             classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
6183           break;
6184 
6185           case ESC_w:
6186           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
6187           break;
6188 
6189           case ESC_W:
6190           should_flip_negation = TRUE;
6191           for (int i = 0; i < 32; i++)
6192             classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
6193           break;
6194 
6195           /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
6196           5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
6197           previously set by something earlier in the character class.
6198           Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
6199           we could just adjust the appropriate bit. From PCRE 8.34 we no
6200           longer treat \s and \S specially. */
6201 
6202           case ESC_s:
6203           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
6204           break;
6205 
6206           case ESC_S:
6207           should_flip_negation = TRUE;
6208           for (int i = 0; i < 32; i++)
6209             classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
6210           break;
6211 
6212           /* When adding the horizontal or vertical space lists to a class, or
6213           their complements, disable PCRE2_CASELESS, because it justs wastes
6214           time, and in the "not-x" UTF cases can create unwanted duplicates in
6215           the XCLASS list (provoked by characters that have more than one other
6216           case and by both cases being in the same "not-x" sublist). */
6217 
6218           case ESC_h:
6219           (void)add_list_to_class(classbits, &class_uchardata,
6220             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list),
6221               NOTACHAR);
6222           break;
6223 
6224           case ESC_H:
6225           (void)add_not_list_to_class(classbits, &class_uchardata,
6226             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list));
6227           break;
6228 
6229           case ESC_v:
6230           (void)add_list_to_class(classbits, &class_uchardata,
6231             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list),
6232               NOTACHAR);
6233           break;
6234 
6235           case ESC_V:
6236           (void)add_not_list_to_class(classbits, &class_uchardata,
6237             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list));
6238           break;
6239 
6240           /* If Unicode is not supported, \P and \p are not allowed and are
6241           faulted at parse time, so will never appear here. */
6242 
6243 #ifdef SUPPORT_UNICODE
6244           case ESC_p:
6245           case ESC_P:
6246             {
6247             uint32_t ptype = *(++pptr) >> 16;
6248             uint32_t pdata = *pptr & 0xffff;
6249             *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
6250             *class_uchardata++ = ptype;
6251             *class_uchardata++ = pdata;
6252             xclass_has_prop = TRUE;
6253             class_has_8bitchar--;                /* Undo! */
6254             }
6255           break;
6256 #endif
6257           }
6258 
6259         goto CONTINUE_CLASS;
6260         }  /* End handling \d-type escapes */
6261 
6262       /* A literal character may be followed by a range meta. At parse time
6263       there are checks for out-of-order characters, for ranges where the two
6264       characters are equal, and for hyphens that cannot indicate a range. At
6265       this point, therefore, no checking is needed. */
6266 
6267       else
6268         {
6269         uint32_t c, d;
6270 
6271         CLASS_LITERAL:
6272         c = d = meta;
6273 
6274         /* Remember if \r or \n were explicitly used */
6275 
6276         if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6277 
6278         /* Process a character range */
6279 
6280         if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6281           {
6282 #ifdef EBCDIC
6283           BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6284 #endif
6285           pptr += 2;
6286           d = *pptr;
6287           if (d == META_BIGVALUE) d = *(++pptr);
6288 
6289           /* Remember an explicit \r or \n, and add the range to the class. */
6290 
6291           if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6292 
6293           /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6294           because there are holes in the encoding, and simply using the range
6295           A-Z (for example) would include the characters in the holes. This
6296           applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6297 
6298 #ifdef EBCDIC
6299           if (range_is_literal &&
6300                (cb->ctypes[c] & ctype_letter) != 0 &&
6301                (cb->ctypes[d] & ctype_letter) != 0 &&
6302                (c <= CHAR_z) == (d <= CHAR_z))
6303             {
6304             uint32_t uc = (d <= CHAR_z)? 0 : 64;
6305             uint32_t C = c - uc;
6306             uint32_t D = d - uc;
6307 
6308             if (C <= CHAR_i)
6309               {
6310               class_has_8bitchar +=
6311                 add_to_class(classbits, &class_uchardata, options, xoptions,
6312                   cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc);
6313               C = CHAR_j;
6314               }
6315 
6316             if (C <= D && C <= CHAR_r)
6317               {
6318               class_has_8bitchar +=
6319                 add_to_class(classbits, &class_uchardata, options, xoptions,
6320                   cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc);
6321               C = CHAR_s;
6322               }
6323 
6324             if (C <= D)
6325               {
6326               class_has_8bitchar +=
6327                 add_to_class(classbits, &class_uchardata, options, xoptions,
6328                   cb, C + uc, D + uc);
6329               }
6330             }
6331           else
6332 #endif
6333           /* Not an EBCDIC special range */
6334 
6335           class_has_8bitchar += add_to_class(classbits, &class_uchardata,
6336             options, xoptions, cb, c, d);
6337           goto CONTINUE_CLASS;   /* Go get the next char in the class */
6338           }  /* End of range handling */
6339 
6340 
6341         /* Handle a single character. */
6342 
6343         class_has_8bitchar +=
6344           add_to_class(classbits, &class_uchardata, options, xoptions, cb,
6345             meta, meta);
6346         }
6347 
6348       /* Continue to the next item in the class. */
6349 
6350       CONTINUE_CLASS:
6351 
6352 #ifdef SUPPORT_WIDE_CHARS
6353       /* If any wide characters or Unicode properties have been encountered,
6354       set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6355       of the extra data and reset the pointer. This is so that very large
6356       classes that contain a zillion wide characters or Unicode property tests
6357       do not overwrite the workspace (which is on the stack). */
6358 
6359       if (class_uchardata > class_uchardata_base)
6360         {
6361         xclass = TRUE;
6362         if (lengthptr != NULL)
6363           {
6364           *lengthptr += class_uchardata - class_uchardata_base;
6365           class_uchardata = class_uchardata_base;
6366           }
6367         }
6368 #endif
6369 
6370       continue;  /* Needed to avoid error when not supporting wide chars */
6371       }   /* End of main class-processing loop */
6372 
6373     /* If this class is the first thing in the branch, there can be no first
6374     char setting, whatever the repeat count. Any reqcu setting must remain
6375     unchanged after any kind of repeat. */
6376 
6377     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6378     zerofirstcu = firstcu;
6379     zerofirstcuflags = firstcuflags;
6380     zeroreqcu = reqcu;
6381     zeroreqcuflags = reqcuflags;
6382 
6383     /* If there are characters with values > 255, or Unicode property settings
6384     (\p or \P), we have to compile an extended class, with its own opcode,
6385     unless there were no property settings and there was a negated special such
6386     as \S in the class, and PCRE2_UCP is not set, because in that case all
6387     characters > 255 are in or not in the class, so any that were explicitly
6388     given as well can be ignored.
6389 
6390     In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
6391     were present in a class, we either have to match or not match all wide
6392     characters (depending on whether the whole class is or is not negated).
6393     This requirement is indicated by match_all_or_no_wide_chars being true.
6394     We do this by including an explicit range, which works in both cases.
6395     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6396     cannot be any wide characters in 8-bit non-UTF mode.
6397 
6398     When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6399     class where \S etc is present without PCRE2_UCP, causing an extended class
6400     to be compiled, we make sure that all characters > 255 are included by
6401     forcing match_all_or_no_wide_chars to be true.
6402 
6403     If, when generating an xclass, there are no characters < 256, we can omit
6404     the bitmap in the actual compiled code. */
6405 
6406 #ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
6407     if (xclass && (
6408 #ifdef SUPPORT_UNICODE
6409         (options & PCRE2_UCP) != 0 ||
6410 #endif
6411         xclass_has_prop || !should_flip_negation))
6412       {
6413       if (match_all_or_no_wide_chars || (
6414 #if PCRE2_CODE_UNIT_WIDTH == 8
6415            utf &&
6416 #endif
6417            should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6418         {
6419         *class_uchardata++ = XCL_RANGE;
6420         if (utf)   /* Will always be utf in the 8-bit library */
6421           {
6422           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6423           class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6424           }
6425         else       /* Can only happen for the 16-bit & 32-bit libraries */
6426           {
6427 #if PCRE2_CODE_UNIT_WIDTH == 16
6428           *class_uchardata++ = 0x100;
6429           *class_uchardata++ = 0xffffu;
6430 #elif PCRE2_CODE_UNIT_WIDTH == 32
6431           *class_uchardata++ = 0x100;
6432           *class_uchardata++ = 0xffffffffu;
6433 #endif
6434           }
6435         }
6436       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
6437       *code++ = OP_XCLASS;
6438       code += LINK_SIZE;
6439       *code = negate_class? XCL_NOT:0;
6440       if (xclass_has_prop) *code |= XCL_HASPROP;
6441 
6442       /* If the map is required, move up the extra data to make room for it;
6443       otherwise just move the code pointer to the end of the extra data. */
6444 
6445       if (class_has_8bitchar > 0)
6446         {
6447         *code++ |= XCL_MAP;
6448         (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6449           CU2BYTES(class_uchardata - code));
6450         if (negate_class && !xclass_has_prop)
6451           {
6452           /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6453           for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6454           }
6455         memcpy(code, classbits, 32);
6456         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6457         }
6458       else code = class_uchardata;
6459 
6460       /* Now fill in the complete length of the item */
6461 
6462       PUT(previous, 1, (int)(code - previous));
6463       break;   /* End of class handling */
6464       }
6465 #endif  /* SUPPORT_WIDE_CHARS */
6466 
6467     /* If there are no characters > 255, or they are all to be included or
6468     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6469     whole class was negated and whether there were negative specials such as \S
6470     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6471     negating it if necessary. */
6472 
6473     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6474     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
6475       {
6476       if (negate_class)
6477         {
6478        /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6479        for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6480        }
6481       memcpy(code, classbits, 32);
6482       }
6483     code += 32 / sizeof(PCRE2_UCHAR);
6484     break;  /* End of class processing */
6485 
6486 
6487     /* ===================================================================*/
6488     /* Deal with (*VERB)s. */
6489 
6490     /* Check for open captures before ACCEPT and close those that are within
6491     the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6492     assertion. In the first pass, just accumulate the length required;
6493     otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6494     workspace overflow. Do not set firstcu after *ACCEPT. */
6495 
6496     case META_ACCEPT:
6497     cb->had_accept = had_accept = TRUE;
6498     for (oc = open_caps;
6499          oc != NULL && oc->assert_depth >= cb->assert_depth;
6500          oc = oc->next)
6501       {
6502       if (lengthptr != NULL)
6503         {
6504         *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6505         }
6506       else
6507         {
6508         *code++ = OP_CLOSE;
6509         PUT2INC(code, 0, oc->number);
6510         }
6511       }
6512     *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6513     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6514     break;
6515 
6516     case META_PRUNE:
6517     case META_SKIP:
6518     cb->had_pruneorskip = TRUE;
6519     /* Fall through */
6520     case META_COMMIT:
6521     case META_FAIL:
6522     *code++ = verbops[(meta - META_MARK) >> 16];
6523     break;
6524 
6525     case META_THEN:
6526     cb->external_flags |= PCRE2_HASTHEN;
6527     *code++ = OP_THEN;
6528     break;
6529 
6530     /* Handle verbs with arguments. Arguments can be very long, especially in
6531     16- and 32-bit modes, and can overflow the workspace in the first pass.
6532     However, the argument length is constrained to be small enough to fit in
6533     one code unit. This check happens in parse_regex(). In the first pass,
6534     instead of putting the argument into memory, we just update the length
6535     counter and set up an empty argument. */
6536 
6537     case META_THEN_ARG:
6538     cb->external_flags |= PCRE2_HASTHEN;
6539     goto VERB_ARG;
6540 
6541     case META_PRUNE_ARG:
6542     case META_SKIP_ARG:
6543     cb->had_pruneorskip = TRUE;
6544     /* Fall through */
6545     case META_MARK:
6546     case META_COMMIT_ARG:
6547     VERB_ARG:
6548     *code++ = verbops[(meta - META_MARK) >> 16];
6549     /* The length is in characters. */
6550     verbarglen = *(++pptr);
6551     verbculen = 0;
6552     tempcode = code++;
6553     for (int i = 0; i < (int)verbarglen; i++)
6554       {
6555       meta = *(++pptr);
6556 #ifdef SUPPORT_UNICODE
6557       if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6558 #endif
6559         {
6560         mclength = 1;
6561         mcbuffer[0] = meta;
6562         }
6563       if (lengthptr != NULL) *lengthptr += mclength; else
6564         {
6565         memcpy(code, mcbuffer, CU2BYTES(mclength));
6566         code += mclength;
6567         verbculen += mclength;
6568         }
6569       }
6570 
6571     *tempcode = verbculen;   /* Fill in the code unit length */
6572     *code++ = 0;             /* Terminating zero */
6573     break;
6574 
6575 
6576     /* ===================================================================*/
6577     /* Handle options change. The new setting must be passed back for use in
6578     subsequent branches. Reset the greedy defaults and the case value for
6579     firstcu and reqcu. */
6580 
6581     case META_OPTIONS:
6582     *optionsptr = options = *(++pptr);
6583     *xoptionsptr = xoptions = *(++pptr);
6584     greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6585     greedy_non_default = greedy_default ^ 1;
6586     req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6587     break;
6588 
6589 
6590     /* ===================================================================*/
6591     /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6592     because it could be a numerical check on recursion, or a name check on a
6593     group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6594     we can handle it either way. We first try for a name; if not found, process
6595     the number. */
6596 
6597     case META_COND_RNUMBER:   /* (?(Rdigits) */
6598     case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6599     case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6600     bravalue = OP_COND;
6601       {
6602       int count, index;
6603       unsigned int i;
6604       PCRE2_SPTR name;
6605       named_group *ng = cb->named_groups;
6606       uint32_t length = *(++pptr);
6607 
6608       GETPLUSOFFSET(offset, pptr);
6609       name = cb->start_pattern + offset;
6610 
6611       /* In the first pass, the names generated in the pre-pass are available,
6612       but the main name table has not yet been created. Scan the list of names
6613       generated in the pre-pass in order to get a number and whether or not
6614       this name is duplicated. If it is not duplicated, we can handle it as a
6615       numerical group. */
6616 
6617       for (i = 0; i < cb->names_found; i++, ng++)
6618         {
6619         if (length == ng->length &&
6620             PRIV(strncmp)(name, ng->name, length) == 0)
6621           {
6622           if (!ng->isdup)
6623             {
6624             code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6625             PUT2(code, 2+LINK_SIZE, ng->number);
6626             if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6627             skipunits = 1+IMM2_SIZE;
6628             goto GROUP_PROCESS_NOTE_EMPTY;
6629             }
6630           break;  /* Found a duplicated name */
6631           }
6632         }
6633 
6634       /* If the name was not found we have a bad reference, unless we are
6635       dealing with R<digits>, which is treated as a recursion test by number.
6636       */
6637 
6638       if (i >= cb->names_found)
6639         {
6640         groupnumber = 0;
6641         if (meta == META_COND_RNUMBER)
6642           {
6643           for (i = 1; i < length; i++)
6644             {
6645             groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6646             if (groupnumber > MAX_GROUP_NUMBER)
6647               {
6648               *errorcodeptr = ERR61;
6649               cb->erroroffset = offset + i;
6650               return 0;
6651               }
6652             }
6653           }
6654 
6655         if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6656           {
6657           *errorcodeptr = ERR15;
6658           cb->erroroffset = offset;
6659           return 0;
6660           }
6661 
6662         /* (?Rdigits) treated as a recursion reference by number. A value of
6663         zero (which is the result of both (?R) and (?R0)) means "any", and is
6664         translated into RREF_ANY (which is 0xffff). */
6665 
6666         if (groupnumber == 0) groupnumber = RREF_ANY;
6667         code[1+LINK_SIZE] = OP_RREF;
6668         PUT2(code, 2+LINK_SIZE, groupnumber);
6669         skipunits = 1+IMM2_SIZE;
6670         goto GROUP_PROCESS_NOTE_EMPTY;
6671         }
6672 
6673       /* A duplicated name was found. Note that if an R<digits> name is found
6674       (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6675 
6676       code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6677 
6678       /* We have a duplicated name. In the compile pass we have to search the
6679       main table in order to get the index and count values. */
6680 
6681       count = 0;  /* Values for first pass (avoids compiler warning) */
6682       index = 0;
6683       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6684             &count, errorcodeptr, cb)) return 0;
6685 
6686       /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6687       insert appropriate data values. */
6688 
6689       code[1+LINK_SIZE]++;
6690       skipunits = 1+2*IMM2_SIZE;
6691       PUT2(code, 2+LINK_SIZE, index);
6692       PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6693       }
6694     goto GROUP_PROCESS_NOTE_EMPTY;
6695 
6696     /* The DEFINE condition is always false. Its internal groups may never
6697     be called, so matched_char must remain false, hence the jump to
6698     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6699 
6700     case META_COND_DEFINE:
6701     bravalue = OP_COND;
6702     GETPLUSOFFSET(offset, pptr);
6703     code[1+LINK_SIZE] = OP_DEFINE;
6704     skipunits = 1;
6705     goto GROUP_PROCESS;
6706 
6707     /* Conditional test of a group's being set. */
6708 
6709     case META_COND_NUMBER:
6710     bravalue = OP_COND;
6711     GETPLUSOFFSET(offset, pptr);
6712     groupnumber = *(++pptr);
6713     if (groupnumber > cb->bracount)
6714       {
6715       *errorcodeptr = ERR15;
6716       cb->erroroffset = offset;
6717       return 0;
6718       }
6719     if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6720     offset -= 2;   /* Point at initial ( for too many branches error */
6721     code[1+LINK_SIZE] = OP_CREF;
6722     skipunits = 1+IMM2_SIZE;
6723     PUT2(code, 2+LINK_SIZE, groupnumber);
6724     goto GROUP_PROCESS_NOTE_EMPTY;
6725 
6726     /* Test for the PCRE2 version. */
6727 
6728     case META_COND_VERSION:
6729     bravalue = OP_COND;
6730     if (pptr[1] > 0)
6731       code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6732         (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6733           OP_TRUE : OP_FALSE;
6734     else
6735       code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6736         OP_TRUE : OP_FALSE;
6737     skipunits = 1;
6738     pptr += 3;
6739     goto GROUP_PROCESS_NOTE_EMPTY;
6740 
6741     /* The condition is an assertion, possibly preceded by a callout. */
6742 
6743     case META_COND_ASSERT:
6744     bravalue = OP_COND;
6745     goto GROUP_PROCESS_NOTE_EMPTY;
6746 
6747 
6748     /* ===================================================================*/
6749     /* Handle all kinds of nested bracketed groups. The non-capturing,
6750     non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6751 
6752     case META_LOOKAHEAD:
6753     bravalue = OP_ASSERT;
6754     cb->assert_depth += 1;
6755     goto GROUP_PROCESS;
6756 
6757     case META_LOOKAHEAD_NA:
6758     bravalue = OP_ASSERT_NA;
6759     cb->assert_depth += 1;
6760     goto GROUP_PROCESS;
6761 
6762     /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6763     thing to do, but Perl allows all assertions to be quantified, and when
6764     they contain capturing parentheses there may be a potential use for
6765     this feature. Not that that applies to a quantified (?!) but we allow
6766     it for uniformity. */
6767 
6768     case META_LOOKAHEADNOT:
6769     if (pptr[1] == META_KET &&
6770          (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6771       {
6772       *code++ = OP_FAIL;
6773       pptr++;
6774       }
6775     else
6776       {
6777       bravalue = OP_ASSERT_NOT;
6778       cb->assert_depth += 1;
6779       goto GROUP_PROCESS;
6780       }
6781     break;
6782 
6783     case META_LOOKBEHIND:
6784     bravalue = OP_ASSERTBACK;
6785     cb->assert_depth += 1;
6786     goto GROUP_PROCESS;
6787 
6788     case META_LOOKBEHINDNOT:
6789     bravalue = OP_ASSERTBACK_NOT;
6790     cb->assert_depth += 1;
6791     goto GROUP_PROCESS;
6792 
6793     case META_LOOKBEHIND_NA:
6794     bravalue = OP_ASSERTBACK_NA;
6795     cb->assert_depth += 1;
6796     goto GROUP_PROCESS;
6797 
6798     case META_ATOMIC:
6799     bravalue = OP_ONCE;
6800     goto GROUP_PROCESS_NOTE_EMPTY;
6801 
6802     case META_SCRIPT_RUN:
6803     bravalue = OP_SCRIPT_RUN;
6804     goto GROUP_PROCESS_NOTE_EMPTY;
6805 
6806     case META_NOCAPTURE:
6807     bravalue = OP_BRA;
6808     /* Fall through */
6809 
6810     /* Process nested bracketed regex. The nesting depth is maintained for the
6811     benefit of the stackguard function. The test for too deep nesting is now
6812     done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6813     others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6814     note of whether or not they may match an empty string. */
6815 
6816     GROUP_PROCESS_NOTE_EMPTY:
6817     note_group_empty = TRUE;
6818 
6819     GROUP_PROCESS:
6820     cb->parens_depth += 1;
6821     *code = bravalue;
6822     pptr++;
6823     tempcode = code;
6824     tempreqvary = cb->req_varyopt;        /* Save value before group */
6825     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6826 
6827     if ((group_return =
6828          compile_regex(
6829          options,                         /* The options state */
6830          xoptions,                        /* The extra options state */
6831          &tempcode,                       /* Where to put code (updated) */
6832          &pptr,                           /* Input pointer (updated) */
6833          errorcodeptr,                    /* Where to put an error message */
6834          skipunits,                       /* Skip over bracket number */
6835          &subfirstcu,                     /* For possible first char */
6836          &subfirstcuflags,
6837          &subreqcu,                       /* For possible last char */
6838          &subreqcuflags,
6839          bcptr,                           /* Current branch chain */
6840          open_caps,                       /* Pointer to capture stack */
6841          cb,                              /* Compile data block */
6842          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6843            &length_prevgroup              /* Pre-compile phase */
6844          )) == 0)
6845       return 0;  /* Error */
6846 
6847     cb->parens_depth -= 1;
6848 
6849     /* If that was a non-conditional significant group (not an assertion, not a
6850     DEFINE) that matches at least one character, then the current item matches
6851     a character. Conditionals are handled below. */
6852 
6853     if (note_group_empty && bravalue != OP_COND && group_return > 0)
6854       matched_char = TRUE;
6855 
6856     /* If we've just compiled an assertion, pop the assert depth. */
6857 
6858     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6859       cb->assert_depth -= 1;
6860 
6861     /* At the end of compiling, code is still pointing to the start of the
6862     group, while tempcode has been updated to point past the end of the group.
6863     The parsed pattern pointer (pptr) is on the closing META_KET.
6864 
6865     If this is a conditional bracket, check that there are no more than
6866     two branches in the group, or just one if it's a DEFINE group. We do this
6867     in the real compile phase, not in the pre-pass, where the whole group may
6868     not be available. */
6869 
6870     if (bravalue == OP_COND && lengthptr == NULL)
6871       {
6872       PCRE2_UCHAR *tc = code;
6873       int condcount = 0;
6874 
6875       do {
6876          condcount++;
6877          tc += GET(tc,1);
6878          }
6879       while (*tc != OP_KET);
6880 
6881       /* A DEFINE group is never obeyed inline (the "condition" is always
6882       false). It must have only one branch. Having checked this, change the
6883       opcode to OP_FALSE. */
6884 
6885       if (code[LINK_SIZE+1] == OP_DEFINE)
6886         {
6887         if (condcount > 1)
6888           {
6889           cb->erroroffset = offset;
6890           *errorcodeptr = ERR54;
6891           return 0;
6892           }
6893         code[LINK_SIZE+1] = OP_FALSE;
6894         bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6895         }
6896 
6897       /* A "normal" conditional group. If there is just one branch, we must not
6898       make use of its firstcu or reqcu, because this is equivalent to an
6899       empty second branch. Also, it may match an empty string. If there are two
6900       branches, this item must match a character if the group must. */
6901 
6902       else
6903         {
6904         if (condcount > 2)
6905           {
6906           cb->erroroffset = offset;
6907           *errorcodeptr = ERR27;
6908           return 0;
6909           }
6910         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6911           else if (group_return > 0) matched_char = TRUE;
6912         }
6913       }
6914 
6915     /* In the pre-compile phase, update the length by the length of the group,
6916     less the brackets at either end. Then reduce the compiled code to just a
6917     set of non-capturing brackets so that it doesn't use much memory if it is
6918     duplicated by a quantifier.*/
6919 
6920     if (lengthptr != NULL)
6921       {
6922       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6923         {
6924         *errorcodeptr = ERR20;
6925         return 0;
6926         }
6927       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6928       code++;   /* This already contains bravalue */
6929       PUTINC(code, 0, 1 + LINK_SIZE);
6930       *code++ = OP_KET;
6931       PUTINC(code, 0, 1 + LINK_SIZE);
6932       break;    /* No need to waste time with special character handling */
6933       }
6934 
6935     /* Otherwise update the main code pointer to the end of the group. */
6936 
6937     code = tempcode;
6938 
6939     /* For a DEFINE group, required and first character settings are not
6940     relevant. */
6941 
6942     if (bravalue == OP_DEFINE) break;
6943 
6944     /* Handle updating of the required and first code units for other types of
6945     group. Update for normal brackets of all kinds, and conditions with two
6946     branches (see code above). If the bracket is followed by a quantifier with
6947     zero repeat, we have to back off. Hence the definition of zeroreqcu and
6948     zerofirstcu outside the main loop so that they can be accessed for the back
6949     off. */
6950 
6951     zeroreqcu = reqcu;
6952     zeroreqcuflags = reqcuflags;
6953     zerofirstcu = firstcu;
6954     zerofirstcuflags = firstcuflags;
6955     groupsetfirstcu = FALSE;
6956 
6957     if (bravalue >= OP_ONCE)  /* Not an assertion */
6958       {
6959       /* If we have not yet set a firstcu in this branch, take it from the
6960       subpattern, remembering that it was set here so that a repeat of more
6961       than one can replicate it as reqcu if necessary. If the subpattern has
6962       no firstcu, set "none" for the whole branch. In both cases, a zero
6963       repeat forces firstcu to "none". */
6964 
6965       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6966         {
6967         if (subfirstcuflags < REQ_NONE)
6968           {
6969           firstcu = subfirstcu;
6970           firstcuflags = subfirstcuflags;
6971           groupsetfirstcu = TRUE;
6972           }
6973         else firstcuflags = REQ_NONE;
6974         zerofirstcuflags = REQ_NONE;
6975         }
6976 
6977       /* If firstcu was previously set, convert the subpattern's firstcu
6978       into reqcu if there wasn't one, using the vary flag that was in
6979       existence beforehand. */
6980 
6981       else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6982         {
6983         subreqcu = subfirstcu;
6984         subreqcuflags = subfirstcuflags | tempreqvary;
6985         }
6986 
6987       /* If the subpattern set a required code unit (or set a first code unit
6988       that isn't really the first code unit - see above), set it. */
6989 
6990       if (subreqcuflags < REQ_NONE)
6991         {
6992         reqcu = subreqcu;
6993         reqcuflags = subreqcuflags;
6994         }
6995       }
6996 
6997     /* For a forward assertion, we take the reqcu, if set, provided that the
6998     group has also set a firstcu. This can be helpful if the pattern that
6999     follows the assertion doesn't set a different char. For example, it's
7000     useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7001     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7002     the "real" "a" would then become a reqcu instead of a firstcu. This is
7003     overcome by a scan at the end if there's no firstcu, looking for an
7004     asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7005     we must only take the reqcu when the group also set a firstcu. Otherwise,
7006     in that example, 'X' ends up set for both. */
7007 
7008     else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7009              subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7010       {
7011       reqcu = subreqcu;
7012       reqcuflags = subreqcuflags;
7013       }
7014 
7015     break;  /* End of nested group handling */
7016 
7017 
7018     /* ===================================================================*/
7019     /* Handle named backreferences and recursions. */
7020 
7021     case META_BACKREF_BYNAME:
7022     case META_RECURSE_BYNAME:
7023       {
7024       int count, index;
7025       PCRE2_SPTR name;
7026       BOOL is_dupname = FALSE;
7027       named_group *ng = cb->named_groups;
7028       uint32_t length = *(++pptr);
7029 
7030       GETPLUSOFFSET(offset, pptr);
7031       name = cb->start_pattern + offset;
7032 
7033       /* In the first pass, the names generated in the pre-pass are available,
7034       but the main name table has not yet been created. Scan the list of names
7035       generated in the pre-pass in order to get a number and whether or not
7036       this name is duplicated. */
7037 
7038       groupnumber = 0;
7039       for (unsigned int i = 0; i < cb->names_found; i++, ng++)
7040         {
7041         if (length == ng->length &&
7042             PRIV(strncmp)(name, ng->name, length) == 0)
7043           {
7044           is_dupname = ng->isdup;
7045           groupnumber = ng->number;
7046 
7047           /* For a recursion, that's all that is needed. We can now go to
7048           the code that handles numerical recursion, applying it to the first
7049           group with the given name. */
7050 
7051           if (meta == META_RECURSE_BYNAME)
7052             {
7053             meta_arg = groupnumber;
7054             goto HANDLE_NUMERICAL_RECURSION;
7055             }
7056 
7057           /* For a back reference, update the back reference map and the
7058           maximum back reference. */
7059 
7060           cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7061           if (groupnumber > cb->top_backref)
7062             cb->top_backref = groupnumber;
7063           }
7064         }
7065 
7066       /* If the name was not found we have a bad reference. */
7067 
7068       if (groupnumber == 0)
7069         {
7070         *errorcodeptr = ERR15;
7071         cb->erroroffset = offset;
7072         return 0;
7073         }
7074 
7075       /* If a back reference name is not duplicated, we can handle it as
7076       a numerical reference. */
7077 
7078       if (!is_dupname)
7079         {
7080         meta_arg = groupnumber;
7081         goto HANDLE_SINGLE_REFERENCE;
7082         }
7083 
7084       /* If a back reference name is duplicated, we generate a different
7085       opcode to a numerical back reference. In the second pass we must
7086       search for the index and count in the final name table. */
7087 
7088       count = 0;  /* Values for first pass (avoids compiler warning) */
7089       index = 0;
7090       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
7091             &count, errorcodeptr, cb)) return 0;
7092 
7093       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7094       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7095       PUT2INC(code, 0, index);
7096       PUT2INC(code, 0, count);
7097       }
7098     break;
7099 
7100 
7101     /* ===================================================================*/
7102     /* Handle a numerical callout. */
7103 
7104     case META_CALLOUT_NUMBER:
7105     code[0] = OP_CALLOUT;
7106     PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7107     PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7108     code[1 + 2*LINK_SIZE] = pptr[3];
7109     pptr += 3;
7110     code += PRIV(OP_lengths)[OP_CALLOUT];
7111     break;
7112 
7113 
7114     /* ===================================================================*/
7115     /* Handle a callout with a string argument. In the pre-pass we just compute
7116     the length without generating anything. The length in pptr[3] includes both
7117     delimiters; in the actual compile only the first one is copied, but a
7118     terminating zero is added. Any doubled delimiters within the string make
7119     this an overestimate, but it is not worth bothering about. */
7120 
7121     case META_CALLOUT_STRING:
7122     if (lengthptr != NULL)
7123       {
7124       *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7125       pptr += 3;
7126       SKIPOFFSET(pptr);
7127       }
7128 
7129     /* In the real compile we can copy the string. The starting delimiter is
7130      included so that the client can discover it if they want. We also pass the
7131      start offset to help a script language give better error messages. */
7132 
7133     else
7134       {
7135       PCRE2_SPTR pp;
7136       uint32_t delimiter;
7137       uint32_t length = pptr[3];
7138       PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7139 
7140       code[0] = OP_CALLOUT_STR;
7141       PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7142       PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7143 
7144       pptr += 3;
7145       GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7146       pp = cb->start_pattern + offset;
7147       delimiter = *callout_string++ = *pp++;
7148       if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7149         delimiter = CHAR_RIGHT_CURLY_BRACKET;
7150       PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7151 
7152       /* The syntax of the pattern was checked in the parsing scan. The length
7153       includes both delimiters, but we have passed the opening one just above,
7154       so we reduce length before testing it. The test is for > 1 because we do
7155       not want to copy the final delimiter. This also ensures that pp[1] is
7156       accessible. */
7157 
7158       while (--length > 1)
7159         {
7160         if (*pp == delimiter && pp[1] == delimiter)
7161           {
7162           *callout_string++ = delimiter;
7163           pp += 2;
7164           length--;
7165           }
7166         else *callout_string++ = *pp++;
7167         }
7168       *callout_string++ = CHAR_NUL;
7169 
7170       /* Set the length of the entire item, the advance to its end. */
7171 
7172       PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7173       code = callout_string;
7174       }
7175     break;
7176 
7177 
7178     /* ===================================================================*/
7179     /* Handle repetition. The different types are all sorted out in the parsing
7180     pass. */
7181 
7182     case META_MINMAX_PLUS:
7183     case META_MINMAX_QUERY:
7184     case META_MINMAX:
7185     repeat_min = *(++pptr);
7186     repeat_max = *(++pptr);
7187     goto REPEAT;
7188 
7189     case META_ASTERISK:
7190     case META_ASTERISK_PLUS:
7191     case META_ASTERISK_QUERY:
7192     repeat_min = 0;
7193     repeat_max = REPEAT_UNLIMITED;
7194     goto REPEAT;
7195 
7196     case META_PLUS:
7197     case META_PLUS_PLUS:
7198     case META_PLUS_QUERY:
7199     repeat_min = 1;
7200     repeat_max = REPEAT_UNLIMITED;
7201     goto REPEAT;
7202 
7203     case META_QUERY:
7204     case META_QUERY_PLUS:
7205     case META_QUERY_QUERY:
7206     repeat_min = 0;
7207     repeat_max = 1;
7208 
7209     REPEAT:
7210     if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7211 
7212     /* Remember whether this is a variable length repeat, and default to
7213     single-char opcodes. */
7214 
7215     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7216     op_type = 0;
7217 
7218     /* Adjust first and required code units for a zero repeat. */
7219 
7220     if (repeat_min == 0)
7221       {
7222       firstcu = zerofirstcu;
7223       firstcuflags = zerofirstcuflags;
7224       reqcu = zeroreqcu;
7225       reqcuflags = zeroreqcuflags;
7226       }
7227 
7228     /* Note the greediness and possessiveness. */
7229 
7230     switch (meta)
7231       {
7232       case META_MINMAX_PLUS:
7233       case META_ASTERISK_PLUS:
7234       case META_PLUS_PLUS:
7235       case META_QUERY_PLUS:
7236       repeat_type = 0;                  /* Force greedy */
7237       possessive_quantifier = TRUE;
7238       break;
7239 
7240       case META_MINMAX_QUERY:
7241       case META_ASTERISK_QUERY:
7242       case META_PLUS_QUERY:
7243       case META_QUERY_QUERY:
7244       repeat_type = greedy_non_default;
7245       possessive_quantifier = FALSE;
7246       break;
7247 
7248       default:
7249       repeat_type = greedy_default;
7250       possessive_quantifier = FALSE;
7251       break;
7252       }
7253 
7254     /* Save start of previous item, in case we have to move it up in order to
7255     insert something before it, and remember what it was. */
7256 
7257     tempcode = previous;
7258     op_previous = *previous;
7259 
7260     /* Now handle repetition for the different types of item. If the repeat
7261     minimum and the repeat maximum are both 1, we can ignore the quantifier for
7262     non-parenthesized items, as they have only one alternative. For anything in
7263     parentheses, we must not ignore if {1} is possessive. */
7264 
7265     switch (op_previous)
7266       {
7267       /* If previous was a character or negated character match, abolish the
7268       item and generate a repeat item instead. If a char item has a minimum of
7269       more than one, ensure that it is set in reqcu - it might not be if a
7270       sequence such as x{3} is the first thing in a branch because the x will
7271       have gone into firstcu instead.  */
7272 
7273       case OP_CHAR:
7274       case OP_CHARI:
7275       case OP_NOT:
7276       case OP_NOTI:
7277       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7278       op_type = chartypeoffset[op_previous - OP_CHAR];
7279 
7280       /* Deal with UTF characters that take up more than one code unit. */
7281 
7282 #ifdef MAYBE_UTF_MULTI
7283       if (utf && NOT_FIRSTCU(code[-1]))
7284         {
7285         PCRE2_UCHAR *lastchar = code - 1;
7286         BACKCHAR(lastchar);
7287         mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7288         memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7289         }
7290       else
7291 #endif  /* MAYBE_UTF_MULTI */
7292 
7293       /* Handle the case of a single code unit - either with no UTF support, or
7294       with UTF disabled, or for a single-code-unit UTF character. In the latter
7295       case, for a repeated positive match, get the caseless flag for the
7296       required code unit from the previous character, because a class like [Aa]
7297       sets a caseless A but by now the req_caseopt flag has been reset. */
7298 
7299         {
7300         mcbuffer[0] = code[-1];
7301         mclength = 1;
7302         if (op_previous <= OP_CHARI && repeat_min > 1)
7303           {
7304           reqcu = mcbuffer[0];
7305           reqcuflags = cb->req_varyopt;
7306           if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7307           }
7308         }
7309       goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7310 
7311       /* If previous was a character class or a back reference, we put the
7312       repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7313 
7314 #ifdef SUPPORT_WIDE_CHARS
7315       case OP_XCLASS:
7316 #endif
7317       case OP_CLASS:
7318       case OP_NCLASS:
7319       case OP_REF:
7320       case OP_REFI:
7321       case OP_DNREF:
7322       case OP_DNREFI:
7323 
7324       if (repeat_max == 0)
7325         {
7326         code = previous;
7327         goto END_REPEAT;
7328         }
7329       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7330 
7331       if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7332         *code++ = OP_CRSTAR + repeat_type;
7333       else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7334         *code++ = OP_CRPLUS + repeat_type;
7335       else if (repeat_min == 0 && repeat_max == 1)
7336         *code++ = OP_CRQUERY + repeat_type;
7337       else
7338         {
7339         *code++ = OP_CRRANGE + repeat_type;
7340         PUT2INC(code, 0, repeat_min);
7341         if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7342         PUT2INC(code, 0, repeat_max);
7343         }
7344       break;
7345 
7346       /* If previous is OP_FAIL, it was generated by an empty class []
7347       (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7348       generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7349       time. We can just ignore this repeat. */
7350 
7351       case OP_FAIL:
7352       goto END_REPEAT;
7353 
7354       /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7355       because pcre2_match() could not handle backtracking into recursively
7356       called groups. Now that this backtracking is available, we no longer need
7357       to do this. However, we still need to replicate recursions as we do for
7358       groups so as to have independent backtracking points. We can replicate
7359       for the minimum number of repeats directly. For optional repeats we now
7360       wrap the recursion in OP_BRA brackets and make use of the bracket
7361       repetition. */
7362 
7363       case OP_RECURSE:
7364       if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7365         goto END_REPEAT;
7366 
7367       /* Generate unwrapped repeats for a non-zero minimum, except when the
7368       minimum is 1 and the maximum unlimited, because that can be handled with
7369       OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7370       minimum, we just need to generate the appropriate additional copies.
7371       Otherwise we need to generate one more, to simulate the situation when
7372       the minimum is zero. */
7373 
7374       if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7375         {
7376         int replicate = repeat_min;
7377         if (repeat_min == repeat_max) replicate--;
7378 
7379         /* In the pre-compile phase, we don't actually do the replication. We
7380         just adjust the length as if we had. Do some paranoid checks for
7381         potential integer overflow. */
7382 
7383         if (lengthptr != NULL)
7384           {
7385           PCRE2_SIZE delta;
7386           if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7387               OFLOW_MAX - *lengthptr < delta)
7388             {
7389             *errorcodeptr = ERR20;
7390             return 0;
7391             }
7392           *lengthptr += delta;
7393           }
7394 
7395         else for (int i = 0; i < replicate; i++)
7396           {
7397           memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7398           previous = code;
7399           code += 1 + LINK_SIZE;
7400           }
7401 
7402         /* If the number of repeats is fixed, we are done. Otherwise, adjust
7403         the counts and fall through. */
7404 
7405         if (repeat_min == repeat_max) break;
7406         if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7407         repeat_min = 0;
7408         }
7409 
7410       /* Wrap the recursion call in OP_BRA brackets. */
7411 
7412       (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7413       op_previous = *previous = OP_BRA;
7414       PUT(previous, 1, 2 + 2*LINK_SIZE);
7415       previous[2 + 2*LINK_SIZE] = OP_KET;
7416       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7417       code += 2 + 2 * LINK_SIZE;
7418       length_prevgroup = 3 + 3*LINK_SIZE;
7419       group_return = -1;  /* Set "may match empty string" */
7420 
7421       /* Now treat as a repeated OP_BRA. */
7422       /* Fall through */
7423 
7424       /* If previous was a bracket group, we may have to replicate it in
7425       certain cases. Note that at this point we can encounter only the "basic"
7426       bracket opcodes such as BRA and CBRA, as this is the place where they get
7427       converted into the more special varieties such as BRAPOS and SBRA.
7428       Originally, PCRE did not allow repetition of assertions, but now it does,
7429       for Perl compatibility. */
7430 
7431       case OP_ASSERT:
7432       case OP_ASSERT_NOT:
7433       case OP_ASSERT_NA:
7434       case OP_ASSERTBACK:
7435       case OP_ASSERTBACK_NOT:
7436       case OP_ASSERTBACK_NA:
7437       case OP_ONCE:
7438       case OP_SCRIPT_RUN:
7439       case OP_BRA:
7440       case OP_CBRA:
7441       case OP_COND:
7442         {
7443         int len = (int)(code - previous);
7444         PCRE2_UCHAR *bralink = NULL;
7445         PCRE2_UCHAR *brazeroptr = NULL;
7446 
7447         if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7448           goto END_REPEAT;
7449 
7450         /* Repeating a DEFINE group (or any group where the condition is always
7451         FALSE and there is only one branch) is pointless, but Perl allows the
7452         syntax, so we just ignore the repeat. */
7453 
7454         if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7455             previous[GET(previous, 1)] != OP_ALT)
7456           goto END_REPEAT;
7457 
7458         /* Perl allows all assertions to be quantified, and when they contain
7459         capturing parentheses and/or are optional there are potential uses for
7460         this feature. PCRE2 used to force the maximum quantifier to 1 on the
7461         invalid grounds that further repetition was never useful. This was
7462         always a bit pointless, since an assertion could be wrapped with a
7463         repeated group to achieve the effect. General repetition is now
7464         permitted, but if the maximum is unlimited it is set to one more than
7465         the minimum. */
7466 
7467         if (op_previous < OP_ONCE)    /* Assertion */
7468           {
7469           if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7470           }
7471 
7472         /* The case of a zero minimum is special because of the need to stick
7473         OP_BRAZERO in front of it, and because the group appears once in the
7474         data, whereas in other cases it appears the minimum number of times. For
7475         this reason, it is simplest to treat this case separately, as otherwise
7476         the code gets far too messy. There are several special subcases when the
7477         minimum is zero. */
7478 
7479         if (repeat_min == 0)
7480           {
7481           /* If the maximum is also zero, we used to just omit the group from
7482           the output altogether, like this:
7483 
7484           ** if (repeat_max == 0)
7485           **   {
7486           **   code = previous;
7487           **   goto END_REPEAT;
7488           **   }
7489 
7490           However, that fails when a group or a subgroup within it is
7491           referenced as a subroutine from elsewhere in the pattern, so now we
7492           stick in OP_SKIPZERO in front of it so that it is skipped on
7493           execution. As we don't have a list of which groups are referenced, we
7494           cannot do this selectively.
7495 
7496           If the maximum is 1 or unlimited, we just have to stick in the
7497           BRAZERO and do no more at this point. */
7498 
7499           if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7500             {
7501             (void)memmove(previous + 1, previous, CU2BYTES(len));
7502             code++;
7503             if (repeat_max == 0)
7504               {
7505               *previous++ = OP_SKIPZERO;
7506               goto END_REPEAT;
7507               }
7508             brazeroptr = previous;    /* Save for possessive optimizing */
7509             *previous++ = OP_BRAZERO + repeat_type;
7510             }
7511 
7512           /* If the maximum is greater than 1 and limited, we have to replicate
7513           in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7514           The first one has to be handled carefully because it's the original
7515           copy, which has to be moved up. The remainder can be handled by code
7516           that is common with the non-zero minimum case below. We have to
7517           adjust the value or repeat_max, since one less copy is required. */
7518 
7519           else
7520             {
7521             int linkoffset;
7522             (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7523             code += 2 + LINK_SIZE;
7524             *previous++ = OP_BRAZERO + repeat_type;
7525             *previous++ = OP_BRA;
7526 
7527             /* We chain together the bracket link offset fields that have to be
7528             filled in later when the ends of the brackets are reached. */
7529 
7530             linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7531             bralink = previous;
7532             PUTINC(previous, 0, linkoffset);
7533             }
7534 
7535           if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7536           }
7537 
7538         /* If the minimum is greater than zero, replicate the group as many
7539         times as necessary, and adjust the maximum to the number of subsequent
7540         copies that we need. */
7541 
7542         else
7543           {
7544           if (repeat_min > 1)
7545             {
7546             /* In the pre-compile phase, we don't actually do the replication.
7547             We just adjust the length as if we had. Do some paranoid checks for
7548             potential integer overflow. */
7549 
7550             if (lengthptr != NULL)
7551               {
7552               PCRE2_SIZE delta;
7553               if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7554                                  (int)length_prevgroup) ||
7555                   OFLOW_MAX - *lengthptr < delta)
7556                 {
7557                 *errorcodeptr = ERR20;
7558                 return 0;
7559                 }
7560               *lengthptr += delta;
7561               }
7562 
7563             /* This is compiling for real. If there is a set first code unit
7564             for the group, and we have not yet set a "required code unit", set
7565             it. */
7566 
7567             else
7568               {
7569               if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7570                 {
7571                 reqcu = firstcu;
7572                 reqcuflags = firstcuflags;
7573                 }
7574               for (uint32_t i = 1; i < repeat_min; i++)
7575                 {
7576                 memcpy(code, previous, CU2BYTES(len));
7577                 code += len;
7578                 }
7579               }
7580             }
7581 
7582           if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7583           }
7584 
7585         /* This code is common to both the zero and non-zero minimum cases. If
7586         the maximum is limited, it replicates the group in a nested fashion,
7587         remembering the bracket starts on a stack. In the case of a zero
7588         minimum, the first one was set up above. In all cases the repeat_max
7589         now specifies the number of additional copies needed. Again, we must
7590         remember to replicate entries on the forward reference list. */
7591 
7592         if (repeat_max != REPEAT_UNLIMITED)
7593           {
7594           /* In the pre-compile phase, we don't actually do the replication. We
7595           just adjust the length as if we had. For each repetition we must add
7596           1 to the length for BRAZERO and for all but the last repetition we
7597           must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7598           paranoid checks to avoid integer overflow. */
7599 
7600           if (lengthptr != NULL && repeat_max > 0)
7601             {
7602             PCRE2_SIZE delta;
7603             if (PRIV(ckd_smul)(&delta, repeat_max,
7604                                (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7605                 OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7606               {
7607               *errorcodeptr = ERR20;
7608               return 0;
7609               }
7610             delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7611             *lengthptr += delta;
7612             }
7613 
7614           /* This is compiling for real */
7615 
7616           else for (uint32_t i = repeat_max; i >= 1; i--)
7617             {
7618             *code++ = OP_BRAZERO + repeat_type;
7619 
7620             /* All but the final copy start a new nesting, maintaining the
7621             chain of brackets outstanding. */
7622 
7623             if (i != 1)
7624               {
7625               int linkoffset;
7626               *code++ = OP_BRA;
7627               linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7628               bralink = code;
7629               PUTINC(code, 0, linkoffset);
7630               }
7631 
7632             memcpy(code, previous, CU2BYTES(len));
7633             code += len;
7634             }
7635 
7636           /* Now chain through the pending brackets, and fill in their length
7637           fields (which are holding the chain links pro tem). */
7638 
7639           while (bralink != NULL)
7640             {
7641             int oldlinkoffset;
7642             int linkoffset = (int)(code - bralink + 1);
7643             PCRE2_UCHAR *bra = code - linkoffset;
7644             oldlinkoffset = GET(bra, 1);
7645             bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7646             *code++ = OP_KET;
7647             PUTINC(code, 0, linkoffset);
7648             PUT(bra, 1, linkoffset);
7649             }
7650           }
7651 
7652         /* If the maximum is unlimited, set a repeater in the final copy. For
7653         SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7654         possessively repeated ONCE brackets can be converted into non-capturing
7655         brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7656         saves having to deal with possessive ONCEs specially.
7657 
7658         Otherwise, when we are doing the actual compile phase, check to see
7659         whether this group is one that could match an empty string. If so,
7660         convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7661         that runtime checking can be done. [This check is also applied to ONCE
7662         and SCRIPT_RUN groups at runtime, but in a different way.]
7663 
7664         Then, if the quantifier was possessive and the bracket is not a
7665         conditional, we convert the BRA code to the POS form, and the KET code
7666         to KETRPOS. (It turns out to be convenient at runtime to detect this
7667         kind of subpattern at both the start and at the end.) The use of
7668         special opcodes makes it possible to reduce greatly the stack usage in
7669         pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7670         OP_BRAPOSZERO.
7671 
7672         Then, if the minimum number of matches is 1 or 0, cancel the possessive
7673         flag so that the default action below, of wrapping everything inside
7674         atomic brackets, does not happen. When the minimum is greater than 1,
7675         there will be earlier copies of the group, and so we still have to wrap
7676         the whole thing. */
7677 
7678         else
7679           {
7680           PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7681           PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7682 
7683           /* Convert possessive ONCE brackets to non-capturing */
7684 
7685           if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7686 
7687           /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7688           to do is to set the KET. */
7689 
7690           if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7691             *ketcode = OP_KETRMAX + repeat_type;
7692 
7693           /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7694           (which have been converted to non-capturing above). */
7695 
7696           else
7697             {
7698             /* In the compile phase, adjust the opcode if the group can match
7699             an empty string. For a conditional group with only one branch, the
7700             value of group_return will not show "could be empty", so we must
7701             check that separately. */
7702 
7703             if (lengthptr == NULL)
7704               {
7705               if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7706               if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7707                 *bracode = OP_SCOND;
7708               }
7709 
7710             /* Handle possessive quantifiers. */
7711 
7712             if (possessive_quantifier)
7713               {
7714               /* For COND brackets, we wrap the whole thing in a possessively
7715               repeated non-capturing bracket, because we have not invented POS
7716               versions of the COND opcodes. */
7717 
7718               if (*bracode == OP_COND || *bracode == OP_SCOND)
7719                 {
7720                 int nlen = (int)(code - bracode);
7721                 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7722                 code += 1 + LINK_SIZE;
7723                 nlen += 1 + LINK_SIZE;
7724                 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7725                 *code++ = OP_KETRPOS;
7726                 PUTINC(code, 0, nlen);
7727                 PUT(bracode, 1, nlen);
7728                 }
7729 
7730               /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7731 
7732               else
7733                 {
7734                 *bracode += 1;              /* Switch to xxxPOS opcodes */
7735                 *ketcode = OP_KETRPOS;
7736                 }
7737 
7738               /* If the minimum is zero, mark it as possessive, then unset the
7739               possessive flag when the minimum is 0 or 1. */
7740 
7741               if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7742               if (repeat_min < 2) possessive_quantifier = FALSE;
7743               }
7744 
7745             /* Non-possessive quantifier */
7746 
7747             else *ketcode = OP_KETRMAX + repeat_type;
7748             }
7749           }
7750         }
7751       break;
7752 
7753       /* If previous was a character type match (\d or similar), abolish it and
7754       create a suitable repeat item. The code is shared with single-character
7755       repeats by setting op_type to add a suitable offset into repeat_type.
7756       Note the the Unicode property types will be present only when
7757       SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7758       here because it just makes it horribly messy. */
7759 
7760       default:
7761       if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7762         {
7763         *errorcodeptr = ERR10;
7764         return 0;
7765         }
7766       else
7767         {
7768         int prop_type, prop_value;
7769         PCRE2_UCHAR *oldcode;
7770 
7771         if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7772 
7773         op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7774         mclength = 0;                         /* Not a character */
7775 
7776         if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7777           {
7778           prop_type = previous[1];
7779           prop_value = previous[2];
7780           }
7781         else
7782           {
7783           /* Come here from just above with a character in mcbuffer/mclength. */
7784           OUTPUT_SINGLE_REPEAT:
7785           prop_type = prop_value = -1;
7786           }
7787 
7788         /* At this point, if prop_type == prop_value == -1 we either have a
7789         character in mcbuffer when mclength is greater than zero, or we have
7790         mclength zero, in which case there is a non-property character type in
7791         op_previous. If prop_type/value are not negative, we have a property
7792         character type in op_previous. */
7793 
7794         oldcode = code;                   /* Save where we were */
7795         code = previous;                  /* Usually overwrite previous item */
7796 
7797         /* If the maximum is zero then the minimum must also be zero; Perl allows
7798         this case, so we do too - by simply omitting the item altogether. */
7799 
7800         if (repeat_max == 0) goto END_REPEAT;
7801 
7802         /* Combine the op_type with the repeat_type */
7803 
7804         repeat_type += op_type;
7805 
7806         /* A minimum of zero is handled either as the special case * or ?, or as
7807         an UPTO, with the maximum given. */
7808 
7809         if (repeat_min == 0)
7810           {
7811           if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7812             else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7813           else
7814             {
7815             *code++ = OP_UPTO + repeat_type;
7816             PUT2INC(code, 0, repeat_max);
7817             }
7818           }
7819 
7820         /* A repeat minimum of 1 is optimized into some special cases. If the
7821         maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7822         left in place and, if the maximum is greater than 1, we use OP_UPTO with
7823         one less than the maximum. */
7824 
7825         else if (repeat_min == 1)
7826           {
7827           if (repeat_max == REPEAT_UNLIMITED)
7828             *code++ = OP_PLUS + repeat_type;
7829           else
7830             {
7831             code = oldcode;  /* Leave previous item in place */
7832             if (repeat_max == 1) goto END_REPEAT;
7833             *code++ = OP_UPTO + repeat_type;
7834             PUT2INC(code, 0, repeat_max - 1);
7835             }
7836           }
7837 
7838         /* The case {n,n} is just an EXACT, while the general case {n,m} is
7839         handled as an EXACT followed by an UPTO or STAR or QUERY. */
7840 
7841         else
7842           {
7843           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7844           PUT2INC(code, 0, repeat_min);
7845 
7846           /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7847           and then generate the second opcode. For a repeated Unicode property
7848           match, there are two extra values that define the required property,
7849           and mclength is set zero to indicate this. */
7850 
7851           if (repeat_max != repeat_min)
7852             {
7853             if (mclength > 0)
7854               {
7855               memcpy(code, mcbuffer, CU2BYTES(mclength));
7856               code += mclength;
7857               }
7858             else
7859               {
7860               *code++ = op_previous;
7861               if (prop_type >= 0)
7862                 {
7863                 *code++ = prop_type;
7864                 *code++ = prop_value;
7865                 }
7866               }
7867 
7868             /* Now set up the following opcode */
7869 
7870             if (repeat_max == REPEAT_UNLIMITED)
7871               *code++ = OP_STAR + repeat_type;
7872             else
7873               {
7874               repeat_max -= repeat_min;
7875               if (repeat_max == 1)
7876                 {
7877                 *code++ = OP_QUERY + repeat_type;
7878                 }
7879               else
7880                 {
7881                 *code++ = OP_UPTO + repeat_type;
7882                 PUT2INC(code, 0, repeat_max);
7883                 }
7884               }
7885             }
7886           }
7887 
7888         /* Fill in the character or character type for the final opcode. */
7889 
7890         if (mclength > 0)
7891           {
7892           memcpy(code, mcbuffer, CU2BYTES(mclength));
7893           code += mclength;
7894           }
7895         else
7896           {
7897           *code++ = op_previous;
7898           if (prop_type >= 0)
7899             {
7900             *code++ = prop_type;
7901             *code++ = prop_value;
7902             }
7903           }
7904         }
7905       break;
7906       }  /* End of switch on different op_previous values */
7907 
7908 
7909     /* If the character following a repeat is '+', possessive_quantifier is
7910     TRUE. For some opcodes, there are special alternative opcodes for this
7911     case. For anything else, we wrap the entire repeated item inside OP_ONCE
7912     brackets. Logically, the '+' notation is just syntactic sugar, taken from
7913     Sun's Java package, but the special opcodes can optimize it.
7914 
7915     Some (but not all) possessively repeated subpatterns have already been
7916     completely handled in the code just above. For them, possessive_quantifier
7917     is always FALSE at this stage. Note that the repeated item starts at
7918     tempcode, not at previous, which might be the first part of a string whose
7919     (former) last char we repeated. */
7920 
7921     if (possessive_quantifier)
7922       {
7923       int len;
7924 
7925       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7926       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7927       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7928       remains is greater than zero, there's a further opcode that can be
7929       handled. If not, do nothing, leaving the EXACT alone. */
7930 
7931       switch(*tempcode)
7932         {
7933         case OP_TYPEEXACT:
7934         tempcode += PRIV(OP_lengths)[*tempcode] +
7935           ((tempcode[1 + IMM2_SIZE] == OP_PROP
7936           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7937         break;
7938 
7939         /* CHAR opcodes are used for exacts whose count is 1. */
7940 
7941         case OP_CHAR:
7942         case OP_CHARI:
7943         case OP_NOT:
7944         case OP_NOTI:
7945         case OP_EXACT:
7946         case OP_EXACTI:
7947         case OP_NOTEXACT:
7948         case OP_NOTEXACTI:
7949         tempcode += PRIV(OP_lengths)[*tempcode];
7950 #ifdef SUPPORT_UNICODE
7951         if (utf && HAS_EXTRALEN(tempcode[-1]))
7952           tempcode += GET_EXTRALEN(tempcode[-1]);
7953 #endif
7954         break;
7955 
7956         /* For the class opcodes, the repeat operator appears at the end;
7957         adjust tempcode to point to it. */
7958 
7959         case OP_CLASS:
7960         case OP_NCLASS:
7961         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7962         break;
7963 
7964 #ifdef SUPPORT_WIDE_CHARS
7965         case OP_XCLASS:
7966         tempcode += GET(tempcode, 1);
7967         break;
7968 #endif
7969         }
7970 
7971       /* If tempcode is equal to code (which points to the end of the repeated
7972       item), it means we have skipped an EXACT item but there is no following
7973       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7974       all other cases, tempcode will be pointing to the repeat opcode, and will
7975       be less than code, so the value of len will be greater than 0. */
7976 
7977       len = (int)(code - tempcode);
7978       if (len > 0)
7979         {
7980         unsigned int repcode = *tempcode;
7981 
7982         /* There is a table for possessifying opcodes, all of which are less
7983         than OP_CALLOUT. A zero entry means there is no possessified version.
7984         */
7985 
7986         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7987           *tempcode = opcode_possessify[repcode];
7988 
7989         /* For opcode without a special possessified version, wrap the item in
7990         ONCE brackets. */
7991 
7992         else
7993           {
7994           (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7995           code += 1 + LINK_SIZE;
7996           len += 1 + LINK_SIZE;
7997           tempcode[0] = OP_ONCE;
7998           *code++ = OP_KET;
7999           PUTINC(code, 0, len);
8000           PUT(tempcode, 1, len);
8001           }
8002         }
8003       }
8004 
8005     /* We set the "follows varying string" flag for subsequently encountered
8006     reqcus if it isn't already set and we have just passed a varying length
8007     item. */
8008 
8009     END_REPEAT:
8010     cb->req_varyopt |= reqvary;
8011     break;
8012 
8013 
8014     /* ===================================================================*/
8015     /* Handle a 32-bit data character with a value greater than META_END. */
8016 
8017     case META_BIGVALUE:
8018     pptr++;
8019     goto NORMAL_CHAR;
8020 
8021 
8022     /* ===============================================================*/
8023     /* Handle a back reference by number, which is the meta argument. The
8024     pattern offsets for back references to group numbers less than 10 are held
8025     in a special vector, to avoid using more than two parsed pattern elements
8026     in 64-bit environments. We only need the offset to the first occurrence,
8027     because if that doesn't fail, subsequent ones will also be OK. */
8028 
8029     case META_BACKREF:
8030     if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8031       else GETPLUSOFFSET(offset, pptr);
8032 
8033     if (meta_arg > cb->bracount)
8034       {
8035       cb->erroroffset = offset;
8036       *errorcodeptr = ERR15;  /* Non-existent subpattern */
8037       return 0;
8038       }
8039 
8040     /* Come here from named backref handling when the reference is to a
8041     single group (that is, not to a duplicated name). The back reference
8042     data will have already been updated. We must disable firstcu if not
8043     set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8044     later. */
8045 
8046     HANDLE_SINGLE_REFERENCE:
8047     if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8048     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8049     PUT2INC(code, 0, meta_arg);
8050 
8051     /* Update the map of back references, and keep the highest one. We
8052     could do this in parse_regex() for numerical back references, but not
8053     for named back references, because we don't know the numbers to which
8054     named back references refer. So we do it all in this function. */
8055 
8056     cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8057     if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8058     break;
8059 
8060 
8061     /* ===============================================================*/
8062     /* Handle recursion by inserting the number of the called group (which is
8063     the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8064     scanned and these numbers are replaced by offsets within the pattern. It is
8065     done like this to avoid problems with forward references and adjusting
8066     offsets when groups are duplicated and moved (as discovered in previous
8067     implementations). Note that a recursion does not have a set first
8068     character. */
8069 
8070     case META_RECURSE:
8071     GETPLUSOFFSET(offset, pptr);
8072     if (meta_arg > cb->bracount)
8073       {
8074       cb->erroroffset = offset;
8075       *errorcodeptr = ERR15;  /* Non-existent subpattern */
8076       return 0;
8077       }
8078     HANDLE_NUMERICAL_RECURSION:
8079     *code = OP_RECURSE;
8080     PUT(code, 1, meta_arg);
8081     code += 1 + LINK_SIZE;
8082     groupsetfirstcu = FALSE;
8083     cb->had_recurse = TRUE;
8084     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8085     zerofirstcu = firstcu;
8086     zerofirstcuflags = firstcuflags;
8087     break;
8088 
8089 
8090     /* ===============================================================*/
8091     /* Handle capturing parentheses; the number is the meta argument. */
8092 
8093     case META_CAPTURE:
8094     bravalue = OP_CBRA;
8095     skipunits = IMM2_SIZE;
8096     PUT2(code, 1+LINK_SIZE, meta_arg);
8097     cb->lastcapture = meta_arg;
8098     goto GROUP_PROCESS_NOTE_EMPTY;
8099 
8100 
8101     /* ===============================================================*/
8102     /* Handle escape sequence items. For ones like \d, the ESC_values are
8103     arranged to be the same as the corresponding OP_values in the default case
8104     when PCRE2_UCP is not set (which is the only case in which they will appear
8105     here).
8106 
8107     Note: \Q and \E are never seen here, as they were dealt with in
8108     parse_pattern(). Neither are numerical back references or recursions, which
8109     were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8110     \g, when followed by names, are turned into META_BACKREF_BYNAME or
8111     META_RECURSE_BYNAME. */
8112 
8113     case META_ESCAPE:
8114 
8115     /* We can test for escape sequences that consume a character because their
8116     values lie between ESC_b and ESC_Z; this may have to change if any new ones
8117     are ever created. For these sequences, we disable the setting of a first
8118     character if it hasn't already been set. */
8119 
8120     if (meta_arg > ESC_b && meta_arg < ESC_Z)
8121       {
8122       matched_char = TRUE;
8123       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8124       }
8125 
8126     /* Set values to reset to if this is followed by a zero repeat. */
8127 
8128     zerofirstcu = firstcu;
8129     zerofirstcuflags = firstcuflags;
8130     zeroreqcu = reqcu;
8131     zeroreqcuflags = reqcuflags;
8132 
8133     /* If Unicode is not supported, \P and \p are not allowed and are
8134     faulted at parse time, so will never appear here. */
8135 
8136 #ifdef SUPPORT_UNICODE
8137     if (meta_arg == ESC_P || meta_arg == ESC_p)
8138       {
8139       uint32_t ptype = *(++pptr) >> 16;
8140       uint32_t pdata = *pptr & 0xffff;
8141 
8142       /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
8143       from the auto-anchoring code. */
8144 
8145       if (meta_arg == ESC_p && ptype == PT_ANY)
8146         {
8147         *code++ = OP_ALLANY;
8148         }
8149       else
8150         {
8151         *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8152         *code++ = ptype;
8153         *code++ = pdata;
8154         }
8155       break;  /* End META_ESCAPE */
8156       }
8157 #endif
8158 
8159     /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8160     done. However, there's an option, in case anyone was relying on it. */
8161 
8162     if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8163         (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8164       {
8165       *errorcodeptr = ERR99;
8166       return 0;
8167       }
8168 
8169     /* For the rest (including \X when Unicode is supported - if not it's
8170     faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8171     not set; if it is set, most of them do not show up here because they are
8172     converted into Unicode property tests in parse_regex().
8173 
8174     In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8175     instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8176     There are special UCP codes for \B and \b which are used in UCP mode unless
8177     "word" matching is being forced to ASCII.
8178 
8179     Note that \b and \B do a one-character lookbehind, and \A also behaves as
8180     if it does. */
8181 
8182     switch(meta_arg)
8183       {
8184       case ESC_C:
8185       cb->external_flags |= PCRE2_HASBKC;  /* Record */
8186 #if PCRE2_CODE_UNIT_WIDTH == 32
8187       meta_arg = OP_ALLANY;
8188 #else
8189       if (!utf) meta_arg = OP_ALLANY;
8190 #endif
8191       break;
8192 
8193       case ESC_B:
8194       case ESC_b:
8195       if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8196         meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8197           OP_UCP_WORD_BOUNDARY;
8198       /* Fall through */
8199 
8200       case ESC_A:
8201       if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8202       break;
8203       }
8204 
8205     *code++ = meta_arg;
8206     break;  /* End META_ESCAPE */
8207 
8208 
8209     /* ===================================================================*/
8210     /* Handle an unrecognized meta value. A parsed pattern value less than
8211     META_END is a literal. Otherwise we have a problem. */
8212 
8213     default:
8214     if (meta >= META_END)
8215       {
8216 #ifdef DEBUG_SHOW_PARSED
8217       fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
8218 #endif
8219       *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8220       return 0;
8221       }
8222 
8223     /* Handle a literal character. We come here by goto in the case of a
8224     32-bit, non-UTF character whose value is greater than META_END. */
8225 
8226     NORMAL_CHAR:
8227     meta = *pptr;     /* Get the full 32 bits */
8228     NORMAL_CHAR_SET:  /* Character is already in meta */
8229     matched_char = TRUE;
8230 
8231     /* For caseless UTF or UCP mode, check whether this character has more than
8232     one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8233     When casing restrictions apply, ignore caseless sets that start with an
8234     ASCII character. */
8235 
8236 #ifdef SUPPORT_UNICODE
8237     if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8238       {
8239       uint32_t caseset = UCD_CASESET(meta);
8240       if (caseset != 0 &&
8241            ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
8242            PRIV(ucd_caseless_sets)[caseset] > 127))
8243         {
8244         *code++ = OP_PROP;
8245         *code++ = PT_CLIST;
8246         *code++ = caseset;
8247         if (firstcuflags == REQ_UNSET)
8248           firstcuflags = zerofirstcuflags = REQ_NONE;
8249         break;  /* End handling this meta item */
8250         }
8251       }
8252 #endif
8253 
8254     /* Caseful matches, or caseless and not one of the multicase characters. We
8255     come here by goto in the case of a positive class that contains only
8256     case-partners of a character with just two cases; matched_char has already
8257     been set TRUE and options fudged if necessary. */
8258 
8259     CLASS_CASELESS_CHAR:
8260 
8261     /* Get the character's code units into mcbuffer, with the length in
8262     mclength. When not in UTF mode, the length is always 1. */
8263 
8264 #ifdef SUPPORT_UNICODE
8265     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8266 #endif
8267       {
8268       mclength = 1;
8269       mcbuffer[0] = meta;
8270       }
8271 
8272     /* Generate the appropriate code */
8273 
8274     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8275     memcpy(code, mcbuffer, CU2BYTES(mclength));
8276     code += mclength;
8277 
8278     /* Remember if \r or \n were seen */
8279 
8280     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8281       cb->external_flags |= PCRE2_HASCRORLF;
8282 
8283     /* Set the first and required code units appropriately. If no previous
8284     first code unit, set it from this character, but revert to none on a zero
8285     repeat. Otherwise, leave the firstcu value alone, and don't change it on
8286     a zero repeat. */
8287 
8288     if (firstcuflags == REQ_UNSET)
8289       {
8290       zerofirstcuflags = REQ_NONE;
8291       zeroreqcu = reqcu;
8292       zeroreqcuflags = reqcuflags;
8293 
8294       /* If the character is more than one code unit long, we can set a single
8295       firstcu only if it is not to be matched caselessly. Multiple possible
8296       starting code units may be picked up later in the studying code. */
8297 
8298       if (mclength == 1 || req_caseopt == 0)
8299         {
8300         firstcu = mcbuffer[0];
8301         firstcuflags = req_caseopt;
8302         if (mclength != 1)
8303           {
8304           reqcu = code[-1];
8305           reqcuflags = cb->req_varyopt;
8306           }
8307         }
8308       else firstcuflags = reqcuflags = REQ_NONE;
8309       }
8310 
8311     /* firstcu was previously set; we can set reqcu only if the length is
8312     1 or the matching is caseful. */
8313 
8314     else
8315       {
8316       zerofirstcu = firstcu;
8317       zerofirstcuflags = firstcuflags;
8318       zeroreqcu = reqcu;
8319       zeroreqcuflags = reqcuflags;
8320       if (mclength == 1 || req_caseopt == 0)
8321         {
8322         reqcu = code[-1];
8323         reqcuflags = req_caseopt | cb->req_varyopt;
8324         }
8325       }
8326 
8327     /* If caselessness was temporarily instated, reset it. */
8328 
8329     if (reset_caseful)
8330       {
8331       options &= ~PCRE2_CASELESS;
8332       req_caseopt = 0;
8333       reset_caseful = FALSE;
8334       }
8335 
8336     break;    /* End literal character handling */
8337     }         /* End of big switch */
8338   }           /* End of big loop */
8339 
8340 /* Control never reaches here. */
8341 }
8342 
8343 
8344 
8345 /*************************************************
8346 *   Compile regex: a sequence of alternatives    *
8347 *************************************************/
8348 
8349 /* On entry, pptr is pointing past the bracket meta, but on return it points to
8350 the closing bracket or META_END. The code variable is pointing at the code unit
8351 into which the BRA operator has been stored. This function is used during the
8352 pre-compile phase when we are trying to find out the amount of memory needed,
8353 as well as during the real compile phase. The value of lengthptr distinguishes
8354 the two phases.
8355 
8356 Arguments:
8357   options           option bits, including any changes for this subpattern
8358   xoptions          extra option bits, ditto
8359   codeptr           -> the address of the current code pointer
8360   pptrptr           -> the address of the current parsed pattern pointer
8361   errorcodeptr      -> pointer to error code variable
8362   skipunits         skip this many code units at start (for brackets and OP_COND)
8363   firstcuptr        place to put the first required code unit
8364   firstcuflagsptr   place to put the first code unit flags
8365   reqcuptr          place to put the last required code unit
8366   reqcuflagsptr     place to put the last required code unit flags
8367   bcptr             pointer to the chain of currently open branches
8368   cb                points to the data block with tables pointers etc.
8369   lengthptr         NULL during the real compile phase
8370                     points to length accumulator during pre-compile phase
8371 
8372 Returns:            0 There has been an error
8373                    +1 Success, this group must match at least one character
8374                    -1 Success, this group may match an empty string
8375 */
8376 
8377 static int
compile_regex(uint32_t options,uint32_t xoptions,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)8378 compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8379   uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8380   uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8381   uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8382   compile_block *cb, PCRE2_SIZE *lengthptr)
8383 {
8384 PCRE2_UCHAR *code = *codeptr;
8385 PCRE2_UCHAR *last_branch = code;
8386 PCRE2_UCHAR *start_bracket = code;
8387 BOOL lookbehind;
8388 open_capitem capitem;
8389 int capnumber = 0;
8390 int okreturn = 1;
8391 uint32_t *pptr = *pptrptr;
8392 uint32_t firstcu, reqcu;
8393 uint32_t lookbehindlength;
8394 uint32_t lookbehindminlength;
8395 uint32_t firstcuflags, reqcuflags;
8396 uint32_t branchfirstcu, branchreqcu;
8397 uint32_t branchfirstcuflags, branchreqcuflags;
8398 PCRE2_SIZE length;
8399 branch_chain bc;
8400 
8401 /* If set, call the external function that checks for stack availability. */
8402 
8403 if (cb->cx->stack_guard != NULL &&
8404     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8405   {
8406   *errorcodeptr= ERR33;
8407   return 0;
8408   }
8409 
8410 /* Miscellaneous initialization */
8411 
8412 bc.outer = bcptr;
8413 bc.current_branch = code;
8414 
8415 firstcu = reqcu = 0;
8416 firstcuflags = reqcuflags = REQ_UNSET;
8417 
8418 /* Accumulate the length for use in the pre-compile phase. Start with the
8419 length of the BRA and KET and any extra code units that are required at the
8420 beginning. We accumulate in a local variable to save frequent testing of
8421 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8422 start and end of each alternative, because compiled items are discarded during
8423 the pre-compile phase so that the workspace is not exceeded. */
8424 
8425 length = 2 + 2*LINK_SIZE + skipunits;
8426 
8427 /* Remember if this is a lookbehind assertion, and if it is, save its length
8428 and skip over the pattern offset. */
8429 
8430 lookbehind = *code == OP_ASSERTBACK ||
8431              *code == OP_ASSERTBACK_NOT ||
8432              *code == OP_ASSERTBACK_NA;
8433 
8434 if (lookbehind)
8435   {
8436   lookbehindlength = META_DATA(pptr[-1]);
8437   lookbehindminlength = *pptr;
8438   pptr += SIZEOFFSET;
8439   }
8440 else lookbehindlength = lookbehindminlength = 0;
8441 
8442 /* If this is a capturing subpattern, add to the chain of open capturing items
8443 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8444 need be tested here; changing this opcode to one of its variants, e.g.
8445 OP_SCBRAPOS, happens later, after the group has been compiled. */
8446 
8447 if (*code == OP_CBRA)
8448   {
8449   capnumber = GET2(code, 1 + LINK_SIZE);
8450   capitem.number = capnumber;
8451   capitem.next = open_caps;
8452   capitem.assert_depth = cb->assert_depth;
8453   open_caps = &capitem;
8454   }
8455 
8456 /* Offset is set zero to mark that this bracket is still open */
8457 
8458 PUT(code, 1, 0);
8459 code += 1 + LINK_SIZE + skipunits;
8460 
8461 /* Loop for each alternative branch */
8462 
8463 for (;;)
8464   {
8465   int branch_return;
8466 
8467   /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8468   is only a single mimimum length for the whole assertion. When the mimimum
8469   length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8470   though not necessarily the same length. In this case, the original OP_REVERSE
8471   can be used. It can also be used if a branch in a variable length lookbehind
8472   has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8473   maximum and minimum values. */
8474 
8475   if (lookbehind && lookbehindlength > 0)
8476     {
8477     if (lookbehindminlength == LOOKBEHIND_MAX ||
8478         lookbehindminlength == lookbehindlength)
8479       {
8480       *code++ = OP_REVERSE;
8481       PUT2INC(code, 0, lookbehindlength);
8482       length += 1 + IMM2_SIZE;
8483       }
8484     else
8485       {
8486       *code++ = OP_VREVERSE;
8487       PUT2INC(code, 0, lookbehindminlength);
8488       PUT2INC(code, 0, lookbehindlength);
8489       length += 1 + 2*IMM2_SIZE;
8490       }
8491     }
8492 
8493   /* Now compile the branch; in the pre-compile phase its length gets added
8494   into the length. */
8495 
8496   if ((branch_return =
8497         compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8498           &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8499           &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8500     return 0;
8501 
8502   /* If a branch can match an empty string, so can the whole group. */
8503 
8504   if (branch_return < 0) okreturn = -1;
8505 
8506   /* In the real compile phase, there is some post-processing to be done. */
8507 
8508   if (lengthptr == NULL)
8509     {
8510     /* If this is the first branch, the firstcu and reqcu values for the
8511     branch become the values for the regex. */
8512 
8513     if (*last_branch != OP_ALT)
8514       {
8515       firstcu = branchfirstcu;
8516       firstcuflags = branchfirstcuflags;
8517       reqcu = branchreqcu;
8518       reqcuflags = branchreqcuflags;
8519       }
8520 
8521     /* If this is not the first branch, the first char and reqcu have to
8522     match the values from all the previous branches, except that if the
8523     previous value for reqcu didn't have REQ_VARY set, it can still match,
8524     and we set REQ_VARY for the group from this branch's value. */
8525 
8526     else
8527       {
8528       /* If we previously had a firstcu, but it doesn't match the new branch,
8529       we have to abandon the firstcu for the regex, but if there was
8530       previously no reqcu, it takes on the value of the old firstcu. */
8531 
8532       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8533         {
8534         if (firstcuflags < REQ_NONE)
8535           {
8536           if (reqcuflags >= REQ_NONE)
8537             {
8538             reqcu = firstcu;
8539             reqcuflags = firstcuflags;
8540             }
8541           }
8542         firstcuflags = REQ_NONE;
8543         }
8544 
8545       /* If we (now or from before) have no firstcu, a firstcu from the
8546       branch becomes a reqcu if there isn't a branch reqcu. */
8547 
8548       if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8549           branchreqcuflags >= REQ_NONE)
8550         {
8551         branchreqcu = branchfirstcu;
8552         branchreqcuflags = branchfirstcuflags;
8553         }
8554 
8555       /* Now ensure that the reqcus match */
8556 
8557       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8558           reqcu != branchreqcu)
8559         reqcuflags = REQ_NONE;
8560       else
8561         {
8562         reqcu = branchreqcu;
8563         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8564         }
8565       }
8566     }
8567 
8568   /* Handle reaching the end of the expression, either ')' or end of pattern.
8569   In the real compile phase, go back through the alternative branches and
8570   reverse the chain of offsets, with the field in the BRA item now becoming an
8571   offset to the first alternative. If there are no alternatives, it points to
8572   the end of the group. The length in the terminating ket is always the length
8573   of the whole bracketed item. Return leaving the pointer at the terminating
8574   char. */
8575 
8576   if (META_CODE(*pptr) != META_ALT)
8577     {
8578     if (lengthptr == NULL)
8579       {
8580       PCRE2_SIZE branch_length = code - last_branch;
8581       do
8582         {
8583         PCRE2_SIZE prev_length = GET(last_branch, 1);
8584         PUT(last_branch, 1, branch_length);
8585         branch_length = prev_length;
8586         last_branch -= branch_length;
8587         }
8588       while (branch_length > 0);
8589       }
8590 
8591     /* Fill in the ket */
8592 
8593     *code = OP_KET;
8594     PUT(code, 1, (int)(code - start_bracket));
8595     code += 1 + LINK_SIZE;
8596 
8597     /* Set values to pass back */
8598 
8599     *codeptr = code;
8600     *pptrptr = pptr;
8601     *firstcuptr = firstcu;
8602     *firstcuflagsptr = firstcuflags;
8603     *reqcuptr = reqcu;
8604     *reqcuflagsptr = reqcuflags;
8605     if (lengthptr != NULL)
8606       {
8607       if (OFLOW_MAX - *lengthptr < length)
8608         {
8609         *errorcodeptr = ERR20;
8610         return 0;
8611         }
8612       *lengthptr += length;
8613       }
8614     return okreturn;
8615     }
8616 
8617   /* Another branch follows. In the pre-compile phase, we can move the code
8618   pointer back to where it was for the start of the first branch. (That is,
8619   pretend that each branch is the only one.)
8620 
8621   In the real compile phase, insert an ALT node. Its length field points back
8622   to the previous branch while the bracket remains open. At the end the chain
8623   is reversed. It's done like this so that the start of the bracket has a
8624   zero offset until it is closed, making it possible to detect recursion. */
8625 
8626   if (lengthptr != NULL)
8627     {
8628     code = *codeptr + 1 + LINK_SIZE + skipunits;
8629     length += 1 + LINK_SIZE;
8630     }
8631   else
8632     {
8633     *code = OP_ALT;
8634     PUT(code, 1, (int)(code - last_branch));
8635     bc.current_branch = last_branch = code;
8636     code += 1 + LINK_SIZE;
8637     }
8638 
8639   /* Set the maximum lookbehind length for the next branch (if not in a
8640   lookbehind the value will be zero) and then advance past the vertical bar. */
8641 
8642   lookbehindlength = META_DATA(*pptr);
8643   pptr++;
8644   }
8645 /* Control never reaches here */
8646 }
8647 
8648 
8649 
8650 /*************************************************
8651 *          Check for anchored pattern            *
8652 *************************************************/
8653 
8654 /* Try to find out if this is an anchored regular expression. Consider each
8655 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8656 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8657 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8658 be found, because ^ generates OP_CIRCM in that mode.
8659 
8660 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8661 This is the code for \G, which means "match at start of match position, taking
8662 into account the match offset".
8663 
8664 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8665 because that will try the rest of the pattern at all possible matching points,
8666 so there is no point trying again.... er ....
8667 
8668 .... except when the .* appears inside capturing parentheses, and there is a
8669 subsequent back reference to those parentheses. We haven't enough information
8670 to catch that case precisely.
8671 
8672 At first, the best we could do was to detect when .* was in capturing brackets
8673 and the highest back reference was greater than or equal to that level.
8674 However, by keeping a bitmap of the first 31 back references, we can catch some
8675 of the more common cases more precisely.
8676 
8677 ... A second exception is when the .* appears inside an atomic group, because
8678 this prevents the number of characters it matches from being adjusted.
8679 
8680 Arguments:
8681   code           points to start of the compiled pattern
8682   bracket_map    a bitmap of which brackets we are inside while testing; this
8683                    handles up to substring 31; after that we just have to take
8684                    the less precise approach
8685   cb             points to the compile data block
8686   atomcount      atomic group level
8687   inassert       TRUE if in an assertion
8688 
8689 Returns:     TRUE or FALSE
8690 */
8691 
8692 static BOOL
is_anchored(PCRE2_SPTR code,uint32_t bracket_map,compile_block * cb,int atomcount,BOOL inassert)8693 is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8694   int atomcount, BOOL inassert)
8695 {
8696 do {
8697    PCRE2_SPTR scode = first_significant_code(
8698      code + PRIV(OP_lengths)[*code], FALSE);
8699    int op = *scode;
8700 
8701    /* Non-capturing brackets */
8702 
8703    if (op == OP_BRA  || op == OP_BRAPOS ||
8704        op == OP_SBRA || op == OP_SBRAPOS)
8705      {
8706      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8707        return FALSE;
8708      }
8709 
8710    /* Capturing brackets */
8711 
8712    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8713             op == OP_SCBRA || op == OP_SCBRAPOS)
8714      {
8715      int n = GET2(scode, 1+LINK_SIZE);
8716      uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8717      if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8718      }
8719 
8720    /* Positive forward assertion */
8721 
8722    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8723      {
8724      if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8725      }
8726 
8727    /* Condition. If there is no second branch, it can't be anchored. */
8728 
8729    else if (op == OP_COND || op == OP_SCOND)
8730      {
8731      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8732      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8733        return FALSE;
8734      }
8735 
8736    /* Atomic groups */
8737 
8738    else if (op == OP_ONCE)
8739      {
8740      if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8741        return FALSE;
8742      }
8743 
8744    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8745    it isn't in brackets that are or may be referenced or inside an atomic
8746    group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8747    because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8748    with the subject "aab", which matches "b", i.e. not at the start of a line.
8749    There is also an option that disables auto-anchoring. */
8750 
8751    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8752              op == OP_TYPEPOSSTAR))
8753      {
8754      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8755          atomcount > 0 || cb->had_pruneorskip || inassert ||
8756          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8757        return FALSE;
8758      }
8759 
8760    /* Check for explicit anchoring */
8761 
8762    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8763 
8764    code += GET(code, 1);
8765    }
8766 while (*code == OP_ALT);   /* Loop for each alternative */
8767 return TRUE;
8768 }
8769 
8770 
8771 
8772 /*************************************************
8773 *         Check for starting with ^ or .*        *
8774 *************************************************/
8775 
8776 /* This is called to find out if every branch starts with ^ or .* so that
8777 "first char" processing can be done to speed things up in multiline
8778 matching and for non-DOTALL patterns that start with .* (which must start at
8779 the beginning or after \n). As in the case of is_anchored() (see above), we
8780 have to take account of back references to capturing brackets that contain .*
8781 because in that case we can't make the assumption. Also, the appearance of .*
8782 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8783 or *SKIP does not count, because once again the assumption no longer holds.
8784 
8785 Arguments:
8786   code           points to start of the compiled pattern or a group
8787   bracket_map    a bitmap of which brackets we are inside while testing; this
8788                    handles up to substring 31; after that we just have to take
8789                    the less precise approach
8790   cb             points to the compile data
8791   atomcount      atomic group level
8792   inassert       TRUE if in an assertion
8793 
8794 Returns:         TRUE or FALSE
8795 */
8796 
8797 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8798 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8799   int atomcount, BOOL inassert)
8800 {
8801 do {
8802    PCRE2_SPTR scode = first_significant_code(
8803      code + PRIV(OP_lengths)[*code], FALSE);
8804    int op = *scode;
8805 
8806    /* If we are at the start of a conditional assertion group, *both* the
8807    conditional assertion *and* what follows the condition must satisfy the test
8808    for start of line. Other kinds of condition fail. Note that there may be an
8809    auto-callout at the start of a condition. */
8810 
8811    if (op == OP_COND)
8812      {
8813      scode += 1 + LINK_SIZE;
8814 
8815      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8816        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8817 
8818      switch (*scode)
8819        {
8820        case OP_CREF:
8821        case OP_DNCREF:
8822        case OP_RREF:
8823        case OP_DNRREF:
8824        case OP_FAIL:
8825        case OP_FALSE:
8826        case OP_TRUE:
8827        return FALSE;
8828 
8829        default:     /* Assertion */
8830        if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8831        do scode += GET(scode, 1); while (*scode == OP_ALT);
8832        scode += 1 + LINK_SIZE;
8833        break;
8834        }
8835      scode = first_significant_code(scode, FALSE);
8836      op = *scode;
8837      }
8838 
8839    /* Non-capturing brackets */
8840 
8841    if (op == OP_BRA  || op == OP_BRAPOS ||
8842        op == OP_SBRA || op == OP_SBRAPOS)
8843      {
8844      if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8845        return FALSE;
8846      }
8847 
8848    /* Capturing brackets */
8849 
8850    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8851             op == OP_SCBRA || op == OP_SCBRAPOS)
8852      {
8853      int n = GET2(scode, 1+LINK_SIZE);
8854      unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8855      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8856      }
8857 
8858    /* Positive forward assertions */
8859 
8860    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8861      {
8862      if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8863        return FALSE;
8864      }
8865 
8866    /* Atomic brackets */
8867 
8868    else if (op == OP_ONCE)
8869      {
8870      if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8871        return FALSE;
8872      }
8873 
8874    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8875    brackets that may be referenced or an assertion, and as long as the pattern
8876    does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8877    for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8878    i.e. not at the start of a line. There is also an option that disables this
8879    optimization. */
8880 
8881    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8882      {
8883      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8884          atomcount > 0 || cb->had_pruneorskip || inassert ||
8885          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8886        return FALSE;
8887      }
8888 
8889    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8890    in particular that this includes atomic brackets OP_ONCE because the number
8891    of characters matched by .* cannot be adjusted inside them. */
8892 
8893    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8894 
8895    /* Move on to the next alternative */
8896 
8897    code += GET(code, 1);
8898    }
8899 while (*code == OP_ALT);  /* Loop for each alternative */
8900 return TRUE;
8901 }
8902 
8903 
8904 
8905 /*************************************************
8906 *   Scan compiled regex for recursion reference  *
8907 *************************************************/
8908 
8909 /* This function scans through a compiled pattern until it finds an instance of
8910 OP_RECURSE.
8911 
8912 Arguments:
8913   code        points to start of expression
8914   utf         TRUE in UTF mode
8915 
8916 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8917 */
8918 
8919 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8920 find_recurse(PCRE2_SPTR code, BOOL utf)
8921 {
8922 for (;;)
8923   {
8924   PCRE2_UCHAR c = *code;
8925   if (c == OP_END) return NULL;
8926   if (c == OP_RECURSE) return code;
8927 
8928   /* XCLASS is used for classes that cannot be represented just by a bit map.
8929   This includes negated single high-valued characters. CALLOUT_STR is used for
8930   callouts with string arguments. In both cases the length in the table is
8931   zero; the actual length is stored in the compiled code. */
8932 
8933   if (c == OP_XCLASS) code += GET(code, 1);
8934     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8935 
8936   /* Otherwise, we can get the item's length from the table, except that for
8937   repeated character types, we have to test for \p and \P, which have an extra
8938   two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8939   we must add in its length. */
8940 
8941   else
8942     {
8943     switch(c)
8944       {
8945       case OP_TYPESTAR:
8946       case OP_TYPEMINSTAR:
8947       case OP_TYPEPLUS:
8948       case OP_TYPEMINPLUS:
8949       case OP_TYPEQUERY:
8950       case OP_TYPEMINQUERY:
8951       case OP_TYPEPOSSTAR:
8952       case OP_TYPEPOSPLUS:
8953       case OP_TYPEPOSQUERY:
8954       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8955       break;
8956 
8957       case OP_TYPEPOSUPTO:
8958       case OP_TYPEUPTO:
8959       case OP_TYPEMINUPTO:
8960       case OP_TYPEEXACT:
8961       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8962         code += 2;
8963       break;
8964 
8965       case OP_MARK:
8966       case OP_COMMIT_ARG:
8967       case OP_PRUNE_ARG:
8968       case OP_SKIP_ARG:
8969       case OP_THEN_ARG:
8970       code += code[1];
8971       break;
8972       }
8973 
8974     /* Add in the fixed length from the table */
8975 
8976     code += PRIV(OP_lengths)[c];
8977 
8978     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8979     be followed by a multi-unit character. The length in the table is a
8980     minimum, so we have to arrange to skip the extra units. */
8981 
8982 #ifdef MAYBE_UTF_MULTI
8983     if (utf) switch(c)
8984       {
8985       case OP_CHAR:
8986       case OP_CHARI:
8987       case OP_NOT:
8988       case OP_NOTI:
8989       case OP_EXACT:
8990       case OP_EXACTI:
8991       case OP_NOTEXACT:
8992       case OP_NOTEXACTI:
8993       case OP_UPTO:
8994       case OP_UPTOI:
8995       case OP_NOTUPTO:
8996       case OP_NOTUPTOI:
8997       case OP_MINUPTO:
8998       case OP_MINUPTOI:
8999       case OP_NOTMINUPTO:
9000       case OP_NOTMINUPTOI:
9001       case OP_POSUPTO:
9002       case OP_POSUPTOI:
9003       case OP_NOTPOSUPTO:
9004       case OP_NOTPOSUPTOI:
9005       case OP_STAR:
9006       case OP_STARI:
9007       case OP_NOTSTAR:
9008       case OP_NOTSTARI:
9009       case OP_MINSTAR:
9010       case OP_MINSTARI:
9011       case OP_NOTMINSTAR:
9012       case OP_NOTMINSTARI:
9013       case OP_POSSTAR:
9014       case OP_POSSTARI:
9015       case OP_NOTPOSSTAR:
9016       case OP_NOTPOSSTARI:
9017       case OP_PLUS:
9018       case OP_PLUSI:
9019       case OP_NOTPLUS:
9020       case OP_NOTPLUSI:
9021       case OP_MINPLUS:
9022       case OP_MINPLUSI:
9023       case OP_NOTMINPLUS:
9024       case OP_NOTMINPLUSI:
9025       case OP_POSPLUS:
9026       case OP_POSPLUSI:
9027       case OP_NOTPOSPLUS:
9028       case OP_NOTPOSPLUSI:
9029       case OP_QUERY:
9030       case OP_QUERYI:
9031       case OP_NOTQUERY:
9032       case OP_NOTQUERYI:
9033       case OP_MINQUERY:
9034       case OP_MINQUERYI:
9035       case OP_NOTMINQUERY:
9036       case OP_NOTMINQUERYI:
9037       case OP_POSQUERY:
9038       case OP_POSQUERYI:
9039       case OP_NOTPOSQUERY:
9040       case OP_NOTPOSQUERYI:
9041       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9042       break;
9043       }
9044 #else
9045     (void)(utf);  /* Keep compiler happy by referencing function argument */
9046 #endif  /* MAYBE_UTF_MULTI */
9047     }
9048   }
9049 }
9050 
9051 
9052 
9053 /*************************************************
9054 *    Check for asserted fixed first code unit    *
9055 *************************************************/
9056 
9057 /* During compilation, the "first code unit" settings from forward assertions
9058 are discarded, because they can cause conflicts with actual literals that
9059 follow. However, if we end up without a first code unit setting for an
9060 unanchored pattern, it is worth scanning the regex to see if there is an
9061 initial asserted first code unit. If all branches start with the same asserted
9062 code unit, or with a non-conditional bracket all of whose alternatives start
9063 with the same asserted code unit (recurse ad lib), then we return that code
9064 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9065 REQ_NONE in the flags.
9066 
9067 Arguments:
9068   code       points to start of compiled pattern
9069   flags      points to the first code unit flags
9070   inassert   non-zero if in an assertion
9071 
9072 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9073 */
9074 
9075 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,uint32_t * flags,uint32_t inassert)9076 find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9077 {
9078 uint32_t c = 0;
9079 uint32_t cflags = REQ_NONE;
9080 
9081 *flags = REQ_NONE;
9082 do {
9083    uint32_t d;
9084    uint32_t dflags;
9085    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9086              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9087    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9088    PCRE2_UCHAR op = *scode;
9089 
9090    switch(op)
9091      {
9092      default:
9093      return 0;
9094 
9095      case OP_BRA:
9096      case OP_BRAPOS:
9097      case OP_CBRA:
9098      case OP_SCBRA:
9099      case OP_CBRAPOS:
9100      case OP_SCBRAPOS:
9101      case OP_ASSERT:
9102      case OP_ASSERT_NA:
9103      case OP_ONCE:
9104      case OP_SCRIPT_RUN:
9105      d = find_firstassertedcu(scode, &dflags, inassert +
9106        ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9107      if (dflags >= REQ_NONE) return 0;
9108      if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9109        else if (c != d || cflags != dflags) return 0;
9110      break;
9111 
9112      case OP_EXACT:
9113      scode += IMM2_SIZE;
9114      /* Fall through */
9115 
9116      case OP_CHAR:
9117      case OP_PLUS:
9118      case OP_MINPLUS:
9119      case OP_POSPLUS:
9120      if (inassert == 0) return 0;
9121      if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9122        else if (c != scode[1]) return 0;
9123      break;
9124 
9125      case OP_EXACTI:
9126      scode += IMM2_SIZE;
9127      /* Fall through */
9128 
9129      case OP_CHARI:
9130      case OP_PLUSI:
9131      case OP_MINPLUSI:
9132      case OP_POSPLUSI:
9133      if (inassert == 0) return 0;
9134 
9135      /* If the character is more than one code unit long, we cannot set its
9136      first code unit when matching caselessly. Later scanning may pick up
9137      multiple code units. */
9138 
9139 #ifdef SUPPORT_UNICODE
9140 #if PCRE2_CODE_UNIT_WIDTH == 8
9141      if (scode[1] >= 0x80) return 0;
9142 #elif PCRE2_CODE_UNIT_WIDTH == 16
9143      if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9144 #endif
9145 #endif
9146 
9147      if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9148        else if (c != scode[1]) return 0;
9149      break;
9150      }
9151 
9152    code += GET(code, 1);
9153    }
9154 while (*code == OP_ALT);
9155 
9156 *flags = cflags;
9157 return c;
9158 }
9159 
9160 
9161 
9162 /*************************************************
9163 *     Add an entry to the name/number table      *
9164 *************************************************/
9165 
9166 /* This function is called between compiling passes to add an entry to the
9167 name/number table, maintaining alphabetical order. Checking for permitted
9168 and forbidden duplicates has already been done.
9169 
9170 Arguments:
9171   cb           the compile data block
9172   name         the name to add
9173   length       the length of the name
9174   groupno      the group number
9175   tablecount   the count of names in the table so far
9176 
9177 Returns:       nothing
9178 */
9179 
9180 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)9181 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9182   unsigned int groupno, uint32_t tablecount)
9183 {
9184 uint32_t i;
9185 PCRE2_UCHAR *slot = cb->name_table;
9186 
9187 for (i = 0; i < tablecount; i++)
9188   {
9189   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9190   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9191     crc = -1; /* Current name is a substring */
9192 
9193   /* Make space in the table and break the loop for an earlier name. For a
9194   duplicate or later name, carry on. We do this for duplicates so that in the
9195   simple case (when ?(| is not used) they are in order of their numbers. In all
9196   cases they are in the order in which they appear in the pattern. */
9197 
9198   if (crc < 0)
9199     {
9200     (void)memmove(slot + cb->name_entry_size, slot,
9201       CU2BYTES((tablecount - i) * cb->name_entry_size));
9202     break;
9203     }
9204 
9205   /* Continue the loop for a later or duplicate name */
9206 
9207   slot += cb->name_entry_size;
9208   }
9209 
9210 PUT2(slot, 0, groupno);
9211 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9212 
9213 /* Add a terminating zero and fill the rest of the slot with zeroes so that
9214 the memory is all initialized. Otherwise valgrind moans about uninitialized
9215 memory when saving serialized compiled patterns. */
9216 
9217 memset(slot + IMM2_SIZE + length, 0,
9218   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9219 }
9220 
9221 
9222 
9223 /*************************************************
9224 *             Skip in parsed pattern             *
9225 *************************************************/
9226 
9227 /* This function is called to skip parts of the parsed pattern when finding the
9228 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9229 the end of the branch, it is called to skip over an internal lookaround or
9230 (DEFINE) group, and it is also called to skip to the end of a class, during
9231 which it will never encounter nested groups (but there's no need to have
9232 special code for that).
9233 
9234 When called to find the end of a branch or group, pptr must point to the first
9235 meta code inside the branch, not the branch-starting code. In other cases it
9236 can point to the item that causes the function to be called.
9237 
9238 Arguments:
9239   pptr       current pointer to skip from
9240   skiptype   PSKIP_CLASS when skipping to end of class
9241              PSKIP_ALT when META_ALT ends the skip
9242              PSKIP_KET when only META_KET ends the skip
9243 
9244 Returns:     new value of pptr
9245              NULL if META_END is reached - should never occur
9246                or for an unknown meta value - likewise
9247 */
9248 
9249 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)9250 parsed_skip(uint32_t *pptr, uint32_t skiptype)
9251 {
9252 uint32_t nestlevel = 0;
9253 
9254 for (;; pptr++)
9255   {
9256   uint32_t meta = META_CODE(*pptr);
9257 
9258   switch(meta)
9259     {
9260     default:  /* Just skip over most items */
9261     if (meta < META_END) continue;  /* Literal */
9262     break;
9263 
9264     /* This should never occur. */
9265 
9266     case META_END:
9267     return NULL;
9268 
9269     /* The data for these items is variable in length. */
9270 
9271     case META_BACKREF:  /* Offset is present only if group >= 10 */
9272     if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9273     break;
9274 
9275     case META_ESCAPE:   /* A few escapes are followed by data items. */
9276     switch (META_DATA(*pptr))
9277       {
9278       case ESC_P:
9279       case ESC_p:
9280       pptr += 1;
9281       break;
9282 
9283       case ESC_g:
9284       case ESC_k:
9285       pptr += 1 + SIZEOFFSET;
9286       break;
9287       }
9288     break;
9289 
9290     case META_MARK:     /* Add the length of the name. */
9291     case META_COMMIT_ARG:
9292     case META_PRUNE_ARG:
9293     case META_SKIP_ARG:
9294     case META_THEN_ARG:
9295     pptr += pptr[1];
9296     break;
9297 
9298     /* These are the "active" items in this loop. */
9299 
9300     case META_CLASS_END:
9301     if (skiptype == PSKIP_CLASS) return pptr;
9302     break;
9303 
9304     case META_ATOMIC:
9305     case META_CAPTURE:
9306     case META_COND_ASSERT:
9307     case META_COND_DEFINE:
9308     case META_COND_NAME:
9309     case META_COND_NUMBER:
9310     case META_COND_RNAME:
9311     case META_COND_RNUMBER:
9312     case META_COND_VERSION:
9313     case META_LOOKAHEAD:
9314     case META_LOOKAHEADNOT:
9315     case META_LOOKAHEAD_NA:
9316     case META_LOOKBEHIND:
9317     case META_LOOKBEHINDNOT:
9318     case META_LOOKBEHIND_NA:
9319     case META_NOCAPTURE:
9320     case META_SCRIPT_RUN:
9321     nestlevel++;
9322     break;
9323 
9324     case META_ALT:
9325     if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9326     break;
9327 
9328     case META_KET:
9329     if (nestlevel == 0) return pptr;
9330     nestlevel--;
9331     break;
9332     }
9333 
9334   /* The extra data item length for each meta is in a table. */
9335 
9336   meta = (meta >> 16) & 0x7fff;
9337   if (meta >= sizeof(meta_extra_lengths)) return NULL;
9338   pptr += meta_extra_lengths[meta];
9339   }
9340 /* Control never reaches here */
9341 return pptr;
9342 }
9343 
9344 
9345 
9346 /*************************************************
9347 *       Find length of a parsed group            *
9348 *************************************************/
9349 
9350 /* This is called for nested groups within a branch of a lookbehind whose
9351 length is being computed. On entry, the pointer must be at the first element
9352 after the group initializing code. On exit it points to OP_KET. Caching is used
9353 to improve processing speed when the same capturing group occurs many times.
9354 
9355 Arguments:
9356   pptrptr     pointer to pointer in the parsed pattern
9357   minptr      where to return the minimum length
9358   isinline    FALSE if a reference or recursion; TRUE for inline group
9359   errcodeptr  pointer to the errorcode
9360   lcptr       pointer to the loop counter
9361   group       number of captured group or -1 for a non-capturing group
9362   recurses    chain of recurse_check to catch mutual recursion
9363   cb          pointer to the compile data
9364 
9365 Returns:      the maximum group length or a negative number
9366 */
9367 
9368 static int
get_grouplength(uint32_t ** pptrptr,int * minptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)9369 get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9370   int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9371 {
9372 uint32_t *gi = cb->groupinfo + 2 * group;
9373 int branchlength, branchminlength;
9374 int grouplength = -1;
9375 int groupminlength = INT_MAX;
9376 
9377 /* The cache can be used only if there is no possibility of there being two
9378 groups with the same number. We do not need to set the end pointer for a group
9379 that is being processed as a back reference or recursion, but we must do so for
9380 an inline group. */
9381 
9382 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9383   {
9384   uint32_t groupinfo = gi[0];
9385   if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9386   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9387     {
9388     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9389     *minptr = gi[1];
9390     return groupinfo & GI_FIXED_LENGTH_MASK;
9391     }
9392   }
9393 
9394 /* Scan the group. In this case we find the end pointer of necessity. */
9395 
9396 for(;;)
9397   {
9398   branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9399     recurses, cb);
9400   if (branchlength < 0) goto ISNOTFIXED;
9401   if (branchlength > grouplength) grouplength = branchlength;
9402   if (branchminlength < groupminlength) groupminlength = branchminlength;
9403   if (**pptrptr == META_KET) break;
9404   *pptrptr += 1;   /* Skip META_ALT */
9405   }
9406 
9407 if (group > 0)
9408   {
9409   gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9410   gi[1] = groupminlength;
9411   }
9412 
9413 *minptr = groupminlength;
9414 return grouplength;
9415 
9416 ISNOTFIXED:
9417 if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9418 return -1;
9419 }
9420 
9421 
9422 
9423 /*************************************************
9424 *        Find length of a parsed branch          *
9425 *************************************************/
9426 
9427 /* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9428 giving an error if the length is not limited. On entry, *pptrptr points to the
9429 first element inside the branch. On exit it is set to point to the ALT or KET.
9430 
9431 Arguments:
9432   pptrptr     pointer to pointer in the parsed pattern
9433   minptr      where to return the minimum length
9434   errcodeptr  pointer to error code
9435   lcptr       pointer to loop counter
9436   recurses    chain of recurse_check to catch mutual recursion
9437   cb          pointer to compile block
9438 
9439 Returns:      the maximum length, or a negative value on error
9440 */
9441 
9442 static int
get_branchlength(uint32_t ** pptrptr,int * minptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9443 get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9444   parsed_recurse_check *recurses, compile_block *cb)
9445 {
9446 int branchlength = 0;
9447 int branchminlength = 0;
9448 int grouplength, groupminlength;
9449 uint32_t lastitemlength = 0;
9450 uint32_t lastitemminlength = 0;
9451 uint32_t *pptr = *pptrptr;
9452 PCRE2_SIZE offset;
9453 parsed_recurse_check this_recurse;
9454 
9455 /* A large and/or complex regex can take too long to process. This can happen
9456 more often when (?| groups are present in the pattern because their length
9457 cannot be cached. */
9458 
9459 if ((*lcptr)++ > 2000)
9460   {
9461   *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9462   return -1;
9463   }
9464 
9465 /* Scan the branch, accumulating the length. */
9466 
9467 for (;; pptr++)
9468   {
9469   parsed_recurse_check *r;
9470   uint32_t *gptr, *gptrend;
9471   uint32_t escape;
9472   uint32_t group = 0;
9473   uint32_t itemlength = 0;
9474   uint32_t itemminlength = 0;
9475   uint32_t min, max;
9476 
9477   if (*pptr < META_END)
9478     {
9479     itemlength = itemminlength = 1;
9480     }
9481 
9482   else switch (META_CODE(*pptr))
9483     {
9484     case META_KET:
9485     case META_ALT:
9486     goto EXIT;
9487 
9488     /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9489     actual termination. */
9490 
9491     case META_ACCEPT:
9492     case META_FAIL:
9493     pptr = parsed_skip(pptr, PSKIP_ALT);
9494     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9495     goto EXIT;
9496 
9497     case META_MARK:
9498     case META_COMMIT_ARG:
9499     case META_PRUNE_ARG:
9500     case META_SKIP_ARG:
9501     case META_THEN_ARG:
9502     pptr += pptr[1] + 1;
9503     break;
9504 
9505     case META_CIRCUMFLEX:
9506     case META_COMMIT:
9507     case META_DOLLAR:
9508     case META_PRUNE:
9509     case META_SKIP:
9510     case META_THEN:
9511     break;
9512 
9513     case META_OPTIONS:
9514     pptr += 2;
9515     break;
9516 
9517     case META_BIGVALUE:
9518     itemlength = itemminlength = 1;
9519     pptr += 1;
9520     break;
9521 
9522     case META_CLASS:
9523     case META_CLASS_NOT:
9524     itemlength = itemminlength = 1;
9525     pptr = parsed_skip(pptr, PSKIP_CLASS);
9526     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9527     break;
9528 
9529     case META_CLASS_EMPTY_NOT:
9530     case META_DOT:
9531     itemlength = itemminlength = 1;
9532     break;
9533 
9534     case META_CALLOUT_NUMBER:
9535     pptr += 3;
9536     break;
9537 
9538     case META_CALLOUT_STRING:
9539     pptr += 3 + SIZEOFFSET;
9540     break;
9541 
9542     /* Only some escapes consume a character. Of those, \R can match one or two
9543     characters, but \X is never allowed because it matches an unknown number of
9544     characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9545 
9546     case META_ESCAPE:
9547     escape = META_DATA(*pptr);
9548     if (escape == ESC_X) return -1;
9549     if (escape == ESC_R)
9550       {
9551       itemminlength = 1;
9552       itemlength = 2;
9553       }
9554     else if (escape > ESC_b && escape < ESC_Z)
9555       {
9556 #if PCRE2_CODE_UNIT_WIDTH != 32
9557       if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9558         {
9559         *errcodeptr = ERR36;
9560         return -1;
9561         }
9562 #endif
9563       itemlength = itemminlength = 1;
9564       if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9565       }
9566     break;
9567 
9568     /* Lookaheads do not contribute to the length of this branch, but they may
9569     contain lookbehinds within them whose lengths need to be set. */
9570 
9571     case META_LOOKAHEAD:
9572     case META_LOOKAHEADNOT:
9573     case META_LOOKAHEAD_NA:
9574     *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9575     if (*errcodeptr != 0) return -1;
9576 
9577     /* Ignore any qualifiers that follow a lookahead assertion. */
9578 
9579     switch (pptr[1])
9580       {
9581       case META_ASTERISK:
9582       case META_ASTERISK_PLUS:
9583       case META_ASTERISK_QUERY:
9584       case META_PLUS:
9585       case META_PLUS_PLUS:
9586       case META_PLUS_QUERY:
9587       case META_QUERY:
9588       case META_QUERY_PLUS:
9589       case META_QUERY_QUERY:
9590       pptr++;
9591       break;
9592 
9593       case META_MINMAX:
9594       case META_MINMAX_PLUS:
9595       case META_MINMAX_QUERY:
9596       pptr += 3;
9597       break;
9598 
9599       default:
9600       break;
9601       }
9602     break;
9603 
9604     /* A nested lookbehind does not contribute any length to this lookbehind,
9605     but must itself be checked and have its lengths set. */
9606 
9607     case META_LOOKBEHIND:
9608     case META_LOOKBEHINDNOT:
9609     case META_LOOKBEHIND_NA:
9610     if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9611       return -1;
9612     break;
9613 
9614     /* Back references and recursions are handled by very similar code. At this
9615     stage, the names generated in the parsing pass are available, but the main
9616     name table has not yet been created. So for the named varieties, scan the
9617     list of names in order to get the number of the first one in the pattern,
9618     and whether or not this name is duplicated. */
9619 
9620     case META_BACKREF_BYNAME:
9621     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9622       goto ISNOTFIXED;
9623     /* Fall through */
9624 
9625     case META_RECURSE_BYNAME:
9626       {
9627       int i;
9628       PCRE2_SPTR name;
9629       BOOL is_dupname = FALSE;
9630       named_group *ng = cb->named_groups;
9631       uint32_t meta_code = META_CODE(*pptr);
9632       uint32_t length = *(++pptr);
9633 
9634       GETPLUSOFFSET(offset, pptr);
9635       name = cb->start_pattern + offset;
9636       for (i = 0; i < cb->names_found; i++, ng++)
9637         {
9638         if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9639           {
9640           group = ng->number;
9641           is_dupname = ng->isdup;
9642           break;
9643           }
9644         }
9645 
9646       if (group == 0)
9647         {
9648         *errcodeptr = ERR15;  /* Non-existent subpattern */
9649         cb->erroroffset = offset;
9650         return -1;
9651         }
9652 
9653       /* A numerical back reference can be fixed length if duplicate capturing
9654       groups are not being used. A non-duplicate named back reference can also
9655       be handled. */
9656 
9657       if (meta_code == META_RECURSE_BYNAME ||
9658           (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9659         goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9660       }
9661     goto ISNOTFIXED;                     /* Duplicate name or number */
9662 
9663     /* The offset values for back references < 10 are in a separate vector
9664     because otherwise they would use more than two parsed pattern elements on
9665     64-bit systems. */
9666 
9667     case META_BACKREF:
9668     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9669         (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9670       goto ISNOTFIXED;
9671     group = META_DATA(*pptr);
9672     if (group < 10)
9673       {
9674       offset = cb->small_ref_offset[group];
9675       goto RECURSE_OR_BACKREF_LENGTH;
9676       }
9677 
9678     /* Fall through */
9679     /* For groups >= 10 - picking up group twice does no harm. */
9680 
9681     /* A true recursion implies not fixed length, but a subroutine call may
9682     be OK. Back reference "recursions" are also failed. */
9683 
9684     case META_RECURSE:
9685     group = META_DATA(*pptr);
9686     GETPLUSOFFSET(offset, pptr);
9687 
9688     RECURSE_OR_BACKREF_LENGTH:
9689     if (group > cb->bracount)
9690       {
9691       cb->erroroffset = offset;
9692       *errcodeptr = ERR15;  /* Non-existent subpattern */
9693       return -1;
9694       }
9695     if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9696     for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9697       {
9698       if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9699         else if (*gptr == (META_CAPTURE | group)) break;
9700       }
9701 
9702     /* We must start the search for the end of the group at the first meta code
9703     inside the group. Otherwise it will be treated as an enclosed group. */
9704 
9705     gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9706     if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9707     if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9708     for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9709     if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9710     this_recurse.prev = recurses;
9711     this_recurse.groupptr = gptr;
9712 
9713     /* We do not need to know the position of the end of the group, that is,
9714     gptr is not used after the call to get_grouplength(). Setting the second
9715     argument FALSE stops it scanning for the end when the length can be found
9716     in the cache. */
9717 
9718     gptr++;
9719     grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9720       lcptr, group, &this_recurse, cb);
9721     if (grouplength < 0)
9722       {
9723       if (*errcodeptr == 0) goto ISNOTFIXED;
9724       return -1;  /* Error already set */
9725       }
9726     itemlength = grouplength;
9727     itemminlength = groupminlength;
9728     break;
9729 
9730     /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9731     the length of this branch. Skip from the following item to the next
9732     unpaired ket. */
9733 
9734     case META_COND_DEFINE:
9735     pptr = parsed_skip(pptr + 1, PSKIP_KET);
9736     break;
9737 
9738     /* Check other nested groups - advance past the initial data for each type
9739     and then seek a fixed length with get_grouplength(). */
9740 
9741     case META_COND_NAME:
9742     case META_COND_NUMBER:
9743     case META_COND_RNAME:
9744     case META_COND_RNUMBER:
9745     pptr += 2 + SIZEOFFSET;
9746     goto CHECK_GROUP;
9747 
9748     case META_COND_ASSERT:
9749     pptr += 1;
9750     goto CHECK_GROUP;
9751 
9752     case META_COND_VERSION:
9753     pptr += 4;
9754     goto CHECK_GROUP;
9755 
9756     case META_CAPTURE:
9757     group = META_DATA(*pptr);
9758     /* Fall through */
9759 
9760     case META_ATOMIC:
9761     case META_NOCAPTURE:
9762     case META_SCRIPT_RUN:
9763     pptr++;
9764     CHECK_GROUP:
9765     grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9766       lcptr, group, recurses, cb);
9767     if (grouplength < 0) return -1;
9768     itemlength = grouplength;
9769     itemminlength = groupminlength;
9770     break;
9771 
9772     case META_QUERY:
9773     case META_QUERY_PLUS:
9774     case META_QUERY_QUERY:
9775     min = 0;
9776     max = 1;
9777     goto REPETITION;
9778 
9779     /* Exact repetition is OK; variable repetition is not. A repetition of zero
9780     must subtract the length that has already been added. */
9781 
9782     case META_MINMAX:
9783     case META_MINMAX_PLUS:
9784     case META_MINMAX_QUERY:
9785     min = pptr[1];
9786     max = pptr[2];
9787     pptr += 2;
9788 
9789     REPETITION:
9790     if (max != REPEAT_UNLIMITED)
9791       {
9792       if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9793           max != 0 &&
9794           (INT_MAX - branchlength)/lastitemlength < max - 1)
9795         {
9796         *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9797         return -1;
9798         }
9799       if (min == 0) branchminlength -= lastitemminlength;
9800         else itemminlength = (min - 1) * lastitemminlength;
9801       if (max == 0) branchlength -= lastitemlength;
9802         else itemlength = (max - 1) * lastitemlength;
9803       break;
9804       }
9805     /* Fall through */
9806 
9807     /* Any other item means this branch does not have a fixed length. */
9808 
9809     default:
9810     ISNOTFIXED:
9811     *errcodeptr = ERR25;   /* Not fixed length */
9812     return -1;
9813     }
9814 
9815   /* Add the item length to the branchlength, checking for integer overflow and
9816   for the branch length exceeding the overall limit. Later, if there is at
9817   least one variable-length branch in the group, there is a test for the
9818   (smaller) variable-length branch length limit. */
9819 
9820   if (INT_MAX - branchlength < (int)itemlength ||
9821       (branchlength += itemlength) > LOOKBEHIND_MAX)
9822     {
9823     *errcodeptr = ERR87;
9824     return -1;
9825     }
9826 
9827   branchminlength += itemminlength;
9828 
9829   /* Save this item length for use if the next item is a quantifier. */
9830 
9831   lastitemlength = itemlength;
9832   lastitemminlength = itemminlength;
9833   }
9834 
9835 EXIT:
9836 *pptrptr = pptr;
9837 *minptr = branchminlength;
9838 return branchlength;
9839 
9840 PARSED_SKIP_FAILED:
9841 *errcodeptr = ERR90;
9842 return -1;
9843 }
9844 
9845 
9846 
9847 /*************************************************
9848 *        Set lengths in a lookbehind             *
9849 *************************************************/
9850 
9851 /* This function is called for each lookbehind, to set the lengths in its
9852 branches. An error occurs if any branch does not have a limited maximum length
9853 that is less than the limit (65535). On exit, the pointer must be left on the
9854 final ket.
9855 
9856 The function also maintains the max_lookbehind value. Any lookbehind branch
9857 that contains a nested lookbehind may actually look further back than the
9858 length of the branch. The additional amount is passed back from
9859 get_branchlength() as an "extra" value.
9860 
9861 Arguments:
9862   pptrptr     pointer to pointer in the parsed pattern
9863   errcodeptr  pointer to error code
9864   lcptr       pointer to loop counter
9865   recurses    chain of recurse_check to catch mutual recursion
9866   cb          pointer to compile block
9867 
9868 Returns:      TRUE if all is well
9869               FALSE otherwise, with error code and offset set
9870 */
9871 
9872 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9873 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9874   parsed_recurse_check *recurses, compile_block *cb)
9875 {
9876 PCRE2_SIZE offset;
9877 uint32_t *bptr = *pptrptr;
9878 uint32_t *gbptr = bptr;
9879 int maxlength = 0;
9880 int minlength = INT_MAX;
9881 BOOL variable = FALSE;
9882 
9883 READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9884 *pptrptr += SIZEOFFSET;
9885 
9886 /* Each branch can have a different maximum length, but we can keep only a
9887 single minimum for the whole group, because there's nowhere to save individual
9888 values in the META_ALT item. */
9889 
9890 do
9891   {
9892   int branchlength, branchminlength;
9893 
9894   *pptrptr += 1;
9895   branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9896     recurses, cb);
9897 
9898   if (branchlength < 0)
9899     {
9900     /* The errorcode and offset may already be set from a nested lookbehind. */
9901     if (*errcodeptr == 0) *errcodeptr = ERR25;
9902     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9903     return FALSE;
9904     }
9905 
9906   if (branchlength != branchminlength) variable = TRUE;
9907   if (branchminlength < minlength) minlength = branchminlength;
9908   if (branchlength > maxlength) maxlength = branchlength;
9909   if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9910   *bptr |= branchlength;  /* branchlength never more than 65535 */
9911   bptr = *pptrptr;
9912   }
9913 while (META_CODE(*bptr) == META_ALT);
9914 
9915 /* If any branch is of variable length, the whole lookbehind is of variable
9916 length. If the maximum length of any branch exceeds the maximum for variable
9917 lookbehinds, give an error. Otherwise, the minimum length is set in the word
9918 that follows the original group META value. For a fixed-length lookbehind, this
9919 is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9920 possibly different) length. */
9921 
9922 if (variable)
9923   {
9924   gbptr[1] = minlength;
9925   if ((uint32_t)maxlength > cb->max_varlookbehind)
9926     {
9927     *errcodeptr = ERR100;
9928     cb->erroroffset = offset;
9929     return FALSE;
9930     }
9931   }
9932 else gbptr[1] = LOOKBEHIND_MAX;
9933 
9934 
9935 gbptr[1] = variable? minlength : LOOKBEHIND_MAX;
9936 return TRUE;
9937 }
9938 
9939 
9940 
9941 /*************************************************
9942 *         Check parsed pattern lookbehinds       *
9943 *************************************************/
9944 
9945 /* This function is called at the end of parsing a pattern if any lookbehinds
9946 were encountered. It scans the parsed pattern for them, calling
9947 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9948 the error offset is marked unset. The enables the functions above not to
9949 override settings from deeper nestings.
9950 
9951 This function is called recursively from get_branchlength() for lookaheads in
9952 order to process any lookbehinds that they may contain. It stops when it hits a
9953 non-nested closing parenthesis in this case, returning a pointer to it.
9954 
9955 Arguments
9956   pptr      points to where to start (start of pattern or start of lookahead)
9957   retptr    if not NULL, return the ket pointer here
9958   recurses  chain of recurse_check to catch mutual recursion
9959   cb        points to the compile block
9960   lcptr     points to loop counter
9961 
9962 Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9963 */
9964 
9965 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9966 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9967   parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9968 {
9969 int errorcode = 0;
9970 int nestlevel = 0;
9971 
9972 cb->erroroffset = PCRE2_UNSET;
9973 
9974 for (; *pptr != META_END; pptr++)
9975   {
9976   if (*pptr < META_END) continue;  /* Literal */
9977 
9978   switch (META_CODE(*pptr))
9979     {
9980     default:
9981     return ERR70;  /* Unrecognized meta code */
9982 
9983     case META_ESCAPE:
9984     if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9985       pptr += 1;
9986     break;
9987 
9988     case META_KET:
9989     if (--nestlevel < 0)
9990       {
9991       if (retptr != NULL) *retptr = pptr;
9992       return 0;
9993       }
9994     break;
9995 
9996     case META_ATOMIC:
9997     case META_CAPTURE:
9998     case META_COND_ASSERT:
9999     case META_LOOKAHEAD:
10000     case META_LOOKAHEADNOT:
10001     case META_LOOKAHEAD_NA:
10002     case META_NOCAPTURE:
10003     case META_SCRIPT_RUN:
10004     nestlevel++;
10005     break;
10006 
10007     case META_ACCEPT:
10008     case META_ALT:
10009     case META_ASTERISK:
10010     case META_ASTERISK_PLUS:
10011     case META_ASTERISK_QUERY:
10012     case META_BACKREF:
10013     case META_CIRCUMFLEX:
10014     case META_CLASS:
10015     case META_CLASS_EMPTY:
10016     case META_CLASS_EMPTY_NOT:
10017     case META_CLASS_END:
10018     case META_CLASS_NOT:
10019     case META_COMMIT:
10020     case META_DOLLAR:
10021     case META_DOT:
10022     case META_FAIL:
10023     case META_PLUS:
10024     case META_PLUS_PLUS:
10025     case META_PLUS_QUERY:
10026     case META_PRUNE:
10027     case META_QUERY:
10028     case META_QUERY_PLUS:
10029     case META_QUERY_QUERY:
10030     case META_RANGE_ESCAPED:
10031     case META_RANGE_LITERAL:
10032     case META_SKIP:
10033     case META_THEN:
10034     break;
10035 
10036     case META_RECURSE:
10037     pptr += SIZEOFFSET;
10038     break;
10039 
10040     case META_BACKREF_BYNAME:
10041     case META_RECURSE_BYNAME:
10042     pptr += 1 + SIZEOFFSET;
10043     break;
10044 
10045     case META_COND_DEFINE:
10046     pptr += SIZEOFFSET;
10047     nestlevel++;
10048     break;
10049 
10050     case META_COND_NAME:
10051     case META_COND_NUMBER:
10052     case META_COND_RNAME:
10053     case META_COND_RNUMBER:
10054     pptr += 1 + SIZEOFFSET;
10055     nestlevel++;
10056     break;
10057 
10058     case META_COND_VERSION:
10059     pptr += 3;
10060     nestlevel++;
10061     break;
10062 
10063     case META_CALLOUT_STRING:
10064     pptr += 3 + SIZEOFFSET;
10065     break;
10066 
10067     case META_BIGVALUE:
10068     case META_POSIX:
10069     case META_POSIX_NEG:
10070     pptr += 1;
10071     break;
10072 
10073     case META_MINMAX:
10074     case META_MINMAX_QUERY:
10075     case META_MINMAX_PLUS:
10076     case META_OPTIONS:
10077     pptr += 2;
10078     break;
10079 
10080     case META_CALLOUT_NUMBER:
10081     pptr += 3;
10082     break;
10083 
10084     case META_MARK:
10085     case META_COMMIT_ARG:
10086     case META_PRUNE_ARG:
10087     case META_SKIP_ARG:
10088     case META_THEN_ARG:
10089     pptr += 1 + pptr[1];
10090     break;
10091 
10092     case META_LOOKBEHIND:
10093     case META_LOOKBEHINDNOT:
10094     case META_LOOKBEHIND_NA:
10095     if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10096       return errorcode;
10097     break;
10098     }
10099   }
10100 
10101 return 0;
10102 }
10103 
10104 
10105 
10106 /*************************************************
10107 *     External function to compile a pattern     *
10108 *************************************************/
10109 
10110 /* This function reads a regular expression in the form of a string and returns
10111 a pointer to a block of store holding a compiled version of the expression.
10112 
10113 Arguments:
10114   pattern       the regular expression
10115   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10116   options       option bits
10117   errorptr      pointer to errorcode
10118   erroroffset   pointer to error offset
10119   ccontext      points to a compile context or is NULL
10120 
10121 Returns:        pointer to compiled data block, or NULL on error,
10122                 with errorcode and erroroffset set
10123 */
10124 
10125 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)10126 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10127    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10128 {
10129 BOOL utf;                             /* Set TRUE for UTF mode */
10130 BOOL ucp;                             /* Set TRUE for UCP mode */
10131 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10132 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10133 pcre2_real_code *re = NULL;           /* What we will return */
10134 compile_block cb;                     /* "Static" compile-time data */
10135 const uint8_t *tables;                /* Char tables base pointer */
10136 
10137 PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10138 PCRE2_SPTR codestart;                 /* Start of compiled code */
10139 PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10140 uint32_t *pptr;                       /* Current pointer in parsed pattern */
10141 
10142 PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10143 PCRE2_SIZE usedlength;                /* Actual length used */
10144 PCRE2_SIZE re_blocksize;              /* Size of memory block */
10145 PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
10146 PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10147 
10148 uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10149 uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10150 uint32_t setflags = 0;                /* NL and BSR set flags */
10151 
10152 uint32_t skipatstart;                 /* When checking (*UTF) etc */
10153 uint32_t limit_heap  = UINT32_MAX;
10154 uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10155 uint32_t limit_depth = UINT32_MAX;
10156 
10157 int newline = 0;                      /* Unset; can be set by the pattern */
10158 int bsr = 0;                          /* Unset; can be set by the pattern */
10159 int errorcode = 0;                    /* Initialize to avoid compiler warn */
10160 int regexrc;                          /* Return from compile */
10161 
10162 uint32_t i;                           /* Local loop counter */
10163 
10164 /* Comments at the head of this file explain about these variables. */
10165 
10166 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10167 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10168 named_group named_groups[NAMED_GROUP_LIST_SIZE];
10169 
10170 /* The workspace is used in different ways in the different compiling phases.
10171 It needs to be 16-bit aligned for the preliminary parsing scan. */
10172 
10173 uint32_t c16workspace[C16_WORK_SIZE];
10174 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10175 
10176 
10177 /* -------------- Check arguments and set up the pattern ----------------- */
10178 
10179 /* There must be error code and offset pointers. */
10180 
10181 if (errorptr == NULL || erroroffset == NULL) return NULL;
10182 *errorptr = ERR0;
10183 *erroroffset = 0;
10184 
10185 /* There must be a pattern, but NULL is allowed with zero length. */
10186 
10187 if (pattern == NULL)
10188   {
10189   if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10190     {
10191     *errorptr = ERR16;
10192     return NULL;
10193     }
10194   }
10195 
10196 /* A NULL compile context means "use a default context" */
10197 
10198 if (ccontext == NULL)
10199   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10200 
10201 /* PCRE2_MATCH_INVALID_UTF implies UTF */
10202 
10203 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10204 
10205 /* Check that all undefined public option bits are zero. */
10206 
10207 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10208     (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10209   {
10210   *errorptr = ERR17;
10211   return NULL;
10212   }
10213 
10214 if ((options & PCRE2_LITERAL) != 0 &&
10215     ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10216      (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10217   {
10218   *errorptr = ERR92;
10219   return NULL;
10220   }
10221 
10222 /* A zero-terminated pattern is indicated by the special length value
10223 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10224 
10225 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10226   patlen = PRIV(strlen)(pattern);
10227 
10228 if (patlen > ccontext->max_pattern_length)
10229   {
10230   *errorptr = ERR88;
10231   return NULL;
10232   }
10233 
10234 /* From here on, all returns from this function should end up going via the
10235 EXIT label. */
10236 
10237 
10238 /* ------------ Initialize the "static" compile data -------------- */
10239 
10240 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10241 
10242 cb.lcc = tables + lcc_offset;          /* Individual */
10243 cb.fcc = tables + fcc_offset;          /*   character */
10244 cb.cbits = tables + cbits_offset;      /*      tables */
10245 cb.ctypes = tables + ctypes_offset;
10246 
10247 cb.assert_depth = 0;
10248 cb.bracount = 0;
10249 cb.cx = ccontext;
10250 cb.dupnames = FALSE;
10251 cb.end_pattern = pattern + patlen;
10252 cb.erroroffset = 0;
10253 cb.external_flags = 0;
10254 cb.external_options = options;
10255 cb.groupinfo = stack_groupinfo;
10256 cb.had_recurse = FALSE;
10257 cb.lastcapture = 0;
10258 cb.max_lookbehind = 0;                               /* Max encountered */
10259 cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10260 cb.name_entry_size = 0;
10261 cb.name_table = NULL;
10262 cb.named_groups = named_groups;
10263 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10264 cb.names_found = 0;
10265 cb.parens_depth = 0;
10266 cb.parsed_pattern = stack_parsed_pattern;
10267 cb.req_varyopt = 0;
10268 cb.start_code = cworkspace;
10269 cb.start_pattern = pattern;
10270 cb.start_workspace = cworkspace;
10271 cb.workspace_size = COMPILE_WORK_SIZE;
10272 
10273 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10274 references to help in deciding whether (.*) can be treated as anchored or not.
10275 */
10276 
10277 cb.top_backref = 0;
10278 cb.backref_map = 0;
10279 
10280 /* Escape sequences \1 to \9 are always back references, but as they are only
10281 two characters long, only two elements can be used in the parsed_pattern
10282 vector. The first contains the reference, and we'd like to use the second to
10283 record the offset in the pattern, so that forward references to non-existent
10284 groups can be diagnosed later with an offset. However, on 64-bit systems,
10285 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10286 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10287 references have enough space for the offset to be put into the parsed pattern.
10288 */
10289 
10290 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10291 
10292 
10293 /* --------------- Start looking at the pattern --------------- */
10294 
10295 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10296 the start of the pattern, and remember the offset to the actual regex. With
10297 valgrind support, make the terminator of a zero-terminated pattern
10298 inaccessible. This catches bugs that would otherwise only show up for
10299 non-zero-terminated patterns. */
10300 
10301 #ifdef SUPPORT_VALGRIND
10302 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10303 #endif
10304 
10305 ptr = pattern;
10306 skipatstart = 0;
10307 
10308 if ((options & PCRE2_LITERAL) == 0)
10309   {
10310   while (patlen - skipatstart >= 2 &&
10311          ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10312          ptr[skipatstart+1] == CHAR_ASTERISK)
10313     {
10314     for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10315       {
10316       uint32_t c, pp;
10317       const pso *p = pso_list + i;
10318 
10319       if (patlen - skipatstart - 2 >= p->length &&
10320           PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
10321             p->length) == 0)
10322         {
10323         skipatstart += p->length + 2;
10324         switch(p->type)
10325           {
10326           case PSO_OPT:
10327           cb.external_options |= p->value;
10328           break;
10329 
10330           case PSO_FLG:
10331           setflags |= p->value;
10332           break;
10333 
10334           case PSO_NL:
10335           newline = p->value;
10336           setflags |= PCRE2_NL_SET;
10337           break;
10338 
10339           case PSO_BSR:
10340           bsr = p->value;
10341           setflags |= PCRE2_BSR_SET;
10342           break;
10343 
10344           case PSO_LIMM:
10345           case PSO_LIMD:
10346           case PSO_LIMH:
10347           c = 0;
10348           pp = skipatstart;
10349           if (!IS_DIGIT(ptr[pp]))
10350             {
10351             errorcode = ERR60;
10352             ptr += pp;
10353             goto HAD_EARLY_ERROR;
10354             }
10355           while (IS_DIGIT(ptr[pp]))
10356             {
10357             if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10358             c = c*10 + (ptr[pp++] - CHAR_0);
10359             }
10360           if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
10361             {
10362             errorcode = ERR60;
10363             ptr += pp;
10364             goto HAD_EARLY_ERROR;
10365             }
10366           if (p->type == PSO_LIMH) limit_heap = c;
10367             else if (p->type == PSO_LIMM) limit_match = c;
10368             else limit_depth = c;
10369           skipatstart += pp - skipatstart;
10370           break;
10371           }
10372         break;   /* Out of the table scan loop */
10373         }
10374       }
10375     if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10376     }
10377   }
10378 
10379 /* End of pattern-start options; advance to start of real regex. */
10380 
10381 ptr += skipatstart;
10382 
10383 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10384 
10385 #ifndef SUPPORT_UNICODE
10386 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10387   {
10388   errorcode = ERR32;
10389   goto HAD_EARLY_ERROR;
10390   }
10391 #endif
10392 
10393 /* Check UTF. We have the original options in 'options', with that value as
10394 modified by (*UTF) etc in cb->external_options. The extra option
10395 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10396 surrogate code points cannot be represented in UTF-16. */
10397 
10398 utf = (cb.external_options & PCRE2_UTF) != 0;
10399 if (utf)
10400   {
10401   if ((options & PCRE2_NEVER_UTF) != 0)
10402     {
10403     errorcode = ERR74;
10404     goto HAD_EARLY_ERROR;
10405     }
10406   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10407        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10408     goto HAD_ERROR;  /* Offset was set by valid_utf() */
10409 
10410 #if PCRE2_CODE_UNIT_WIDTH == 16
10411   if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10412     {
10413     errorcode = ERR91;
10414     goto HAD_EARLY_ERROR;
10415     }
10416 #endif
10417   }
10418 
10419 /* Check UCP lockout. */
10420 
10421 ucp = (cb.external_options & PCRE2_UCP) != 0;
10422 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10423   {
10424   errorcode = ERR75;
10425   goto HAD_EARLY_ERROR;
10426   }
10427 
10428 /* Process the BSR setting. */
10429 
10430 if (bsr == 0) bsr = ccontext->bsr_convention;
10431 
10432 /* Process the newline setting. */
10433 
10434 if (newline == 0) newline = ccontext->newline_convention;
10435 cb.nltype = NLTYPE_FIXED;
10436 switch(newline)
10437   {
10438   case PCRE2_NEWLINE_CR:
10439   cb.nllen = 1;
10440   cb.nl[0] = CHAR_CR;
10441   break;
10442 
10443   case PCRE2_NEWLINE_LF:
10444   cb.nllen = 1;
10445   cb.nl[0] = CHAR_NL;
10446   break;
10447 
10448   case PCRE2_NEWLINE_NUL:
10449   cb.nllen = 1;
10450   cb.nl[0] = CHAR_NUL;
10451   break;
10452 
10453   case PCRE2_NEWLINE_CRLF:
10454   cb.nllen = 2;
10455   cb.nl[0] = CHAR_CR;
10456   cb.nl[1] = CHAR_NL;
10457   break;
10458 
10459   case PCRE2_NEWLINE_ANY:
10460   cb.nltype = NLTYPE_ANY;
10461   break;
10462 
10463   case PCRE2_NEWLINE_ANYCRLF:
10464   cb.nltype = NLTYPE_ANYCRLF;
10465   break;
10466 
10467   default:
10468   errorcode = ERR56;
10469   goto HAD_EARLY_ERROR;
10470   }
10471 
10472 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10473 their numerical equivalents, so that this information is always available for
10474 the remaining processing. (2) At the same time, parse the pattern and put a
10475 processed version into the parsed_pattern vector. This has escapes interpreted
10476 and comments removed (amongst other things).
10477 
10478 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10479 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10480 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10481 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10482 characters greater than META_END (0x80000000) have to be coded as two units. In
10483 this case, therefore, we scan the pattern to check for such values. */
10484 
10485 #if PCRE2_CODE_UNIT_WIDTH == 32
10486 if (!utf)
10487   {
10488   PCRE2_SPTR p;
10489   for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10490   }
10491 #endif
10492 
10493 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10494 is set we have to assume a numerical callout (4 elements) for each character
10495 plus one at the end. This is overkill, but memory is plentiful these days. For
10496 many smaller patterns the vector on the stack (which was set up above) can be
10497 used. */
10498 
10499 parsed_size_needed = patlen - skipatstart + big32count;
10500 
10501 if ((ccontext->extra_options &
10502      (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10503   parsed_size_needed += 4;
10504 
10505 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10506   parsed_size_needed = (parsed_size_needed + 1) * 5;
10507 
10508 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10509   {
10510   uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10511     (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10512   if (heap_parsed_pattern == NULL)
10513     {
10514     *errorptr = ERR21;
10515     goto EXIT;
10516     }
10517   cb.parsed_pattern = heap_parsed_pattern;
10518   }
10519 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10520 
10521 /* Do the parsing scan. */
10522 
10523 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10524 if (errorcode != 0) goto HAD_CB_ERROR;
10525 
10526 /* If there are any lookbehinds, scan the parsed pattern to figure out their
10527 lengths. Workspace is needed to remember whether numbered groups are or are not
10528 of limited length, and if limited, what the minimum and maximum lengths are.
10529 This caching saves re-computing the length of any group that is referenced more
10530 than once, which is particularly relevant when recursion is involved.
10531 Unnumbered groups do not have this exposure because they cannot be referenced.
10532 If there are sufficiently few groups, the default index vector on the stack, as
10533 set up above, can be used. Otherwise we have to get/free some heap memory. The
10534 vector must be initialized to zero. */
10535 
10536 if (has_lookbehind)
10537   {
10538   int loopcount = 0;
10539   if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10540     {
10541     cb.groupinfo = ccontext->memctl.malloc(
10542       (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10543     if (cb.groupinfo == NULL)
10544       {
10545       errorcode = ERR21;
10546       cb.erroroffset = 0;
10547       goto HAD_CB_ERROR;
10548       }
10549     }
10550   memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10551   errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10552   if (errorcode != 0) goto HAD_CB_ERROR;
10553   }
10554 
10555 /* For debugging, there is a function that shows the parsed pattern vector. */
10556 
10557 #ifdef DEBUG_SHOW_PARSED
10558 fprintf(stderr, "+++ Pre-scan complete:\n");
10559 show_parsed(&cb);
10560 #endif
10561 
10562 /* For debugging capturing information this code can be enabled. */
10563 
10564 #ifdef DEBUG_SHOW_CAPTURES
10565   {
10566   named_group *ng = cb.named_groups;
10567   fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10568   for (i = 0; i < cb.names_found; i++, ng++)
10569     {
10570     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10571     }
10572   }
10573 #endif
10574 
10575 /* Pretend to compile the pattern while actually just accumulating the amount
10576 of memory required in the 'length' variable. This behaviour is triggered by
10577 passing a non-NULL final argument to compile_regex(). We pass a block of
10578 workspace (cworkspace) for it to compile parts of the pattern into; the
10579 compiled code is discarded when it is no longer needed, so hopefully this
10580 workspace will never overflow, though there is a test for its doing so.
10581 
10582 On error, errorcode will be set non-zero, so we don't need to look at the
10583 result of the function. The initial options have been put into the cb block,
10584 but we still have to pass a separate options variable (the first argument)
10585 because the options may change as the pattern is processed. */
10586 
10587 cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10588 pptr = cb.parsed_pattern;
10589 code = cworkspace;
10590 *code = OP_BRA;
10591 
10592 (void)compile_regex(cb.external_options, ccontext->extra_options, &code, &pptr,
10593    &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10594    &cb, &length);
10595 
10596 if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10597 
10598 /* This should be caught in compile_regex(), but just in case... */
10599 
10600 if (length > MAX_PATTERN_SIZE)
10601   {
10602   errorcode = ERR20;
10603   goto HAD_CB_ERROR;
10604   }
10605 
10606 /* Compute the size of, then, if not too large, get and initialize the data
10607 block for storing the compiled pattern and names table. Integer overflow should
10608 no longer be possible because nowadays we limit the maximum value of
10609 cb.names_found and cb.name_entry_size. */
10610 
10611 re_blocksize = sizeof(pcre2_real_code) +
10612   CU2BYTES(length +
10613   (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10614 
10615 if (re_blocksize > ccontext->max_pattern_compiled_length)
10616   {
10617   errorcode = ERR101;
10618   goto HAD_CB_ERROR;
10619   }
10620 
10621 re = (pcre2_real_code *)
10622   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10623 if (re == NULL)
10624   {
10625   errorcode = ERR21;
10626   goto HAD_CB_ERROR;
10627   }
10628 
10629 /* The compiler may put padding at the end of the pcre2_real_code structure in
10630 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10631 compiled pattern is copied (for example, when serialized) undefined bytes are
10632 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10633 write to the last 8 bytes of the structure before setting the fields. */
10634 
10635 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10636 re->memctl = ccontext->memctl;
10637 re->tables = tables;
10638 re->executable_jit = NULL;
10639 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10640 re->blocksize = re_blocksize;
10641 re->magic_number = MAGIC_NUMBER;
10642 re->compile_options = options;
10643 re->overall_options = cb.external_options;
10644 re->extra_options = ccontext->extra_options;
10645 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10646 re->limit_heap = limit_heap;
10647 re->limit_match = limit_match;
10648 re->limit_depth = limit_depth;
10649 re->first_codeunit = 0;
10650 re->last_codeunit = 0;
10651 re->bsr_convention = bsr;
10652 re->newline_convention = newline;
10653 re->max_lookbehind = 0;
10654 re->minlength = 0;
10655 re->top_bracket = 0;
10656 re->top_backref = 0;
10657 re->name_entry_size = cb.name_entry_size;
10658 re->name_count = cb.names_found;
10659 
10660 /* The basic block is immediately followed by the name table, and the compiled
10661 code follows after that. */
10662 
10663 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10664   re->name_entry_size * re->name_count;
10665 
10666 /* Update the compile data block for the actual compile. The starting points of
10667 the name/number translation table and of the code are passed around in the
10668 compile data block. The start/end pattern and initial options are already set
10669 from the pre-compile phase, as is the name_entry_size field. */
10670 
10671 cb.parens_depth = 0;
10672 cb.assert_depth = 0;
10673 cb.lastcapture = 0;
10674 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10675 cb.start_code = codestart;
10676 cb.req_varyopt = 0;
10677 cb.had_accept = FALSE;
10678 cb.had_pruneorskip = FALSE;
10679 
10680 /* If any named groups were found, create the name/number table from the list
10681 created in the pre-pass. */
10682 
10683 if (cb.names_found > 0)
10684   {
10685   named_group *ng = cb.named_groups;
10686   for (i = 0; i < cb.names_found; i++, ng++)
10687     add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10688   }
10689 
10690 /* Set up a starting, non-extracting bracket, then compile the expression. On
10691 error, errorcode will be set non-zero, so we don't need to look at the result
10692 of the function here. */
10693 
10694 pptr = cb.parsed_pattern;
10695 code = (PCRE2_UCHAR *)codestart;
10696 *code = OP_BRA;
10697 regexrc = compile_regex(re->overall_options, ccontext->extra_options, &code,
10698   &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10699   NULL, &cb, NULL);
10700 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10701 re->top_bracket = cb.bracount;
10702 re->top_backref = cb.top_backref;
10703 re->max_lookbehind = cb.max_lookbehind;
10704 
10705 if (cb.had_accept)
10706   {
10707   reqcu = 0;                     /* Must disable after (*ACCEPT) */
10708   reqcuflags = REQ_NONE;
10709   re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10710   }
10711 
10712 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10713 but the estimated length exceeds the really used length, adjust the value of
10714 re->blocksize, and if valgrind support is configured, mark the extra allocated
10715 memory as unaddressable, so that any out-of-bound reads can be detected. */
10716 
10717 *code++ = OP_END;
10718 usedlength = code - codestart;
10719 if (usedlength > length) errorcode = ERR23; else
10720   {
10721   re->blocksize -= CU2BYTES(length - usedlength);
10722 #ifdef SUPPORT_VALGRIND
10723   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10724 #endif
10725   }
10726 
10727 /* Scan the pattern for recursion/subroutine calls and convert the group
10728 numbers into offsets. Maintain a small cache so that repeated groups containing
10729 recursions are efficiently handled. */
10730 
10731 #define RSCAN_CACHE_SIZE 8
10732 
10733 if (errorcode == 0 && cb.had_recurse)
10734   {
10735   PCRE2_UCHAR *rcode;
10736   PCRE2_SPTR rgroup;
10737   unsigned int ccount = 0;
10738   int start = RSCAN_CACHE_SIZE;
10739   recurse_cache rc[RSCAN_CACHE_SIZE];
10740 
10741   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10742        rcode != NULL;
10743        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10744     {
10745     int p, groupnumber;
10746 
10747     groupnumber = (int)GET(rcode, 1);
10748     if (groupnumber == 0) rgroup = codestart; else
10749       {
10750       PCRE2_SPTR search_from = codestart;
10751       rgroup = NULL;
10752       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10753         {
10754         if (groupnumber == rc[p].groupnumber)
10755           {
10756           rgroup = rc[p].group;
10757           break;
10758           }
10759 
10760         /* Group n+1 must always start to the right of group n, so we can save
10761         search time below when the new group number is greater than any of the
10762         previously found groups. */
10763 
10764         if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10765         }
10766 
10767       if (rgroup == NULL)
10768         {
10769         rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10770         if (rgroup == NULL)
10771           {
10772           errorcode = ERR53;
10773           break;
10774           }
10775         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10776         rc[start].groupnumber = groupnumber;
10777         rc[start].group = rgroup;
10778         if (ccount < RSCAN_CACHE_SIZE) ccount++;
10779         }
10780       }
10781 
10782     PUT(rcode, 1, rgroup - codestart);
10783     }
10784   }
10785 
10786 /* In rare debugging situations we sometimes need to look at the compiled code
10787 at this stage. */
10788 
10789 #ifdef DEBUG_CALL_PRINTINT
10790 pcre2_printint(re, stderr, TRUE);
10791 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10792 #endif
10793 
10794 /* Unless disabled, check whether any single character iterators can be
10795 auto-possessified. The function overwrites the appropriate opcode values, so
10796 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10797 used in this code because at least one compiler gives a warning about loss of
10798 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10799 function call. */
10800 
10801 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10802   {
10803   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10804   if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10805   }
10806 
10807 /* Failed to compile, or error while post-processing. */
10808 
10809 if (errorcode != 0) goto HAD_CB_ERROR;
10810 
10811 /* Successful compile. If the anchored option was not passed, set it if
10812 we can determine that the pattern is anchored by virtue of ^ characters or \A
10813 or anything else, such as starting with non-atomic .* when DOTALL is set and
10814 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10815 disable this case). */
10816 
10817 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10818      is_anchored(codestart, 0, &cb, 0, FALSE))
10819   re->overall_options |= PCRE2_ANCHORED;
10820 
10821 /* Set up the first code unit or startline flag, the required code unit, and
10822 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10823 is set, as the data it would create will not be used. Note that a first code
10824 unit (but not the startline flag) is useful for anchored patterns because it
10825 can still give a quick "no match" and also avoid searching for a last code
10826 unit. */
10827 
10828 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10829   {
10830   int minminlength = 0;  /* For minimal minlength from first/required CU */
10831 
10832   /* If we do not have a first code unit, see if there is one that is asserted
10833   (these are not saved during the compile because they can cause conflicts with
10834   actual literals that follow). */
10835 
10836   if (firstcuflags >= REQ_NONE)
10837     firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10838 
10839   /* Save the data for a first code unit. The existence of one means the
10840   minimum length must be at least 1. */
10841 
10842   if (firstcuflags < REQ_NONE)
10843     {
10844     re->first_codeunit = firstcu;
10845     re->flags |= PCRE2_FIRSTSET;
10846     minminlength++;
10847 
10848     /* Handle caseless first code units. */
10849 
10850     if ((firstcuflags & REQ_CASELESS) != 0)
10851       {
10852       if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10853         {
10854         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10855         }
10856 
10857       /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10858       In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10859       points and cannot have another case, but if UCP is set they may do. */
10860 
10861 #ifdef SUPPORT_UNICODE
10862 #if PCRE2_CODE_UNIT_WIDTH == 8
10863       else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10864         re->flags |= PCRE2_FIRSTCASELESS;
10865 #else
10866       else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10867                UCD_OTHERCASE(firstcu) != firstcu)
10868         re->flags |= PCRE2_FIRSTCASELESS;
10869 #endif
10870 #endif  /* SUPPORT_UNICODE */
10871       }
10872     }
10873 
10874   /* When there is no first code unit, for non-anchored patterns, see if we can
10875   set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10876   branches start with ^ and also when all branches start with non-atomic .* for
10877   non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10878   that disables this case.) */
10879 
10880   else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10881            is_startline(codestart, 0, &cb, 0, FALSE))
10882     re->flags |= PCRE2_STARTLINE;
10883 
10884   /* Handle the "required code unit", if one is set. In the UTF case we can
10885   increment the minimum minimum length only if we are sure this really is a
10886   different character and not a non-starting code unit of the first character,
10887   because the minimum length count is in characters, not code units. */
10888 
10889   if (reqcuflags < REQ_NONE)
10890     {
10891 #if PCRE2_CODE_UNIT_WIDTH == 16
10892     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10893         firstcuflags >= REQ_NONE ||                 /* First not set */
10894         (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10895         (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10896 #elif PCRE2_CODE_UNIT_WIDTH == 8
10897     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10898         firstcuflags >= REQ_NONE ||                 /* First not set */
10899         (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10900         (reqcu & 0x80) == 0)                        /* Req is ASCII */
10901 #endif
10902       {
10903       minminlength++;
10904       }
10905 
10906     /* In the case of an anchored pattern, set up the value only if it follows
10907     a variable length item in the pattern. */
10908 
10909     if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10910         (reqcuflags & REQ_VARY) != 0)
10911       {
10912       re->last_codeunit = reqcu;
10913       re->flags |= PCRE2_LASTSET;
10914 
10915       /* Handle caseless required code units as for first code units (above). */
10916 
10917       if ((reqcuflags & REQ_CASELESS) != 0)
10918         {
10919         if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10920           {
10921           if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10922           }
10923 #ifdef SUPPORT_UNICODE
10924 #if PCRE2_CODE_UNIT_WIDTH == 8
10925       else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10926         re->flags |= PCRE2_LASTCASELESS;
10927 #else
10928       else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10929                UCD_OTHERCASE(reqcu) != reqcu)
10930         re->flags |= PCRE2_LASTCASELESS;
10931 #endif
10932 #endif  /* SUPPORT_UNICODE */
10933         }
10934       }
10935     }
10936 
10937   /* Study the compiled pattern to set up information such as a bitmap of
10938   starting code units and a minimum matching length. */
10939 
10940   if (PRIV(study)(re) != 0)
10941     {
10942     errorcode = ERR31;
10943     goto HAD_CB_ERROR;
10944     }
10945 
10946   /* If study() set a bitmap of starting code units, it implies a minimum
10947   length of at least one. */
10948 
10949   if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10950     minminlength = 1;
10951 
10952   /* If the minimum length set (or not set) by study() is less than the minimum
10953   implied by required code units, override it. */
10954 
10955   if (re->minlength < minminlength) re->minlength = minminlength;
10956   }   /* End of start-of-match optimizations. */
10957 
10958 /* Control ends up here in all cases. When running under valgrind, make a
10959 pattern's terminating zero defined again. If memory was obtained for the parsed
10960 version of the pattern, free it before returning. Also free the list of named
10961 groups if a larger one had to be obtained, and likewise the group information
10962 vector. */
10963 
10964 EXIT:
10965 #ifdef SUPPORT_VALGRIND
10966 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10967 #endif
10968 if (cb.parsed_pattern != stack_parsed_pattern)
10969   ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10970 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10971   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10972 if (cb.groupinfo != stack_groupinfo)
10973   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10974 return re;    /* Will be NULL after an error */
10975 
10976 /* Errors discovered in parse_regex() set the offset value in the compile
10977 block. Errors discovered before it is called must compute it from the ptr
10978 value. After parse_regex() is called, the offset in the compile block is set to
10979 the end of the pattern, but certain errors in compile_regex() may reset it if
10980 an offset is available in the parsed pattern. */
10981 
10982 HAD_CB_ERROR:
10983 ptr = pattern + cb.erroroffset;
10984 
10985 HAD_EARLY_ERROR:
10986 *erroroffset = ptr - pattern;
10987 
10988 HAD_ERROR:
10989 *errorptr = errorcode;
10990 pcre2_code_free(re);
10991 re = NULL;
10992 goto EXIT;
10993 }
10994 
10995 /* These #undefs are here to enable unity builds with CMake. */
10996 
10997 #undef NLBLOCK /* Block containing newline information */
10998 #undef PSSTART /* Field containing processed string start */
10999 #undef PSEND   /* Field containing processed string end */
11000 
11001 /* End of pcre2_compile.c */
11002