xref: /php-src/ext/pcre/pcre2lib/pcre2_compile.c (revision ae5beff6)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2023 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #define NLBLOCK cb             /* Block containing newline information */
47 #define PSSTART start_pattern  /* Field containing processed string start */
48 #define PSEND   end_pattern    /* Field containing processed string end */
49 
50 #include "pcre2_internal.h"
51 
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53 
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63 
64 /* Other debugging code can be enabled by these defines. */
65 
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68 
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71 
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c)                xdigitab[c]
75 
76 #else  /* Either 16-bit or 32-bit */
77 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78 
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81 
82 #else  /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86 
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90 
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100   { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102   { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110 
111 /* Macros for manipulating elements of the parsed pattern vector. */
112 
113 #define META_CODE(x)   (x & 0xffff0000u)
114 #define META_DATA(x)   (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116 
117 /* Function definitions to allow mutual recursion */
118 
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121   add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t,
122     compile_block *, const uint32_t *, unsigned int);
123 #endif
124 
125 static int
126   compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
127     uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128     open_capitem *, compile_block *, PCRE2_SIZE *);
129 
130 static int
131   get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
132     compile_block *);
133 
134 static BOOL
135   set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136     compile_block *);
137 
138 static int
139   check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140     compile_block *, int *);
141 
142 
143 /*************************************************
144 *      Code parameters and static tables         *
145 *************************************************/
146 
147 #define MAX_GROUP_NUMBER   65535u
148 #define MAX_REPEAT_COUNT   65535u
149 #define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
150 
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156 
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163 
164 In the real compile phase, this workspace is not currently used. */
165 
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
167 
168 #define C16_WORK_SIZE \
169   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170 
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174 
175 #define GROUPINFO_DEFAULT_SIZE 256
176 
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179 
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181 
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186 
187 #define NAMED_GROUP_LIST_SIZE  20
188 
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192 
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194 
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199 
200 #define OFLOW_MAX (INT_MAX - 20)
201 
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207 
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210 
211 #define META_END              0x80000000u  /* End of pattern */
212 
213 #define META_ALT              0x80010000u  /* alternation */
214 #define META_ATOMIC           0x80020000u  /* atomic group */
215 #define META_BACKREF          0x80030000u  /* Back ref */
216 #define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
217 #define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
219 #define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
220 #define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
221 #define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
222 #define META_CLASS            0x800a0000u  /* start non-empty class */
223 #define META_CLASS_EMPTY      0x800b0000u  /* empty class */
224 #define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
225 #define META_CLASS_END        0x800d0000u  /* end of non-empty class */
226 #define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
227 #define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
228 #define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
229 #define META_COND_NAME        0x80110000u  /* (?(<name>)... */
230 #define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
231 #define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
232 #define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
233 #define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR           0x80160000u  /* $ metacharacter */
235 #define META_DOT              0x80170000u  /* . metacharacter */
236 #define META_ESCAPE           0x80180000u  /* \d and friends */
237 #define META_KET              0x80190000u  /* closing parenthesis */
238 #define META_NOCAPTURE        0x801a0000u  /* no capture parens */
239 #define META_OPTIONS          0x801b0000u  /* (?i) and friends */
240 #define META_POSIX            0x801c0000u  /* POSIX class item */
241 #define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
242 #define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
243 #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
244 #define META_RECURSE          0x80200000u  /* Recursion */
245 #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
246 #define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
247 
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250 
251 #define META_LOOKAHEAD        0x80230000u  /* (?= */
252 #define META_LOOKAHEADNOT     0x80240000u  /* (?! */
253 #define META_LOOKBEHIND       0x80250000u  /* (?<= */
254 #define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
255 
256 /* These cannot be conditions */
257 
258 #define META_LOOKAHEAD_NA     0x80270000u  /* (*napla: */
259 #define META_LOOKBEHIND_NA    0x80280000u  /* (*naplb: */
260 
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264 
265 #define META_MARK             0x80290000u  /* (*MARK) */
266 #define META_ACCEPT           0x802a0000u  /* (*ACCEPT) */
267 #define META_FAIL             0x802b0000u  /* (*FAIL) */
268 #define META_COMMIT           0x802c0000u  /* These               */
269 #define META_COMMIT_ARG       0x802d0000u  /*   pairs             */
270 #define META_PRUNE            0x802e0000u  /*     must            */
271 #define META_PRUNE_ARG        0x802f0000u  /*       be            */
272 #define META_SKIP             0x80300000u  /*         kept        */
273 #define META_SKIP_ARG         0x80310000u  /*           in        */
274 #define META_THEN             0x80320000u  /*             this    */
275 #define META_THEN_ARG         0x80330000u  /*               order */
276 
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278 
279 #define META_ASTERISK         0x80340000u  /* *  */
280 #define META_ASTERISK_PLUS    0x80350000u  /* *+ */
281 #define META_ASTERISK_QUERY   0x80360000u  /* *? */
282 #define META_PLUS             0x80370000u  /* +  */
283 #define META_PLUS_PLUS        0x80380000u  /* ++ */
284 #define META_PLUS_QUERY       0x80390000u  /* +? */
285 #define META_QUERY            0x803a0000u  /* ?  */
286 #define META_QUERY_PLUS       0x803b0000u  /* ?+ */
287 #define META_QUERY_QUERY      0x803c0000u  /* ?? */
288 #define META_MINMAX           0x803d0000u  /* {n,m}  repeat */
289 #define META_MINMAX_PLUS      0x803e0000u  /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY     0x803f0000u  /* {n,m}? repeat */
291 
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER  META_MINMAX_QUERY
294 
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299 
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301 
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305 
306 static unsigned char meta_extra_lengths[] = {
307   0,             /* META_END */
308   0,             /* META_ALT */
309   0,             /* META_ATOMIC */
310   0,             /* META_BACKREF - more if group is >= 10 */
311   1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
312   1,             /* META_BIGVALUE */
313   3,             /* META_CALLOUT_NUMBER */
314   3+SIZEOFFSET,  /* META_CALLOUT_STRING */
315   0,             /* META_CAPTURE */
316   0,             /* META_CIRCUMFLEX */
317   0,             /* META_CLASS */
318   0,             /* META_CLASS_EMPTY */
319   0,             /* META_CLASS_EMPTY_NOT */
320   0,             /* META_CLASS_END */
321   0,             /* META_CLASS_NOT */
322   0,             /* META_COND_ASSERT */
323   SIZEOFFSET,    /* META_COND_DEFINE */
324   1+SIZEOFFSET,  /* META_COND_NAME */
325   1+SIZEOFFSET,  /* META_COND_NUMBER */
326   1+SIZEOFFSET,  /* META_COND_RNAME */
327   1+SIZEOFFSET,  /* META_COND_RNUMBER */
328   3,             /* META_COND_VERSION */
329   0,             /* META_DOLLAR */
330   0,             /* META_DOT */
331   0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332   0,             /* META_KET */
333   0,             /* META_NOCAPTURE */
334   1,             /* META_OPTIONS */
335   1,             /* META_POSIX */
336   1,             /* META_POSIX_NEG */
337   0,             /* META_RANGE_ESCAPED */
338   0,             /* META_RANGE_LITERAL */
339   SIZEOFFSET,    /* META_RECURSE */
340   1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
341   0,             /* META_SCRIPT_RUN */
342   0,             /* META_LOOKAHEAD */
343   0,             /* META_LOOKAHEADNOT */
344   SIZEOFFSET,    /* META_LOOKBEHIND */
345   SIZEOFFSET,    /* META_LOOKBEHINDNOT */
346   0,             /* META_LOOKAHEAD_NA */
347   SIZEOFFSET,    /* META_LOOKBEHIND_NA */
348   1,             /* META_MARK - plus the string length */
349   0,             /* META_ACCEPT */
350   0,             /* META_FAIL */
351   0,             /* META_COMMIT */
352   1,             /* META_COMMIT_ARG - plus the string length */
353   0,             /* META_PRUNE */
354   1,             /* META_PRUNE_ARG - plus the string length */
355   0,             /* META_SKIP */
356   1,             /* META_SKIP_ARG - plus the string length */
357   0,             /* META_THEN */
358   1,             /* META_THEN_ARG - plus the string length */
359   0,             /* META_ASTERISK */
360   0,             /* META_ASTERISK_PLUS */
361   0,             /* META_ASTERISK_QUERY */
362   0,             /* META_PLUS */
363   0,             /* META_PLUS_PLUS */
364   0,             /* META_PLUS_QUERY */
365   0,             /* META_QUERY */
366   0,             /* META_QUERY_PLUS */
367   0,             /* META_QUERY_QUERY */
368   2,             /* META_MINMAX */
369   2,             /* META_MINMAX_PLUS */
370   2              /* META_MINMAX_QUERY */
371 };
372 
373 /* Types for skipping parts of a parsed pattern. */
374 
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376 
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380 
381   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382 
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385 
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387 
388 /* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389 variables, which are concerned with first and required code units. A value
390 greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391 matching xxcu variable is set, and the low valued bits are relevant. */
392 
393 #define REQ_UNSET     0xffffffffu  /* Not yet found anything */
394 #define REQ_NONE      0xfffffffeu  /* Found not fixed character */
395 #define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
396 #define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
397 
398 /* These flags are used in the groupinfo vector. */
399 
400 #define GI_SET_FIXED_LENGTH    0x80000000u
401 #define GI_NOT_FIXED_LENGTH    0x40000000u
402 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
403 
404 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405 and is fast (a good compiler can turn it into a subtraction and unsigned
406 comparison). */
407 
408 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409 
410 /* Table to identify hex digits. The tables in chartables are dependent on the
411 locale, and may mark arbitrary characters as digits. We want to recognize only
412 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413 costs 256 bytes, but it is a lot faster than doing character value tests (at
414 least in some simple cases I timed), and in some applications one wants PCRE2
415 to compile efficiently as well as match efficiently. The value in the table is
416 the binary hex digit value, or 0xff for non-hex digits. */
417 
418 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419 UTF-8 mode. */
420 
421 #ifndef EBCDIC
422 static const uint8_t xdigitab[] =
423   {
424   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
425   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
426   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
427   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
428   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
429   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
430   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
431   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
432   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
433   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
434   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
435   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
436   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
437   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
438   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
439   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
440   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456 
457 #else
458 
459 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460 
461 static const uint8_t xdigitab[] =
462   {
463   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
464   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
465   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
466   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
467   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
468   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
469   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
470   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
471   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
472   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
473   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
474   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
475   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
476   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
477   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
479   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
480   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
481   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
482   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
483   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
484   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
485   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
486   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
487   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
488   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
489   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
490   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
491   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
492   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
493   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
494   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
495 #endif  /* EBCDIC */
496 
497 
498 /* Table for handling alphanumeric escaped characters. Positive returns are
499 simple data values; negative values are for special things like \d and so on.
500 Zero means further processing is needed (for things like \x), or the escape is
501 invalid. */
502 
503 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
504 in UTF-8 mode. It runs from '0' to 'z'. */
505 
506 #ifndef EBCDIC
507 #define ESCAPES_FIRST       CHAR_0
508 #define ESCAPES_LAST        CHAR_z
509 #define UPPER_CASE(c)       (c-32)
510 
511 static const short int escapes[] = {
512      0,                       0,
513      0,                       0,
514      0,                       0,
515      0,                       0,
516      0,                       0,
517      CHAR_COLON,              CHAR_SEMICOLON,
518      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
519      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
520      CHAR_COMMERCIAL_AT,      -ESC_A,
521      -ESC_B,                  -ESC_C,
522      -ESC_D,                  -ESC_E,
523      0,                       -ESC_G,
524      -ESC_H,                  0,
525      0,                       -ESC_K,
526      0,                       0,
527      -ESC_N,                  0,
528      -ESC_P,                  -ESC_Q,
529      -ESC_R,                  -ESC_S,
530      0,                       0,
531      -ESC_V,                  -ESC_W,
532      -ESC_X,                  0,
533      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
534      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
535      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
536      CHAR_GRAVE_ACCENT,       CHAR_BEL,
537      -ESC_b,                  0,
538      -ESC_d,                  CHAR_ESC,
539      CHAR_FF,                 0,
540      -ESC_h,                  0,
541      0,                       -ESC_k,
542      0,                       0,
543      CHAR_LF,                 0,
544      -ESC_p,                  0,
545      CHAR_CR,                 -ESC_s,
546      CHAR_HT,                 0,
547      -ESC_v,                  -ESC_w,
548      0,                       0,
549      -ESC_z
550 };
551 
552 #else
553 
554 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557 because it is defined as 'a', which of course picks up the ASCII value. */
558 
559 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
560 #define ESCAPES_FIRST       CHAR_a
561 #define ESCAPES_LAST        CHAR_9
562 #define UPPER_CASE(c)       (c+64)
563 #else                              /* Testing in an ASCII environment */
564 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
565 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
566 #define UPPER_CASE(c)  (c-32)
567 #endif
568 
569 static const short int escapes[] = {
570 /*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
571 /*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
572 /*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
573 /*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
574 /*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
575 /*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
576 /*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
577 /*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
578 /*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
579 /*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
580 /*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
581 /*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
582 /*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
583 /*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
584 /*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
585 /*  F8 */      0,        0
586 };
587 
588 /* We also need a table of characters that may follow \c in an EBCDIC
589 environment for characters 0-31. */
590 
591 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592 
593 #endif   /* EBCDIC */
594 
595 
596 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597 searched linearly. Put all the names into a single string, in order to reduce
598 the number of relocations when a shared library is dynamically linked. The
599 string is built from string macros so that it works in UTF-8 mode on EBCDIC
600 platforms. */
601 
602 typedef struct verbitem {
603   unsigned int len;          /* Length of verb name */
604   uint32_t meta;             /* Base META_ code */
605   int has_arg;               /* Argument requirement */
606 } verbitem;
607 
608 static const char verbnames[] =
609   "\0"                       /* Empty name is a shorthand for MARK */
610   STRING_MARK0
611   STRING_ACCEPT0
612   STRING_F0
613   STRING_FAIL0
614   STRING_COMMIT0
615   STRING_PRUNE0
616   STRING_SKIP0
617   STRING_THEN;
618 
619 static const verbitem verbs[] = {
620   { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
621   { 4, META_MARK,   +1 },
622   { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
623   { 1, META_FAIL,   -1 },
624   { 4, META_FAIL,   -1 },
625   { 6, META_COMMIT,  0 },
626   { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
627   { 4, META_SKIP,    0 },
628   { 4, META_THEN,    0 }
629 };
630 
631 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632 
633 /* Verb opcodes, indexed by their META code offset from META_MARK. */
634 
635 static const uint32_t verbops[] = {
636   OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637   OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638 
639 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640 
641 typedef struct alasitem {
642   unsigned int len;          /* Length of name */
643   uint32_t meta;             /* Base META_ code */
644 } alasitem;
645 
646 static const char alasnames[] =
647   STRING_pla0
648   STRING_plb0
649   STRING_napla0
650   STRING_naplb0
651   STRING_nla0
652   STRING_nlb0
653   STRING_positive_lookahead0
654   STRING_positive_lookbehind0
655   STRING_non_atomic_positive_lookahead0
656   STRING_non_atomic_positive_lookbehind0
657   STRING_negative_lookahead0
658   STRING_negative_lookbehind0
659   STRING_atomic0
660   STRING_sr0
661   STRING_asr0
662   STRING_script_run0
663   STRING_atomic_script_run;
664 
665 static const alasitem alasmeta[] = {
666   {  3, META_LOOKAHEAD         },
667   {  3, META_LOOKBEHIND        },
668   {  5, META_LOOKAHEAD_NA      },
669   {  5, META_LOOKBEHIND_NA     },
670   {  3, META_LOOKAHEADNOT      },
671   {  3, META_LOOKBEHINDNOT     },
672   { 18, META_LOOKAHEAD         },
673   { 19, META_LOOKBEHIND        },
674   { 29, META_LOOKAHEAD_NA      },
675   { 30, META_LOOKBEHIND_NA     },
676   { 18, META_LOOKAHEADNOT      },
677   { 19, META_LOOKBEHINDNOT     },
678   {  6, META_ATOMIC            },
679   {  2, META_SCRIPT_RUN        }, /* sr = script run */
680   {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681   { 10, META_SCRIPT_RUN        }, /* script run */
682   { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
683 };
684 
685 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686 
687 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688 
689 static uint32_t chartypeoffset[] = {
690   OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
691   OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692 
693 /* Tables of names of POSIX character classes and their lengths. The names are
694 now all in a single string, to reduce the number of relocations when a shared
695 library is dynamically loaded. The list of lengths is terminated by a zero
696 length entry. The first three must be alpha, lower, upper, as this is assumed
697 for handling case independence. The indices for several classes are needed, so
698 identify them. */
699 
700 static const char posix_names[] =
701   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704   STRING_word0  STRING_xdigit;
705 
706 static const uint8_t posix_name_lengths[] = {
707   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708 
709 #define PC_DIGIT   7
710 #define PC_GRAPH   8
711 #define PC_PRINT   9
712 #define PC_PUNCT  10
713 #define PC_XDIGIT 13
714 
715 /* Table of class bit maps for each POSIX class. Each class is formed from a
716 base map, with an optional addition or removal of another map. Then, for some
717 classes, there is some additional tweaking: for [:blank:] the vertical space
718 characters are removed, and for [:alpha:] and [:alnum:] the underscore
719 character is removed. The triples in the table consist of the base map offset,
720 second map offset or -1 if no second map, and a non-negative value for map
721 addition or a negative value for map subtraction (if there are two maps). The
722 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
723 remove vertical space characters, 2 => remove underscore. */
724 
725 static const int posix_class_maps[] = {
726   cbit_word,   cbit_digit, -2,            /* alpha */
727   cbit_lower,  -1,          0,            /* lower */
728   cbit_upper,  -1,          0,            /* upper */
729   cbit_word,   -1,          2,            /* alnum - word without underscore */
730   cbit_print,  cbit_cntrl,  0,            /* ascii */
731   cbit_space,  -1,          1,            /* blank - a GNU extension */
732   cbit_cntrl,  -1,          0,            /* cntrl */
733   cbit_digit,  -1,          0,            /* digit */
734   cbit_graph,  -1,          0,            /* graph */
735   cbit_print,  -1,          0,            /* print */
736   cbit_punct,  -1,          0,            /* punct */
737   cbit_space,  -1,          0,            /* space */
738   cbit_word,   -1,          0,            /* word - a Perl extension */
739   cbit_xdigit, -1,          0             /* xdigit */
740 };
741 
742 #ifdef SUPPORT_UNICODE
743 
744 /* The POSIX class Unicode property substitutes that are used in UCP mode must
745 be in the order of the POSIX class names, defined above. */
746 
747 static int posix_substitutes[] = {
748   PT_GC, ucp_L,     /* alpha */
749   PT_PC, ucp_Ll,    /* lower */
750   PT_PC, ucp_Lu,    /* upper */
751   PT_ALNUM, 0,      /* alnum */
752   -1, 0,            /* ascii, treat as non-UCP */
753   -1, 1,            /* blank, treat as \h */
754   PT_PC, ucp_Cc,    /* cntrl */
755   PT_PC, ucp_Nd,    /* digit */
756   PT_PXGRAPH, 0,    /* graph */
757   PT_PXPRINT, 0,    /* print */
758   PT_PXPUNCT, 0,    /* punct */
759   PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
760   PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
761   PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
762 };
763 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
764 #endif  /* SUPPORT_UNICODE */
765 
766 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
767 are allowed. */
768 
769 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
770   (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
771    PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
772    PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
773 
774 #define PUBLIC_COMPILE_OPTIONS \
775   (PUBLIC_LITERAL_COMPILE_OPTIONS| \
776    PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
777    PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
778    PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
779    PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
780    PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
781    PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
782 
783 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
784    (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT)
785 
786 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
787    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
788     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
789     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
790     PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
791     PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
792     PCRE2_EXTRA_ASCII_DIGIT)
793 
794 /* Compile time error code numbers. They are given names so that they can more
795 easily be tracked. When a new number is added, the tables called eint1 and
796 eint2 in pcre2posix.c may need to be updated, and a new error text must be
797 added to compile_error_texts in pcre2_error.c. Also, the error codes in
798 pcre2.h.in must be updated - their values are exactly 100 greater than these
799 values. */
800 
801 enum { ERR0 = COMPILE_ERROR_BASE,
802        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
803        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
804        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
805        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
806        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
807        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
808        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
809        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
810        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
811        ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99, ERR100 };
812 
813 /* This is a table of start-of-pattern options such as (*UTF) and settings such
814 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
815 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
816 generic and always supported. */
817 
818 enum { PSO_OPT,     /* Value is an option bit */
819        PSO_FLG,     /* Value is a flag bit */
820        PSO_NL,      /* Value is a newline type */
821        PSO_BSR,     /* Value is a \R type */
822        PSO_LIMH,    /* Read integer value for heap limit */
823        PSO_LIMM,    /* Read integer value for match limit */
824        PSO_LIMD     /* Read integer value for depth limit */
825      };
826 
827 typedef struct pso {
828   const uint8_t *name;
829   uint16_t length;
830   uint16_t type;
831   uint32_t value;
832 } pso;
833 
834 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
835 
836 static const pso pso_list[] = {
837   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
838   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
839   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
840   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
841   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
842   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
843   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
844   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
845   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
846   { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
847   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
848   { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
849   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
850   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
851   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
852   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
853   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
854   { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
855   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
856   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
857   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
858 };
859 
860 /* This table is used when converting repeating opcodes into possessified
861 versions as a result of an explicit possessive quantifier such as ++. A zero
862 value means there is no possessified version - in those cases the item in
863 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
864 because all relevant opcodes are less than that. */
865 
866 static const uint8_t opcode_possessify[] = {
867   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
868   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
869 
870   0,                       /* NOTI */
871   OP_POSSTAR, 0,           /* STAR, MINSTAR */
872   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
873   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
874   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
875   0,                       /* EXACT */
876   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
877 
878   OP_POSSTARI, 0,          /* STARI, MINSTARI */
879   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
880   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
881   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
882   0,                       /* EXACTI */
883   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
884 
885   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
886   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
887   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
888   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
889   0,                       /* NOTEXACT */
890   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
891 
892   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
893   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
894   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
895   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
896   0,                       /* NOTEXACTI */
897   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
898 
899   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
900   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
901   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
902   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
903   0,                       /* TYPEEXACT */
904   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
905 
906   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
907   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
908   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
909   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
910   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
911 
912   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
913   0, 0,                    /* REF, REFI */
914   0, 0,                    /* DNREF, DNREFI */
915   0, 0                     /* RECURSE, CALLOUT */
916 };
917 
918 
919 #ifdef DEBUG_SHOW_PARSED
920 /*************************************************
921 *     Show the parsed pattern for debugging      *
922 *************************************************/
923 
924 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
925 can be enabled. */
926 
show_parsed(compile_block * cb)927 static void show_parsed(compile_block *cb)
928 {
929 uint32_t *pptr = cb->parsed_pattern;
930 
931 for (;;)
932   {
933   int max, min;
934   PCRE2_SIZE offset;
935   uint32_t i;
936   uint32_t length;
937   uint32_t meta_arg = META_DATA(*pptr);
938 
939   fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
940 
941   if (*pptr < META_END)
942     {
943     if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
944     pptr++;
945     }
946 
947   else switch (META_CODE(*pptr++))
948     {
949     default:
950     fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
951     return;
952 
953     case META_END:
954     fprintf(stderr, "META_END\n");
955     return;
956 
957     case META_CAPTURE:
958     fprintf(stderr, "META_CAPTURE %d", meta_arg);
959     break;
960 
961     case META_RECURSE:
962     GETOFFSET(offset, pptr);
963     fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
964     break;
965 
966     case META_BACKREF:
967     if (meta_arg < 10)
968       offset = cb->small_ref_offset[meta_arg];
969     else
970       GETOFFSET(offset, pptr);
971     fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
972     break;
973 
974     case META_ESCAPE:
975     if (meta_arg == ESC_P || meta_arg == ESC_p)
976       {
977       uint32_t ptype = *pptr >> 16;
978       uint32_t pvalue = *pptr++ & 0xffff;
979       fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
980         ptype, pvalue);
981       }
982     else
983       {
984       uint32_t cc;
985       /* There's just one escape we might have here that isn't negated in the
986       escapes table. */
987       if (meta_arg == ESC_g) cc = CHAR_g;
988       else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
989         {
990         if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
991         }
992       if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
993       fprintf(stderr, "META \\%c", cc);
994       }
995     break;
996 
997     case META_MINMAX:
998     min = *pptr++;
999     max = *pptr++;
1000     if (max != REPEAT_UNLIMITED)
1001       fprintf(stderr, "META {%d,%d}", min, max);
1002     else
1003       fprintf(stderr, "META {%d,}", min);
1004     break;
1005 
1006     case META_MINMAX_QUERY:
1007     min = *pptr++;
1008     max = *pptr++;
1009     if (max != REPEAT_UNLIMITED)
1010       fprintf(stderr, "META {%d,%d}?", min, max);
1011     else
1012       fprintf(stderr, "META {%d,}?", min);
1013     break;
1014 
1015     case META_MINMAX_PLUS:
1016     min = *pptr++;
1017     max = *pptr++;
1018     if (max != REPEAT_UNLIMITED)
1019       fprintf(stderr, "META {%d,%d}+", min, max);
1020     else
1021       fprintf(stderr, "META {%d,}+", min);
1022     break;
1023 
1024     case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1025     case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1026     case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1027     case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1028     case META_DOT: fprintf(stderr, "META_DOT"); break;
1029     case META_ASTERISK: fprintf(stderr, "META *"); break;
1030     case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1031     case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1032     case META_PLUS: fprintf(stderr, "META +"); break;
1033     case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1034     case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1035     case META_QUERY: fprintf(stderr, "META ?"); break;
1036     case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1037     case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1038 
1039     case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1040     case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1041     case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1042     case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1043     case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1044     case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1045     case META_KET: fprintf(stderr, "META )"); break;
1046     case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1047 
1048     case META_CLASS: fprintf(stderr, "META ["); break;
1049     case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1050     case META_CLASS_END: fprintf(stderr, "META ]"); break;
1051     case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1052     case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1053 
1054     case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1055     case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1056 
1057     case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1058     case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1059 
1060     case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1061     case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1062     case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1063     case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1064     case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1065     case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1066 
1067     case META_OPTIONS:
1068     fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
1069     pptr += 2;
1070     break;
1071 
1072     case META_LOOKBEHIND:
1073     fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
1074     pptr += 2;
1075     break;
1076 
1077     case META_LOOKBEHIND_NA:
1078     fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
1079     pptr += 2;
1080     break;
1081 
1082     case META_LOOKBEHINDNOT:
1083     fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
1084     pptr += 2;
1085     break;
1086 
1087     case META_CALLOUT_NUMBER:
1088     fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1089        pptr[1]);
1090     pptr += 3;
1091     break;
1092 
1093     case META_CALLOUT_STRING:
1094       {
1095       uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1096       uint32_t patlength = *pptr++;    /* Length of next pattern item */
1097       fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1098       GETOFFSET(offset, pptr);
1099       fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1100       }
1101     break;
1102 
1103     case META_RECURSE_BYNAME:
1104     fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1105     GETOFFSET(offset, pptr);
1106     fprintf(stderr, "%zd", offset);
1107     break;
1108 
1109     case META_BACKREF_BYNAME:
1110     fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1111     GETOFFSET(offset, pptr);
1112     fprintf(stderr, "%zd", offset);
1113     break;
1114 
1115     case META_COND_NUMBER:
1116     fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1117     GETOFFSET(offset, pptr);
1118     fprintf(stderr, "%zd", offset);
1119     pptr++;
1120     break;
1121 
1122     case META_COND_DEFINE:
1123     fprintf(stderr, "META (?(DEFINE) offset=");
1124     GETOFFSET(offset, pptr);
1125     fprintf(stderr, "%zd", offset);
1126     break;
1127 
1128     case META_COND_VERSION:
1129     fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1130     fprintf(stderr, "%d.", *pptr++);
1131     fprintf(stderr, "%d)", *pptr++);
1132     break;
1133 
1134     case META_COND_NAME:
1135     fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1136     GETOFFSET(offset, pptr);
1137     fprintf(stderr, "%zd", offset);
1138     break;
1139 
1140     case META_COND_RNAME:
1141     fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1142     GETOFFSET(offset, pptr);
1143     fprintf(stderr, "%zd", offset);
1144     break;
1145 
1146     /* This is kept as a name, because it might be. */
1147 
1148     case META_COND_RNUMBER:
1149     fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1150     GETOFFSET(offset, pptr);
1151     fprintf(stderr, "%zd", offset);
1152     break;
1153 
1154     case META_MARK:
1155     fprintf(stderr, "META (*MARK:");
1156     goto SHOWARG;
1157 
1158     case META_COMMIT_ARG:
1159     fprintf(stderr, "META (*COMMIT:");
1160     goto SHOWARG;
1161 
1162     case META_PRUNE_ARG:
1163     fprintf(stderr, "META (*PRUNE:");
1164     goto SHOWARG;
1165 
1166     case META_SKIP_ARG:
1167     fprintf(stderr, "META (*SKIP:");
1168     goto SHOWARG;
1169 
1170     case META_THEN_ARG:
1171     fprintf(stderr, "META (*THEN:");
1172     SHOWARG:
1173     length = *pptr++;
1174     for (i = 0; i < length; i++)
1175       {
1176       uint32_t cc = *pptr++;
1177       if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1178         else fprintf(stderr, "\\x{%x}", cc);
1179       }
1180     fprintf(stderr, ") length=%u", length);
1181     break;
1182     }
1183   fprintf(stderr, "\n");
1184   }
1185 return;
1186 }
1187 #endif  /* DEBUG_SHOW_PARSED */
1188 
1189 
1190 
1191 /*************************************************
1192 *               Copy compiled code               *
1193 *************************************************/
1194 
1195 /* Compiled JIT code cannot be copied, so the new compiled block has no
1196 associated JIT data. */
1197 
1198 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1199 pcre2_code_copy(const pcre2_code *code)
1200 {
1201 PCRE2_SIZE* ref_count;
1202 pcre2_code *newcode;
1203 
1204 if (code == NULL) return NULL;
1205 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1206 if (newcode == NULL) return NULL;
1207 memcpy(newcode, code, code->blocksize);
1208 newcode->executable_jit = NULL;
1209 
1210 /* If the code is one that has been deserialized, increment the reference count
1211 in the decoded tables. */
1212 
1213 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1214   {
1215   ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1216   (*ref_count)++;
1217   }
1218 
1219 return newcode;
1220 }
1221 
1222 
1223 
1224 /*************************************************
1225 *     Copy compiled code and character tables    *
1226 *************************************************/
1227 
1228 /* Compiled JIT code cannot be copied, so the new compiled block has no
1229 associated JIT data. This version of code_copy also makes a separate copy of
1230 the character tables. */
1231 
1232 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1233 pcre2_code_copy_with_tables(const pcre2_code *code)
1234 {
1235 PCRE2_SIZE* ref_count;
1236 pcre2_code *newcode;
1237 uint8_t *newtables;
1238 
1239 if (code == NULL) return NULL;
1240 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1241 if (newcode == NULL) return NULL;
1242 memcpy(newcode, code, code->blocksize);
1243 newcode->executable_jit = NULL;
1244 
1245 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1246   code->memctl.memory_data);
1247 if (newtables == NULL)
1248   {
1249   code->memctl.free((void *)newcode, code->memctl.memory_data);
1250   return NULL;
1251   }
1252 memcpy(newtables, code->tables, TABLES_LENGTH);
1253 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1254 *ref_count = 1;
1255 
1256 newcode->tables = newtables;
1257 newcode->flags |= PCRE2_DEREF_TABLES;
1258 return newcode;
1259 }
1260 
1261 
1262 
1263 /*************************************************
1264 *               Free compiled code               *
1265 *************************************************/
1266 
1267 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1268 pcre2_code_free(pcre2_code *code)
1269 {
1270 PCRE2_SIZE* ref_count;
1271 
1272 if (code != NULL)
1273   {
1274 #ifdef SUPPORT_JIT
1275   if (code->executable_jit != NULL)
1276     PRIV(jit_free)(code->executable_jit, &code->memctl);
1277 #endif
1278 
1279   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1280     {
1281     /* Decoded tables belong to the codes after deserialization, and they must
1282     be freed when there are no more references to them. The *ref_count should
1283     always be > 0. */
1284 
1285     ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1286     if (*ref_count > 0)
1287       {
1288       (*ref_count)--;
1289       if (*ref_count == 0)
1290         code->memctl.free((void *)code->tables, code->memctl.memory_data);
1291       }
1292     }
1293 
1294   code->memctl.free(code, code->memctl.memory_data);
1295   }
1296 }
1297 
1298 
1299 
1300 /*************************************************
1301 *         Read a number, possibly signed         *
1302 *************************************************/
1303 
1304 /* This function is used to read numbers in the pattern. The initial pointer
1305 must be at the sign or first digit of the number. When relative values
1306 (introduced by + or -) are allowed, they are relative group numbers, and the
1307 result must be greater than zero.
1308 
1309 Arguments:
1310   ptrptr      points to the character pointer variable
1311   ptrend      points to the end of the input string
1312   allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1313   max_value   the largest number allowed
1314   max_error   the error to give for an over-large number
1315   intptr      where to put the result
1316   errcodeptr  where to put an error code
1317 
1318 Returns:      TRUE  - a number was read
1319               FALSE - errorcode == 0 => no number was found
1320                       errorcode != 0 => an error occurred
1321 */
1322 
1323 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1324 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1325   uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1326 {
1327 int sign = 0;
1328 uint32_t n = 0;
1329 PCRE2_SPTR ptr = *ptrptr;
1330 BOOL yield = FALSE;
1331 
1332 *errorcodeptr = 0;
1333 
1334 if (allow_sign >= 0 && ptr < ptrend)
1335   {
1336   if (*ptr == CHAR_PLUS)
1337     {
1338     sign = +1;
1339     max_value -= allow_sign;
1340     ptr++;
1341     }
1342   else if (*ptr == CHAR_MINUS)
1343     {
1344     sign = -1;
1345     ptr++;
1346     }
1347   }
1348 
1349 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1350 while (ptr < ptrend && IS_DIGIT(*ptr))
1351   {
1352   n = n * 10 + *ptr++ - CHAR_0;
1353   if (n > max_value)
1354     {
1355     *errorcodeptr = max_error;
1356     goto EXIT;
1357     }
1358   }
1359 
1360 if (allow_sign >= 0 && sign != 0)
1361   {
1362   if (n == 0)
1363     {
1364     *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1365     goto EXIT;
1366     }
1367 
1368   if (sign > 0) n += allow_sign;
1369   else if ((int)n > allow_sign)
1370     {
1371     *errorcodeptr = ERR15;  /* Non-existent subpattern */
1372     goto EXIT;
1373     }
1374   else n = allow_sign + 1 - n;
1375   }
1376 
1377 yield = TRUE;
1378 
1379 EXIT:
1380 *intptr = n;
1381 *ptrptr = ptr;
1382 return yield;
1383 }
1384 
1385 
1386 
1387 /*************************************************
1388 *         Read repeat counts                     *
1389 *************************************************/
1390 
1391 /* Read an item of the form {n,m} and return the values when non-NULL pointers
1392 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1393 larger value is used for "unlimited". We have to use signed arguments for
1394 read_number() because it is capable of returning a signed value. As of Perl
1395 5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1396 tabs after { and before } and between the numbers and the comma, so we do too.
1397 
1398 Arguments:
1399   ptrptr         points to pointer to character after '{'
1400   ptrend         pointer to end of input
1401   minp           if not NULL, pointer to int for min
1402   maxp           if not NULL, pointer to int for max
1403   errorcodeptr   points to error code variable
1404 
1405 Returns:         FALSE if not a repeat quantifier, errorcode set zero
1406                  FALSE on error, with errorcode set non-zero
1407                  TRUE on success, with pointer updated to point after '}'
1408 */
1409 
1410 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1411 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1412   uint32_t *maxp, int *errorcodeptr)
1413 {
1414 PCRE2_SPTR p = *ptrptr;
1415 PCRE2_SPTR pp;
1416 BOOL yield = FALSE;
1417 BOOL had_minimum = FALSE;
1418 int32_t min = 0;
1419 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1420 
1421 *errorcodeptr = 0;
1422 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1423 
1424 /* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1425 such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1426 error. */
1427 
1428 pp = p;
1429 if (pp < ptrend && IS_DIGIT(*pp))
1430   {
1431   had_minimum = TRUE;
1432   while (++pp < ptrend && IS_DIGIT(*pp)) {}
1433   }
1434 
1435 while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1436 if (pp >= ptrend) return FALSE;
1437 
1438 if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1439   {
1440   if (!had_minimum) return FALSE;
1441   }
1442 else
1443   {
1444   if (*pp++ != CHAR_COMMA) return FALSE;
1445   while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1446   if (pp >= ptrend) return FALSE;
1447   if (IS_DIGIT(*pp))
1448     {
1449     while (++pp < ptrend && IS_DIGIT(*pp)) {}
1450     }
1451   else if (!had_minimum) return FALSE;
1452   while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1453   if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1454   }
1455 
1456 /* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
1457 or {n,m}. The only error that read_number() can return is for a number that is
1458 too big. If *errorcodeptr is returned as zero it means no number was found. */
1459 
1460 /* Deal with {,m} or n too big. If we successfully read m there is no need to
1461 check m >= n because n defaults to zero. */
1462 
1463 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1464   {
1465   if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1466   p++;  /* Skip comma and subsequent spaces */
1467   while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1468   if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1469     {
1470     if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1471     }
1472   }
1473 
1474 /* Have read one number. Deal with {n} or {n,} or {n,m} */
1475 
1476 else
1477   {
1478   while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1479   if (*p == CHAR_RIGHT_CURLY_BRACKET)
1480     {
1481     max = min;
1482     }
1483   else   /* Handle {n,} or {n,m} */
1484     {
1485     p++;    /* Skip comma and subsequent spaces */
1486     while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1487     if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1488       {
1489       if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1490       }
1491 
1492     if (max < min)
1493       {
1494       *errorcodeptr = ERR4;
1495       goto EXIT;
1496       }
1497     }
1498   }
1499 
1500 /* Valid quantifier exists */
1501 
1502 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1503 p++;
1504 yield = TRUE;
1505 if (minp != NULL) *minp = (uint32_t)min;
1506 if (maxp != NULL) *maxp = (uint32_t)max;
1507 
1508 /* Update the pattern pointer */
1509 
1510 EXIT:
1511 *ptrptr = p;
1512 return yield;
1513 }
1514 
1515 
1516 
1517 /*************************************************
1518 *            Handle escapes                      *
1519 *************************************************/
1520 
1521 /* This function is called when a \ has been encountered. It either returns a
1522 positive value for a simple escape such as \d, or 0 for a data character, which
1523 is placed in chptr. A backreference to group n is returned as negative n. On
1524 entry, ptr is pointing at the character after \. On exit, it points after the
1525 final code unit of the escape sequence.
1526 
1527 This function is also called from pcre2_substitute() to handle escape sequences
1528 in replacement strings. In this case, the cb argument is NULL, and in the case
1529 of escapes that have further processing, only sequences that define a data
1530 character are recognised. The isclass argument is not relevant; the options
1531 argument is the final value of the compiled pattern's options.
1532 
1533 Arguments:
1534   ptrptr         points to the input position pointer
1535   ptrend         points to the end of the input
1536   chptr          points to a returned data character
1537   errorcodeptr   points to the errorcode variable (containing zero)
1538   options        the current options bits
1539   xoptions       the current extra options bits
1540   isclass        TRUE if inside a character class
1541   cb             compile data block or NULL when called from pcre2_substitute()
1542 
1543 Returns:         zero => a data character
1544                  positive => a special escape sequence
1545                  negative => a numerical back reference
1546                  on error, errorcodeptr is set non-zero
1547 */
1548 
1549 int
PRIV(check_escape)1550 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1551   int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
1552   compile_block *cb)
1553 {
1554 BOOL utf = (options & PCRE2_UTF) != 0;
1555 BOOL alt_bsux =
1556   ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1557 PCRE2_SPTR ptr = *ptrptr;
1558 uint32_t c, cc;
1559 int escape = 0;
1560 int i;
1561 
1562 /* If backslash is at the end of the string, it's an error. */
1563 
1564 if (ptr >= ptrend)
1565   {
1566   *errorcodeptr = ERR1;
1567   return 0;
1568   }
1569 
1570 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1571 *errorcodeptr = 0;              /* Be optimistic */
1572 
1573 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1574 value test saves a memory lookup for code points outside the alphanumeric
1575 range. */
1576 
1577 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1578 
1579 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1580 positive value is a literal value for something like \n. A negative value is
1581 the negation of one of the ESC_ macros that is passed back for handling by the
1582 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1583 is supported. If the value is zero, further processing is handled below. */
1584 
1585 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1586   {
1587   if (i > 0)
1588     {
1589     c = (uint32_t)i;
1590     if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1591       c = CHAR_LF;
1592     }
1593   else  /* Negative table entry */
1594     {
1595     escape = -i;                    /* Else return a special escape */
1596     if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1597       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1598 
1599     /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1600     Unicode code points, as well as plain \N for "not newline". PCRE does not
1601     support \N{name}. However, it does support quantification such as \N{2,3},
1602     so if \N{ is not followed by U+dddd we check for a quantifier. */
1603 
1604     if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1605       {
1606       PCRE2_SPTR p = ptr + 1;
1607 
1608       /* Perl ignores spaces and tabs after { */
1609 
1610       while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1611 
1612       /* \N{U+ can be handled by the \x{ code. However, this construction is
1613       not valid in EBCDIC environments because it specifies a Unicode
1614       character, not a codepoint in the local code. For example \N{U+0041}
1615       must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1616       casing semantics for the entire pattern, so allow it only in UTF (i.e.
1617       Unicode) mode. */
1618 
1619       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1620         {
1621 #ifdef EBCDIC
1622         *errorcodeptr = ERR93;
1623 #else
1624         if (utf)
1625           {
1626           ptr = p + 2;
1627           escape = 0;   /* Not a fancy escape after all */
1628           goto COME_FROM_NU;
1629           }
1630         else *errorcodeptr = ERR93;
1631 #endif
1632         }
1633 
1634       /* Give an error if what follows is not a quantifier, but don't override
1635       an error set by the quantifier reader (e.g. number overflow). */
1636 
1637       else
1638         {
1639         if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1640              *errorcodeptr == 0)
1641           *errorcodeptr = ERR37;
1642         }
1643       }
1644     }
1645   }
1646 
1647 /* Escapes that need further processing, including those that are unknown, have
1648 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1649 \o, and \x are recognized (\u and \U can never appear as they are used for case
1650 forcing). */
1651 
1652 else
1653   {
1654   int s;
1655   PCRE2_SPTR oldptr;
1656   BOOL overflow;
1657 
1658   /* Filter calls from pcre2_substitute(). */
1659 
1660   if (cb == NULL)
1661     {
1662     if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1663       {
1664       *errorcodeptr = ERR3;
1665       return 0;
1666       }
1667     alt_bsux = FALSE;   /* Do not modify \x handling */
1668     }
1669 
1670   switch (c)
1671     {
1672     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1673     error. */
1674 
1675     case CHAR_F:
1676     case CHAR_l:
1677     case CHAR_L:
1678     *errorcodeptr = ERR37;
1679     break;
1680 
1681     /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1682     is set. Otherwise, \u must be followed by exactly four hex digits or, if
1683     PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1684     Otherwise it is a lowercase u letter. This gives some compatibility with
1685     ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1686     allowed. When \u{ is not followed by hex digits, a special return is given
1687     because otherwise \u{ 12} (for example) would be treated as u{12}. */
1688 
1689     case CHAR_u:
1690     if (!alt_bsux) *errorcodeptr = ERR37; else
1691       {
1692       uint32_t xc;
1693 
1694       if (ptr >= ptrend) break;
1695       if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1696           (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1697         {
1698         PCRE2_SPTR hptr = ptr + 1;
1699 
1700         cc = 0;
1701         while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1702           {
1703           if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1704             {
1705             *errorcodeptr = ERR77;
1706             ptr = hptr;   /* Show where */
1707             break;        /* *hptr != } will cause another break below */
1708             }
1709           cc = (cc << 4) | xc;
1710           hptr++;
1711           }
1712 
1713         if (hptr == ptr + 1 ||   /* No hex digits */
1714             hptr >= ptrend ||    /* Hit end of input */
1715             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1716           {
1717           escape = ESC_ub;    /* Special return */
1718           ptr++;              /* Skip { */
1719           break;              /* Hex escape not recognized */
1720           }
1721 
1722         c = cc;          /* Accept the code point */
1723         ptr = hptr + 1;
1724         }
1725 
1726       else  /* Must be exactly 4 hex digits */
1727         {
1728         if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1729         if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1730         if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1731         cc = (cc << 4) | xc;
1732         if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1733         cc = (cc << 4) | xc;
1734         if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1735         c = (cc << 4) | xc;
1736         ptr += 4;
1737         }
1738 
1739       if (utf)
1740         {
1741         if (c > 0x10ffffU) *errorcodeptr = ERR77;
1742         else
1743           if (c >= 0xd800 && c <= 0xdfff &&
1744               (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1745                 *errorcodeptr = ERR73;
1746         }
1747       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1748       }
1749     break;
1750 
1751     /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1752     in which case it is an upper case letter. */
1753 
1754     case CHAR_U:
1755     if (!alt_bsux) *errorcodeptr = ERR37;
1756     break;
1757 
1758     /* In a character class, \g is just a literal "g". Outside a character
1759     class, \g must be followed by one of a number of specific things:
1760 
1761     (1) A number, either plain or braced. If positive, it is an absolute
1762     backreference. If negative, it is a relative backreference. This is a Perl
1763     5.10 feature.
1764 
1765     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1766     is part of Perl's movement towards a unified syntax for back references. As
1767     this is synonymous with \k{name}, we fudge it up by pretending it really
1768     was \k{name}.
1769 
1770     (3) For Oniguruma compatibility we also support \g followed by a name or a
1771     number either in angle brackets or in single quotes. However, these are
1772     (possibly recursive) subroutine calls, _not_ backreferences. We return
1773     the ESC_g code.
1774 
1775     Summary: Return a negative number for a numerical back reference, ESC_k for
1776     a named back reference, and ESC_g for a named or numbered subroutine call.
1777     */
1778 
1779     case CHAR_g:
1780     if (isclass) break;
1781 
1782     if (ptr >= ptrend)
1783       {
1784       *errorcodeptr = ERR57;
1785       break;
1786       }
1787 
1788     if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1789       {
1790       escape = ESC_g;
1791       break;
1792       }
1793 
1794     /* If there is a brace delimiter, try to read a numerical reference. If
1795     there isn't one, assume we have a name and treat it as \k. */
1796 
1797     if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1798       {
1799       PCRE2_SPTR p = ptr + 1;
1800 
1801       while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1802       if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1803           errorcodeptr))
1804         {
1805         if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1806         break;
1807         }
1808       while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1809 
1810       if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1811         {
1812         *errorcodeptr = ERR57;
1813         break;
1814         }
1815       ptr = p + 1;
1816       }
1817 
1818     /* Read an undelimited number */
1819 
1820     else
1821       {
1822       if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1823           errorcodeptr))
1824         {
1825         if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1826         break;
1827         }
1828       }
1829 
1830     if (s <= 0)
1831       {
1832       *errorcodeptr = ERR15;
1833       break;
1834       }
1835 
1836     escape = -s;
1837     break;
1838 
1839     /* The handling of escape sequences consisting of a string of digits
1840     starting with one that is not zero is not straightforward. Perl has changed
1841     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1842     recommended to avoid the ambiguities in the old syntax.
1843 
1844     Outside a character class, the digits are read as a decimal number. If the
1845     number is less than 10, or if there are that many previous extracting left
1846     brackets, it is a back reference. Otherwise, up to three octal digits are
1847     read to form an escaped character code. Thus \123 is likely to be octal 123
1848     (cf \0123, which is octal 012 followed by the literal 3).
1849 
1850     Inside a character class, \ followed by a digit is always either a literal
1851     8 or 9 or an octal number. */
1852 
1853     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1854     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1855 
1856     if (!isclass)
1857       {
1858       oldptr = ptr;
1859       ptr--;   /* Back to the digit */
1860 
1861       /* As we know we are at a digit, the only possible error from
1862       read_number() is a number that is too large to be a group number. In this
1863       case we fall through handle this as not a group reference. If we have
1864       read a small enough number, check for a back reference.
1865 
1866       \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1867       are octal escapes if there are not that many previous captures. */
1868 
1869       if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1870           (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1871         {
1872         if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1873           else escape = -s;     /* Indicates a back reference */
1874         break;
1875         }
1876 
1877       ptr = oldptr;      /* Put the pointer back and fall through */
1878       }
1879 
1880     /* Handle a digit following \ when the number is not a back reference, or
1881     we are within a character class. If the first digit is 8 or 9, Perl used to
1882     generate a binary zero and then treat the digit as a following literal. At
1883     least by Perl 5.18 this changed so as not to insert the binary zero. */
1884 
1885     if (c >= CHAR_8) break;
1886 
1887     /* Fall through */
1888 
1889     /* \0 always starts an octal number, but we may drop through to here with a
1890     larger first octal digit. The original code used just to take the least
1891     significant 8 bits of octal numbers (I think this is what early Perls used
1892     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1893     but no more than 3 octal digits. */
1894 
1895     case CHAR_0:
1896     c -= CHAR_0;
1897     while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1898         c = c * 8 + *ptr++ - CHAR_0;
1899 #if PCRE2_CODE_UNIT_WIDTH == 8
1900     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1901 #endif
1902     break;
1903 
1904     /* \o is a relatively new Perl feature, supporting a more general way of
1905     specifying character codes in octal. The only supported form is \o{ddd},
1906     with optional spaces or tabs after { and before }. */
1907 
1908     case CHAR_o:
1909     if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1910       {
1911       ptr--;
1912       *errorcodeptr = ERR55;
1913       break;
1914       }
1915 
1916     while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1917     if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1918       {
1919       *errorcodeptr = ERR78;
1920       break;
1921       }
1922 
1923     c = 0;
1924     overflow = FALSE;
1925     while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1926       {
1927       cc = *ptr++;
1928       if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1929 #if PCRE2_CODE_UNIT_WIDTH == 32
1930       if (c >= 0x20000000l) { overflow = TRUE; break; }
1931 #endif
1932       c = (c << 3) + (cc - CHAR_0);
1933 #if PCRE2_CODE_UNIT_WIDTH == 8
1934       if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1935 #elif PCRE2_CODE_UNIT_WIDTH == 16
1936       if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1937 #elif PCRE2_CODE_UNIT_WIDTH == 32
1938       if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1939 #endif
1940       }
1941 
1942     while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1943 
1944     if (overflow)
1945       {
1946       while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1947       *errorcodeptr = ERR34;
1948       }
1949     else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1950       {
1951       if (utf && c >= 0xd800 && c <= 0xdfff &&
1952           (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1953         {
1954         ptr--;
1955         *errorcodeptr = ERR73;
1956         }
1957       }
1958     else
1959       {
1960       ptr--;
1961       *errorcodeptr = ERR64;
1962       }
1963     break;
1964 
1965     /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1966     by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1967 
1968     case CHAR_x:
1969     if (alt_bsux)
1970       {
1971       uint32_t xc;
1972       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1973       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1974       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1975       c = (cc << 4) | xc;
1976       ptr += 2;
1977       }
1978 
1979     /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1980     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1981     digits. If not, { used to be treated as a data character. However, Perl
1982     seems to read hex digits up to the first non-such, and ignore the rest, so
1983     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1984     now gives an error. */
1985 
1986     else
1987       {
1988       if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1989         {
1990         ptr++;
1991         while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1992 
1993 #ifndef EBCDIC
1994         COME_FROM_NU:
1995 #endif
1996         if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1997           {
1998           *errorcodeptr = ERR78;
1999           break;
2000           }
2001         c = 0;
2002         overflow = FALSE;
2003 
2004         while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2005           {
2006           ptr++;
2007           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2008 #if PCRE2_CODE_UNIT_WIDTH == 32
2009           if (c >= 0x10000000l) { overflow = TRUE; break; }
2010 #endif
2011           c = (c << 4) | cc;
2012           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2013             {
2014             overflow = TRUE;
2015             break;
2016             }
2017           }
2018 
2019         /* Perl ignores spaces and tabs before } */
2020 
2021         while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2022 
2023         /* On overflow, skip remaining hex digits */
2024 
2025         if (overflow)
2026           {
2027           while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2028           *errorcodeptr = ERR34;
2029           }
2030         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2031           {
2032           if (utf && c >= 0xd800 && c <= 0xdfff &&
2033               (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2034             {
2035             ptr--;
2036             *errorcodeptr = ERR73;
2037             }
2038           }
2039 
2040         /* If the sequence of hex digits (followed by optional space) does not
2041         end with '}', give an error. We used just to recognize this construct
2042         and fall through to the normal \x handling, but nowadays Perl gives an
2043         error, which seems much more sensible, so we do too. */
2044 
2045         else
2046           {
2047           ptr--;
2048           *errorcodeptr = ERR67;
2049           }
2050         }   /* End of \x{} processing */
2051 
2052       /* Read a up to two hex digits after \x */
2053 
2054       else
2055         {
2056         c = 0;
2057         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2058         ptr++;
2059         c = cc;
2060         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2061         ptr++;
2062         c = (c << 4) | cc;
2063         }     /* End of \xdd handling */
2064       }       /* End of Perl-style \x handling */
2065     break;
2066 
2067     /* The handling of \c is different in ASCII and EBCDIC environments. In an
2068     ASCII (or Unicode) environment, an error is given if the character
2069     following \c is not a printable ASCII character. Otherwise, the following
2070     character is upper-cased if it is a letter, and after that the 0x40 bit is
2071     flipped. The result is the value of the escape.
2072 
2073     In an EBCDIC environment the handling of \c is compatible with the
2074     specification in the perlebcdic document. The following character must be
2075     a letter or one of small number of special characters. These provide a
2076     means of defining the character values 0-31.
2077 
2078     For testing the EBCDIC handling of \c in an ASCII environment, recognize
2079     the EBCDIC value of 'c' explicitly. */
2080 
2081 #if defined EBCDIC && 'a' != 0x81
2082     case 0x83:
2083 #else
2084     case CHAR_c:
2085 #endif
2086     if (ptr >= ptrend)
2087       {
2088       *errorcodeptr = ERR2;
2089       break;
2090       }
2091     c = *ptr;
2092     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2093 
2094     /* Handle \c in an ASCII/Unicode environment. */
2095 
2096 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
2097     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2098       {
2099       *errorcodeptr = ERR68;
2100       break;
2101       }
2102     c ^= 0x40;
2103 
2104     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2105     255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2106     POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2107     The other valid sequences correspond to a list of specific characters. */
2108 
2109 #else
2110     if (c == CHAR_QUESTION_MARK)
2111       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2112     else
2113       {
2114       for (i = 0; i < 32; i++)
2115         {
2116         if (c == ebcdic_escape_c[i]) break;
2117         }
2118       if (i < 32) c = i; else *errorcodeptr = ERR68;
2119       }
2120 #endif  /* EBCDIC */
2121 
2122     ptr++;
2123     break;
2124 
2125     /* Any other alphanumeric following \ is an error. Perl gives an error only
2126     if in warning mode, but PCRE doesn't have a warning mode. */
2127 
2128     default:
2129     *errorcodeptr = ERR3;
2130     *ptrptr = ptr - 1;     /* Point to the character at fault */
2131     return 0;
2132     }
2133   }
2134 
2135 /* Set the pointer to the next character before returning. */
2136 
2137 *ptrptr = ptr;
2138 *chptr = c;
2139 return escape;
2140 }
2141 
2142 
2143 
2144 #ifdef SUPPORT_UNICODE
2145 /*************************************************
2146 *               Handle \P and \p                 *
2147 *************************************************/
2148 
2149 /* This function is called after \P or \p has been encountered, provided that
2150 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2151 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2152 after the final code unit of the escape sequence.
2153 
2154 Arguments:
2155   ptrptr         the pattern position pointer
2156   negptr         a boolean that is set TRUE for negation else FALSE
2157   ptypeptr       an unsigned int that is set to the type value
2158   pdataptr       an unsigned int that is set to the detailed property value
2159   errorcodeptr   the error code variable
2160   cb             the compile data
2161 
2162 Returns:         TRUE if the type value was found, or FALSE for an invalid type
2163 */
2164 
2165 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2166 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2167   uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2168 {
2169 PCRE2_UCHAR c;
2170 PCRE2_SIZE i, bot, top;
2171 PCRE2_SPTR ptr = *ptrptr;
2172 PCRE2_UCHAR name[50];
2173 PCRE2_UCHAR *vptr = NULL;
2174 uint16_t ptscript = PT_NOTSCRIPT;
2175 
2176 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2177 c = *ptr++;
2178 *negptr = FALSE;
2179 
2180 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2181 negation. */
2182 
2183 if (c == CHAR_LEFT_CURLY_BRACKET)
2184   {
2185   if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2186 
2187   if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2188     {
2189     *negptr = TRUE;
2190     ptr++;
2191     }
2192 
2193   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2194     {
2195     if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2196     c = *ptr++;
2197 #if PCRE2_CODE_UNIT_WIDTH != 8
2198     while (c == '_' || c == '-' || (c <= 0xff && isspace(c)))
2199 #else
2200     while (c == '_' || c == '-' || isspace(c))
2201 #endif
2202       {
2203       if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2204       c = *ptr++;
2205       }
2206     if (c == CHAR_NUL) goto ERROR_RETURN;
2207     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2208     name[i] = tolower(c);
2209     if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2210     }
2211 
2212   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2213   name[i] = 0;
2214   }
2215 
2216 /* If { doesn't follow \p or \P there is just one following character, which
2217 must be an ASCII letter. */
2218 
2219 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2220   {
2221   name[0] = tolower(c);
2222   name[1] = 0;
2223   }
2224 else goto ERROR_RETURN;
2225 
2226 *ptrptr = ptr;
2227 
2228 /* If the property contains ':' or '=' we have class name and value separately
2229 specified. The following are supported:
2230 
2231   . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2232   . Script (synonym sc) for which the property name is the script name
2233   . Script_Extensions (synonym scx), ditto
2234 
2235 As this is a small number, we currently just check the names directly. If this
2236 grows, a sorted table and a switch will be neater.
2237 
2238 For both the script properties, set a PT_xxx value so that (1) they can be
2239 distinguished and (2) invalid script names that happen to be the name of
2240 another property can be diagnosed. */
2241 
2242 if (vptr != NULL)
2243   {
2244   int offset = 0;
2245   PCRE2_UCHAR sname[8];
2246 
2247   *vptr = 0;   /* Terminate property name */
2248   if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2249       PRIV(strcmp_c8)(name, STRING_bc) == 0)
2250     {
2251     offset = 4;
2252     sname[0] = CHAR_b;
2253     sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2254     sname[2] = CHAR_d;
2255     sname[3] = CHAR_i;
2256     }
2257 
2258   else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2259            PRIV(strcmp_c8)(name, STRING_sc) == 0)
2260     ptscript = PT_SC;
2261 
2262   else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2263            PRIV(strcmp_c8)(name, STRING_scx) == 0)
2264     ptscript = PT_SCX;
2265 
2266   else
2267     {
2268     *errorcodeptr = ERR47;
2269     return FALSE;
2270     }
2271 
2272   /* Adjust the string in name[] as needed */
2273 
2274   memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2275   if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2276   }
2277 
2278 /* Search for a recognized property using binary chop. */
2279 
2280 bot = 0;
2281 top = PRIV(utt_size);
2282 
2283 while (bot < top)
2284   {
2285   int r;
2286   i = (bot + top) >> 1;
2287   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2288 
2289   /* When a matching property is found, some extra checking is needed when the
2290   \p{xx:yy} syntax is used and xx is either sc or scx. */
2291 
2292   if (r == 0)
2293     {
2294     *pdataptr = PRIV(utt)[i].value;
2295     if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2296       {
2297       *ptypeptr = PRIV(utt)[i].type;
2298       return TRUE;
2299       }
2300 
2301     switch (PRIV(utt)[i].type)
2302       {
2303       case PT_SC:
2304       *ptypeptr = PT_SC;
2305       return TRUE;
2306 
2307       case PT_SCX:
2308       *ptypeptr = ptscript;
2309       return TRUE;
2310       }
2311 
2312     break;  /* Non-script found */
2313     }
2314 
2315   if (r > 0) bot = i + 1; else top = i;
2316   }
2317 
2318 *errorcodeptr = ERR47;   /* Unrecognized property */
2319 return FALSE;
2320 
2321 ERROR_RETURN:            /* Malformed \P or \p */
2322 *errorcodeptr = ERR46;
2323 *ptrptr = ptr;
2324 return FALSE;
2325 }
2326 #endif
2327 
2328 
2329 
2330 /*************************************************
2331 *           Check for POSIX class syntax         *
2332 *************************************************/
2333 
2334 /* This function is called when the sequence "[:" or "[." or "[=" is
2335 encountered in a character class. It checks whether this is followed by a
2336 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2337 reach an unescaped ']' without the special preceding character, return FALSE.
2338 
2339 Originally, this function only recognized a sequence of letters between the
2340 terminators, but it seems that Perl recognizes any sequence of characters,
2341 though of course unknown POSIX names are subsequently rejected. Perl gives an
2342 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2343 didn't consider this to be a POSIX class. Likewise for [:1234:].
2344 
2345 The problem in trying to be exactly like Perl is in the handling of escapes. We
2346 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2347 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2348 below handles the special cases \\ and \], but does not try to do any other
2349 escape processing. This makes it different from Perl for cases such as
2350 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2351 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2352 when Perl does, I think.
2353 
2354 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2355 It seems that the appearance of a nested POSIX class supersedes an apparent
2356 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2357 a digit. This is handled by returning FALSE if the start of a new group with
2358 the same terminator is encountered, since the next closing sequence must close
2359 the nested group, not the outer one.
2360 
2361 In Perl, unescaped square brackets may also appear as part of class names. For
2362 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2363 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2364 seem right at all. PCRE does not allow closing square brackets in POSIX class
2365 names.
2366 
2367 Arguments:
2368   ptr      pointer to the character after the initial [ (colon, dot, equals)
2369   ptrend   pointer to the end of the pattern
2370   endptr   where to return a pointer to the terminating ':', '.', or '='
2371 
2372 Returns:   TRUE or FALSE
2373 */
2374 
2375 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2376 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2377 {
2378 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2379 terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2380 
2381 for (; ptrend - ptr >= 2; ptr++)
2382   {
2383   if (*ptr == CHAR_BACKSLASH &&
2384       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2385     ptr++;
2386 
2387   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2388             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2389 
2390   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2391     {
2392     *endptr = ptr;
2393     return TRUE;
2394     }
2395   }
2396 
2397 return FALSE;
2398 }
2399 
2400 
2401 
2402 /*************************************************
2403 *          Check POSIX class name                *
2404 *************************************************/
2405 
2406 /* This function is called to check the name given in a POSIX-style class entry
2407 such as [:alnum:].
2408 
2409 Arguments:
2410   ptr        points to the first letter
2411   len        the length of the name
2412 
2413 Returns:     a value representing the name, or -1 if unknown
2414 */
2415 
2416 static int
check_posix_name(PCRE2_SPTR ptr,int len)2417 check_posix_name(PCRE2_SPTR ptr, int len)
2418 {
2419 const char *pn = posix_names;
2420 int yield = 0;
2421 while (posix_name_lengths[yield] != 0)
2422   {
2423   if (len == posix_name_lengths[yield] &&
2424     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2425   pn += posix_name_lengths[yield] + 1;
2426   yield++;
2427   }
2428 return -1;
2429 }
2430 
2431 
2432 
2433 /*************************************************
2434 *       Read a subpattern or VERB name           *
2435 *************************************************/
2436 
2437 /* This function is called from parse_regex() below whenever it needs to read
2438 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2439 pointer must be to the preceding character. If that character is '*' we are
2440 reading a verb or alpha assertion name. The pointer is updated to point after
2441 the name, for a VERB or alpha assertion name, or after tha name's terminator
2442 for a subpattern name. Returning both the offset and the name pointer is
2443 redundant information, but some callers use one and some the other, so it is
2444 simplest just to return both. When the name is in braces, spaces and tabs are
2445 allowed (and ignored) at either end.
2446 
2447 Arguments:
2448   ptrptr      points to the character pointer variable
2449   ptrend      points to the end of the input string
2450   utf         true if the input is UTF-encoded
2451   terminator  the terminator of a subpattern name must be this
2452   offsetptr   where to put the offset from the start of the pattern
2453   nameptr     where to put a pointer to the name in the input
2454   namelenptr  where to put the length of the name
2455   errcodeptr  where to put an error code
2456   cb          pointer to the compile data block
2457 
2458 Returns:    TRUE if a name was read
2459             FALSE otherwise, with error code set
2460 */
2461 
2462 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2463 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2464   PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2465   int *errorcodeptr, compile_block *cb)
2466 {
2467 PCRE2_SPTR ptr = *ptrptr;
2468 BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2469 BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2470 
2471 if (is_braced)
2472   while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2473 
2474 if (ptr >= ptrend)                 /* No characters in name */
2475   {
2476   *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2477                             ERR60; /* Verb not recognized or malformed */
2478   goto FAILED;
2479   }
2480 
2481 *nameptr = ptr;
2482 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2483 
2484 /* In UTF mode, a group name may contain letters and decimal digits as defined
2485 by Unicode properties, and underscores, but must not start with a digit. */
2486 
2487 #ifdef SUPPORT_UNICODE
2488 if (utf && is_group)
2489   {
2490   uint32_t c, type;
2491 
2492   GETCHAR(c, ptr);
2493   type = UCD_CHARTYPE(c);
2494 
2495   if (type == ucp_Nd)
2496     {
2497     *errorcodeptr = ERR44;
2498     goto FAILED;
2499     }
2500 
2501   for(;;)
2502     {
2503     if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2504         c != CHAR_UNDERSCORE) break;
2505     ptr++;
2506     FORWARDCHARTEST(ptr, ptrend);
2507     if (ptr >= ptrend) break;
2508     GETCHAR(c, ptr);
2509     type = UCD_CHARTYPE(c);
2510     }
2511   }
2512 else
2513 #else
2514 (void)utf;  /* Avoid compiler warning */
2515 #endif      /* SUPPORT_UNICODE */
2516 
2517 /* Handle non-group names and group names in non-UTF modes. A group name must
2518 not start with a digit. If either of the others start with a digit it just
2519 won't be recognized. */
2520 
2521   {
2522   if (is_group && IS_DIGIT(*ptr))
2523     {
2524     *errorcodeptr = ERR44;
2525     goto FAILED;
2526     }
2527 
2528   while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2529     {
2530     ptr++;
2531     }
2532   }
2533 
2534 /* Check name length */
2535 
2536 if (ptr > *nameptr + MAX_NAME_SIZE)
2537   {
2538   *errorcodeptr = ERR48;
2539   goto FAILED;
2540   }
2541 *namelenptr = (uint32_t)(ptr - *nameptr);
2542 
2543 /* Subpattern names must not be empty, and their terminator is checked here.
2544 (What follows a verb or alpha assertion name is checked separately.) */
2545 
2546 if (is_group)
2547   {
2548   if (ptr == *nameptr)
2549     {
2550     *errorcodeptr = ERR62;   /* Subpattern name expected */
2551     goto FAILED;
2552     }
2553   if (is_braced)
2554     while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2555   if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2556     {
2557     *errorcodeptr = ERR42;
2558     goto FAILED;
2559     }
2560   ptr++;
2561   }
2562 
2563 *ptrptr = ptr;
2564 return TRUE;
2565 
2566 FAILED:
2567 *ptrptr = ptr;
2568 return FALSE;
2569 }
2570 
2571 
2572 
2573 /*************************************************
2574 *          Manage callouts at start of cycle     *
2575 *************************************************/
2576 
2577 /* At the start of a new item in parse_regex() we are able to record the
2578 details of the previous item in a prior callout, and also to set up an
2579 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2580 which would otherwise happen for items such as \Q that contribute nothing to
2581 the parsed pattern.
2582 
2583 Arguments:
2584   ptr              current pattern pointer
2585   pcalloutptr      points to a pointer to previous callout, or NULL
2586   auto_callout     TRUE if auto_callouts are enabled
2587   parsed_pattern   the parsed pattern pointer
2588   cb               compile block
2589 
2590 Returns: possibly updated parsed_pattern pointer.
2591 */
2592 
2593 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2594 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2595   uint32_t *parsed_pattern, compile_block *cb)
2596 {
2597 uint32_t *previous_callout = *pcalloutptr;
2598 
2599 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2600   cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2601 
2602 if (!auto_callout) previous_callout = NULL; else
2603   {
2604   if (previous_callout == NULL ||
2605       previous_callout != parsed_pattern - 4 ||
2606       previous_callout[3] != 255)
2607     {
2608     previous_callout = parsed_pattern;  /* Set up new automatic callout */
2609     parsed_pattern += 4;
2610     previous_callout[0] = META_CALLOUT_NUMBER;
2611     previous_callout[2] = 0;
2612     previous_callout[3] = 255;
2613     }
2614   previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2615   }
2616 
2617 *pcalloutptr = previous_callout;
2618 return parsed_pattern;
2619 }
2620 
2621 
2622 
2623 /*************************************************
2624 *          Handle \d, \D, \s, \S, \w, \W         *
2625 *************************************************/
2626 
2627 /* This function is called from parse_regex() below, both for freestanding
2628 escapes, and those within classes, to handle those escapes that may change when
2629 Unicode property support is requested. Note that PCRE2_UCP will never be set
2630 without Unicode support because that is checked when pcre2_compile() is called.
2631 
2632 Arguments:
2633   escape          the ESC_... value
2634   parsed_pattern  where to add the code
2635   options         options bits
2636   xoptions        extra options bits
2637 
2638 Returns:          updated value of parsed_pattern
2639 */
2640 static uint32_t *
handle_escdsw(int escape,uint32_t * parsed_pattern,uint32_t options,uint32_t xoptions)2641 handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2642   uint32_t xoptions)
2643 {
2644 uint32_t ascii_option = 0;
2645 uint32_t prop = ESC_p;
2646 
2647 switch(escape)
2648   {
2649   case ESC_D:
2650   prop = ESC_P;
2651   /* Fall through */
2652   case ESC_d:
2653   ascii_option = PCRE2_EXTRA_ASCII_BSD;
2654   break;
2655 
2656   case ESC_S:
2657   prop = ESC_P;
2658   /* Fall through */
2659   case ESC_s:
2660   ascii_option = PCRE2_EXTRA_ASCII_BSS;
2661   break;
2662 
2663   case ESC_W:
2664   prop = ESC_P;
2665   /* Fall through */
2666   case ESC_w:
2667   ascii_option = PCRE2_EXTRA_ASCII_BSW;
2668   break;
2669   }
2670 
2671 if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2672   {
2673   *parsed_pattern++ = META_ESCAPE + escape;
2674   }
2675 else
2676   {
2677   *parsed_pattern++ = META_ESCAPE + prop;
2678   switch(escape)
2679     {
2680     case ESC_d:
2681     case ESC_D:
2682     *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2683     break;
2684 
2685     case ESC_s:
2686     case ESC_S:
2687     *parsed_pattern++ = PT_SPACE << 16;
2688     break;
2689 
2690     case ESC_w:
2691     case ESC_W:
2692     *parsed_pattern++ = PT_WORD << 16;
2693     break;
2694     }
2695   }
2696 
2697 return parsed_pattern;
2698 }
2699 
2700 
2701 
2702 /*************************************************
2703 *      Parse regex and identify named groups     *
2704 *************************************************/
2705 
2706 /* This function is called first of all. It scans the pattern and does two
2707 things: (1) It identifies capturing groups and makes a table of named capturing
2708 groups so that information about them is fully available to both the compiling
2709 scans. (2) It writes a parsed version of the pattern with comments omitted and
2710 escapes processed into the parsed_pattern vector.
2711 
2712 Arguments:
2713   ptr             points to the start of the pattern
2714   options         compiling dynamic options (may change during the scan)
2715   has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2716   cb              pointer to the compile data block
2717 
2718 Returns:   zero on success or a non-zero error code, with the
2719              error offset placed in the cb field
2720 */
2721 
2722 /* A structure and some flags for dealing with nested groups. */
2723 
2724 typedef struct nest_save {
2725   uint16_t  nest_depth;
2726   uint16_t  reset_group;
2727   uint16_t  max_group;
2728   uint16_t  flags;
2729   uint32_t  options;
2730   uint32_t  xoptions;
2731 } nest_save;
2732 
2733 #define NSF_RESET          0x0001u
2734 #define NSF_CONDASSERT     0x0002u
2735 #define NSF_ATOMICSR       0x0004u
2736 
2737 /* Options that are changeable within the pattern must be tracked during
2738 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2739 but all must be tracked so that META_OPTIONS items set the correct values for
2740 the main compiling phase. */
2741 
2742 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2743   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2744   PCRE2_UNGREEDY)
2745 
2746 #define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2747   PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2748   PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2749 
2750 /* States used for analyzing ranges in character classes. The two OK values
2751 must be last. */
2752 
2753 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2754 
2755 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2756 the storing of literal values in the main parsed pattern, where they can always
2757 be quantified. */
2758 
2759 #if PCRE2_CODE_UNIT_WIDTH == 32
2760 #define PARSED_LITERAL(c, p) \
2761   { \
2762   if (c >= META_END) *p++ = META_BIGVALUE; \
2763   *p++ = c; \
2764   okquantifier = TRUE; \
2765   }
2766 #else
2767 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2768 #endif
2769 
2770 /* Here's the actual function. */
2771 
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2772 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2773   compile_block *cb)
2774 {
2775 uint32_t c;
2776 uint32_t delimiter;
2777 uint32_t namelen;
2778 uint32_t class_range_state;
2779 uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2780 uint32_t *verbstartptr = NULL;
2781 uint32_t *previous_callout = NULL;
2782 uint32_t *parsed_pattern = cb->parsed_pattern;
2783 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2784 uint32_t *this_parsed_item = NULL;
2785 uint32_t *prev_parsed_item = NULL;
2786 uint32_t meta_quantifier = 0;
2787 uint32_t add_after_mark = 0;
2788 uint32_t xoptions = cb->cx->extra_options;
2789 uint16_t nest_depth = 0;
2790 int after_manual_callout = 0;
2791 int expect_cond_assert = 0;
2792 int errorcode = 0;
2793 int escape;
2794 int i;
2795 BOOL inescq = FALSE;
2796 BOOL inverbname = FALSE;
2797 BOOL utf = (options & PCRE2_UTF) != 0;
2798 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2799 BOOL isdupname;
2800 BOOL negate_class;
2801 BOOL okquantifier = FALSE;
2802 PCRE2_SPTR thisptr;
2803 PCRE2_SPTR name;
2804 PCRE2_SPTR ptrend = cb->end_pattern;
2805 PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2806 named_group *ng;
2807 nest_save *top_nest, *end_nests;
2808 
2809 /* Insert leading items for word and line matching (features provided for the
2810 benefit of pcre2grep). */
2811 
2812 if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2813   {
2814   *parsed_pattern++ = META_CIRCUMFLEX;
2815   *parsed_pattern++ = META_NOCAPTURE;
2816   }
2817 else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2818   {
2819   *parsed_pattern++ = META_ESCAPE + ESC_b;
2820   *parsed_pattern++ = META_NOCAPTURE;
2821   }
2822 
2823 /* If the pattern is actually a literal string, process it separately to avoid
2824 cluttering up the main loop. */
2825 
2826 if ((options & PCRE2_LITERAL) != 0)
2827   {
2828   while (ptr < ptrend)
2829     {
2830     if (parsed_pattern >= parsed_pattern_end)
2831       {
2832       errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2833       goto FAILED;
2834       }
2835     thisptr = ptr;
2836     GETCHARINCTEST(c, ptr);
2837     if (auto_callout)
2838       parsed_pattern = manage_callouts(thisptr, &previous_callout,
2839         auto_callout, parsed_pattern, cb);
2840     PARSED_LITERAL(c, parsed_pattern);
2841     }
2842   goto PARSED_END;
2843   }
2844 
2845 /* Process a real regex which may contain meta-characters. */
2846 
2847 top_nest = NULL;
2848 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2849 
2850 /* The size of the nest_save structure might not be a factor of the size of the
2851 workspace. Therefore we must round down end_nests so as to correctly avoid
2852 creating a nest_save that spans the end of the workspace. */
2853 
2854 end_nests = (nest_save *)((char *)end_nests -
2855   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2856 
2857 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2858 
2859 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2860 
2861 /* Now scan the pattern */
2862 
2863 while (ptr < ptrend)
2864   {
2865   int prev_expect_cond_assert;
2866   uint32_t min_repeat = 0, max_repeat = 0;
2867   uint32_t set, unset, *optset;
2868   uint32_t xset, xunset, *xoptset;
2869   uint32_t terminator;
2870   uint32_t prev_meta_quantifier;
2871   BOOL prev_okquantifier;
2872   PCRE2_SPTR tempptr;
2873   PCRE2_SIZE offset;
2874 
2875   if (parsed_pattern >= parsed_pattern_end)
2876     {
2877     errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2878     goto FAILED;
2879     }
2880 
2881   if (nest_depth > cb->cx->parens_nest_limit)
2882     {
2883     errorcode = ERR19;
2884     goto FAILED;        /* Parentheses too deeply nested */
2885     }
2886 
2887   /* If the last time round this loop something was added, parsed_pattern will
2888   no longer be equal to this_parsed_item. Remember where the previous item
2889   started and reset for the next item. Note that sometimes round the loop,
2890   nothing gets added (e.g. for ignored white space). */
2891 
2892   if (this_parsed_item != parsed_pattern)
2893     {
2894     prev_parsed_item = this_parsed_item;
2895     this_parsed_item = parsed_pattern;
2896     }
2897 
2898   /* Get next input character, save its position for callout handling. */
2899 
2900   thisptr = ptr;
2901   GETCHARINCTEST(c, ptr);
2902 
2903   /* Copy quoted literals until \E, allowing for the possibility of automatic
2904   callouts, except when processing a (*VERB) "name".  */
2905 
2906   if (inescq)
2907     {
2908     if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2909       {
2910       inescq = FALSE;
2911       ptr++;   /* Skip E */
2912       }
2913     else
2914       {
2915       if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2916         {                           /* expecting a conditional assertion, */
2917         ptr--;                      /* but an empty \Q\E sequence is OK.  */
2918         errorcode = ERR28;
2919         goto FAILED;
2920         }
2921       if (inverbname)
2922         {                          /* Don't use PARSED_LITERAL() because it */
2923 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2924         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2925 #endif
2926         *parsed_pattern++ = c;
2927         }
2928       else
2929         {
2930         if (after_manual_callout-- <= 0)
2931           parsed_pattern = manage_callouts(thisptr, &previous_callout,
2932             auto_callout, parsed_pattern, cb);
2933         PARSED_LITERAL(c, parsed_pattern);
2934         }
2935       meta_quantifier = 0;
2936       }
2937     continue;  /* Next character */
2938     }
2939 
2940   /* If we are processing the "name" part of a (*VERB:NAME) item, all
2941   characters up to the closing parenthesis are literals except when
2942   PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2943   and \E and escaped characters are allowed (no character types such as \d). If
2944   PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2945   this by not entering the special (*VERB:NAME) processing - they are then
2946   picked up below. Note that c is a character, not a code unit, so we must not
2947   use MAX_255 to test its size because MAX_255 tests code units and is assumed
2948   TRUE in 8-bit mode. */
2949 
2950   if (inverbname &&
2951        (
2952         /* EITHER: not both options set */
2953         ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2954                     (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2955 #ifdef SUPPORT_UNICODE
2956         /* OR: character > 255 AND not Unicode Pattern White Space */
2957         (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2958 #endif
2959         /* OR: not a # comment or isspace() white space */
2960         (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2961 #ifdef SUPPORT_UNICODE
2962         /* and not CHAR_NEL when Unicode is supported */
2963           && c != CHAR_NEL
2964 #endif
2965        )))
2966     {
2967     PCRE2_SIZE verbnamelength;
2968 
2969     switch(c)
2970       {
2971       default:                     /* Don't use PARSED_LITERAL() because it */
2972 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2973       if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2974 #endif
2975       *parsed_pattern++ = c;
2976       break;
2977 
2978       case CHAR_RIGHT_PARENTHESIS:
2979       inverbname = FALSE;
2980       /* This is the length in characters */
2981       verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2982       /* But the limit on the length is in code units */
2983       if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2984         {
2985         ptr--;
2986         errorcode = ERR76;
2987         goto FAILED;
2988         }
2989       *verblengthptr = (uint32_t)verbnamelength;
2990 
2991       /* If this name was on a verb such as (*ACCEPT) which does not continue,
2992       a (*MARK) was generated for the name. We now add the original verb as the
2993       next item. */
2994 
2995       if (add_after_mark != 0)
2996         {
2997         *parsed_pattern++ = add_after_mark;
2998         add_after_mark = 0;
2999         }
3000       break;
3001 
3002       case CHAR_BACKSLASH:
3003       if ((options & PCRE2_ALT_VERBNAMES) != 0)
3004         {
3005         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3006           xoptions, FALSE, cb);
3007         if (errorcode != 0) goto FAILED;
3008         }
3009       else escape = 0;   /* Treat all as literal */
3010 
3011       switch(escape)
3012         {
3013         case 0:                    /* Don't use PARSED_LITERAL() because it */
3014 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3015         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3016 #endif
3017         *parsed_pattern++ = c;
3018         break;
3019 
3020         case ESC_ub:
3021         *parsed_pattern++ = CHAR_u;
3022         PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3023         break;
3024 
3025         case ESC_Q:
3026         inescq = TRUE;
3027         break;
3028 
3029         case ESC_E:           /* Ignore */
3030         break;
3031 
3032         default:
3033         errorcode = ERR40;    /* Invalid in verb name */
3034         goto FAILED;
3035         }
3036       }
3037     continue;   /* Next character in pattern */
3038     }
3039 
3040   /* Not a verb name character. At this point we must process everything that
3041   must not change the quantification state. This is mainly comments, but we
3042   handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3043   A+, as in Perl. An isolated \E is ignored. */
3044 
3045   if (c == CHAR_BACKSLASH && ptr < ptrend)
3046     {
3047     if (*ptr == CHAR_Q || *ptr == CHAR_E)
3048       {
3049       inescq = *ptr == CHAR_Q;
3050       ptr++;
3051       continue;
3052       }
3053     }
3054 
3055   /* Skip over whitespace and # comments in extended mode. Note that c is a
3056   character, not a code unit, so we must not use MAX_255 to test its size
3057   because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3058   whitespace characters are those designated as "Pattern White Space" by
3059   Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3060   U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3061   subset of space characters that match \h and \v. */
3062 
3063   if ((options & PCRE2_EXTENDED) != 0)
3064     {
3065     if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3066 #ifdef SUPPORT_UNICODE
3067     if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3068 #endif
3069     if (c == CHAR_NUMBER_SIGN)
3070       {
3071       while (ptr < ptrend)
3072         {
3073         if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3074           {                       /* IS_NEWLINE sets cb->nllen. */
3075           ptr += cb->nllen;
3076           break;
3077           }
3078         ptr++;
3079 #ifdef SUPPORT_UNICODE
3080         if (utf) FORWARDCHARTEST(ptr, ptrend);
3081 #endif
3082         }
3083       continue;  /* Next character in pattern */
3084       }
3085     }
3086 
3087   /* Skip over bracketed comments */
3088 
3089   if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3090       ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3091     {
3092     while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3093     if (ptr >= ptrend)
3094       {
3095       errorcode = ERR18;  /* A special error for missing ) in a comment */
3096       goto FAILED;        /* to make it easier to debug. */
3097       }
3098     ptr++;
3099     continue;  /* Next character in pattern */
3100     }
3101 
3102   /* If the next item is not a quantifier, fill in length of any previous
3103   callout and create an auto callout if required. */
3104 
3105   if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3106        (c != CHAR_LEFT_CURLY_BRACKET ||
3107          (tempptr = ptr,
3108          !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3109     {
3110     if (after_manual_callout-- <= 0)
3111       {
3112       parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3113         parsed_pattern, cb);
3114       this_parsed_item = parsed_pattern;  /* New start for current item */
3115       }
3116     }
3117 
3118   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3119   assertion, possibly preceded by a callout. If the value is 1, we have just
3120   had the callout and expect an assertion. There must be at least 3 more
3121   characters in all cases. When expect_cond_assert is 2, we know that the
3122   current character is an opening parenthesis, as otherwise we wouldn't be
3123   here. However, when it is 1, we need to check, and it's easiest just to check
3124   always. Note that expect_cond_assert may be negative, since all callouts just
3125   decrement it. */
3126 
3127   if (expect_cond_assert > 0)
3128     {
3129     BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3130               (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3131     if (ok)
3132       {
3133       if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3134         {
3135         ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3136         }
3137       else switch(ptr[1])  /* Traditional symbolic format */
3138         {
3139         case CHAR_C:
3140         ok = expect_cond_assert == 2;
3141         break;
3142 
3143         case CHAR_EQUALS_SIGN:
3144         case CHAR_EXCLAMATION_MARK:
3145         break;
3146 
3147         case CHAR_LESS_THAN_SIGN:
3148         ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3149         break;
3150 
3151         default:
3152         ok = FALSE;
3153         }
3154       }
3155 
3156     if (!ok)
3157       {
3158       ptr--;   /* Adjust error offset */
3159       errorcode = ERR28;
3160       goto FAILED;
3161       }
3162     }
3163 
3164   /* Remember whether we are expecting a conditional assertion, and set the
3165   default for this item. */
3166 
3167   prev_expect_cond_assert = expect_cond_assert;
3168   expect_cond_assert = 0;
3169 
3170   /* Remember quantification status for the previous significant item, then set
3171   default for this item. */
3172 
3173   prev_okquantifier = okquantifier;
3174   prev_meta_quantifier = meta_quantifier;
3175   okquantifier = FALSE;
3176   meta_quantifier = 0;
3177 
3178   /* If the previous significant item was a quantifier, adjust the parsed code
3179   if there is a following modifier. The base meta value is always followed by
3180   the PLUS and QUERY values, in that order. We do this here rather than after
3181   reading a quantifier so that intervening comments and /x whitespace can be
3182   ignored without having to replicate code. */
3183 
3184   if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3185     {
3186     parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3187       prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3188         0x00020000u : 0x00010000u);
3189     continue;  /* Next character in pattern */
3190     }
3191 
3192   /* Process the next item in the main part of a pattern. */
3193 
3194   switch(c)
3195     {
3196     default:              /* Non-special character */
3197     PARSED_LITERAL(c, parsed_pattern);
3198     break;
3199 
3200 
3201     /* ---- Escape sequence ---- */
3202 
3203     case CHAR_BACKSLASH:
3204     tempptr = ptr;
3205     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3206       xoptions, FALSE, cb);
3207     if (errorcode != 0)
3208       {
3209       ESCAPE_FAILED:
3210       if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3211         goto FAILED;
3212       ptr = tempptr;
3213       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3214         {
3215         GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3216         }
3217       escape = 0;                 /* Treat as literal character */
3218       }
3219 
3220     /* The escape was a data escape or literal character. */
3221 
3222     if (escape == 0)
3223       {
3224       PARSED_LITERAL(c, parsed_pattern);
3225       }
3226 
3227     /* The escape was a back (or forward) reference. We keep the offset in
3228     order to give a more useful diagnostic for a bad forward reference. For
3229     references to groups numbered less than 10 we can't use more than two items
3230     in parsed_pattern because they may be just two characters in the input (and
3231     in a 64-bit world an offset may need two elements). So for them, the offset
3232     of the first occurrent is held in a special vector. */
3233 
3234     else if (escape < 0)
3235       {
3236       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3237       escape = -escape;
3238       *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3239       if (escape < 10)
3240         {
3241         if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3242           cb->small_ref_offset[escape] = offset;
3243         }
3244       else
3245         {
3246         PUTOFFSET(offset, parsed_pattern);
3247         }
3248       okquantifier = TRUE;
3249       }
3250 
3251     /* The escape was a character class such as \d etc. or other special
3252     escape indicator such as \A or \X. Most of them generate just a single
3253     parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3254     value. They are supported only when Unicode is available. The type and
3255     value are packed into a single 32-bit value so that the whole sequences
3256     uses only two elements in the parsed_vector. This is because the same
3257     coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3258     set.
3259 
3260     There are also some cases where the escape sequence is followed by a name:
3261     \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3262     and \g'name' are subroutine calls by name; \g{name} is a synonym for
3263     \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3264     and returned as a negative value (handled above). A name is coded as an
3265     offset into the pattern and a length. */
3266 
3267     else switch (escape)
3268       {
3269       case ESC_C:
3270 #ifdef NEVER_BACKSLASH_C
3271       errorcode = ERR85;
3272       goto ESCAPE_FAILED;
3273 #else
3274       if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3275         {
3276         errorcode = ERR83;
3277         goto ESCAPE_FAILED;
3278         }
3279 #endif
3280       okquantifier = TRUE;
3281       *parsed_pattern++ = META_ESCAPE + escape;
3282       break;
3283 
3284       /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3285       when \u{ is not followed by hex digits and }. It requests two literal
3286       characters, u and { and we need this, as otherwise \u{ 12} (for example)
3287       would be treated as u{12} now that spaces are allowed in quantifiers. */
3288 
3289       case ESC_ub:
3290       *parsed_pattern++ = CHAR_u;
3291       PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3292       break;
3293 
3294       case ESC_X:
3295 #ifndef SUPPORT_UNICODE
3296       errorcode = ERR45;   /* Supported only with Unicode support */
3297       goto ESCAPE_FAILED;
3298 #endif
3299       case ESC_H:
3300       case ESC_h:
3301       case ESC_N:
3302       case ESC_R:
3303       case ESC_V:
3304       case ESC_v:
3305       okquantifier = TRUE;
3306       *parsed_pattern++ = META_ESCAPE + escape;
3307       break;
3308 
3309       default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3310       *parsed_pattern++ = META_ESCAPE + escape;
3311       break;
3312 
3313       /* Escapes that may change in UCP mode. */
3314 
3315       case ESC_d:
3316       case ESC_D:
3317       case ESC_s:
3318       case ESC_S:
3319       case ESC_w:
3320       case ESC_W:
3321       okquantifier = TRUE;
3322       parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3323         xoptions);
3324       break;
3325 
3326       /* Unicode property matching */
3327 
3328       case ESC_P:
3329       case ESC_p:
3330 #ifdef SUPPORT_UNICODE
3331         {
3332         BOOL negated;
3333         uint16_t ptype = 0, pdata = 0;
3334         if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3335           goto ESCAPE_FAILED;
3336         if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3337         *parsed_pattern++ = META_ESCAPE + escape;
3338         *parsed_pattern++ = (ptype << 16) | pdata;
3339         okquantifier = TRUE;
3340         }
3341 #else
3342       errorcode = ERR45;
3343       goto ESCAPE_FAILED;
3344 #endif
3345       break;  /* End \P and \p */
3346 
3347       /* When \g is used with quotes or angle brackets as delimiters, it is a
3348       numerical or named subroutine call, and control comes here. When used
3349       with brace delimiters it is a numberical back reference and does not come
3350       here because check_escape() returns it directly as a reference. \k is
3351       always a named back reference. */
3352 
3353       case ESC_g:
3354       case ESC_k:
3355       if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3356           *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3357         {
3358         errorcode = (escape == ESC_g)? ERR57 : ERR69;
3359         goto ESCAPE_FAILED;
3360         }
3361       terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3362         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3363         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3364 
3365       /* For a non-braced \g, check for a numerical recursion. */
3366 
3367       if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3368         {
3369         PCRE2_SPTR p = ptr + 1;
3370 
3371         if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3372             &errorcode))
3373           {
3374           if (p >= ptrend || *p != terminator)
3375             {
3376             errorcode = ERR57;
3377             goto ESCAPE_FAILED;
3378             }
3379           ptr = p;
3380           goto SET_RECURSION;
3381           }
3382         if (errorcode != 0) goto ESCAPE_FAILED;
3383         }
3384 
3385       /* Not a numerical recursion. Perl allows spaces and tabs after { and
3386       before } but not for other delimiters. */
3387 
3388       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3389           &errorcode, cb)) goto ESCAPE_FAILED;
3390 
3391       /* \k and \g when used with braces are back references, whereas \g used
3392       with quotes or angle brackets is a recursion */
3393 
3394       *parsed_pattern++ =
3395         (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3396           META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3397       *parsed_pattern++ = namelen;
3398 
3399       PUTOFFSET(offset, parsed_pattern);
3400       okquantifier = TRUE;
3401       break;  /* End special escape processing */
3402       }
3403     break;    /* End escape sequence processing */
3404 
3405 
3406     /* ---- Single-character special items ---- */
3407 
3408     case CHAR_CIRCUMFLEX_ACCENT:
3409     *parsed_pattern++ = META_CIRCUMFLEX;
3410     break;
3411 
3412     case CHAR_DOLLAR_SIGN:
3413     *parsed_pattern++ = META_DOLLAR;
3414     break;
3415 
3416     case CHAR_DOT:
3417     *parsed_pattern++ = META_DOT;
3418     okquantifier = TRUE;
3419     break;
3420 
3421 
3422     /* ---- Single-character quantifiers ---- */
3423 
3424     case CHAR_ASTERISK:
3425     meta_quantifier = META_ASTERISK;
3426     goto CHECK_QUANTIFIER;
3427 
3428     case CHAR_PLUS:
3429     meta_quantifier = META_PLUS;
3430     goto CHECK_QUANTIFIER;
3431 
3432     case CHAR_QUESTION_MARK:
3433     meta_quantifier = META_QUERY;
3434     goto CHECK_QUANTIFIER;
3435 
3436 
3437     /* ---- Potential {n,m} quantifier ---- */
3438 
3439     case CHAR_LEFT_CURLY_BRACKET:
3440     if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3441         &errorcode))
3442       {
3443       if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3444       PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3445       break;                               /* No more quantifier processing */
3446       }
3447     meta_quantifier = META_MINMAX;
3448     /* Fall through */
3449 
3450 
3451     /* ---- Quantifier post-processing ---- */
3452 
3453     /* Check that a quantifier is allowed after the previous item. This
3454     guarantees that there is a previous item. */
3455 
3456     CHECK_QUANTIFIER:
3457     if (!prev_okquantifier)
3458       {
3459       errorcode = ERR9;
3460       goto FAILED_BACK;
3461       }
3462 
3463     /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3464     quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3465     sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3466     wrapping it in non-capturing brackets, but we have to allow for a preceding
3467     (*MARK) for when (*ACCEPT) has an argument. */
3468 
3469     if (*prev_parsed_item == META_ACCEPT)
3470       {
3471       uint32_t *p;
3472       for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3473       *verbstartptr = META_NOCAPTURE;
3474       parsed_pattern[1] = META_KET;
3475       parsed_pattern += 2;
3476       }
3477 
3478     /* Now we can put the quantifier into the parsed pattern vector. At this
3479     stage, we have only the basic quantifier. The check for a following + or ?
3480     modifier happens at the top of the loop, after any intervening comments
3481     have been removed. */
3482 
3483     *parsed_pattern++ = meta_quantifier;
3484     if (c == CHAR_LEFT_CURLY_BRACKET)
3485       {
3486       *parsed_pattern++ = min_repeat;
3487       *parsed_pattern++ = max_repeat;
3488       }
3489     break;
3490 
3491 
3492     /* ---- Character class ---- */
3493 
3494     case CHAR_LEFT_SQUARE_BRACKET:
3495     okquantifier = TRUE;
3496 
3497     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3498     used for "start of word" and "end of word". As these are otherwise illegal
3499     sequences, we don't break anything by recognizing them. They are replaced
3500     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3501     erroneous and are handled by the normal code below. */
3502 
3503     if (ptrend - ptr >= 6 &&
3504          (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3505           PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3506       {
3507       *parsed_pattern++ = META_ESCAPE + ESC_b;
3508 
3509       if (ptr[2] == CHAR_LESS_THAN_SIGN)
3510         {
3511         *parsed_pattern++ = META_LOOKAHEAD;
3512         }
3513       else
3514         {
3515         *parsed_pattern++ = META_LOOKBEHIND;
3516         *has_lookbehind = TRUE;
3517 
3518         /* The offset is used only for the "non-fixed length" error; this won't
3519         occur here, so just store zero. */
3520 
3521         PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3522         }
3523 
3524       if ((options & PCRE2_UCP) == 0)
3525         *parsed_pattern++ = META_ESCAPE + ESC_w;
3526       else
3527         {
3528         *parsed_pattern++ = META_ESCAPE + ESC_p;
3529         *parsed_pattern++ = PT_WORD << 16;
3530         }
3531       *parsed_pattern++ = META_KET;
3532       ptr += 6;
3533       break;
3534       }
3535 
3536     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3537     they are encountered at the top level, so we'll do that too. */
3538 
3539     if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3540          *ptr == CHAR_EQUALS_SIGN) &&
3541         check_posix_syntax(ptr, ptrend, &tempptr))
3542       {
3543       errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3544       goto FAILED;
3545       }
3546 
3547     /* Process a regular character class. If the first character is '^', set
3548     the negation flag. If the first few characters (either before or after ^)
3549     are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3550     This makes for compatibility with Perl. */
3551 
3552     negate_class = FALSE;
3553     while (ptr < ptrend)
3554       {
3555       GETCHARINCTEST(c, ptr);
3556       if (c == CHAR_BACKSLASH)
3557         {
3558         if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3559         else if (ptrend - ptr >= 3 &&
3560              PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3561           ptr += 3;
3562         else
3563           break;
3564         }
3565       else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3566                (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3567         continue;
3568       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3569         negate_class = TRUE;
3570       else break;
3571       }
3572 
3573     /* Now the real contents of the class; c has the first "real" character.
3574     Empty classes are permitted only if the option is set. */
3575 
3576     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3577         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3578       {
3579       *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3580       break;  /* End of class processing */
3581       }
3582 
3583     /* Process a non-empty class. */
3584 
3585     *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3586     class_range_state = RANGE_NO;
3587 
3588     /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3589     because there are holes in the encoding, and simply using the range A-Z
3590     (for example) would include the characters in the holes. This applies only
3591     to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3592     in this respect. In order to accommodate this, we keep track of whether
3593     character values are literal or not, and a state variable for handling
3594     ranges. */
3595 
3596     /* Loop for the contents of the class */
3597 
3598     for (;;)
3599       {
3600       BOOL char_is_literal = TRUE;
3601 
3602       /* Inside \Q...\E everything is literal except \E */
3603 
3604       if (inescq)
3605         {
3606         if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3607           {
3608           inescq = FALSE;                   /* Reset literal state */
3609           ptr++;                            /* Skip the 'E' */
3610           goto CLASS_CONTINUE;
3611           }
3612         goto CLASS_LITERAL;
3613         }
3614 
3615       /* Skip over space and tab (only) in extended-more mode. */
3616 
3617       if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3618           (c == CHAR_SPACE || c == CHAR_HT))
3619         goto CLASS_CONTINUE;
3620 
3621       /* Handle POSIX class names. Perl allows a negation extension of the
3622       form [:^name:]. A square bracket that doesn't match the syntax is
3623       treated as a literal. We also recognize the POSIX constructions
3624       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3625       5.6 and 5.8 do. */
3626 
3627       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3628           ptrend - ptr >= 3 &&
3629           (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3630            *ptr == CHAR_EQUALS_SIGN) &&
3631           check_posix_syntax(ptr, ptrend, &tempptr))
3632         {
3633         BOOL posix_negate = FALSE;
3634         int posix_class;
3635 
3636         /* Perl treats a hyphen before a POSIX class as a literal, not the
3637         start of a range. However, it gives a warning in its warning mode. PCRE
3638         does not have a warning mode, so we give an error, because this is
3639         likely an error on the user's part. */
3640 
3641         if (class_range_state == RANGE_STARTED)
3642           {
3643           errorcode = ERR50;
3644           goto FAILED;
3645           }
3646 
3647         if (*ptr != CHAR_COLON)
3648           {
3649           errorcode = ERR13;
3650           goto FAILED_BACK;
3651           }
3652 
3653         if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3654           {
3655           posix_negate = TRUE;
3656           ptr++;
3657           }
3658 
3659         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3660         if (posix_class < 0)
3661           {
3662           errorcode = ERR30;
3663           goto FAILED;
3664           }
3665         ptr = tempptr + 2;
3666 
3667         /* Perl treats a hyphen after a POSIX class as a literal, not the
3668         start of a range. However, it gives a warning in its warning mode
3669         unless the hyphen is the last character in the class. PCRE does not
3670         have a warning mode, so we give an error, because this is likely an
3671         error on the user's part. */
3672 
3673         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3674             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3675           {
3676           errorcode = ERR50;
3677           goto FAILED;
3678           }
3679 
3680         /* Set "a hyphen is not the start of a range" for the -] case, and also
3681         in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3682         fuzzers do that kind of thing) and *then* a hyphen. This causes that
3683         hyphen to be treated as a literal. I don't think it's worth setting up
3684         special apparatus to do otherwise. */
3685 
3686         class_range_state = RANGE_NO;
3687 
3688         /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3689         of the POSIX classes are converted to use Unicode properties \p or \P
3690         or, in one case, \h or \H. The substitutes table has two values per
3691         class, containing the type and value of a \p or \P item. The special
3692         cases are specified with a negative type: a non-zero value causes \h or
3693         \H to be used, and a zero value falls through to behave like a non-UCP
3694         POSIX class. There are now also some extra options that force ASCII for
3695         some classes. */
3696 
3697 #ifdef SUPPORT_UNICODE
3698         if ((options & PCRE2_UCP) != 0 &&
3699             (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3700             !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3701               (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3702           {
3703           int ptype = posix_substitutes[2*posix_class];
3704           int pvalue = posix_substitutes[2*posix_class + 1];
3705 
3706           if (ptype >= 0)
3707             {
3708             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3709             *parsed_pattern++ = (ptype << 16) | pvalue;
3710             goto CLASS_CONTINUE;
3711             }
3712 
3713           if (pvalue != 0)
3714             {
3715             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3716             goto CLASS_CONTINUE;
3717             }
3718 
3719           /* Fall through */
3720           }
3721 #endif  /* SUPPORT_UNICODE */
3722 
3723         /* Non-UCP POSIX class */
3724 
3725         *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3726         *parsed_pattern++ = posix_class;
3727         }
3728 
3729       /* Handle potential start of range */
3730 
3731       else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3732         {
3733         *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3734           META_RANGE_LITERAL : META_RANGE_ESCAPED;
3735         class_range_state = RANGE_STARTED;
3736         }
3737 
3738       /* Handle a literal character */
3739 
3740       else if (c != CHAR_BACKSLASH)
3741         {
3742         CLASS_LITERAL:
3743         if (class_range_state == RANGE_STARTED)
3744           {
3745           if (c == parsed_pattern[-2])       /* Optimize one-char range */
3746             parsed_pattern--;
3747           else if (parsed_pattern[-2] > c)   /* Check range is in order */
3748             {
3749             errorcode = ERR8;
3750             goto FAILED_BACK;
3751             }
3752           else
3753             {
3754             if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3755               parsed_pattern[-1] = META_RANGE_ESCAPED;
3756             PARSED_LITERAL(c, parsed_pattern);
3757             }
3758           class_range_state = RANGE_NO;
3759           }
3760         else  /* Potential start of range */
3761           {
3762           class_range_state = char_is_literal?
3763             RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3764           PARSED_LITERAL(c, parsed_pattern);
3765           }
3766         }
3767 
3768       /* Handle escapes in a class */
3769 
3770       else
3771         {
3772         tempptr = ptr;
3773         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3774           xoptions, TRUE, cb);
3775 
3776         if (errorcode != 0)
3777           {
3778           if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3779             goto FAILED;
3780           ptr = tempptr;
3781           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3782             {
3783             GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3784             }
3785           escape = 0;                 /* Treat as literal character */
3786           }
3787 
3788         switch(escape)
3789           {
3790           case 0:  /* Escaped character code point is in c */
3791           char_is_literal = FALSE;
3792           goto CLASS_LITERAL;      /* (a few lines above) */
3793 
3794           case ESC_b:
3795           c = CHAR_BS;    /* \b is backspace in a class */
3796           char_is_literal = FALSE;
3797           goto CLASS_LITERAL;
3798 
3799           case ESC_Q:
3800           inescq = TRUE;  /* Enter literal mode */
3801           goto CLASS_CONTINUE;
3802 
3803           case ESC_E:     /* Ignore orphan \E */
3804           goto CLASS_CONTINUE;
3805 
3806           case ESC_B:     /* Always an error in a class */
3807           case ESC_R:
3808           case ESC_X:
3809           errorcode = ERR7;
3810           ptr--;
3811           goto FAILED;
3812           }
3813 
3814         /* The second part of a range can be a single-character escape
3815         sequence (detected above), but not any of the other escapes. Perl
3816         treats a hyphen as a literal in such circumstances. However, in Perl's
3817         warning mode, a warning is given, so PCRE now faults it, as it is
3818         almost certainly a mistake on the user's part. */
3819 
3820         if (class_range_state == RANGE_STARTED)
3821           {
3822           errorcode = ERR50;
3823           goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3824           }
3825 
3826         /* Of the remaining escapes, only those that define characters are
3827         allowed in a class. None may start a range. */
3828 
3829         class_range_state = RANGE_NO;
3830         switch(escape)
3831           {
3832           case ESC_N:
3833           errorcode = ERR71;
3834           goto FAILED;
3835 
3836           case ESC_H:
3837           case ESC_h:
3838           case ESC_V:
3839           case ESC_v:
3840           *parsed_pattern++ = META_ESCAPE + escape;
3841           break;
3842 
3843           /* These escapes may be converted to Unicode property tests when
3844           PCRE2_UCP is set. */
3845 
3846           case ESC_d:
3847           case ESC_D:
3848           case ESC_s:
3849           case ESC_S:
3850           case ESC_w:
3851           case ESC_W:
3852           parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3853             xoptions);
3854           break;
3855 
3856           /* Explicit Unicode property matching */
3857 
3858           case ESC_P:
3859           case ESC_p:
3860 #ifdef SUPPORT_UNICODE
3861             {
3862             BOOL negated;
3863             uint16_t ptype = 0, pdata = 0;
3864             if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3865               goto FAILED;
3866             if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3867             *parsed_pattern++ = META_ESCAPE + escape;
3868             *parsed_pattern++ = (ptype << 16) | pdata;
3869             }
3870 #else
3871           errorcode = ERR45;
3872           goto FAILED;
3873 #endif
3874           break;  /* End \P and \p */
3875 
3876           default:    /* All others are not allowed in a class */
3877           errorcode = ERR7;
3878           ptr--;
3879           goto FAILED;
3880           }
3881 
3882         /* Perl gives a warning unless a following hyphen is the last character
3883         in the class. PCRE throws an error. */
3884 
3885         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3886             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3887           {
3888           errorcode = ERR50;
3889           goto FAILED;
3890           }
3891         }
3892 
3893       /* Proceed to next thing in the class. */
3894 
3895       CLASS_CONTINUE:
3896       if (ptr >= ptrend)
3897         {
3898         errorcode = ERR6;  /* Missing terminating ']' */
3899         goto FAILED;
3900         }
3901       GETCHARINCTEST(c, ptr);
3902       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3903       }     /* End of class-processing loop */
3904 
3905     /* -] at the end of a class is a literal '-' */
3906 
3907     if (class_range_state == RANGE_STARTED)
3908       {
3909       parsed_pattern[-1] = CHAR_MINUS;
3910       class_range_state = RANGE_NO;
3911       }
3912 
3913     *parsed_pattern++ = META_CLASS_END;
3914     break;  /* End of character class */
3915 
3916 
3917     /* ---- Opening parenthesis ---- */
3918 
3919     case CHAR_LEFT_PARENTHESIS:
3920     if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3921 
3922     /* If ( is not followed by ? it is either a capture or a special verb or an
3923     alpha assertion or a positive non-atomic lookahead. */
3924 
3925     if (*ptr != CHAR_QUESTION_MARK)
3926       {
3927       const char *vn;
3928 
3929       /* Handle capturing brackets (or non-capturing if auto-capture is turned
3930       off). */
3931 
3932       if (*ptr != CHAR_ASTERISK)
3933         {
3934         nest_depth++;
3935         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3936           {
3937           if (cb->bracount >= MAX_GROUP_NUMBER)
3938             {
3939             errorcode = ERR97;
3940             goto FAILED;
3941             }
3942           cb->bracount++;
3943           *parsed_pattern++ = META_CAPTURE | cb->bracount;
3944           }
3945         else *parsed_pattern++ = META_NOCAPTURE;
3946         }
3947 
3948       /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3949       quantifier" error rather than "(*MARK) must have an argument". */
3950 
3951       else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3952         break;
3953 
3954       /* Handle "alpha assertions" such as (*pla:...). Most of these are
3955       synonyms for the historical symbolic assertions, but the script run and
3956       non-atomic lookaround ones are new. They are distinguished by starting
3957       with a lower case letter. Checking both ends of the alphabet makes this
3958       work in all character codes. */
3959 
3960       else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3961         {
3962         uint32_t meta;
3963 
3964         vn = alasnames;
3965         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3966           &errorcode, cb)) goto FAILED;
3967         if (ptr >= ptrend || *ptr != CHAR_COLON)
3968           {
3969           errorcode = ERR95;  /* Malformed */
3970           goto FAILED;
3971           }
3972 
3973         /* Scan the table of alpha assertion names */
3974 
3975         for (i = 0; i < alascount; i++)
3976           {
3977           if (namelen == alasmeta[i].len &&
3978               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3979             break;
3980           vn += alasmeta[i].len + 1;
3981           }
3982 
3983         if (i >= alascount)
3984           {
3985           errorcode = ERR95;  /* Alpha assertion not recognized */
3986           goto FAILED;
3987           }
3988 
3989         /* Check for expecting an assertion condition. If so, only atomic
3990         lookaround assertions are valid. */
3991 
3992         meta = alasmeta[i].meta;
3993         if (prev_expect_cond_assert > 0 &&
3994             (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3995           {
3996           errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3997             ERR98 : ERR28;  /* (Atomic) assertion expected */
3998           goto FAILED;
3999           }
4000 
4001         /* The lookaround alphabetic synonyms can mostly be handled by jumping
4002         to the code that handles the traditional symbolic forms. */
4003 
4004         switch(meta)
4005           {
4006           default:
4007           errorcode = ERR89;  /* Unknown code; should never occur because */
4008           goto FAILED;        /* the meta values come from a table above. */
4009 
4010           case META_ATOMIC:
4011           goto ATOMIC_GROUP;
4012 
4013           case META_LOOKAHEAD:
4014           goto POSITIVE_LOOK_AHEAD;
4015 
4016           case META_LOOKAHEAD_NA:
4017           goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4018 
4019           case META_LOOKAHEADNOT:
4020           goto NEGATIVE_LOOK_AHEAD;
4021 
4022           case META_LOOKBEHIND:
4023           case META_LOOKBEHINDNOT:
4024           case META_LOOKBEHIND_NA:
4025           *parsed_pattern++ = meta;
4026           ptr--;
4027           goto POST_LOOKBEHIND;
4028 
4029           /* The script run facilities are handled here. Unicode support is
4030           required (give an error if not, as this is a security issue). Always
4031           record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4032           META_ATOMIC and remember that we need two META_KETs at the end. */
4033 
4034           case META_SCRIPT_RUN:
4035           case META_ATOMIC_SCRIPT_RUN:
4036 #ifdef SUPPORT_UNICODE
4037           *parsed_pattern++ = META_SCRIPT_RUN;
4038           nest_depth++;
4039           ptr++;
4040           if (meta == META_ATOMIC_SCRIPT_RUN)
4041             {
4042             *parsed_pattern++ = META_ATOMIC;
4043             if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4044             else if (++top_nest >= end_nests)
4045               {
4046               errorcode = ERR84;
4047               goto FAILED;
4048               }
4049             top_nest->nest_depth = nest_depth;
4050             top_nest->flags = NSF_ATOMICSR;
4051             top_nest->options = options & PARSE_TRACKED_OPTIONS;
4052             top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4053             }
4054           break;
4055 #else  /* SUPPORT_UNICODE */
4056           errorcode = ERR96;
4057           goto FAILED;
4058 #endif
4059           }
4060         }
4061 
4062 
4063       /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4064 
4065       else
4066         {
4067         vn = verbnames;
4068         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4069           &errorcode, cb)) goto FAILED;
4070         if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4071                               *ptr != CHAR_RIGHT_PARENTHESIS))
4072           {
4073           errorcode = ERR60;  /* Malformed */
4074           goto FAILED;
4075           }
4076 
4077         /* Scan the table of verb names */
4078 
4079         for (i = 0; i < verbcount; i++)
4080           {
4081           if (namelen == verbs[i].len &&
4082               PRIV(strncmp_c8)(name, vn, namelen) == 0)
4083             break;
4084           vn += verbs[i].len + 1;
4085           }
4086 
4087         if (i >= verbcount)
4088           {
4089           errorcode = ERR60;  /* Verb not recognized */
4090           goto FAILED;
4091           }
4092 
4093         /* An empty argument is treated as no argument. */
4094 
4095         if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4096              ptr[1] == CHAR_RIGHT_PARENTHESIS)
4097           ptr++;    /* Advance to the closing parens */
4098 
4099         /* Check for mandatory non-empty argument; this is (*MARK) */
4100 
4101         if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4102           {
4103           errorcode = ERR66;
4104           goto FAILED;
4105           }
4106 
4107         /* Remember where this verb, possibly with a preceding (*MARK), starts,
4108         for handling quantified (*ACCEPT). */
4109 
4110         verbstartptr = parsed_pattern;
4111         okquantifier = (verbs[i].meta == META_ACCEPT);
4112 
4113         /* It appears that Perl allows any characters whatsoever, other than a
4114         closing parenthesis, to appear in arguments ("names"), so we no longer
4115         insist on letters, digits, and underscores. Perl does not, however, do
4116         any interpretation within arguments, and has no means of including a
4117         closing parenthesis. PCRE supports escape processing but only when it
4118         is requested by an option. We set inverbname TRUE here, and let the
4119         main loop take care of this so that escape and \x processing is done by
4120         the main code above. */
4121 
4122         if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4123           {
4124           /* Some optional arguments can be treated as a preceding (*MARK) */
4125 
4126           if (verbs[i].has_arg < 0)
4127             {
4128             add_after_mark = verbs[i].meta;
4129             *parsed_pattern++ = META_MARK;
4130             }
4131 
4132           /* The remaining verbs with arguments (except *MARK) need a different
4133           opcode. */
4134 
4135           else
4136             {
4137             *parsed_pattern++ = verbs[i].meta +
4138               ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4139             }
4140 
4141           /* Set up for reading the name in the main loop. */
4142 
4143           verblengthptr = parsed_pattern++;
4144           verbnamestart = ptr;
4145           inverbname = TRUE;
4146           }
4147         else  /* No verb "name" argument */
4148           {
4149           *parsed_pattern++ = verbs[i].meta;
4150           }
4151         }     /* End of (*VERB) handling */
4152       break;  /* Done with this parenthesis */
4153       }       /* End of groups that don't start with (? */
4154 
4155 
4156     /* ---- Items starting (? ---- */
4157 
4158     /* The type of item is determined by what follows (?. Handle (?| and option
4159     changes under "default" because both need a new block on the nest stack.
4160     Comments starting with (?# are handled above. Note that there is some
4161     ambiguity about the sequence (?- because if a digit follows it's a relative
4162     recursion or subroutine call whereas otherwise it's an option unsetting. */
4163 
4164     if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4165 
4166     switch(*ptr)
4167       {
4168       default:
4169       if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4170         goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4171 
4172       /* We now have either (?| or a (possibly empty) option setting,
4173       optionally followed by a non-capturing group. */
4174 
4175       nest_depth++;
4176       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4177       else if (++top_nest >= end_nests)
4178         {
4179         errorcode = ERR84;
4180         goto FAILED;
4181         }
4182       top_nest->nest_depth = nest_depth;
4183       top_nest->flags = 0;
4184       top_nest->options = options & PARSE_TRACKED_OPTIONS;
4185       top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4186 
4187       /* Start of non-capturing group that resets the capture count for each
4188       branch. */
4189 
4190       if (*ptr == CHAR_VERTICAL_LINE)
4191         {
4192         top_nest->reset_group = (uint16_t)cb->bracount;
4193         top_nest->max_group = (uint16_t)cb->bracount;
4194         top_nest->flags |= NSF_RESET;
4195         cb->external_flags |= PCRE2_DUPCAPUSED;
4196         *parsed_pattern++ = META_NOCAPTURE;
4197         ptr++;
4198         }
4199 
4200       /* Scan for options imnrsxJU to be set or unset. */
4201 
4202       else
4203         {
4204         BOOL hyphenok = TRUE;
4205         uint32_t oldoptions = options;
4206         uint32_t oldxoptions = xoptions;
4207 
4208         top_nest->reset_group = 0;
4209         top_nest->max_group = 0;
4210         set = unset = 0;
4211         optset = &set;
4212         xset = xunset = 0;
4213         xoptset = &xset;
4214 
4215         /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4216 
4217         if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4218           {
4219           options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4220                        PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4221           xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4222           hyphenok = FALSE;
4223           ptr++;
4224           }
4225 
4226         while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4227                                *ptr != CHAR_COLON)
4228           {
4229           switch (*ptr++)
4230             {
4231             case CHAR_MINUS:
4232             if (!hyphenok)
4233               {
4234               errorcode = ERR94;
4235               ptr--;  /* Correct the offset */
4236               goto FAILED;
4237               }
4238             optset = &unset;
4239             xoptset = &xunset;
4240             hyphenok = FALSE;
4241             break;
4242 
4243             /* There are some two-character sequences that start with 'a'. */
4244 
4245             case CHAR_a:
4246             if (ptr < ptrend)
4247               {
4248               if (*ptr == CHAR_D)
4249                 {
4250                 *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4251                 ptr++;
4252                 break;
4253                 }
4254               if (*ptr == CHAR_P)
4255                 {
4256                 *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4257                 ptr++;
4258                 break;
4259                 }
4260               if (*ptr == CHAR_S)
4261                 {
4262                 *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4263                 ptr++;
4264                 break;
4265                 }
4266               if (*ptr == CHAR_T)
4267                 {
4268                 *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4269                 ptr++;
4270                 break;
4271                 }
4272               if (*ptr == CHAR_W)
4273                 {
4274                 *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4275                 ptr++;
4276                 break;
4277                 }
4278               }
4279             *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4280                         PCRE2_EXTRA_ASCII_BSW|
4281                         PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4282             break;
4283 
4284             case CHAR_J:  /* Record that it changed in the external options */
4285             *optset |= PCRE2_DUPNAMES;
4286             cb->external_flags |= PCRE2_JCHANGED;
4287             break;
4288 
4289             case CHAR_i: *optset |= PCRE2_CASELESS; break;
4290             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4291             case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4292             case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4293             case CHAR_s: *optset |= PCRE2_DOTALL; break;
4294             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4295 
4296             /* If x appears twice it sets the extended extended option. */
4297 
4298             case CHAR_x:
4299             *optset |= PCRE2_EXTENDED;
4300             if (ptr < ptrend && *ptr == CHAR_x)
4301               {
4302               *optset |= PCRE2_EXTENDED_MORE;
4303               ptr++;
4304               }
4305             break;
4306 
4307             default:
4308             errorcode = ERR11;
4309             ptr--;    /* Correct the offset */
4310             goto FAILED;
4311             }
4312           }
4313 
4314         /* If we are setting extended without extended-more, ensure that any
4315         existing extended-more gets unset. Also, unsetting extended must also
4316         unset extended-more. */
4317 
4318         if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4319             (unset & PCRE2_EXTENDED) != 0)
4320           unset |= PCRE2_EXTENDED_MORE;
4321 
4322         options = (options | set) & (~unset);
4323         xoptions = (xoptions | xset) & (~xunset);
4324 
4325         /* If the options ended with ')' this is not the start of a nested
4326         group with option changes, so the options change at this level.
4327         In this case, if the previous level set up a nest block, discard the
4328         one we have just created. Otherwise adjust it for the previous level.
4329         If the options ended with ':' we are starting a non-capturing group,
4330         possibly with an options setting. */
4331 
4332         if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4333         if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4334           {
4335           nest_depth--;  /* This is not a nested group after all. */
4336           if (top_nest > (nest_save *)(cb->start_workspace) &&
4337               (top_nest-1)->nest_depth == nest_depth) top_nest--;
4338           else top_nest->nest_depth = nest_depth;
4339           }
4340         else *parsed_pattern++ = META_NOCAPTURE;
4341 
4342         /* If nothing changed, no need to record. */
4343 
4344         if (options != oldoptions || xoptions != oldxoptions)
4345           {
4346           *parsed_pattern++ = META_OPTIONS;
4347           *parsed_pattern++ = options;
4348           *parsed_pattern++ = xoptions;
4349           }
4350         }     /* End options processing */
4351       break;  /* End default case after (? */
4352 
4353 
4354       /* ---- Python syntax support ---- */
4355 
4356       case CHAR_P:
4357       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4358 
4359       /* (?P<name> is the same as (?<name>, which defines a named group. */
4360 
4361       if (*ptr == CHAR_LESS_THAN_SIGN)
4362         {
4363         terminator = CHAR_GREATER_THAN_SIGN;
4364         goto DEFINE_NAME;
4365         }
4366 
4367       /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4368       call. */
4369 
4370       if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4371 
4372       /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4373       else after (?P is an error. */
4374 
4375       if (*ptr != CHAR_EQUALS_SIGN)
4376         {
4377         errorcode = ERR41;
4378         goto FAILED;
4379         }
4380       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4381           &namelen, &errorcode, cb)) goto FAILED;
4382       *parsed_pattern++ = META_BACKREF_BYNAME;
4383       *parsed_pattern++ = namelen;
4384       PUTOFFSET(offset, parsed_pattern);
4385       okquantifier = TRUE;
4386       break;   /* End of (?P processing */
4387 
4388 
4389       /* ---- Recursion/subroutine calls by number ---- */
4390 
4391       case CHAR_R:
4392       i = 0;         /* (?R) == (?R0) */
4393       ptr++;
4394       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4395         {
4396         errorcode = ERR58;
4397         goto FAILED;
4398         }
4399       goto SET_RECURSION;
4400 
4401       /* An item starting (?- followed by a digit comes here via the "default"
4402       case because (?- followed by a non-digit is an options setting. */
4403 
4404       case CHAR_PLUS:
4405       if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4406         {
4407         errorcode = ERR29;   /* Missing number */
4408         goto FAILED;
4409         }
4410       /* Fall through */
4411 
4412       case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4413       case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4414       RECURSION_BYNUMBER:
4415       if (!read_number(&ptr, ptrend,
4416           (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4417           MAX_GROUP_NUMBER, ERR61,
4418           &i, &errorcode)) goto FAILED;
4419       if (i < 0)  /* NB (?0) is permitted */
4420         {
4421         errorcode = ERR15;   /* Unknown group */
4422         goto FAILED_BACK;
4423         }
4424       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4425         goto UNCLOSED_PARENTHESIS;
4426 
4427       SET_RECURSION:
4428       *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4429       offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4430       ptr++;
4431       PUTOFFSET(offset, parsed_pattern);
4432       okquantifier = TRUE;
4433       break;  /* End of recursive call by number handling */
4434 
4435 
4436       /* ---- Recursion/subroutine calls by name ---- */
4437 
4438       case CHAR_AMPERSAND:
4439       RECURSE_BY_NAME:
4440       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4441           &namelen, &errorcode, cb)) goto FAILED;
4442       *parsed_pattern++ = META_RECURSE_BYNAME;
4443       *parsed_pattern++ = namelen;
4444       PUTOFFSET(offset, parsed_pattern);
4445       okquantifier = TRUE;
4446       break;
4447 
4448       /* ---- Callout with numerical or string argument ---- */
4449 
4450       case CHAR_C:
4451       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4452 
4453       /* If the previous item was a condition starting (?(? an assertion,
4454       optionally preceded by a callout, is expected. This is checked later on,
4455       during actual compilation. However we need to identify this kind of
4456       assertion in this pass because it must not be qualified. The value of
4457       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4458       for a callout - still leaving a positive value that identifies the
4459       assertion. Multiple callouts or any other items will make it zero or
4460       less, which doesn't matter because they will cause an error later. */
4461 
4462       expect_cond_assert = prev_expect_cond_assert - 1;
4463 
4464       /* If previous_callout is not NULL, it means this follows a previous
4465       callout. If it was a manual callout, do nothing; this means its "length
4466       of next pattern item" field will remain zero. If it was an automatic
4467       callout, abolish it. */
4468 
4469       if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4470           previous_callout == parsed_pattern - 4 &&
4471           parsed_pattern[-1] == 255)
4472         parsed_pattern = previous_callout;
4473 
4474       /* Save for updating next pattern item length, and skip one item before
4475       completing. */
4476 
4477       previous_callout = parsed_pattern;
4478       after_manual_callout = 1;
4479 
4480       /* Handle a string argument; specific delimiter is required. */
4481 
4482       if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4483         {
4484         PCRE2_SIZE calloutlength;
4485         PCRE2_SPTR startptr = ptr;
4486 
4487         delimiter = 0;
4488         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4489           {
4490           if (*ptr == PRIV(callout_start_delims)[i])
4491             {
4492             delimiter = PRIV(callout_end_delims)[i];
4493             break;
4494             }
4495           }
4496         if (delimiter == 0)
4497           {
4498           errorcode = ERR82;
4499           goto FAILED;
4500           }
4501 
4502         *parsed_pattern = META_CALLOUT_STRING;
4503         parsed_pattern += 3;   /* Skip pattern info */
4504 
4505         for (;;)
4506           {
4507           if (++ptr >= ptrend)
4508             {
4509             errorcode = ERR81;
4510             ptr = startptr;   /* To give a more useful message */
4511             goto FAILED;
4512             }
4513           if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4514             break;
4515           }
4516 
4517         calloutlength = (PCRE2_SIZE)(ptr - startptr);
4518         if (calloutlength > UINT32_MAX)
4519           {
4520           errorcode = ERR72;
4521           goto FAILED;
4522           }
4523         *parsed_pattern++ = (uint32_t)calloutlength;
4524         offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4525         PUTOFFSET(offset, parsed_pattern);
4526         }
4527 
4528       /* Handle a callout with an optional numerical argument, which must be
4529       less than or equal to 255. A missing argument gives 0. */
4530 
4531       else
4532         {
4533         int n = 0;
4534         *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4535         parsed_pattern += 3;                       /* Skip pattern info */
4536         while (ptr < ptrend && IS_DIGIT(*ptr))
4537           {
4538           n = n * 10 + *ptr++ - CHAR_0;
4539           if (n > 255)
4540             {
4541             errorcode = ERR38;
4542             goto FAILED;
4543             }
4544           }
4545         *parsed_pattern++ = n;
4546         }
4547 
4548       /* Both formats must have a closing parenthesis */
4549 
4550       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4551         {
4552         errorcode = ERR39;
4553         goto FAILED;
4554         }
4555       ptr++;
4556 
4557       /* Remember the offset to the next item in the pattern, and set a default
4558       length. This should get updated after the next item is read. */
4559 
4560       previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4561       previous_callout[2] = 0;
4562       break;                  /* End callout */
4563 
4564 
4565       /* ---- Conditional group ---- */
4566 
4567       /* A condition can be an assertion, a number (referring to a numbered
4568       group's having been set), a name (referring to a named group), or 'R',
4569       referring to overall recursion. R<digits> and R&name are also permitted
4570       for recursion state tests. Numbers may be preceded by + or - to specify a
4571       relative group number.
4572 
4573       There are several syntaxes for testing a named group: (?(name)) is used
4574       by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4575 
4576       There are two unfortunate ambiguities. 'R' can be the recursive thing or
4577       the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4578       the Perl DEFINE feature or the Python named test. We look for a name
4579       first; if not found, we try the other case.
4580 
4581       For compatibility with auto-callouts, we allow a callout to be specified
4582       before a condition that is an assertion. */
4583 
4584       case CHAR_LEFT_PARENTHESIS:
4585       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4586       nest_depth++;
4587 
4588       /* If the next character is ? or * there must be an assertion next
4589       (optionally preceded by a callout). We do not check this here, but
4590       instead we set expect_cond_assert to 2. If this is still greater than
4591       zero (callouts decrement it) when the next assertion is read, it will be
4592       marked as a condition that must not be repeated. A value greater than
4593       zero also causes checking that an assertion (possibly with callout)
4594       follows. */
4595 
4596       if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4597         {
4598         *parsed_pattern++ = META_COND_ASSERT;
4599         ptr--;   /* Pull pointer back to the opening parenthesis. */
4600         expect_cond_assert = 2;
4601         break;  /* End of conditional */
4602         }
4603 
4604       /* Handle (?([+-]number)... */
4605 
4606       if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4607           &errorcode))
4608         {
4609         if (i <= 0)
4610           {
4611           errorcode = ERR15;
4612           goto FAILED;
4613           }
4614         *parsed_pattern++ = META_COND_NUMBER;
4615         offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4616         PUTOFFSET(offset, parsed_pattern);
4617         *parsed_pattern++ = i;
4618         }
4619       else if (errorcode != 0) goto FAILED;   /* Number too big */
4620 
4621       /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4622 
4623       else if (ptrend - ptr >= 10 &&
4624                PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4625                ptr[7] != CHAR_RIGHT_PARENTHESIS)
4626         {
4627         uint32_t ge = 0;
4628         int major = 0;
4629         int minor = 0;
4630 
4631         ptr += 7;
4632         if (*ptr == CHAR_GREATER_THAN_SIGN)
4633           {
4634           ge = 1;
4635           ptr++;
4636           }
4637 
4638         /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4639         references its argument twice. */
4640 
4641         if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4642           goto BAD_VERSION_CONDITION;
4643 
4644         if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4645           goto FAILED;
4646 
4647         if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4648         if (*ptr == CHAR_DOT)
4649           {
4650           if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4651           minor = (*ptr++ - CHAR_0) * 10;
4652           if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4653           if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4654           if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4655             goto BAD_VERSION_CONDITION;
4656           }
4657 
4658         *parsed_pattern++ = META_COND_VERSION;
4659         *parsed_pattern++ = ge;
4660         *parsed_pattern++ = major;
4661         *parsed_pattern++ = minor;
4662         }
4663 
4664       /* All the remaining cases now require us to read a name. We cannot at
4665       this stage distinguish ambiguous cases such as (?(R12) which might be a
4666       recursion test by number or a name, because the named groups have not yet
4667       all been identified. Those cases are treated as names, but given a
4668       different META code. */
4669 
4670       else
4671         {
4672         BOOL was_r_ampersand = FALSE;
4673 
4674         if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4675           {
4676           terminator = CHAR_RIGHT_PARENTHESIS;
4677           was_r_ampersand = TRUE;
4678           ptr++;
4679           }
4680         else if (*ptr == CHAR_LESS_THAN_SIGN)
4681           terminator = CHAR_GREATER_THAN_SIGN;
4682         else if (*ptr == CHAR_APOSTROPHE)
4683           terminator = CHAR_APOSTROPHE;
4684         else
4685           {
4686           terminator = CHAR_RIGHT_PARENTHESIS;
4687           ptr--;   /* Point to char before name */
4688           }
4689         if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4690             &errorcode, cb)) goto FAILED;
4691 
4692         /* Handle (?(R&name) */
4693 
4694         if (was_r_ampersand)
4695           {
4696           *parsed_pattern = META_COND_RNAME;
4697           ptr--;   /* Back to closing parens */
4698           }
4699 
4700         /* Handle (?(name). If the name is "DEFINE" we identify it with a
4701         special code. Likewise if the name consists of R followed only by
4702         digits. Otherwise, handle it like a quoted name. */
4703 
4704         else if (terminator == CHAR_RIGHT_PARENTHESIS)
4705           {
4706           if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4707             *parsed_pattern = META_COND_DEFINE;
4708           else
4709             {
4710             for (i = 1; i < (int)namelen; i++)
4711               if (!IS_DIGIT(name[i])) break;
4712             *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4713               META_COND_RNUMBER : META_COND_NAME;
4714             }
4715           ptr--;   /* Back to closing parens */
4716           }
4717 
4718         /* Handle (?('name') or (?(<name>) */
4719 
4720         else *parsed_pattern = META_COND_NAME;
4721 
4722         /* All these cases except DEFINE end with the name length and offset;
4723         DEFINE just has an offset (for the "too many branches" error). */
4724 
4725         if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4726         PUTOFFSET(offset, parsed_pattern);
4727         }  /* End cases that read a name */
4728 
4729       /* Check the closing parenthesis of the condition */
4730 
4731       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4732         {
4733         errorcode = ERR24;
4734         goto FAILED;
4735         }
4736       ptr++;
4737       break;  /* End of condition processing */
4738 
4739 
4740       /* ---- Atomic group ---- */
4741 
4742       case CHAR_GREATER_THAN_SIGN:
4743       ATOMIC_GROUP:                          /* Come from (*atomic: */
4744       *parsed_pattern++ = META_ATOMIC;
4745       nest_depth++;
4746       ptr++;
4747       break;
4748 
4749 
4750       /* ---- Lookahead assertions ---- */
4751 
4752       case CHAR_EQUALS_SIGN:
4753       POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4754       *parsed_pattern++ = META_LOOKAHEAD;
4755       ptr++;
4756       goto POST_ASSERTION;
4757 
4758       case CHAR_ASTERISK:
4759       POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */
4760       *parsed_pattern++ = META_LOOKAHEAD_NA;
4761       ptr++;
4762       goto POST_ASSERTION;
4763 
4764       case CHAR_EXCLAMATION_MARK:
4765       NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4766       *parsed_pattern++ = META_LOOKAHEADNOT;
4767       ptr++;
4768       goto POST_ASSERTION;
4769 
4770 
4771       /* ---- Lookbehind assertions ---- */
4772 
4773       /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4774       is the start of the name of a capturing group. */
4775 
4776       case CHAR_LESS_THAN_SIGN:
4777       if (ptrend - ptr <= 1 ||
4778          (ptr[1] != CHAR_EQUALS_SIGN &&
4779           ptr[1] != CHAR_EXCLAMATION_MARK &&
4780           ptr[1] != CHAR_ASTERISK))
4781         {
4782         terminator = CHAR_GREATER_THAN_SIGN;
4783         goto DEFINE_NAME;
4784         }
4785       *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4786         META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4787         META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4788 
4789       POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
4790       *has_lookbehind = TRUE;
4791       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4792       PUTOFFSET(offset, parsed_pattern);
4793       ptr += 2;
4794       /* Fall through */
4795 
4796       /* If the previous item was a condition starting (?(? an assertion,
4797       optionally preceded by a callout, is expected. This is checked later on,
4798       during actual compilation. However we need to identify this kind of
4799       assertion in this pass because it must not be qualified. The value of
4800       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4801       for a callout - still leaving a positive value that identifies the
4802       assertion. Multiple callouts or any other items will make it zero or
4803       less, which doesn't matter because they will cause an error later. */
4804 
4805       POST_ASSERTION:
4806       nest_depth++;
4807       if (prev_expect_cond_assert > 0)
4808         {
4809         if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4810         else if (++top_nest >= end_nests)
4811           {
4812           errorcode = ERR84;
4813           goto FAILED;
4814           }
4815         top_nest->nest_depth = nest_depth;
4816         top_nest->flags = NSF_CONDASSERT;
4817         top_nest->options = options & PARSE_TRACKED_OPTIONS;
4818         top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4819         }
4820       break;
4821 
4822 
4823       /* ---- Define a named group ---- */
4824 
4825       /* A named group may be defined as (?'name') or (?<name>). In the latter
4826       case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4827       terminator set to '>'. */
4828 
4829       case CHAR_APOSTROPHE:
4830       terminator = CHAR_APOSTROPHE;    /* Terminator */
4831 
4832       DEFINE_NAME:
4833       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4834           &errorcode, cb)) goto FAILED;
4835 
4836       /* We have a name for this capturing group. It is also assigned a number,
4837       which is its primary means of identification. */
4838 
4839       if (cb->bracount >= MAX_GROUP_NUMBER)
4840         {
4841         errorcode = ERR97;
4842         goto FAILED;
4843         }
4844       cb->bracount++;
4845       *parsed_pattern++ = META_CAPTURE | cb->bracount;
4846       nest_depth++;
4847 
4848       /* Check not too many names */
4849 
4850       if (cb->names_found >= MAX_NAME_COUNT)
4851         {
4852         errorcode = ERR49;
4853         goto FAILED;
4854         }
4855 
4856       /* Adjust the entry size to accommodate the longest name found. */
4857 
4858       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4859         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4860 
4861       /* Scan the list to check for duplicates. For duplicate names, if the
4862       number is the same, break the loop, which causes the name to be
4863       discarded; otherwise, if DUPNAMES is not set, give an error.
4864       If it is set, allow the name with a different number, but continue
4865       scanning in case this is a duplicate with the same number. For
4866       non-duplicate names, give an error if the number is duplicated. */
4867 
4868       isdupname = FALSE;
4869       ng = cb->named_groups;
4870       for (i = 0; i < cb->names_found; i++, ng++)
4871         {
4872         if (namelen == ng->length &&
4873             PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4874           {
4875           if (ng->number == cb->bracount) break;
4876           if ((options & PCRE2_DUPNAMES) == 0)
4877             {
4878             errorcode = ERR43;
4879             goto FAILED;
4880             }
4881           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4882           cb->dupnames = TRUE;              /* Duplicate names exist */
4883           }
4884         else if (ng->number == cb->bracount)
4885           {
4886           errorcode = ERR65;
4887           goto FAILED;
4888           }
4889         }
4890 
4891       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4892 
4893       /* Increase the list size if necessary */
4894 
4895       if (cb->names_found >= cb->named_group_list_size)
4896         {
4897         uint32_t newsize = cb->named_group_list_size * 2;
4898         named_group *newspace =
4899           cb->cx->memctl.malloc(newsize * sizeof(named_group),
4900           cb->cx->memctl.memory_data);
4901         if (newspace == NULL)
4902           {
4903           errorcode = ERR21;
4904           goto FAILED;
4905           }
4906 
4907         memcpy(newspace, cb->named_groups,
4908           cb->named_group_list_size * sizeof(named_group));
4909         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4910           cb->cx->memctl.free((void *)cb->named_groups,
4911           cb->cx->memctl.memory_data);
4912         cb->named_groups = newspace;
4913         cb->named_group_list_size = newsize;
4914         }
4915 
4916       /* Add this name to the list */
4917 
4918       cb->named_groups[cb->names_found].name = name;
4919       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4920       cb->named_groups[cb->names_found].number = cb->bracount;
4921       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4922       cb->names_found++;
4923       break;
4924       }        /* End of (? switch */
4925     break;     /* End of ( handling */
4926 
4927 
4928     /* ---- Branch terminators ---- */
4929 
4930     /* Alternation: reset the capture count if we are in a (?| group. */
4931 
4932     case CHAR_VERTICAL_LINE:
4933     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4934         (top_nest->flags & NSF_RESET) != 0)
4935       {
4936       if (cb->bracount > top_nest->max_group)
4937         top_nest->max_group = (uint16_t)cb->bracount;
4938       cb->bracount = top_nest->reset_group;
4939       }
4940     *parsed_pattern++ = META_ALT;
4941     break;
4942 
4943     /* End of group; reset the capture count to the maximum if we are in a (?|
4944     group and/or reset the options that are tracked during parsing. Disallow
4945     quantifier for a condition that is an assertion. */
4946 
4947     case CHAR_RIGHT_PARENTHESIS:
4948     okquantifier = TRUE;
4949     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4950       {
4951       options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4952       xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
4953       if ((top_nest->flags & NSF_RESET) != 0 &&
4954           top_nest->max_group > cb->bracount)
4955         cb->bracount = top_nest->max_group;
4956       if ((top_nest->flags & NSF_CONDASSERT) != 0)
4957         okquantifier = FALSE;
4958 
4959       if ((top_nest->flags & NSF_ATOMICSR) != 0)
4960         {
4961         *parsed_pattern++ = META_KET;
4962         }
4963 
4964       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4965         else top_nest--;
4966       }
4967     if (nest_depth == 0)    /* Unmatched closing parenthesis */
4968       {
4969       errorcode = ERR22;
4970       goto FAILED_BACK;
4971       }
4972     nest_depth--;
4973     *parsed_pattern++ = META_KET;
4974     break;
4975     }  /* End of switch on pattern character */
4976   }    /* End of main character scan loop */
4977 
4978 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4979 
4980 if (inverbname && ptr >= ptrend)
4981   {
4982   errorcode = ERR60;
4983   goto FAILED;
4984   }
4985 
4986 /* Manage callout for the final item */
4987 
4988 PARSED_END:
4989 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4990   parsed_pattern, cb);
4991 
4992 /* Insert trailing items for word and line matching (features provided for the
4993 benefit of pcre2grep). */
4994 
4995 if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
4996   {
4997   *parsed_pattern++ = META_KET;
4998   *parsed_pattern++ = META_DOLLAR;
4999   }
5000 else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5001   {
5002   *parsed_pattern++ = META_KET;
5003   *parsed_pattern++ = META_ESCAPE + ESC_b;
5004   }
5005 
5006 /* Terminate the parsed pattern, then return success if all groups are closed.
5007 Otherwise we have unclosed parentheses. */
5008 
5009 if (parsed_pattern >= parsed_pattern_end)
5010   {
5011   errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5012   goto FAILED;
5013   }
5014 
5015 *parsed_pattern = META_END;
5016 if (nest_depth == 0) return 0;
5017 
5018 UNCLOSED_PARENTHESIS:
5019 errorcode = ERR14;
5020 
5021 /* Come here for all failures. */
5022 
5023 FAILED:
5024 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5025 return errorcode;
5026 
5027 /* Some errors need to indicate the previous character. */
5028 
5029 FAILED_BACK:
5030 ptr--;
5031 goto FAILED;
5032 
5033 /* This failure happens several times. */
5034 
5035 BAD_VERSION_CONDITION:
5036 errorcode = ERR79;
5037 goto FAILED;
5038 }
5039 
5040 
5041 
5042 /*************************************************
5043 *       Find first significant opcode            *
5044 *************************************************/
5045 
5046 /* This is called by several functions that scan a compiled expression looking
5047 for a fixed first character, or an anchoring opcode etc. It skips over things
5048 that do not influence this. For some calls, it makes sense to skip negative
5049 forward and all backward assertions, and also the \b assertion; for others it
5050 does not.
5051 
5052 Arguments:
5053   code         pointer to the start of the group
5054   skipassert   TRUE if certain assertions are to be skipped
5055 
5056 Returns:       pointer to the first significant opcode
5057 */
5058 
5059 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)5060 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5061 {
5062 for (;;)
5063   {
5064   switch ((int)*code)
5065     {
5066     case OP_ASSERT_NOT:
5067     case OP_ASSERTBACK:
5068     case OP_ASSERTBACK_NOT:
5069     case OP_ASSERTBACK_NA:
5070     if (!skipassert) return code;
5071     do code += GET(code, 1); while (*code == OP_ALT);
5072     code += PRIV(OP_lengths)[*code];
5073     break;
5074 
5075     case OP_WORD_BOUNDARY:
5076     case OP_NOT_WORD_BOUNDARY:
5077     case OP_UCP_WORD_BOUNDARY:
5078     case OP_NOT_UCP_WORD_BOUNDARY:
5079     if (!skipassert) return code;
5080     /* Fall through */
5081 
5082     case OP_CALLOUT:
5083     case OP_CREF:
5084     case OP_DNCREF:
5085     case OP_RREF:
5086     case OP_DNRREF:
5087     case OP_FALSE:
5088     case OP_TRUE:
5089     code += PRIV(OP_lengths)[*code];
5090     break;
5091 
5092     case OP_CALLOUT_STR:
5093     code += GET(code, 1 + 2*LINK_SIZE);
5094     break;
5095 
5096     case OP_SKIPZERO:
5097     code += 2 + GET(code, 2) + LINK_SIZE;
5098     break;
5099 
5100     case OP_COND:
5101     case OP_SCOND:
5102     if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
5103         code[GET(code, 1)] != OP_KET)      /* More than one branch */
5104       return code;
5105     code += GET(code, 1) + 1 + LINK_SIZE;
5106     break;
5107 
5108     case OP_MARK:
5109     case OP_COMMIT_ARG:
5110     case OP_PRUNE_ARG:
5111     case OP_SKIP_ARG:
5112     case OP_THEN_ARG:
5113     code += code[1] + PRIV(OP_lengths)[*code];
5114     break;
5115 
5116     default:
5117     return code;
5118     }
5119   }
5120 /* Control never reaches here */
5121 }
5122 
5123 
5124 
5125 #ifdef SUPPORT_UNICODE
5126 /*************************************************
5127 *           Get othercase range                  *
5128 *************************************************/
5129 
5130 /* This function is passed the start and end of a class range in UCP mode. For
5131 single characters the range may be just one character long. The function
5132 searches up the characters, looking for ranges of characters in the "other"
5133 case. Each call returns the next one, updating the start address. A character
5134 with multiple other cases is returned on its own with a special return value.
5135 
5136 Arguments:
5137   cptr        points to starting character value; updated
5138   d           end value
5139   ocptr       where to put start of othercase range
5140   odptr       where to put end of othercase range
5141   restricted  TRUE if caseless restriction applies
5142 
5143 Yield:        -1 when no more
5144                0 when a range is returned
5145               >0 the CASESET offset for char with multiple other cases;
5146                  for this return, *ocptr contains the original
5147 */
5148 
5149 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr,BOOL restricted)5150 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
5151   uint32_t *odptr, BOOL restricted)
5152 {
5153 uint32_t c, othercase, next;
5154 unsigned int co;
5155 
5156 /* Find the first character that has an other case. If it has multiple other
5157 cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the
5158 multi-case entries that begin with ASCII values. In 32-bit mode, a value
5159 greater than the Unicode maximum ends the range. */
5160 
5161 for (c = *cptr; c <= d; c++)
5162   {
5163 #if PCRE2_CODE_UNIT_WIDTH == 32
5164   if (c > MAX_UTF_CODE_POINT) return -1;
5165 #endif
5166   if ((co = UCD_CASESET(c)) != 0 &&
5167       (!restricted || PRIV(ucd_caseless_sets)[co] > 127))
5168     {
5169     *ocptr = c++;   /* Character that has the set */
5170     *cptr = c;      /* Rest of input range */
5171     return (int)co;
5172     }
5173 
5174    /* This is not a valid multiple-case character. Check that the single other
5175    case is different to the original. We don't need to check "restricted" here
5176    because the non-ASCII characters with multiple cases that include an ASCII
5177    character don't have a different "othercase". */
5178 
5179   if ((othercase = UCD_OTHERCASE(c)) != c) break;
5180   }
5181 
5182 if (c > d) return -1;  /* Reached end of range */
5183 
5184 /* Found a character that has a single other case. Search for the end of the
5185 range, which is either the end of the input range, or a character that has zero
5186 or more than one other cases. */
5187 
5188 *ocptr = othercase;
5189 next = othercase + 1;
5190 
5191 for (++c; c <= d; c++)
5192   {
5193   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
5194   next++;
5195   }
5196 
5197 *odptr = next - 1;     /* End of othercase range */
5198 *cptr = c;             /* Rest of input range */
5199 return 0;
5200 }
5201 #endif  /* SUPPORT_UNICODE */
5202 
5203 
5204 
5205 /*************************************************
5206 * Add a character or range to a class (internal) *
5207 *************************************************/
5208 
5209 /* This function packages up the logic of adding a character or range of
5210 characters to a class. The character values in the arguments will be within the
5211 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
5212 called only from within the "add to class" group of functions, some of which
5213 are recursive and mutually recursive. The external entry point is
5214 add_to_class().
5215 
5216 Arguments:
5217   classbits     the bit map for characters < 256
5218   uchardptr     points to the pointer for extra data
5219   options       the options bits
5220   xoptions      the extra options bits
5221   cb            compile data
5222   start         start of range character
5223   end           end of range character
5224 
5225 Returns:        the number of < 256 characters added
5226                 the pointer to extra data is updated
5227 */
5228 
5229 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5230 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5231   uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start,
5232   uint32_t end)
5233 {
5234 uint32_t c;
5235 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5236 unsigned int n8 = 0;
5237 
5238 /* If caseless matching is required, scan the range and process alternate
5239 cases. In Unicode, there are 8-bit characters that have alternate cases that
5240 are greater than 255 and vice-versa (though these may be ignored if caseless
5241 restriction is in force). Sometimes we can just extend the original range. */
5242 
5243 if ((options & PCRE2_CASELESS) != 0)
5244   {
5245 #ifdef SUPPORT_UNICODE
5246   if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5247     {
5248     int rc;
5249     uint32_t oc, od;
5250 
5251     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
5252     c = start;
5253 
5254     while ((rc = get_othercase_range(&c, end, &oc, &od,
5255              (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0)
5256       {
5257       /* Handle a single character that has more than one other case. */
5258 
5259       if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr,
5260         options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc);
5261 
5262       /* Do nothing if the other case range is within the original range. */
5263 
5264       else if (oc >= cb->class_range_start && od <= cb->class_range_end)
5265         continue;
5266 
5267       /* Extend the original range if there is overlap, noting that if oc < c,
5268       we can't have od > end because a subrange is always shorter than the
5269       basic range. Otherwise, use a recursive call to add the additional range.
5270       */
5271 
5272       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5273       else if (od > end && oc <= end + 1)
5274         {
5275         end = od;       /* Extend upwards */
5276         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5277         }
5278       else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions,
5279         cb, oc, od);
5280       }
5281     }
5282   else
5283 #else
5284   (void)xoptions;   /* Avoid compiler warning */
5285 #endif  /* SUPPORT_UNICODE */
5286 
5287   /* Not UTF mode */
5288 
5289   for (c = start; c <= classbits_end; c++)
5290     {
5291     SETBIT(classbits, cb->fcc[c]);
5292     n8++;
5293     }
5294   }
5295 
5296 /* Now handle the originally supplied range. Adjust the final value according
5297 to the bit length - this means that the same lists of (e.g.) horizontal spaces
5298 can be used in all cases. */
5299 
5300 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5301   end = MAX_NON_UTF_CHAR;
5302 
5303 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5304 
5305 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
5306 
5307 for (c = start; c <= classbits_end; c++)
5308   {
5309   /* Regardless of start, c will always be <= 255. */
5310   SETBIT(classbits, c);
5311   n8++;
5312   }
5313 
5314 #ifdef SUPPORT_WIDE_CHARS
5315 if (start <= 0xff) start = 0xff + 1;
5316 
5317 if (end >= start)
5318   {
5319   PCRE2_UCHAR *uchardata = *uchardptr;
5320 
5321 #ifdef SUPPORT_UNICODE
5322   if ((options & PCRE2_UTF) != 0)
5323     {
5324     if (start < end)
5325       {
5326       *uchardata++ = XCL_RANGE;
5327       uchardata += PRIV(ord2utf)(start, uchardata);
5328       uchardata += PRIV(ord2utf)(end, uchardata);
5329       }
5330     else if (start == end)
5331       {
5332       *uchardata++ = XCL_SINGLE;
5333       uchardata += PRIV(ord2utf)(start, uchardata);
5334       }
5335     }
5336   else
5337 #endif  /* SUPPORT_UNICODE */
5338 
5339   /* Without UTF support, character values are constrained by the bit length,
5340   and can only be > 256 for 16-bit and 32-bit libraries. */
5341 
5342 #if PCRE2_CODE_UNIT_WIDTH == 8
5343     {}
5344 #else
5345   if (start < end)
5346     {
5347     *uchardata++ = XCL_RANGE;
5348     *uchardata++ = start;
5349     *uchardata++ = end;
5350     }
5351   else if (start == end)
5352     {
5353     *uchardata++ = XCL_SINGLE;
5354     *uchardata++ = start;
5355     }
5356 #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
5357   *uchardptr = uchardata;   /* Updata extra data pointer */
5358   }
5359 #else  /* SUPPORT_WIDE_CHARS */
5360   (void)uchardptr;          /* Avoid compiler warning */
5361 #endif /* SUPPORT_WIDE_CHARS */
5362 
5363 return n8;    /* Number of 8-bit characters */
5364 }
5365 
5366 
5367 
5368 #ifdef SUPPORT_UNICODE
5369 /*************************************************
5370 * Add a list of characters to a class (internal) *
5371 *************************************************/
5372 
5373 /* This function is used for adding a list of case-equivalent characters to a
5374 class when in UTF mode. This function is called only from within
5375 add_to_class_internal(), with which it is mutually recursive.
5376 
5377 Arguments:
5378   classbits     the bit map for characters < 256
5379   uchardptr     points to the pointer for extra data
5380   options       the options bits
5381   xoptions      the extra options bits
5382   cb            contains pointers to tables etc.
5383   p             points to row of 32-bit values, terminated by NOTACHAR
5384   except        character to omit; this is used when adding lists of
5385                   case-equivalent characters to avoid including the one we
5386                   already know about
5387 
5388 Returns:        the number of < 256 characters added
5389                 the pointer to extra data is updated
5390 */
5391 
5392 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5393 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5394   uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p,
5395   unsigned int except)
5396 {
5397 unsigned int n8 = 0;
5398 while (p[0] < NOTACHAR)
5399   {
5400   unsigned int n = 0;
5401   if (p[0] != except)
5402     {
5403     while(p[n+1] == p[0] + n + 1) n++;
5404     n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5405       p[0], p[n]);
5406     }
5407   p += n + 1;
5408   }
5409 return n8;
5410 }
5411 #endif
5412 
5413 
5414 
5415 /*************************************************
5416 *   External entry point for add range to class  *
5417 *************************************************/
5418 
5419 /* This function sets the overall range so that the internal functions can try
5420 to avoid duplication when handling case-independence.
5421 
5422 Arguments:
5423   classbits     the bit map for characters < 256
5424   uchardptr     points to the pointer for extra data
5425   options       the options bits
5426   xoptions      the extra options bits
5427   cb            compile data
5428   start         start of range character
5429   end           end of range character
5430 
5431 Returns:        the number of < 256 characters added
5432                 the pointer to extra data is updated
5433 */
5434 
5435 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5436 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5437   uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end)
5438 {
5439 cb->class_range_start = start;
5440 cb->class_range_end = end;
5441 return add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5442   start, end);
5443 }
5444 
5445 
5446 /*************************************************
5447 *   External entry point for add list to class   *
5448 *************************************************/
5449 
5450 /* This function is used for adding a list of horizontal or vertical whitespace
5451 characters to a class. The list must be in order so that ranges of characters
5452 can be detected and handled appropriately. This function sets the overall range
5453 so that the internal functions can try to avoid duplication when handling
5454 case-independence.
5455 
5456 Arguments:
5457   classbits     the bit map for characters < 256
5458   uchardptr     points to the pointer for extra data
5459   options       the options bits
5460   xoptions      the extra options bits
5461   cb            contains pointers to tables etc.
5462   p             points to row of 32-bit values, terminated by NOTACHAR
5463   except        character to omit; this is used when adding lists of
5464                   case-equivalent characters to avoid including the one we
5465                   already know about
5466 
5467 Returns:        the number of < 256 characters added
5468                 the pointer to extra data is updated
5469 */
5470 
5471 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5472 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5473   uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except)
5474 {
5475 unsigned int n8 = 0;
5476 while (p[0] < NOTACHAR)
5477   {
5478   unsigned int n = 0;
5479   if (p[0] != except)
5480     {
5481     while(p[n+1] == p[0] + n + 1) n++;
5482     cb->class_range_start = p[0];
5483     cb->class_range_end = p[n];
5484     n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5485       p[0], p[n]);
5486     }
5487   p += n + 1;
5488   }
5489 return n8;
5490 }
5491 
5492 
5493 
5494 /*************************************************
5495 *    Add characters not in a list to a class     *
5496 *************************************************/
5497 
5498 /* This function is used for adding the complement of a list of horizontal or
5499 vertical whitespace to a class. The list must be in order.
5500 
5501 Arguments:
5502   classbits     the bit map for characters < 256
5503   uchardptr     points to the pointer for extra data
5504   options       the options bits
5505   xoptions      the extra options bits
5506   cb            contains pointers to tables etc.
5507   p             points to row of 32-bit values, terminated by NOTACHAR
5508 
5509 Returns:        the number of < 256 characters added
5510                 the pointer to extra data is updated
5511 */
5512 
5513 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p)5514 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5515   uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p)
5516 {
5517 BOOL utf = (options & PCRE2_UTF) != 0;
5518 unsigned int n8 = 0;
5519 if (p[0] > 0)
5520   n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1);
5521 while (p[0] < NOTACHAR)
5522   {
5523   while (p[1] == p[0] + 1) p++;
5524   n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1,
5525     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5526   p++;
5527   }
5528 return n8;
5529 }
5530 
5531 
5532 
5533 /*************************************************
5534 *    Find details of duplicate group names       *
5535 *************************************************/
5536 
5537 /* This is called from compile_branch() when it needs to know the index and
5538 count of duplicates in the names table when processing named backreferences,
5539 either directly, or as conditions.
5540 
5541 Arguments:
5542   name          points to the name
5543   length        the length of the name
5544   indexptr      where to put the index
5545   countptr      where to put the count of duplicates
5546   errorcodeptr  where to put an error code
5547   cb            the compile block
5548 
5549 Returns:        TRUE if OK, FALSE if not, error code set
5550 */
5551 
5552 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5553 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5554   int *countptr, int *errorcodeptr, compile_block *cb)
5555 {
5556 uint32_t i, groupnumber;
5557 int count;
5558 PCRE2_UCHAR *slot = cb->name_table;
5559 
5560 /* Find the first entry in the table */
5561 
5562 for (i = 0; i < cb->names_found; i++)
5563   {
5564   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5565       slot[IMM2_SIZE+length] == 0) break;
5566   slot += cb->name_entry_size;
5567   }
5568 
5569 /* This should not occur, because this function is called only when we know we
5570 have duplicate names. Give an internal error. */
5571 
5572 if (i >= cb->names_found)
5573   {
5574   *errorcodeptr = ERR53;
5575   cb->erroroffset = name - cb->start_pattern;
5576   return FALSE;
5577   }
5578 
5579 /* Record the index and then see how many duplicates there are, updating the
5580 backref map and maximum back reference as we do. */
5581 
5582 *indexptr = i;
5583 count = 0;
5584 
5585 for (;;)
5586   {
5587   count++;
5588   groupnumber = GET2(slot,0);
5589   cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5590   if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5591   if (++i >= cb->names_found) break;
5592   slot += cb->name_entry_size;
5593   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5594     (slot+IMM2_SIZE)[length] != 0) break;
5595   }
5596 
5597 *countptr = count;
5598 return TRUE;
5599 }
5600 
5601 
5602 
5603 /*************************************************
5604 *           Compile one branch                   *
5605 *************************************************/
5606 
5607 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5608 the options are changed during the branch, the pointer is used to change the
5609 external options bits. This function is used during the pre-compile phase when
5610 we are trying to find out the amount of memory needed, as well as during the
5611 real compile phase. The value of lengthptr distinguishes the two phases.
5612 
5613 Arguments:
5614   optionsptr        pointer to the option bits
5615   xoptionsptr       pointer to the extra option bits
5616   codeptr           points to the pointer to the current code point
5617   pptrptr           points to the current parsed pattern pointer
5618   errorcodeptr      points to error code variable
5619   firstcuptr        place to put the first required code unit
5620   firstcuflagsptr   place to put the first code unit flags
5621   reqcuptr          place to put the last required code unit
5622   reqcuflagsptr     place to put the last required code unit flags
5623   bcptr             points to current branch chain
5624   open_caps         points to current capitem
5625   cb                contains pointers to tables etc.
5626   lengthptr         NULL during the real compile phase
5627                     points to length accumulator during pre-compile phase
5628 
5629 Returns:            0 There's been an error, *errorcodeptr is non-zero
5630                    +1 Success, this branch must match at least one character
5631                    -1 Success, this branch may match an empty string
5632 */
5633 
5634 static int
compile_branch(uint32_t * optionsptr,uint32_t * xoptionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)5635 compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5636   PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5637   uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5638   uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5639   compile_block *cb, PCRE2_SIZE *lengthptr)
5640 {
5641 int bravalue = 0;
5642 int okreturn = -1;
5643 int group_return = 0;
5644 uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5645 uint32_t greedy_default, greedy_non_default;
5646 uint32_t repeat_type, op_type;
5647 uint32_t options = *optionsptr;               /* May change dynamically */
5648 uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
5649 uint32_t firstcu, reqcu;
5650 uint32_t zeroreqcu, zerofirstcu;
5651 uint32_t escape;
5652 uint32_t *pptr = *pptrptr;
5653 uint32_t meta, meta_arg;
5654 uint32_t firstcuflags, reqcuflags;
5655 uint32_t zeroreqcuflags, zerofirstcuflags;
5656 uint32_t req_caseopt, reqvary, tempreqvary;
5657 PCRE2_SIZE offset = 0;
5658 PCRE2_SIZE length_prevgroup = 0;
5659 PCRE2_UCHAR *code = *codeptr;
5660 PCRE2_UCHAR *last_code = code;
5661 PCRE2_UCHAR *orig_code = code;
5662 PCRE2_UCHAR *tempcode;
5663 PCRE2_UCHAR *previous = NULL;
5664 PCRE2_UCHAR op_previous;
5665 BOOL groupsetfirstcu = FALSE;
5666 BOOL had_accept = FALSE;
5667 BOOL matched_char = FALSE;
5668 BOOL previous_matched_char = FALSE;
5669 BOOL reset_caseful = FALSE;
5670 const uint8_t *cbits = cb->cbits;
5671 uint8_t classbits[32];
5672 
5673 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5674 not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5675 as we process the pattern. */
5676 
5677 #ifdef SUPPORT_UNICODE
5678 BOOL utf = (options & PCRE2_UTF) != 0;
5679 BOOL ucp = (options & PCRE2_UCP) != 0;
5680 #else  /* No Unicode support */
5681 BOOL utf = FALSE;
5682 #endif
5683 
5684 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5685 class_uchardata always so that it can be passed to add_to_class() always,
5686 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5687 alternative calls for the different cases. */
5688 
5689 PCRE2_UCHAR *class_uchardata;
5690 #ifdef SUPPORT_WIDE_CHARS
5691 BOOL xclass;
5692 PCRE2_UCHAR *class_uchardata_base;
5693 #endif
5694 
5695 /* Set up the default and non-default settings for greediness */
5696 
5697 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5698 greedy_non_default = greedy_default ^ 1;
5699 
5700 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5701 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5702 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5703 
5704 When we hit a repeat whose minimum is zero, we may have to adjust these values
5705 to take the zero repeat into account. This is implemented by setting them to
5706 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5707 item types that can be repeated set these backoff variables appropriately. */
5708 
5709 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5710 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5711 
5712 /* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5713 according to the current setting of the caseless flag. The REQ_CASELESS value
5714 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5715 to record the case status of the value. This is used only for ASCII characters.
5716 */
5717 
5718 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5719 
5720 /* Switch on next META item until the end of the branch */
5721 
5722 for (;; pptr++)
5723   {
5724 #ifdef SUPPORT_WIDE_CHARS
5725   BOOL xclass_has_prop;
5726 #endif
5727   BOOL negate_class;
5728   BOOL should_flip_negation;
5729   BOOL match_all_or_no_wide_chars;
5730   BOOL possessive_quantifier;
5731   BOOL note_group_empty;
5732   int class_has_8bitchar;
5733   uint32_t mclength;
5734   uint32_t skipunits;
5735   uint32_t subreqcu, subfirstcu;
5736   uint32_t groupnumber;
5737   uint32_t verbarglen, verbculen;
5738   uint32_t subreqcuflags, subfirstcuflags;
5739   open_capitem *oc;
5740   PCRE2_UCHAR mcbuffer[8];
5741 
5742   /* Get next META item in the pattern and its potential argument. */
5743 
5744   meta = META_CODE(*pptr);
5745   meta_arg = META_DATA(*pptr);
5746 
5747   /* If we are in the pre-compile phase, accumulate the length used for the
5748   previous cycle of this loop, unless the next item is a quantifier. */
5749 
5750   if (lengthptr != NULL)
5751     {
5752     if (code > cb->start_workspace + cb->workspace_size -
5753         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5754       {
5755       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5756         ERR52 : ERR86;
5757       return 0;
5758       }
5759 
5760     /* There is at least one situation where code goes backwards: this is the
5761     case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5762     is processed, the whole class is eliminated. However, it is created first,
5763     so we have to allow memory for it. Therefore, don't ever reduce the length
5764     at this point. */
5765 
5766     if (code < last_code) code = last_code;
5767 
5768     /* If the next thing is not a quantifier, we add the length of the previous
5769     item into the total, and reset the code pointer to the start of the
5770     workspace. Otherwise leave the previous item available to be quantified. */
5771 
5772     if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5773       {
5774       if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5775         {
5776         *errorcodeptr = ERR20;   /* Integer overflow */
5777         return 0;
5778         }
5779       *lengthptr += (PCRE2_SIZE)(code - orig_code);
5780       if (*lengthptr > MAX_PATTERN_SIZE)
5781         {
5782         *errorcodeptr = ERR20;   /* Pattern is too large */
5783         return 0;
5784         }
5785       code = orig_code;
5786       }
5787 
5788     /* Remember where this code item starts so we can catch the "backwards"
5789     case above next time round. */
5790 
5791     last_code = code;
5792     }
5793 
5794   /* Process the next parsed pattern item. If it is not a quantifier, remember
5795   where it starts so that it can be quantified when a quantifier follows.
5796   Checking for the legality of quantifiers happens in parse_regex(), except for
5797   a quantifier after an assertion that is a condition. */
5798 
5799   if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5800     {
5801     previous = code;
5802     if (matched_char && !had_accept) okreturn = 1;
5803     }
5804 
5805   previous_matched_char = matched_char;
5806   matched_char = FALSE;
5807   note_group_empty = FALSE;
5808   skipunits = 0;         /* Default value for most subgroups */
5809 
5810   switch(meta)
5811     {
5812     /* ===================================================================*/
5813     /* The branch terminates at pattern end or | or ) */
5814 
5815     case META_END:
5816     case META_ALT:
5817     case META_KET:
5818     *firstcuptr = firstcu;
5819     *firstcuflagsptr = firstcuflags;
5820     *reqcuptr = reqcu;
5821     *reqcuflagsptr = reqcuflags;
5822     *codeptr = code;
5823     *pptrptr = pptr;
5824     return okreturn;
5825 
5826 
5827     /* ===================================================================*/
5828     /* Handle single-character metacharacters. In multiline mode, ^ disables
5829     the setting of any following char as a first character. */
5830 
5831     case META_CIRCUMFLEX:
5832     if ((options & PCRE2_MULTILINE) != 0)
5833       {
5834       if (firstcuflags == REQ_UNSET)
5835         zerofirstcuflags = firstcuflags = REQ_NONE;
5836       *code++ = OP_CIRCM;
5837       }
5838     else *code++ = OP_CIRC;
5839     break;
5840 
5841     case META_DOLLAR:
5842     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5843     break;
5844 
5845     /* There can never be a first char if '.' is first, whatever happens about
5846     repeats. The value of reqcu doesn't change either. */
5847 
5848     case META_DOT:
5849     matched_char = TRUE;
5850     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5851     zerofirstcu = firstcu;
5852     zerofirstcuflags = firstcuflags;
5853     zeroreqcu = reqcu;
5854     zeroreqcuflags = reqcuflags;
5855     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5856     break;
5857 
5858 
5859     /* ===================================================================*/
5860     /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5861     Otherwise, an initial ']' is taken as a data character. When empty classes
5862     are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5863     match any character, so generate OP_ALLANY. */
5864 
5865     case META_CLASS_EMPTY:
5866     case META_CLASS_EMPTY_NOT:
5867     matched_char = TRUE;
5868     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5869     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5870     zerofirstcu = firstcu;
5871     zerofirstcuflags = firstcuflags;
5872     break;
5873 
5874 
5875     /* ===================================================================*/
5876     /* Non-empty character class. If the included characters are all < 256, we
5877     build a 32-byte bitmap of the permitted characters, except in the special
5878     case where there is only one such character. For negated classes, we build
5879     the map as usual, then invert it at the end. However, we use a different
5880     opcode so that data characters > 255 can be handled correctly.
5881 
5882     If the class contains characters outside the 0-255 range, a different
5883     opcode is compiled. It may optionally have a bit map for characters < 256,
5884     but those above are explicitly listed afterwards. A flag code unit tells
5885     whether the bitmap is present, and whether this is a negated class or
5886     not. */
5887 
5888     case META_CLASS_NOT:
5889     case META_CLASS:
5890     matched_char = TRUE;
5891     negate_class = meta == META_CLASS_NOT;
5892 
5893     /* We can optimize the case of a single character in a class by generating
5894     OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5895     negative. In the negative case there can be no first char if this item is
5896     first, whatever repeat count may follow. In the case of reqcu, save the
5897     previous value for reinstating. */
5898 
5899     /* NOTE: at present this optimization is not effective if the only
5900     character in a class in 32-bit, non-UCP mode has its top bit set. */
5901 
5902     if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5903       {
5904 #ifdef SUPPORT_UNICODE
5905       uint32_t d;
5906 #endif
5907       uint32_t c = pptr[1];
5908 
5909       pptr += 2;                 /* Move on to class end */
5910       if (meta == META_CLASS)    /* A positive one-char class can be */
5911         {                        /* handled as a normal literal character. */
5912         meta = c;                /* Set up the character */
5913         goto NORMAL_CHAR_SET;
5914         }
5915 
5916       /* Handle a negative one-character class */
5917 
5918       zeroreqcu = reqcu;
5919       zeroreqcuflags = reqcuflags;
5920       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5921       zerofirstcu = firstcu;
5922       zerofirstcuflags = firstcuflags;
5923 
5924       /* For caseless UTF or UCP mode, check whether this character has more
5925       than one other case. If so, generate a special OP_NOTPROP item instead of
5926       OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
5927       caseless set that starts with an ASCII character. */
5928 
5929 #ifdef SUPPORT_UNICODE
5930       if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5931           (d = UCD_CASESET(c)) != 0 &&
5932           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
5933           PRIV(ucd_caseless_sets)[d] > 127))
5934         {
5935         *code++ = OP_NOTPROP;
5936         *code++ = PT_CLIST;
5937         *code++ = d;
5938         break;   /* We are finished with this class */
5939         }
5940 #endif
5941       /* Char has only one other (usable) case, or UCP not available */
5942 
5943       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5944       code += PUTCHAR(c, code);
5945       break;   /* We are finished with this class */
5946       }        /* End of 1-char optimization */
5947 
5948     /* Handle character classes that contain more than just one literal
5949     character. If there are exactly two characters in a positive class, see if
5950     they are case partners. This can be optimized to generate a caseless single
5951     character match (which also sets first/required code units if relevant).
5952     When casing restrictions apply, ignore a caseless set if both characters
5953     are ASCII. */
5954 
5955     if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5956         pptr[3] == META_CLASS_END)
5957       {
5958       uint32_t c = pptr[1];
5959 
5960 #ifdef SUPPORT_UNICODE
5961       if (UCD_CASESET(c) == 0 ||
5962          ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
5963          c < 128 && pptr[2] < 128))
5964 #endif
5965         {
5966         uint32_t d;
5967 
5968 #ifdef SUPPORT_UNICODE
5969         if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5970 #endif
5971           {
5972 #if PCRE2_CODE_UNIT_WIDTH != 8
5973           if (c > 255) d = c; else
5974 #endif
5975           d = TABLE_GET(c, cb->fcc, c);
5976           }
5977 
5978         if (c != d && pptr[2] == d)
5979           {
5980           pptr += 3;                 /* Move on to class end */
5981           meta = c;
5982           if ((options & PCRE2_CASELESS) == 0)
5983             {
5984             reset_caseful = TRUE;
5985             options |= PCRE2_CASELESS;
5986             req_caseopt = REQ_CASELESS;
5987             }
5988           goto CLASS_CASELESS_CHAR;
5989           }
5990         }
5991       }
5992 
5993     /* If a non-extended class contains a negative special such as \S, we need
5994     to flip the negation flag at the end, so that support for characters > 255
5995     works correctly (they are all included in the class). An extended class may
5996     need to insert specific matching or non-matching code for wide characters.
5997     */
5998 
5999     should_flip_negation = match_all_or_no_wide_chars = FALSE;
6000 
6001     /* Extended class (xclass) will be used when characters > 255
6002     might match. */
6003 
6004 #ifdef SUPPORT_WIDE_CHARS
6005     xclass = FALSE;
6006     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
6007     class_uchardata_base = class_uchardata;   /* Save the start */
6008 #endif
6009 
6010     /* For optimization purposes, we track some properties of the class:
6011     class_has_8bitchar will be non-zero if the class contains at least one
6012     character with a code point less than 256; xclass_has_prop will be TRUE if
6013     Unicode property checks are present in the class. */
6014 
6015     class_has_8bitchar = 0;
6016 #ifdef SUPPORT_WIDE_CHARS
6017     xclass_has_prop = FALSE;
6018 #endif
6019 
6020     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
6021     in a temporary bit of memory, in case the class contains fewer than two
6022     8-bit characters because in that case the compiled code doesn't use the bit
6023     map. */
6024 
6025     memset(classbits, 0, 32 * sizeof(uint8_t));
6026 
6027     /* Process items until META_CLASS_END is reached. */
6028 
6029     while ((meta = *(++pptr)) != META_CLASS_END)
6030       {
6031       /* Handle POSIX classes such as [:alpha:] etc. */
6032 
6033       if (meta == META_POSIX || meta == META_POSIX_NEG)
6034         {
6035         BOOL local_negate = (meta == META_POSIX_NEG);
6036         int posix_class = *(++pptr);
6037         int taboffset, tabopt;
6038         uint8_t pbits[32];
6039 
6040         should_flip_negation = local_negate;  /* Note negative special */
6041 
6042         /* If matching is caseless, upper and lower are converted to alpha.
6043         This relies on the fact that the class table starts with alpha,
6044         lower, upper as the first 3 entries. */
6045 
6046         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
6047           posix_class = 0;
6048 
6049         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
6050         different escape sequences that use Unicode properties \p or \P.
6051         Others that are not available via \p or \P have to generate
6052         XCL_PROP/XCL_NOTPROP directly, which is done here. */
6053 
6054 #ifdef SUPPORT_UNICODE
6055         if ((options & PCRE2_UCP) != 0 &&
6056             (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
6057           {
6058           switch(posix_class)
6059             {
6060             case PC_GRAPH:
6061             case PC_PRINT:
6062             case PC_PUNCT:
6063             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
6064             *class_uchardata++ = (PCRE2_UCHAR)
6065               ((posix_class == PC_GRAPH)? PT_PXGRAPH :
6066                (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
6067             *class_uchardata++ = 0;
6068             xclass_has_prop = TRUE;
6069             goto CONTINUE_CLASS;
6070 
6071             /* For the other POSIX classes (ex: ascii) we are going to
6072             fall through to the non-UCP case and build a bit map for
6073             characters with code points less than 256. However, if we are in
6074             a negated POSIX class, characters with code points greater than
6075             255 must either all match or all not match, depending on whether
6076             the whole class is not or is negated. For example, for
6077             [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
6078             they must not.
6079 
6080             In the special case where there are no xclass items, this is
6081             automatically handled by the use of OP_CLASS or OP_NCLASS, but an
6082             explicit range is needed for OP_XCLASS. Setting a flag here
6083             causes the range to be generated later when it is known that
6084             OP_XCLASS is required. In the 8-bit library this is relevant only in
6085             utf mode, since no wide characters can exist otherwise. */
6086 
6087             default:
6088 #if PCRE2_CODE_UNIT_WIDTH == 8
6089             if (utf)
6090 #endif
6091             match_all_or_no_wide_chars |= local_negate;
6092             break;
6093             }
6094           }
6095 #endif  /* SUPPORT_UNICODE */
6096 
6097         /* In the non-UCP case, or when UCP makes no difference, we build the
6098         bit map for the POSIX class in a chunk of local store because we may
6099         be adding and subtracting from it, and we don't want to subtract bits
6100         that may be in the main map already. At the end we or the result into
6101         the bit map that is being built. */
6102 
6103         posix_class *= 3;
6104 
6105         /* Copy in the first table (always present) */
6106 
6107         memcpy(pbits, cbits + posix_class_maps[posix_class],
6108           32 * sizeof(uint8_t));
6109 
6110         /* If there is a second table, add or remove it as required. */
6111 
6112         taboffset = posix_class_maps[posix_class + 1];
6113         tabopt = posix_class_maps[posix_class + 2];
6114 
6115         if (taboffset >= 0)
6116           {
6117           if (tabopt >= 0)
6118             for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
6119           else
6120             for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
6121           }
6122 
6123         /* Now see if we need to remove any special characters. An option
6124         value of 1 removes vertical space and 2 removes underscore. */
6125 
6126         if (tabopt < 0) tabopt = -tabopt;
6127         if (tabopt == 1) pbits[1] &= ~0x3c;
6128           else if (tabopt == 2) pbits[11] &= 0x7f;
6129 
6130         /* Add the POSIX table or its complement into the main table that is
6131         being built and we are done. */
6132 
6133         if (local_negate)
6134           for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
6135         else
6136           for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
6137 
6138         /* Every class contains at least one < 256 character. */
6139 
6140         class_has_8bitchar = 1;
6141         goto CONTINUE_CLASS;    /* End of POSIX handling */
6142         }
6143 
6144       /* Other than POSIX classes, the only items we should encounter are
6145       \d-type escapes and literal characters (possibly as ranges). */
6146 
6147       if (meta == META_BIGVALUE)
6148         {
6149         meta = *(++pptr);
6150         goto CLASS_LITERAL;
6151         }
6152 
6153       /* Any other non-literal must be an escape */
6154 
6155       if (meta >= META_END)
6156         {
6157         if (META_CODE(meta) != META_ESCAPE)
6158           {
6159 #ifdef DEBUG_SHOW_PARSED
6160           fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
6161                           "in character class\n", meta);
6162 #endif
6163           *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
6164           return 0;
6165           }
6166         escape = META_DATA(meta);
6167 
6168         /* Every class contains at least one < 256 character. */
6169 
6170         class_has_8bitchar++;
6171 
6172         switch(escape)
6173           {
6174           case ESC_d:
6175           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
6176           break;
6177 
6178           case ESC_D:
6179           should_flip_negation = TRUE;
6180           for (int i = 0; i < 32; i++)
6181             classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
6182           break;
6183 
6184           case ESC_w:
6185           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
6186           break;
6187 
6188           case ESC_W:
6189           should_flip_negation = TRUE;
6190           for (int i = 0; i < 32; i++)
6191             classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
6192           break;
6193 
6194           /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
6195           5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
6196           previously set by something earlier in the character class.
6197           Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
6198           we could just adjust the appropriate bit. From PCRE 8.34 we no
6199           longer treat \s and \S specially. */
6200 
6201           case ESC_s:
6202           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
6203           break;
6204 
6205           case ESC_S:
6206           should_flip_negation = TRUE;
6207           for (int i = 0; i < 32; i++)
6208             classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
6209           break;
6210 
6211           /* When adding the horizontal or vertical space lists to a class, or
6212           their complements, disable PCRE2_CASELESS, because it justs wastes
6213           time, and in the "not-x" UTF cases can create unwanted duplicates in
6214           the XCLASS list (provoked by characters that have more than one other
6215           case and by both cases being in the same "not-x" sublist). */
6216 
6217           case ESC_h:
6218           (void)add_list_to_class(classbits, &class_uchardata,
6219             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list),
6220               NOTACHAR);
6221           break;
6222 
6223           case ESC_H:
6224           (void)add_not_list_to_class(classbits, &class_uchardata,
6225             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list));
6226           break;
6227 
6228           case ESC_v:
6229           (void)add_list_to_class(classbits, &class_uchardata,
6230             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list),
6231               NOTACHAR);
6232           break;
6233 
6234           case ESC_V:
6235           (void)add_not_list_to_class(classbits, &class_uchardata,
6236             options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list));
6237           break;
6238 
6239           /* If Unicode is not supported, \P and \p are not allowed and are
6240           faulted at parse time, so will never appear here. */
6241 
6242 #ifdef SUPPORT_UNICODE
6243           case ESC_p:
6244           case ESC_P:
6245             {
6246             uint32_t ptype = *(++pptr) >> 16;
6247             uint32_t pdata = *pptr & 0xffff;
6248             *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
6249             *class_uchardata++ = ptype;
6250             *class_uchardata++ = pdata;
6251             xclass_has_prop = TRUE;
6252             class_has_8bitchar--;                /* Undo! */
6253             }
6254           break;
6255 #endif
6256           }
6257 
6258         goto CONTINUE_CLASS;
6259         }  /* End handling \d-type escapes */
6260 
6261       /* A literal character may be followed by a range meta. At parse time
6262       there are checks for out-of-order characters, for ranges where the two
6263       characters are equal, and for hyphens that cannot indicate a range. At
6264       this point, therefore, no checking is needed. */
6265 
6266       else
6267         {
6268         uint32_t c, d;
6269 
6270         CLASS_LITERAL:
6271         c = d = meta;
6272 
6273         /* Remember if \r or \n were explicitly used */
6274 
6275         if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6276 
6277         /* Process a character range */
6278 
6279         if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6280           {
6281 #ifdef EBCDIC
6282           BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6283 #endif
6284           pptr += 2;
6285           d = *pptr;
6286           if (d == META_BIGVALUE) d = *(++pptr);
6287 
6288           /* Remember an explicit \r or \n, and add the range to the class. */
6289 
6290           if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6291 
6292           /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6293           because there are holes in the encoding, and simply using the range
6294           A-Z (for example) would include the characters in the holes. This
6295           applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6296 
6297 #ifdef EBCDIC
6298           if (range_is_literal &&
6299                (cb->ctypes[c] & ctype_letter) != 0 &&
6300                (cb->ctypes[d] & ctype_letter) != 0 &&
6301                (c <= CHAR_z) == (d <= CHAR_z))
6302             {
6303             uint32_t uc = (d <= CHAR_z)? 0 : 64;
6304             uint32_t C = c - uc;
6305             uint32_t D = d - uc;
6306 
6307             if (C <= CHAR_i)
6308               {
6309               class_has_8bitchar +=
6310                 add_to_class(classbits, &class_uchardata, options, xoptions,
6311                   cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc);
6312               C = CHAR_j;
6313               }
6314 
6315             if (C <= D && C <= CHAR_r)
6316               {
6317               class_has_8bitchar +=
6318                 add_to_class(classbits, &class_uchardata, options, xoptions,
6319                   cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc);
6320               C = CHAR_s;
6321               }
6322 
6323             if (C <= D)
6324               {
6325               class_has_8bitchar +=
6326                 add_to_class(classbits, &class_uchardata, options, xoptions,
6327                   cb, C + uc, D + uc);
6328               }
6329             }
6330           else
6331 #endif
6332           /* Not an EBCDIC special range */
6333 
6334           class_has_8bitchar += add_to_class(classbits, &class_uchardata,
6335             options, xoptions, cb, c, d);
6336           goto CONTINUE_CLASS;   /* Go get the next char in the class */
6337           }  /* End of range handling */
6338 
6339 
6340         /* Handle a single character. */
6341 
6342         class_has_8bitchar +=
6343           add_to_class(classbits, &class_uchardata, options, xoptions, cb,
6344             meta, meta);
6345         }
6346 
6347       /* Continue to the next item in the class. */
6348 
6349       CONTINUE_CLASS:
6350 
6351 #ifdef SUPPORT_WIDE_CHARS
6352       /* If any wide characters or Unicode properties have been encountered,
6353       set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6354       of the extra data and reset the pointer. This is so that very large
6355       classes that contain a zillion wide characters or Unicode property tests
6356       do not overwrite the workspace (which is on the stack). */
6357 
6358       if (class_uchardata > class_uchardata_base)
6359         {
6360         xclass = TRUE;
6361         if (lengthptr != NULL)
6362           {
6363           *lengthptr += class_uchardata - class_uchardata_base;
6364           class_uchardata = class_uchardata_base;
6365           }
6366         }
6367 #endif
6368 
6369       continue;  /* Needed to avoid error when not supporting wide chars */
6370       }   /* End of main class-processing loop */
6371 
6372     /* If this class is the first thing in the branch, there can be no first
6373     char setting, whatever the repeat count. Any reqcu setting must remain
6374     unchanged after any kind of repeat. */
6375 
6376     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6377     zerofirstcu = firstcu;
6378     zerofirstcuflags = firstcuflags;
6379     zeroreqcu = reqcu;
6380     zeroreqcuflags = reqcuflags;
6381 
6382     /* If there are characters with values > 255, or Unicode property settings
6383     (\p or \P), we have to compile an extended class, with its own opcode,
6384     unless there were no property settings and there was a negated special such
6385     as \S in the class, and PCRE2_UCP is not set, because in that case all
6386     characters > 255 are in or not in the class, so any that were explicitly
6387     given as well can be ignored.
6388 
6389     In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
6390     were present in a class, we either have to match or not match all wide
6391     characters (depending on whether the whole class is or is not negated).
6392     This requirement is indicated by match_all_or_no_wide_chars being true.
6393     We do this by including an explicit range, which works in both cases.
6394     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6395     cannot be any wide characters in 8-bit non-UTF mode.
6396 
6397     When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6398     class where \S etc is present without PCRE2_UCP, causing an extended class
6399     to be compiled, we make sure that all characters > 255 are included by
6400     forcing match_all_or_no_wide_chars to be true.
6401 
6402     If, when generating an xclass, there are no characters < 256, we can omit
6403     the bitmap in the actual compiled code. */
6404 
6405 #ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
6406     if (xclass && (
6407 #ifdef SUPPORT_UNICODE
6408         (options & PCRE2_UCP) != 0 ||
6409 #endif
6410         xclass_has_prop || !should_flip_negation))
6411       {
6412       if (match_all_or_no_wide_chars || (
6413 #if PCRE2_CODE_UNIT_WIDTH == 8
6414            utf &&
6415 #endif
6416            should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6417         {
6418         *class_uchardata++ = XCL_RANGE;
6419         if (utf)   /* Will always be utf in the 8-bit library */
6420           {
6421           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6422           class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6423           }
6424         else       /* Can only happen for the 16-bit & 32-bit libraries */
6425           {
6426 #if PCRE2_CODE_UNIT_WIDTH == 16
6427           *class_uchardata++ = 0x100;
6428           *class_uchardata++ = 0xffffu;
6429 #elif PCRE2_CODE_UNIT_WIDTH == 32
6430           *class_uchardata++ = 0x100;
6431           *class_uchardata++ = 0xffffffffu;
6432 #endif
6433           }
6434         }
6435       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
6436       *code++ = OP_XCLASS;
6437       code += LINK_SIZE;
6438       *code = negate_class? XCL_NOT:0;
6439       if (xclass_has_prop) *code |= XCL_HASPROP;
6440 
6441       /* If the map is required, move up the extra data to make room for it;
6442       otherwise just move the code pointer to the end of the extra data. */
6443 
6444       if (class_has_8bitchar > 0)
6445         {
6446         *code++ |= XCL_MAP;
6447         (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6448           CU2BYTES(class_uchardata - code));
6449         if (negate_class && !xclass_has_prop)
6450           {
6451           /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6452           for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6453           }
6454         memcpy(code, classbits, 32);
6455         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6456         }
6457       else code = class_uchardata;
6458 
6459       /* Now fill in the complete length of the item */
6460 
6461       PUT(previous, 1, (int)(code - previous));
6462       break;   /* End of class handling */
6463       }
6464 #endif  /* SUPPORT_WIDE_CHARS */
6465 
6466     /* If there are no characters > 255, or they are all to be included or
6467     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6468     whole class was negated and whether there were negative specials such as \S
6469     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6470     negating it if necessary. */
6471 
6472     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6473     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
6474       {
6475       if (negate_class)
6476         {
6477        /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6478        for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6479        }
6480       memcpy(code, classbits, 32);
6481       }
6482     code += 32 / sizeof(PCRE2_UCHAR);
6483     break;  /* End of class processing */
6484 
6485 
6486     /* ===================================================================*/
6487     /* Deal with (*VERB)s. */
6488 
6489     /* Check for open captures before ACCEPT and close those that are within
6490     the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6491     assertion. In the first pass, just accumulate the length required;
6492     otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6493     workspace overflow. Do not set firstcu after *ACCEPT. */
6494 
6495     case META_ACCEPT:
6496     cb->had_accept = had_accept = TRUE;
6497     for (oc = open_caps;
6498          oc != NULL && oc->assert_depth >= cb->assert_depth;
6499          oc = oc->next)
6500       {
6501       if (lengthptr != NULL)
6502         {
6503         *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6504         }
6505       else
6506         {
6507         *code++ = OP_CLOSE;
6508         PUT2INC(code, 0, oc->number);
6509         }
6510       }
6511     *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6512     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6513     break;
6514 
6515     case META_PRUNE:
6516     case META_SKIP:
6517     cb->had_pruneorskip = TRUE;
6518     /* Fall through */
6519     case META_COMMIT:
6520     case META_FAIL:
6521     *code++ = verbops[(meta - META_MARK) >> 16];
6522     break;
6523 
6524     case META_THEN:
6525     cb->external_flags |= PCRE2_HASTHEN;
6526     *code++ = OP_THEN;
6527     break;
6528 
6529     /* Handle verbs with arguments. Arguments can be very long, especially in
6530     16- and 32-bit modes, and can overflow the workspace in the first pass.
6531     However, the argument length is constrained to be small enough to fit in
6532     one code unit. This check happens in parse_regex(). In the first pass,
6533     instead of putting the argument into memory, we just update the length
6534     counter and set up an empty argument. */
6535 
6536     case META_THEN_ARG:
6537     cb->external_flags |= PCRE2_HASTHEN;
6538     goto VERB_ARG;
6539 
6540     case META_PRUNE_ARG:
6541     case META_SKIP_ARG:
6542     cb->had_pruneorskip = TRUE;
6543     /* Fall through */
6544     case META_MARK:
6545     case META_COMMIT_ARG:
6546     VERB_ARG:
6547     *code++ = verbops[(meta - META_MARK) >> 16];
6548     /* The length is in characters. */
6549     verbarglen = *(++pptr);
6550     verbculen = 0;
6551     tempcode = code++;
6552     for (int i = 0; i < (int)verbarglen; i++)
6553       {
6554       meta = *(++pptr);
6555 #ifdef SUPPORT_UNICODE
6556       if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6557 #endif
6558         {
6559         mclength = 1;
6560         mcbuffer[0] = meta;
6561         }
6562       if (lengthptr != NULL) *lengthptr += mclength; else
6563         {
6564         memcpy(code, mcbuffer, CU2BYTES(mclength));
6565         code += mclength;
6566         verbculen += mclength;
6567         }
6568       }
6569 
6570     *tempcode = verbculen;   /* Fill in the code unit length */
6571     *code++ = 0;             /* Terminating zero */
6572     break;
6573 
6574 
6575     /* ===================================================================*/
6576     /* Handle options change. The new setting must be passed back for use in
6577     subsequent branches. Reset the greedy defaults and the case value for
6578     firstcu and reqcu. */
6579 
6580     case META_OPTIONS:
6581     *optionsptr = options = *(++pptr);
6582     *xoptionsptr = xoptions = *(++pptr);
6583     greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6584     greedy_non_default = greedy_default ^ 1;
6585     req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6586     break;
6587 
6588 
6589     /* ===================================================================*/
6590     /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6591     because it could be a numerical check on recursion, or a name check on a
6592     group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6593     we can handle it either way. We first try for a name; if not found, process
6594     the number. */
6595 
6596     case META_COND_RNUMBER:   /* (?(Rdigits) */
6597     case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6598     case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6599     bravalue = OP_COND;
6600       {
6601       int count, index;
6602       unsigned int i;
6603       PCRE2_SPTR name;
6604       named_group *ng = cb->named_groups;
6605       uint32_t length = *(++pptr);
6606 
6607       GETPLUSOFFSET(offset, pptr);
6608       name = cb->start_pattern + offset;
6609 
6610       /* In the first pass, the names generated in the pre-pass are available,
6611       but the main name table has not yet been created. Scan the list of names
6612       generated in the pre-pass in order to get a number and whether or not
6613       this name is duplicated. If it is not duplicated, we can handle it as a
6614       numerical group. */
6615 
6616       for (i = 0; i < cb->names_found; i++, ng++)
6617         {
6618         if (length == ng->length &&
6619             PRIV(strncmp)(name, ng->name, length) == 0)
6620           {
6621           if (!ng->isdup)
6622             {
6623             code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6624             PUT2(code, 2+LINK_SIZE, ng->number);
6625             if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6626             skipunits = 1+IMM2_SIZE;
6627             goto GROUP_PROCESS_NOTE_EMPTY;
6628             }
6629           break;  /* Found a duplicated name */
6630           }
6631         }
6632 
6633       /* If the name was not found we have a bad reference, unless we are
6634       dealing with R<digits>, which is treated as a recursion test by number.
6635       */
6636 
6637       if (i >= cb->names_found)
6638         {
6639         groupnumber = 0;
6640         if (meta == META_COND_RNUMBER)
6641           {
6642           for (i = 1; i < length; i++)
6643             {
6644             groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6645             if (groupnumber > MAX_GROUP_NUMBER)
6646               {
6647               *errorcodeptr = ERR61;
6648               cb->erroroffset = offset + i;
6649               return 0;
6650               }
6651             }
6652           }
6653 
6654         if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6655           {
6656           *errorcodeptr = ERR15;
6657           cb->erroroffset = offset;
6658           return 0;
6659           }
6660 
6661         /* (?Rdigits) treated as a recursion reference by number. A value of
6662         zero (which is the result of both (?R) and (?R0)) means "any", and is
6663         translated into RREF_ANY (which is 0xffff). */
6664 
6665         if (groupnumber == 0) groupnumber = RREF_ANY;
6666         code[1+LINK_SIZE] = OP_RREF;
6667         PUT2(code, 2+LINK_SIZE, groupnumber);
6668         skipunits = 1+IMM2_SIZE;
6669         goto GROUP_PROCESS_NOTE_EMPTY;
6670         }
6671 
6672       /* A duplicated name was found. Note that if an R<digits> name is found
6673       (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6674 
6675       code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6676 
6677       /* We have a duplicated name. In the compile pass we have to search the
6678       main table in order to get the index and count values. */
6679 
6680       count = 0;  /* Values for first pass (avoids compiler warning) */
6681       index = 0;
6682       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6683             &count, errorcodeptr, cb)) return 0;
6684 
6685       /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6686       insert appropriate data values. */
6687 
6688       code[1+LINK_SIZE]++;
6689       skipunits = 1+2*IMM2_SIZE;
6690       PUT2(code, 2+LINK_SIZE, index);
6691       PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6692       }
6693     goto GROUP_PROCESS_NOTE_EMPTY;
6694 
6695     /* The DEFINE condition is always false. Its internal groups may never
6696     be called, so matched_char must remain false, hence the jump to
6697     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6698 
6699     case META_COND_DEFINE:
6700     bravalue = OP_COND;
6701     GETPLUSOFFSET(offset, pptr);
6702     code[1+LINK_SIZE] = OP_DEFINE;
6703     skipunits = 1;
6704     goto GROUP_PROCESS;
6705 
6706     /* Conditional test of a group's being set. */
6707 
6708     case META_COND_NUMBER:
6709     bravalue = OP_COND;
6710     GETPLUSOFFSET(offset, pptr);
6711     groupnumber = *(++pptr);
6712     if (groupnumber > cb->bracount)
6713       {
6714       *errorcodeptr = ERR15;
6715       cb->erroroffset = offset;
6716       return 0;
6717       }
6718     if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6719     offset -= 2;   /* Point at initial ( for too many branches error */
6720     code[1+LINK_SIZE] = OP_CREF;
6721     skipunits = 1+IMM2_SIZE;
6722     PUT2(code, 2+LINK_SIZE, groupnumber);
6723     goto GROUP_PROCESS_NOTE_EMPTY;
6724 
6725     /* Test for the PCRE2 version. */
6726 
6727     case META_COND_VERSION:
6728     bravalue = OP_COND;
6729     if (pptr[1] > 0)
6730       code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6731         (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6732           OP_TRUE : OP_FALSE;
6733     else
6734       code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6735         OP_TRUE : OP_FALSE;
6736     skipunits = 1;
6737     pptr += 3;
6738     goto GROUP_PROCESS_NOTE_EMPTY;
6739 
6740     /* The condition is an assertion, possibly preceded by a callout. */
6741 
6742     case META_COND_ASSERT:
6743     bravalue = OP_COND;
6744     goto GROUP_PROCESS_NOTE_EMPTY;
6745 
6746 
6747     /* ===================================================================*/
6748     /* Handle all kinds of nested bracketed groups. The non-capturing,
6749     non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6750 
6751     case META_LOOKAHEAD:
6752     bravalue = OP_ASSERT;
6753     cb->assert_depth += 1;
6754     goto GROUP_PROCESS;
6755 
6756     case META_LOOKAHEAD_NA:
6757     bravalue = OP_ASSERT_NA;
6758     cb->assert_depth += 1;
6759     goto GROUP_PROCESS;
6760 
6761     /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6762     thing to do, but Perl allows all assertions to be quantified, and when
6763     they contain capturing parentheses there may be a potential use for
6764     this feature. Not that that applies to a quantified (?!) but we allow
6765     it for uniformity. */
6766 
6767     case META_LOOKAHEADNOT:
6768     if (pptr[1] == META_KET &&
6769          (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6770       {
6771       *code++ = OP_FAIL;
6772       pptr++;
6773       }
6774     else
6775       {
6776       bravalue = OP_ASSERT_NOT;
6777       cb->assert_depth += 1;
6778       goto GROUP_PROCESS;
6779       }
6780     break;
6781 
6782     case META_LOOKBEHIND:
6783     bravalue = OP_ASSERTBACK;
6784     cb->assert_depth += 1;
6785     goto GROUP_PROCESS;
6786 
6787     case META_LOOKBEHINDNOT:
6788     bravalue = OP_ASSERTBACK_NOT;
6789     cb->assert_depth += 1;
6790     goto GROUP_PROCESS;
6791 
6792     case META_LOOKBEHIND_NA:
6793     bravalue = OP_ASSERTBACK_NA;
6794     cb->assert_depth += 1;
6795     goto GROUP_PROCESS;
6796 
6797     case META_ATOMIC:
6798     bravalue = OP_ONCE;
6799     goto GROUP_PROCESS_NOTE_EMPTY;
6800 
6801     case META_SCRIPT_RUN:
6802     bravalue = OP_SCRIPT_RUN;
6803     goto GROUP_PROCESS_NOTE_EMPTY;
6804 
6805     case META_NOCAPTURE:
6806     bravalue = OP_BRA;
6807     /* Fall through */
6808 
6809     /* Process nested bracketed regex. The nesting depth is maintained for the
6810     benefit of the stackguard function. The test for too deep nesting is now
6811     done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6812     others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6813     note of whether or not they may match an empty string. */
6814 
6815     GROUP_PROCESS_NOTE_EMPTY:
6816     note_group_empty = TRUE;
6817 
6818     GROUP_PROCESS:
6819     cb->parens_depth += 1;
6820     *code = bravalue;
6821     pptr++;
6822     tempcode = code;
6823     tempreqvary = cb->req_varyopt;        /* Save value before group */
6824     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6825 
6826     if ((group_return =
6827          compile_regex(
6828          options,                         /* The options state */
6829          xoptions,                        /* The extra options state */
6830          &tempcode,                       /* Where to put code (updated) */
6831          &pptr,                           /* Input pointer (updated) */
6832          errorcodeptr,                    /* Where to put an error message */
6833          skipunits,                       /* Skip over bracket number */
6834          &subfirstcu,                     /* For possible first char */
6835          &subfirstcuflags,
6836          &subreqcu,                       /* For possible last char */
6837          &subreqcuflags,
6838          bcptr,                           /* Current branch chain */
6839          open_caps,                       /* Pointer to capture stack */
6840          cb,                              /* Compile data block */
6841          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6842            &length_prevgroup              /* Pre-compile phase */
6843          )) == 0)
6844       return 0;  /* Error */
6845 
6846     cb->parens_depth -= 1;
6847 
6848     /* If that was a non-conditional significant group (not an assertion, not a
6849     DEFINE) that matches at least one character, then the current item matches
6850     a character. Conditionals are handled below. */
6851 
6852     if (note_group_empty && bravalue != OP_COND && group_return > 0)
6853       matched_char = TRUE;
6854 
6855     /* If we've just compiled an assertion, pop the assert depth. */
6856 
6857     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6858       cb->assert_depth -= 1;
6859 
6860     /* At the end of compiling, code is still pointing to the start of the
6861     group, while tempcode has been updated to point past the end of the group.
6862     The parsed pattern pointer (pptr) is on the closing META_KET.
6863 
6864     If this is a conditional bracket, check that there are no more than
6865     two branches in the group, or just one if it's a DEFINE group. We do this
6866     in the real compile phase, not in the pre-pass, where the whole group may
6867     not be available. */
6868 
6869     if (bravalue == OP_COND && lengthptr == NULL)
6870       {
6871       PCRE2_UCHAR *tc = code;
6872       int condcount = 0;
6873 
6874       do {
6875          condcount++;
6876          tc += GET(tc,1);
6877          }
6878       while (*tc != OP_KET);
6879 
6880       /* A DEFINE group is never obeyed inline (the "condition" is always
6881       false). It must have only one branch. Having checked this, change the
6882       opcode to OP_FALSE. */
6883 
6884       if (code[LINK_SIZE+1] == OP_DEFINE)
6885         {
6886         if (condcount > 1)
6887           {
6888           cb->erroroffset = offset;
6889           *errorcodeptr = ERR54;
6890           return 0;
6891           }
6892         code[LINK_SIZE+1] = OP_FALSE;
6893         bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6894         }
6895 
6896       /* A "normal" conditional group. If there is just one branch, we must not
6897       make use of its firstcu or reqcu, because this is equivalent to an
6898       empty second branch. Also, it may match an empty string. If there are two
6899       branches, this item must match a character if the group must. */
6900 
6901       else
6902         {
6903         if (condcount > 2)
6904           {
6905           cb->erroroffset = offset;
6906           *errorcodeptr = ERR27;
6907           return 0;
6908           }
6909         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6910           else if (group_return > 0) matched_char = TRUE;
6911         }
6912       }
6913 
6914     /* In the pre-compile phase, update the length by the length of the group,
6915     less the brackets at either end. Then reduce the compiled code to just a
6916     set of non-capturing brackets so that it doesn't use much memory if it is
6917     duplicated by a quantifier.*/
6918 
6919     if (lengthptr != NULL)
6920       {
6921       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6922         {
6923         *errorcodeptr = ERR20;
6924         return 0;
6925         }
6926       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6927       code++;   /* This already contains bravalue */
6928       PUTINC(code, 0, 1 + LINK_SIZE);
6929       *code++ = OP_KET;
6930       PUTINC(code, 0, 1 + LINK_SIZE);
6931       break;    /* No need to waste time with special character handling */
6932       }
6933 
6934     /* Otherwise update the main code pointer to the end of the group. */
6935 
6936     code = tempcode;
6937 
6938     /* For a DEFINE group, required and first character settings are not
6939     relevant. */
6940 
6941     if (bravalue == OP_DEFINE) break;
6942 
6943     /* Handle updating of the required and first code units for other types of
6944     group. Update for normal brackets of all kinds, and conditions with two
6945     branches (see code above). If the bracket is followed by a quantifier with
6946     zero repeat, we have to back off. Hence the definition of zeroreqcu and
6947     zerofirstcu outside the main loop so that they can be accessed for the back
6948     off. */
6949 
6950     zeroreqcu = reqcu;
6951     zeroreqcuflags = reqcuflags;
6952     zerofirstcu = firstcu;
6953     zerofirstcuflags = firstcuflags;
6954     groupsetfirstcu = FALSE;
6955 
6956     if (bravalue >= OP_ONCE)  /* Not an assertion */
6957       {
6958       /* If we have not yet set a firstcu in this branch, take it from the
6959       subpattern, remembering that it was set here so that a repeat of more
6960       than one can replicate it as reqcu if necessary. If the subpattern has
6961       no firstcu, set "none" for the whole branch. In both cases, a zero
6962       repeat forces firstcu to "none". */
6963 
6964       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6965         {
6966         if (subfirstcuflags < REQ_NONE)
6967           {
6968           firstcu = subfirstcu;
6969           firstcuflags = subfirstcuflags;
6970           groupsetfirstcu = TRUE;
6971           }
6972         else firstcuflags = REQ_NONE;
6973         zerofirstcuflags = REQ_NONE;
6974         }
6975 
6976       /* If firstcu was previously set, convert the subpattern's firstcu
6977       into reqcu if there wasn't one, using the vary flag that was in
6978       existence beforehand. */
6979 
6980       else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6981         {
6982         subreqcu = subfirstcu;
6983         subreqcuflags = subfirstcuflags | tempreqvary;
6984         }
6985 
6986       /* If the subpattern set a required code unit (or set a first code unit
6987       that isn't really the first code unit - see above), set it. */
6988 
6989       if (subreqcuflags < REQ_NONE)
6990         {
6991         reqcu = subreqcu;
6992         reqcuflags = subreqcuflags;
6993         }
6994       }
6995 
6996     /* For a forward assertion, we take the reqcu, if set, provided that the
6997     group has also set a firstcu. This can be helpful if the pattern that
6998     follows the assertion doesn't set a different char. For example, it's
6999     useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7000     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7001     the "real" "a" would then become a reqcu instead of a firstcu. This is
7002     overcome by a scan at the end if there's no firstcu, looking for an
7003     asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7004     we must only take the reqcu when the group also set a firstcu. Otherwise,
7005     in that example, 'X' ends up set for both. */
7006 
7007     else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7008              subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7009       {
7010       reqcu = subreqcu;
7011       reqcuflags = subreqcuflags;
7012       }
7013 
7014     break;  /* End of nested group handling */
7015 
7016 
7017     /* ===================================================================*/
7018     /* Handle named backreferences and recursions. */
7019 
7020     case META_BACKREF_BYNAME:
7021     case META_RECURSE_BYNAME:
7022       {
7023       int count, index;
7024       PCRE2_SPTR name;
7025       BOOL is_dupname = FALSE;
7026       named_group *ng = cb->named_groups;
7027       uint32_t length = *(++pptr);
7028 
7029       GETPLUSOFFSET(offset, pptr);
7030       name = cb->start_pattern + offset;
7031 
7032       /* In the first pass, the names generated in the pre-pass are available,
7033       but the main name table has not yet been created. Scan the list of names
7034       generated in the pre-pass in order to get a number and whether or not
7035       this name is duplicated. */
7036 
7037       groupnumber = 0;
7038       for (unsigned int i = 0; i < cb->names_found; i++, ng++)
7039         {
7040         if (length == ng->length &&
7041             PRIV(strncmp)(name, ng->name, length) == 0)
7042           {
7043           is_dupname = ng->isdup;
7044           groupnumber = ng->number;
7045 
7046           /* For a recursion, that's all that is needed. We can now go to
7047           the code that handles numerical recursion, applying it to the first
7048           group with the given name. */
7049 
7050           if (meta == META_RECURSE_BYNAME)
7051             {
7052             meta_arg = groupnumber;
7053             goto HANDLE_NUMERICAL_RECURSION;
7054             }
7055 
7056           /* For a back reference, update the back reference map and the
7057           maximum back reference. */
7058 
7059           cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7060           if (groupnumber > cb->top_backref)
7061             cb->top_backref = groupnumber;
7062           }
7063         }
7064 
7065       /* If the name was not found we have a bad reference. */
7066 
7067       if (groupnumber == 0)
7068         {
7069         *errorcodeptr = ERR15;
7070         cb->erroroffset = offset;
7071         return 0;
7072         }
7073 
7074       /* If a back reference name is not duplicated, we can handle it as
7075       a numerical reference. */
7076 
7077       if (!is_dupname)
7078         {
7079         meta_arg = groupnumber;
7080         goto HANDLE_SINGLE_REFERENCE;
7081         }
7082 
7083       /* If a back reference name is duplicated, we generate a different
7084       opcode to a numerical back reference. In the second pass we must
7085       search for the index and count in the final name table. */
7086 
7087       count = 0;  /* Values for first pass (avoids compiler warning) */
7088       index = 0;
7089       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
7090             &count, errorcodeptr, cb)) return 0;
7091 
7092       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7093       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7094       PUT2INC(code, 0, index);
7095       PUT2INC(code, 0, count);
7096       }
7097     break;
7098 
7099 
7100     /* ===================================================================*/
7101     /* Handle a numerical callout. */
7102 
7103     case META_CALLOUT_NUMBER:
7104     code[0] = OP_CALLOUT;
7105     PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7106     PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7107     code[1 + 2*LINK_SIZE] = pptr[3];
7108     pptr += 3;
7109     code += PRIV(OP_lengths)[OP_CALLOUT];
7110     break;
7111 
7112 
7113     /* ===================================================================*/
7114     /* Handle a callout with a string argument. In the pre-pass we just compute
7115     the length without generating anything. The length in pptr[3] includes both
7116     delimiters; in the actual compile only the first one is copied, but a
7117     terminating zero is added. Any doubled delimiters within the string make
7118     this an overestimate, but it is not worth bothering about. */
7119 
7120     case META_CALLOUT_STRING:
7121     if (lengthptr != NULL)
7122       {
7123       *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7124       pptr += 3;
7125       SKIPOFFSET(pptr);
7126       }
7127 
7128     /* In the real compile we can copy the string. The starting delimiter is
7129      included so that the client can discover it if they want. We also pass the
7130      start offset to help a script language give better error messages. */
7131 
7132     else
7133       {
7134       PCRE2_SPTR pp;
7135       uint32_t delimiter;
7136       uint32_t length = pptr[3];
7137       PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7138 
7139       code[0] = OP_CALLOUT_STR;
7140       PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7141       PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7142 
7143       pptr += 3;
7144       GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7145       pp = cb->start_pattern + offset;
7146       delimiter = *callout_string++ = *pp++;
7147       if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7148         delimiter = CHAR_RIGHT_CURLY_BRACKET;
7149       PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7150 
7151       /* The syntax of the pattern was checked in the parsing scan. The length
7152       includes both delimiters, but we have passed the opening one just above,
7153       so we reduce length before testing it. The test is for > 1 because we do
7154       not want to copy the final delimiter. This also ensures that pp[1] is
7155       accessible. */
7156 
7157       while (--length > 1)
7158         {
7159         if (*pp == delimiter && pp[1] == delimiter)
7160           {
7161           *callout_string++ = delimiter;
7162           pp += 2;
7163           length--;
7164           }
7165         else *callout_string++ = *pp++;
7166         }
7167       *callout_string++ = CHAR_NUL;
7168 
7169       /* Set the length of the entire item, the advance to its end. */
7170 
7171       PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7172       code = callout_string;
7173       }
7174     break;
7175 
7176 
7177     /* ===================================================================*/
7178     /* Handle repetition. The different types are all sorted out in the parsing
7179     pass. */
7180 
7181     case META_MINMAX_PLUS:
7182     case META_MINMAX_QUERY:
7183     case META_MINMAX:
7184     repeat_min = *(++pptr);
7185     repeat_max = *(++pptr);
7186     goto REPEAT;
7187 
7188     case META_ASTERISK:
7189     case META_ASTERISK_PLUS:
7190     case META_ASTERISK_QUERY:
7191     repeat_min = 0;
7192     repeat_max = REPEAT_UNLIMITED;
7193     goto REPEAT;
7194 
7195     case META_PLUS:
7196     case META_PLUS_PLUS:
7197     case META_PLUS_QUERY:
7198     repeat_min = 1;
7199     repeat_max = REPEAT_UNLIMITED;
7200     goto REPEAT;
7201 
7202     case META_QUERY:
7203     case META_QUERY_PLUS:
7204     case META_QUERY_QUERY:
7205     repeat_min = 0;
7206     repeat_max = 1;
7207 
7208     REPEAT:
7209     if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7210 
7211     /* Remember whether this is a variable length repeat, and default to
7212     single-char opcodes. */
7213 
7214     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7215     op_type = 0;
7216 
7217     /* Adjust first and required code units for a zero repeat. */
7218 
7219     if (repeat_min == 0)
7220       {
7221       firstcu = zerofirstcu;
7222       firstcuflags = zerofirstcuflags;
7223       reqcu = zeroreqcu;
7224       reqcuflags = zeroreqcuflags;
7225       }
7226 
7227     /* Note the greediness and possessiveness. */
7228 
7229     switch (meta)
7230       {
7231       case META_MINMAX_PLUS:
7232       case META_ASTERISK_PLUS:
7233       case META_PLUS_PLUS:
7234       case META_QUERY_PLUS:
7235       repeat_type = 0;                  /* Force greedy */
7236       possessive_quantifier = TRUE;
7237       break;
7238 
7239       case META_MINMAX_QUERY:
7240       case META_ASTERISK_QUERY:
7241       case META_PLUS_QUERY:
7242       case META_QUERY_QUERY:
7243       repeat_type = greedy_non_default;
7244       possessive_quantifier = FALSE;
7245       break;
7246 
7247       default:
7248       repeat_type = greedy_default;
7249       possessive_quantifier = FALSE;
7250       break;
7251       }
7252 
7253     /* Save start of previous item, in case we have to move it up in order to
7254     insert something before it, and remember what it was. */
7255 
7256     tempcode = previous;
7257     op_previous = *previous;
7258 
7259     /* Now handle repetition for the different types of item. If the repeat
7260     minimum and the repeat maximum are both 1, we can ignore the quantifier for
7261     non-parenthesized items, as they have only one alternative. For anything in
7262     parentheses, we must not ignore if {1} is possessive. */
7263 
7264     switch (op_previous)
7265       {
7266       /* If previous was a character or negated character match, abolish the
7267       item and generate a repeat item instead. If a char item has a minimum of
7268       more than one, ensure that it is set in reqcu - it might not be if a
7269       sequence such as x{3} is the first thing in a branch because the x will
7270       have gone into firstcu instead.  */
7271 
7272       case OP_CHAR:
7273       case OP_CHARI:
7274       case OP_NOT:
7275       case OP_NOTI:
7276       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7277       op_type = chartypeoffset[op_previous - OP_CHAR];
7278 
7279       /* Deal with UTF characters that take up more than one code unit. */
7280 
7281 #ifdef MAYBE_UTF_MULTI
7282       if (utf && NOT_FIRSTCU(code[-1]))
7283         {
7284         PCRE2_UCHAR *lastchar = code - 1;
7285         BACKCHAR(lastchar);
7286         mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7287         memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7288         }
7289       else
7290 #endif  /* MAYBE_UTF_MULTI */
7291 
7292       /* Handle the case of a single code unit - either with no UTF support, or
7293       with UTF disabled, or for a single-code-unit UTF character. In the latter
7294       case, for a repeated positive match, get the caseless flag for the
7295       required code unit from the previous character, because a class like [Aa]
7296       sets a caseless A but by now the req_caseopt flag has been reset. */
7297 
7298         {
7299         mcbuffer[0] = code[-1];
7300         mclength = 1;
7301         if (op_previous <= OP_CHARI && repeat_min > 1)
7302           {
7303           reqcu = mcbuffer[0];
7304           reqcuflags = cb->req_varyopt;
7305           if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7306           }
7307         }
7308       goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7309 
7310       /* If previous was a character class or a back reference, we put the
7311       repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7312 
7313 #ifdef SUPPORT_WIDE_CHARS
7314       case OP_XCLASS:
7315 #endif
7316       case OP_CLASS:
7317       case OP_NCLASS:
7318       case OP_REF:
7319       case OP_REFI:
7320       case OP_DNREF:
7321       case OP_DNREFI:
7322 
7323       if (repeat_max == 0)
7324         {
7325         code = previous;
7326         goto END_REPEAT;
7327         }
7328       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7329 
7330       if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7331         *code++ = OP_CRSTAR + repeat_type;
7332       else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7333         *code++ = OP_CRPLUS + repeat_type;
7334       else if (repeat_min == 0 && repeat_max == 1)
7335         *code++ = OP_CRQUERY + repeat_type;
7336       else
7337         {
7338         *code++ = OP_CRRANGE + repeat_type;
7339         PUT2INC(code, 0, repeat_min);
7340         if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7341         PUT2INC(code, 0, repeat_max);
7342         }
7343       break;
7344 
7345       /* If previous is OP_FAIL, it was generated by an empty class []
7346       (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7347       generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7348       time. We can just ignore this repeat. */
7349 
7350       case OP_FAIL:
7351       goto END_REPEAT;
7352 
7353       /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7354       because pcre2_match() could not handle backtracking into recursively
7355       called groups. Now that this backtracking is available, we no longer need
7356       to do this. However, we still need to replicate recursions as we do for
7357       groups so as to have independent backtracking points. We can replicate
7358       for the minimum number of repeats directly. For optional repeats we now
7359       wrap the recursion in OP_BRA brackets and make use of the bracket
7360       repetition. */
7361 
7362       case OP_RECURSE:
7363       if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7364         goto END_REPEAT;
7365 
7366       /* Generate unwrapped repeats for a non-zero minimum, except when the
7367       minimum is 1 and the maximum unlimited, because that can be handled with
7368       OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7369       minimum, we just need to generate the appropriate additional copies.
7370       Otherwise we need to generate one more, to simulate the situation when
7371       the minimum is zero. */
7372 
7373       if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7374         {
7375         int replicate = repeat_min;
7376         if (repeat_min == repeat_max) replicate--;
7377 
7378         /* In the pre-compile phase, we don't actually do the replication. We
7379         just adjust the length as if we had. Do some paranoid checks for
7380         potential integer overflow. */
7381 
7382         if (lengthptr != NULL)
7383           {
7384           PCRE2_SIZE delta;
7385           if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7386               OFLOW_MAX - *lengthptr < delta)
7387             {
7388             *errorcodeptr = ERR20;
7389             return 0;
7390             }
7391           *lengthptr += delta;
7392           }
7393 
7394         else for (int i = 0; i < replicate; i++)
7395           {
7396           memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7397           previous = code;
7398           code += 1 + LINK_SIZE;
7399           }
7400 
7401         /* If the number of repeats is fixed, we are done. Otherwise, adjust
7402         the counts and fall through. */
7403 
7404         if (repeat_min == repeat_max) break;
7405         if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7406         repeat_min = 0;
7407         }
7408 
7409       /* Wrap the recursion call in OP_BRA brackets. */
7410 
7411       (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7412       op_previous = *previous = OP_BRA;
7413       PUT(previous, 1, 2 + 2*LINK_SIZE);
7414       previous[2 + 2*LINK_SIZE] = OP_KET;
7415       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7416       code += 2 + 2 * LINK_SIZE;
7417       length_prevgroup = 3 + 3*LINK_SIZE;
7418       group_return = -1;  /* Set "may match empty string" */
7419 
7420       /* Now treat as a repeated OP_BRA. */
7421       /* Fall through */
7422 
7423       /* If previous was a bracket group, we may have to replicate it in
7424       certain cases. Note that at this point we can encounter only the "basic"
7425       bracket opcodes such as BRA and CBRA, as this is the place where they get
7426       converted into the more special varieties such as BRAPOS and SBRA.
7427       Originally, PCRE did not allow repetition of assertions, but now it does,
7428       for Perl compatibility. */
7429 
7430       case OP_ASSERT:
7431       case OP_ASSERT_NOT:
7432       case OP_ASSERT_NA:
7433       case OP_ASSERTBACK:
7434       case OP_ASSERTBACK_NOT:
7435       case OP_ASSERTBACK_NA:
7436       case OP_ONCE:
7437       case OP_SCRIPT_RUN:
7438       case OP_BRA:
7439       case OP_CBRA:
7440       case OP_COND:
7441         {
7442         int len = (int)(code - previous);
7443         PCRE2_UCHAR *bralink = NULL;
7444         PCRE2_UCHAR *brazeroptr = NULL;
7445 
7446         if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7447           goto END_REPEAT;
7448 
7449         /* Repeating a DEFINE group (or any group where the condition is always
7450         FALSE and there is only one branch) is pointless, but Perl allows the
7451         syntax, so we just ignore the repeat. */
7452 
7453         if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7454             previous[GET(previous, 1)] != OP_ALT)
7455           goto END_REPEAT;
7456 
7457         /* Perl allows all assertions to be quantified, and when they contain
7458         capturing parentheses and/or are optional there are potential uses for
7459         this feature. PCRE2 used to force the maximum quantifier to 1 on the
7460         invalid grounds that further repetition was never useful. This was
7461         always a bit pointless, since an assertion could be wrapped with a
7462         repeated group to achieve the effect. General repetition is now
7463         permitted, but if the maximum is unlimited it is set to one more than
7464         the minimum. */
7465 
7466         if (op_previous < OP_ONCE)    /* Assertion */
7467           {
7468           if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7469           }
7470 
7471         /* The case of a zero minimum is special because of the need to stick
7472         OP_BRAZERO in front of it, and because the group appears once in the
7473         data, whereas in other cases it appears the minimum number of times. For
7474         this reason, it is simplest to treat this case separately, as otherwise
7475         the code gets far too messy. There are several special subcases when the
7476         minimum is zero. */
7477 
7478         if (repeat_min == 0)
7479           {
7480           /* If the maximum is also zero, we used to just omit the group from
7481           the output altogether, like this:
7482 
7483           ** if (repeat_max == 0)
7484           **   {
7485           **   code = previous;
7486           **   goto END_REPEAT;
7487           **   }
7488 
7489           However, that fails when a group or a subgroup within it is
7490           referenced as a subroutine from elsewhere in the pattern, so now we
7491           stick in OP_SKIPZERO in front of it so that it is skipped on
7492           execution. As we don't have a list of which groups are referenced, we
7493           cannot do this selectively.
7494 
7495           If the maximum is 1 or unlimited, we just have to stick in the
7496           BRAZERO and do no more at this point. */
7497 
7498           if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7499             {
7500             (void)memmove(previous + 1, previous, CU2BYTES(len));
7501             code++;
7502             if (repeat_max == 0)
7503               {
7504               *previous++ = OP_SKIPZERO;
7505               goto END_REPEAT;
7506               }
7507             brazeroptr = previous;    /* Save for possessive optimizing */
7508             *previous++ = OP_BRAZERO + repeat_type;
7509             }
7510 
7511           /* If the maximum is greater than 1 and limited, we have to replicate
7512           in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7513           The first one has to be handled carefully because it's the original
7514           copy, which has to be moved up. The remainder can be handled by code
7515           that is common with the non-zero minimum case below. We have to
7516           adjust the value or repeat_max, since one less copy is required. */
7517 
7518           else
7519             {
7520             int linkoffset;
7521             (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7522             code += 2 + LINK_SIZE;
7523             *previous++ = OP_BRAZERO + repeat_type;
7524             *previous++ = OP_BRA;
7525 
7526             /* We chain together the bracket link offset fields that have to be
7527             filled in later when the ends of the brackets are reached. */
7528 
7529             linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7530             bralink = previous;
7531             PUTINC(previous, 0, linkoffset);
7532             }
7533 
7534           if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7535           }
7536 
7537         /* If the minimum is greater than zero, replicate the group as many
7538         times as necessary, and adjust the maximum to the number of subsequent
7539         copies that we need. */
7540 
7541         else
7542           {
7543           if (repeat_min > 1)
7544             {
7545             /* In the pre-compile phase, we don't actually do the replication.
7546             We just adjust the length as if we had. Do some paranoid checks for
7547             potential integer overflow. */
7548 
7549             if (lengthptr != NULL)
7550               {
7551               PCRE2_SIZE delta;
7552               if (PRIV(ckd_smul)(&delta, repeat_min - 1, length_prevgroup) ||
7553                   OFLOW_MAX - *lengthptr < delta)
7554                 {
7555                 *errorcodeptr = ERR20;
7556                 return 0;
7557                 }
7558               *lengthptr += delta;
7559               }
7560 
7561             /* This is compiling for real. If there is a set first code unit
7562             for the group, and we have not yet set a "required code unit", set
7563             it. */
7564 
7565             else
7566               {
7567               if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7568                 {
7569                 reqcu = firstcu;
7570                 reqcuflags = firstcuflags;
7571                 }
7572               for (uint32_t i = 1; i < repeat_min; i++)
7573                 {
7574                 memcpy(code, previous, CU2BYTES(len));
7575                 code += len;
7576                 }
7577               }
7578             }
7579 
7580           if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7581           }
7582 
7583         /* This code is common to both the zero and non-zero minimum cases. If
7584         the maximum is limited, it replicates the group in a nested fashion,
7585         remembering the bracket starts on a stack. In the case of a zero
7586         minimum, the first one was set up above. In all cases the repeat_max
7587         now specifies the number of additional copies needed. Again, we must
7588         remember to replicate entries on the forward reference list. */
7589 
7590         if (repeat_max != REPEAT_UNLIMITED)
7591           {
7592           /* In the pre-compile phase, we don't actually do the replication. We
7593           just adjust the length as if we had. For each repetition we must add
7594           1 to the length for BRAZERO and for all but the last repetition we
7595           must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7596           paranoid checks to avoid integer overflow. */
7597 
7598           if (lengthptr != NULL && repeat_max > 0)
7599             {
7600             PCRE2_SIZE delta;
7601             if (PRIV(ckd_smul)(&delta, repeat_max,
7602                                length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7603                 OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7604               {
7605               *errorcodeptr = ERR20;
7606               return 0;
7607               }
7608             delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7609             *lengthptr += delta;
7610             }
7611 
7612           /* This is compiling for real */
7613 
7614           else for (uint32_t i = repeat_max; i >= 1; i--)
7615             {
7616             *code++ = OP_BRAZERO + repeat_type;
7617 
7618             /* All but the final copy start a new nesting, maintaining the
7619             chain of brackets outstanding. */
7620 
7621             if (i != 1)
7622               {
7623               int linkoffset;
7624               *code++ = OP_BRA;
7625               linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7626               bralink = code;
7627               PUTINC(code, 0, linkoffset);
7628               }
7629 
7630             memcpy(code, previous, CU2BYTES(len));
7631             code += len;
7632             }
7633 
7634           /* Now chain through the pending brackets, and fill in their length
7635           fields (which are holding the chain links pro tem). */
7636 
7637           while (bralink != NULL)
7638             {
7639             int oldlinkoffset;
7640             int linkoffset = (int)(code - bralink + 1);
7641             PCRE2_UCHAR *bra = code - linkoffset;
7642             oldlinkoffset = GET(bra, 1);
7643             bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7644             *code++ = OP_KET;
7645             PUTINC(code, 0, linkoffset);
7646             PUT(bra, 1, linkoffset);
7647             }
7648           }
7649 
7650         /* If the maximum is unlimited, set a repeater in the final copy. For
7651         SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7652         possessively repeated ONCE brackets can be converted into non-capturing
7653         brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7654         saves having to deal with possessive ONCEs specially.
7655 
7656         Otherwise, when we are doing the actual compile phase, check to see
7657         whether this group is one that could match an empty string. If so,
7658         convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7659         that runtime checking can be done. [This check is also applied to ONCE
7660         and SCRIPT_RUN groups at runtime, but in a different way.]
7661 
7662         Then, if the quantifier was possessive and the bracket is not a
7663         conditional, we convert the BRA code to the POS form, and the KET code
7664         to KETRPOS. (It turns out to be convenient at runtime to detect this
7665         kind of subpattern at both the start and at the end.) The use of
7666         special opcodes makes it possible to reduce greatly the stack usage in
7667         pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7668         OP_BRAPOSZERO.
7669 
7670         Then, if the minimum number of matches is 1 or 0, cancel the possessive
7671         flag so that the default action below, of wrapping everything inside
7672         atomic brackets, does not happen. When the minimum is greater than 1,
7673         there will be earlier copies of the group, and so we still have to wrap
7674         the whole thing. */
7675 
7676         else
7677           {
7678           PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7679           PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7680 
7681           /* Convert possessive ONCE brackets to non-capturing */
7682 
7683           if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7684 
7685           /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7686           to do is to set the KET. */
7687 
7688           if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7689             *ketcode = OP_KETRMAX + repeat_type;
7690 
7691           /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7692           (which have been converted to non-capturing above). */
7693 
7694           else
7695             {
7696             /* In the compile phase, adjust the opcode if the group can match
7697             an empty string. For a conditional group with only one branch, the
7698             value of group_return will not show "could be empty", so we must
7699             check that separately. */
7700 
7701             if (lengthptr == NULL)
7702               {
7703               if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7704               if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7705                 *bracode = OP_SCOND;
7706               }
7707 
7708             /* Handle possessive quantifiers. */
7709 
7710             if (possessive_quantifier)
7711               {
7712               /* For COND brackets, we wrap the whole thing in a possessively
7713               repeated non-capturing bracket, because we have not invented POS
7714               versions of the COND opcodes. */
7715 
7716               if (*bracode == OP_COND || *bracode == OP_SCOND)
7717                 {
7718                 int nlen = (int)(code - bracode);
7719                 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7720                 code += 1 + LINK_SIZE;
7721                 nlen += 1 + LINK_SIZE;
7722                 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7723                 *code++ = OP_KETRPOS;
7724                 PUTINC(code, 0, nlen);
7725                 PUT(bracode, 1, nlen);
7726                 }
7727 
7728               /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7729 
7730               else
7731                 {
7732                 *bracode += 1;              /* Switch to xxxPOS opcodes */
7733                 *ketcode = OP_KETRPOS;
7734                 }
7735 
7736               /* If the minimum is zero, mark it as possessive, then unset the
7737               possessive flag when the minimum is 0 or 1. */
7738 
7739               if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7740               if (repeat_min < 2) possessive_quantifier = FALSE;
7741               }
7742 
7743             /* Non-possessive quantifier */
7744 
7745             else *ketcode = OP_KETRMAX + repeat_type;
7746             }
7747           }
7748         }
7749       break;
7750 
7751       /* If previous was a character type match (\d or similar), abolish it and
7752       create a suitable repeat item. The code is shared with single-character
7753       repeats by setting op_type to add a suitable offset into repeat_type.
7754       Note the the Unicode property types will be present only when
7755       SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7756       here because it just makes it horribly messy. */
7757 
7758       default:
7759       if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7760         {
7761         *errorcodeptr = ERR10;
7762         return 0;
7763         }
7764       else
7765         {
7766         int prop_type, prop_value;
7767         PCRE2_UCHAR *oldcode;
7768 
7769         if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7770 
7771         op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7772         mclength = 0;                         /* Not a character */
7773 
7774         if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7775           {
7776           prop_type = previous[1];
7777           prop_value = previous[2];
7778           }
7779         else
7780           {
7781           /* Come here from just above with a character in mcbuffer/mclength. */
7782           OUTPUT_SINGLE_REPEAT:
7783           prop_type = prop_value = -1;
7784           }
7785 
7786         /* At this point, if prop_type == prop_value == -1 we either have a
7787         character in mcbuffer when mclength is greater than zero, or we have
7788         mclength zero, in which case there is a non-property character type in
7789         op_previous. If prop_type/value are not negative, we have a property
7790         character type in op_previous. */
7791 
7792         oldcode = code;                   /* Save where we were */
7793         code = previous;                  /* Usually overwrite previous item */
7794 
7795         /* If the maximum is zero then the minimum must also be zero; Perl allows
7796         this case, so we do too - by simply omitting the item altogether. */
7797 
7798         if (repeat_max == 0) goto END_REPEAT;
7799 
7800         /* Combine the op_type with the repeat_type */
7801 
7802         repeat_type += op_type;
7803 
7804         /* A minimum of zero is handled either as the special case * or ?, or as
7805         an UPTO, with the maximum given. */
7806 
7807         if (repeat_min == 0)
7808           {
7809           if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7810             else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7811           else
7812             {
7813             *code++ = OP_UPTO + repeat_type;
7814             PUT2INC(code, 0, repeat_max);
7815             }
7816           }
7817 
7818         /* A repeat minimum of 1 is optimized into some special cases. If the
7819         maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7820         left in place and, if the maximum is greater than 1, we use OP_UPTO with
7821         one less than the maximum. */
7822 
7823         else if (repeat_min == 1)
7824           {
7825           if (repeat_max == REPEAT_UNLIMITED)
7826             *code++ = OP_PLUS + repeat_type;
7827           else
7828             {
7829             code = oldcode;  /* Leave previous item in place */
7830             if (repeat_max == 1) goto END_REPEAT;
7831             *code++ = OP_UPTO + repeat_type;
7832             PUT2INC(code, 0, repeat_max - 1);
7833             }
7834           }
7835 
7836         /* The case {n,n} is just an EXACT, while the general case {n,m} is
7837         handled as an EXACT followed by an UPTO or STAR or QUERY. */
7838 
7839         else
7840           {
7841           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7842           PUT2INC(code, 0, repeat_min);
7843 
7844           /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7845           and then generate the second opcode. For a repeated Unicode property
7846           match, there are two extra values that define the required property,
7847           and mclength is set zero to indicate this. */
7848 
7849           if (repeat_max != repeat_min)
7850             {
7851             if (mclength > 0)
7852               {
7853               memcpy(code, mcbuffer, CU2BYTES(mclength));
7854               code += mclength;
7855               }
7856             else
7857               {
7858               *code++ = op_previous;
7859               if (prop_type >= 0)
7860                 {
7861                 *code++ = prop_type;
7862                 *code++ = prop_value;
7863                 }
7864               }
7865 
7866             /* Now set up the following opcode */
7867 
7868             if (repeat_max == REPEAT_UNLIMITED)
7869               *code++ = OP_STAR + repeat_type;
7870             else
7871               {
7872               repeat_max -= repeat_min;
7873               if (repeat_max == 1)
7874                 {
7875                 *code++ = OP_QUERY + repeat_type;
7876                 }
7877               else
7878                 {
7879                 *code++ = OP_UPTO + repeat_type;
7880                 PUT2INC(code, 0, repeat_max);
7881                 }
7882               }
7883             }
7884           }
7885 
7886         /* Fill in the character or character type for the final opcode. */
7887 
7888         if (mclength > 0)
7889           {
7890           memcpy(code, mcbuffer, CU2BYTES(mclength));
7891           code += mclength;
7892           }
7893         else
7894           {
7895           *code++ = op_previous;
7896           if (prop_type >= 0)
7897             {
7898             *code++ = prop_type;
7899             *code++ = prop_value;
7900             }
7901           }
7902         }
7903       break;
7904       }  /* End of switch on different op_previous values */
7905 
7906 
7907     /* If the character following a repeat is '+', possessive_quantifier is
7908     TRUE. For some opcodes, there are special alternative opcodes for this
7909     case. For anything else, we wrap the entire repeated item inside OP_ONCE
7910     brackets. Logically, the '+' notation is just syntactic sugar, taken from
7911     Sun's Java package, but the special opcodes can optimize it.
7912 
7913     Some (but not all) possessively repeated subpatterns have already been
7914     completely handled in the code just above. For them, possessive_quantifier
7915     is always FALSE at this stage. Note that the repeated item starts at
7916     tempcode, not at previous, which might be the first part of a string whose
7917     (former) last char we repeated. */
7918 
7919     if (possessive_quantifier)
7920       {
7921       int len;
7922 
7923       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7924       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7925       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7926       remains is greater than zero, there's a further opcode that can be
7927       handled. If not, do nothing, leaving the EXACT alone. */
7928 
7929       switch(*tempcode)
7930         {
7931         case OP_TYPEEXACT:
7932         tempcode += PRIV(OP_lengths)[*tempcode] +
7933           ((tempcode[1 + IMM2_SIZE] == OP_PROP
7934           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7935         break;
7936 
7937         /* CHAR opcodes are used for exacts whose count is 1. */
7938 
7939         case OP_CHAR:
7940         case OP_CHARI:
7941         case OP_NOT:
7942         case OP_NOTI:
7943         case OP_EXACT:
7944         case OP_EXACTI:
7945         case OP_NOTEXACT:
7946         case OP_NOTEXACTI:
7947         tempcode += PRIV(OP_lengths)[*tempcode];
7948 #ifdef SUPPORT_UNICODE
7949         if (utf && HAS_EXTRALEN(tempcode[-1]))
7950           tempcode += GET_EXTRALEN(tempcode[-1]);
7951 #endif
7952         break;
7953 
7954         /* For the class opcodes, the repeat operator appears at the end;
7955         adjust tempcode to point to it. */
7956 
7957         case OP_CLASS:
7958         case OP_NCLASS:
7959         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7960         break;
7961 
7962 #ifdef SUPPORT_WIDE_CHARS
7963         case OP_XCLASS:
7964         tempcode += GET(tempcode, 1);
7965         break;
7966 #endif
7967         }
7968 
7969       /* If tempcode is equal to code (which points to the end of the repeated
7970       item), it means we have skipped an EXACT item but there is no following
7971       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7972       all other cases, tempcode will be pointing to the repeat opcode, and will
7973       be less than code, so the value of len will be greater than 0. */
7974 
7975       len = (int)(code - tempcode);
7976       if (len > 0)
7977         {
7978         unsigned int repcode = *tempcode;
7979 
7980         /* There is a table for possessifying opcodes, all of which are less
7981         than OP_CALLOUT. A zero entry means there is no possessified version.
7982         */
7983 
7984         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7985           *tempcode = opcode_possessify[repcode];
7986 
7987         /* For opcode without a special possessified version, wrap the item in
7988         ONCE brackets. */
7989 
7990         else
7991           {
7992           (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7993           code += 1 + LINK_SIZE;
7994           len += 1 + LINK_SIZE;
7995           tempcode[0] = OP_ONCE;
7996           *code++ = OP_KET;
7997           PUTINC(code, 0, len);
7998           PUT(tempcode, 1, len);
7999           }
8000         }
8001       }
8002 
8003     /* We set the "follows varying string" flag for subsequently encountered
8004     reqcus if it isn't already set and we have just passed a varying length
8005     item. */
8006 
8007     END_REPEAT:
8008     cb->req_varyopt |= reqvary;
8009     break;
8010 
8011 
8012     /* ===================================================================*/
8013     /* Handle a 32-bit data character with a value greater than META_END. */
8014 
8015     case META_BIGVALUE:
8016     pptr++;
8017     goto NORMAL_CHAR;
8018 
8019 
8020     /* ===============================================================*/
8021     /* Handle a back reference by number, which is the meta argument. The
8022     pattern offsets for back references to group numbers less than 10 are held
8023     in a special vector, to avoid using more than two parsed pattern elements
8024     in 64-bit environments. We only need the offset to the first occurrence,
8025     because if that doesn't fail, subsequent ones will also be OK. */
8026 
8027     case META_BACKREF:
8028     if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8029       else GETPLUSOFFSET(offset, pptr);
8030 
8031     if (meta_arg > cb->bracount)
8032       {
8033       cb->erroroffset = offset;
8034       *errorcodeptr = ERR15;  /* Non-existent subpattern */
8035       return 0;
8036       }
8037 
8038     /* Come here from named backref handling when the reference is to a
8039     single group (that is, not to a duplicated name). The back reference
8040     data will have already been updated. We must disable firstcu if not
8041     set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8042     later. */
8043 
8044     HANDLE_SINGLE_REFERENCE:
8045     if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8046     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8047     PUT2INC(code, 0, meta_arg);
8048 
8049     /* Update the map of back references, and keep the highest one. We
8050     could do this in parse_regex() for numerical back references, but not
8051     for named back references, because we don't know the numbers to which
8052     named back references refer. So we do it all in this function. */
8053 
8054     cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8055     if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8056     break;
8057 
8058 
8059     /* ===============================================================*/
8060     /* Handle recursion by inserting the number of the called group (which is
8061     the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8062     scanned and these numbers are replaced by offsets within the pattern. It is
8063     done like this to avoid problems with forward references and adjusting
8064     offsets when groups are duplicated and moved (as discovered in previous
8065     implementations). Note that a recursion does not have a set first
8066     character. */
8067 
8068     case META_RECURSE:
8069     GETPLUSOFFSET(offset, pptr);
8070     if (meta_arg > cb->bracount)
8071       {
8072       cb->erroroffset = offset;
8073       *errorcodeptr = ERR15;  /* Non-existent subpattern */
8074       return 0;
8075       }
8076     HANDLE_NUMERICAL_RECURSION:
8077     *code = OP_RECURSE;
8078     PUT(code, 1, meta_arg);
8079     code += 1 + LINK_SIZE;
8080     groupsetfirstcu = FALSE;
8081     cb->had_recurse = TRUE;
8082     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8083     zerofirstcu = firstcu;
8084     zerofirstcuflags = firstcuflags;
8085     break;
8086 
8087 
8088     /* ===============================================================*/
8089     /* Handle capturing parentheses; the number is the meta argument. */
8090 
8091     case META_CAPTURE:
8092     bravalue = OP_CBRA;
8093     skipunits = IMM2_SIZE;
8094     PUT2(code, 1+LINK_SIZE, meta_arg);
8095     cb->lastcapture = meta_arg;
8096     goto GROUP_PROCESS_NOTE_EMPTY;
8097 
8098 
8099     /* ===============================================================*/
8100     /* Handle escape sequence items. For ones like \d, the ESC_values are
8101     arranged to be the same as the corresponding OP_values in the default case
8102     when PCRE2_UCP is not set (which is the only case in which they will appear
8103     here).
8104 
8105     Note: \Q and \E are never seen here, as they were dealt with in
8106     parse_pattern(). Neither are numerical back references or recursions, which
8107     were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8108     \g, when followed by names, are turned into META_BACKREF_BYNAME or
8109     META_RECURSE_BYNAME. */
8110 
8111     case META_ESCAPE:
8112 
8113     /* We can test for escape sequences that consume a character because their
8114     values lie between ESC_b and ESC_Z; this may have to change if any new ones
8115     are ever created. For these sequences, we disable the setting of a first
8116     character if it hasn't already been set. */
8117 
8118     if (meta_arg > ESC_b && meta_arg < ESC_Z)
8119       {
8120       matched_char = TRUE;
8121       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8122       }
8123 
8124     /* Set values to reset to if this is followed by a zero repeat. */
8125 
8126     zerofirstcu = firstcu;
8127     zerofirstcuflags = firstcuflags;
8128     zeroreqcu = reqcu;
8129     zeroreqcuflags = reqcuflags;
8130 
8131     /* If Unicode is not supported, \P and \p are not allowed and are
8132     faulted at parse time, so will never appear here. */
8133 
8134 #ifdef SUPPORT_UNICODE
8135     if (meta_arg == ESC_P || meta_arg == ESC_p)
8136       {
8137       uint32_t ptype = *(++pptr) >> 16;
8138       uint32_t pdata = *pptr & 0xffff;
8139 
8140       /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
8141       from the auto-anchoring code. */
8142 
8143       if (meta_arg == ESC_p && ptype == PT_ANY)
8144         {
8145         *code++ = OP_ALLANY;
8146         }
8147       else
8148         {
8149         *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8150         *code++ = ptype;
8151         *code++ = pdata;
8152         }
8153       break;  /* End META_ESCAPE */
8154       }
8155 #endif
8156 
8157     /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8158     done. However, there's an option, in case anyone was relying on it. */
8159 
8160     if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8161         (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8162       {
8163       *errorcodeptr = ERR99;
8164       return 0;
8165       }
8166 
8167     /* For the rest (including \X when Unicode is supported - if not it's
8168     faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8169     not set; if it is set, most of them do not show up here because they are
8170     converted into Unicode property tests in parse_regex().
8171 
8172     In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8173     instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8174     There are special UCP codes for \B and \b which are used in UCP mode unless
8175     "word" matching is being forced to ASCII.
8176 
8177     Note that \b and \B do a one-character lookbehind, and \A also behaves as
8178     if it does. */
8179 
8180     switch(meta_arg)
8181       {
8182       case ESC_C:
8183       cb->external_flags |= PCRE2_HASBKC;  /* Record */
8184 #if PCRE2_CODE_UNIT_WIDTH == 32
8185       meta_arg = OP_ALLANY;
8186 #else
8187       if (!utf) meta_arg = OP_ALLANY;
8188 #endif
8189       break;
8190 
8191       case ESC_B:
8192       case ESC_b:
8193       if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8194         meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8195           OP_UCP_WORD_BOUNDARY;
8196       /* Fall through */
8197 
8198       case ESC_A:
8199       if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8200       break;
8201       }
8202 
8203     *code++ = meta_arg;
8204     break;  /* End META_ESCAPE */
8205 
8206 
8207     /* ===================================================================*/
8208     /* Handle an unrecognized meta value. A parsed pattern value less than
8209     META_END is a literal. Otherwise we have a problem. */
8210 
8211     default:
8212     if (meta >= META_END)
8213       {
8214 #ifdef DEBUG_SHOW_PARSED
8215       fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
8216 #endif
8217       *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8218       return 0;
8219       }
8220 
8221     /* Handle a literal character. We come here by goto in the case of a
8222     32-bit, non-UTF character whose value is greater than META_END. */
8223 
8224     NORMAL_CHAR:
8225     meta = *pptr;     /* Get the full 32 bits */
8226     NORMAL_CHAR_SET:  /* Character is already in meta */
8227     matched_char = TRUE;
8228 
8229     /* For caseless UTF or UCP mode, check whether this character has more than
8230     one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8231     When casing restrictions apply, ignore caseless sets that start with an
8232     ASCII character. */
8233 
8234 #ifdef SUPPORT_UNICODE
8235     if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8236       {
8237       uint32_t caseset = UCD_CASESET(meta);
8238       if (caseset != 0 &&
8239            ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
8240            PRIV(ucd_caseless_sets)[caseset] > 127))
8241         {
8242         *code++ = OP_PROP;
8243         *code++ = PT_CLIST;
8244         *code++ = caseset;
8245         if (firstcuflags == REQ_UNSET)
8246           firstcuflags = zerofirstcuflags = REQ_NONE;
8247         break;  /* End handling this meta item */
8248         }
8249       }
8250 #endif
8251 
8252     /* Caseful matches, or caseless and not one of the multicase characters. We
8253     come here by goto in the case of a positive class that contains only
8254     case-partners of a character with just two cases; matched_char has already
8255     been set TRUE and options fudged if necessary. */
8256 
8257     CLASS_CASELESS_CHAR:
8258 
8259     /* Get the character's code units into mcbuffer, with the length in
8260     mclength. When not in UTF mode, the length is always 1. */
8261 
8262 #ifdef SUPPORT_UNICODE
8263     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8264 #endif
8265       {
8266       mclength = 1;
8267       mcbuffer[0] = meta;
8268       }
8269 
8270     /* Generate the appropriate code */
8271 
8272     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8273     memcpy(code, mcbuffer, CU2BYTES(mclength));
8274     code += mclength;
8275 
8276     /* Remember if \r or \n were seen */
8277 
8278     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8279       cb->external_flags |= PCRE2_HASCRORLF;
8280 
8281     /* Set the first and required code units appropriately. If no previous
8282     first code unit, set it from this character, but revert to none on a zero
8283     repeat. Otherwise, leave the firstcu value alone, and don't change it on
8284     a zero repeat. */
8285 
8286     if (firstcuflags == REQ_UNSET)
8287       {
8288       zerofirstcuflags = REQ_NONE;
8289       zeroreqcu = reqcu;
8290       zeroreqcuflags = reqcuflags;
8291 
8292       /* If the character is more than one code unit long, we can set a single
8293       firstcu only if it is not to be matched caselessly. Multiple possible
8294       starting code units may be picked up later in the studying code. */
8295 
8296       if (mclength == 1 || req_caseopt == 0)
8297         {
8298         firstcu = mcbuffer[0];
8299         firstcuflags = req_caseopt;
8300         if (mclength != 1)
8301           {
8302           reqcu = code[-1];
8303           reqcuflags = cb->req_varyopt;
8304           }
8305         }
8306       else firstcuflags = reqcuflags = REQ_NONE;
8307       }
8308 
8309     /* firstcu was previously set; we can set reqcu only if the length is
8310     1 or the matching is caseful. */
8311 
8312     else
8313       {
8314       zerofirstcu = firstcu;
8315       zerofirstcuflags = firstcuflags;
8316       zeroreqcu = reqcu;
8317       zeroreqcuflags = reqcuflags;
8318       if (mclength == 1 || req_caseopt == 0)
8319         {
8320         reqcu = code[-1];
8321         reqcuflags = req_caseopt | cb->req_varyopt;
8322         }
8323       }
8324 
8325     /* If caselessness was temporarily instated, reset it. */
8326 
8327     if (reset_caseful)
8328       {
8329       options &= ~PCRE2_CASELESS;
8330       req_caseopt = 0;
8331       reset_caseful = FALSE;
8332       }
8333 
8334     break;    /* End literal character handling */
8335     }         /* End of big switch */
8336   }           /* End of big loop */
8337 
8338 /* Control never reaches here. */
8339 }
8340 
8341 
8342 
8343 /*************************************************
8344 *   Compile regex: a sequence of alternatives    *
8345 *************************************************/
8346 
8347 /* On entry, pptr is pointing past the bracket meta, but on return it points to
8348 the closing bracket or META_END. The code variable is pointing at the code unit
8349 into which the BRA operator has been stored. This function is used during the
8350 pre-compile phase when we are trying to find out the amount of memory needed,
8351 as well as during the real compile phase. The value of lengthptr distinguishes
8352 the two phases.
8353 
8354 Arguments:
8355   options           option bits, including any changes for this subpattern
8356   xoptions          extra option bits, ditto
8357   codeptr           -> the address of the current code pointer
8358   pptrptr           -> the address of the current parsed pattern pointer
8359   errorcodeptr      -> pointer to error code variable
8360   skipunits         skip this many code units at start (for brackets and OP_COND)
8361   firstcuptr        place to put the first required code unit
8362   firstcuflagsptr   place to put the first code unit flags
8363   reqcuptr          place to put the last required code unit
8364   reqcuflagsptr     place to put the last required code unit flags
8365   bcptr             pointer to the chain of currently open branches
8366   cb                points to the data block with tables pointers etc.
8367   lengthptr         NULL during the real compile phase
8368                     points to length accumulator during pre-compile phase
8369 
8370 Returns:            0 There has been an error
8371                    +1 Success, this group must match at least one character
8372                    -1 Success, this group may match an empty string
8373 */
8374 
8375 static int
compile_regex(uint32_t options,uint32_t xoptions,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)8376 compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8377   uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8378   uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8379   uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8380   compile_block *cb, PCRE2_SIZE *lengthptr)
8381 {
8382 PCRE2_UCHAR *code = *codeptr;
8383 PCRE2_UCHAR *last_branch = code;
8384 PCRE2_UCHAR *start_bracket = code;
8385 BOOL lookbehind;
8386 open_capitem capitem;
8387 int capnumber = 0;
8388 int okreturn = 1;
8389 uint32_t *pptr = *pptrptr;
8390 uint32_t firstcu, reqcu;
8391 uint32_t lookbehindlength;
8392 uint32_t lookbehindminlength;
8393 uint32_t firstcuflags, reqcuflags;
8394 uint32_t branchfirstcu, branchreqcu;
8395 uint32_t branchfirstcuflags, branchreqcuflags;
8396 PCRE2_SIZE length;
8397 branch_chain bc;
8398 
8399 /* If set, call the external function that checks for stack availability. */
8400 
8401 if (cb->cx->stack_guard != NULL &&
8402     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8403   {
8404   *errorcodeptr= ERR33;
8405   return 0;
8406   }
8407 
8408 /* Miscellaneous initialization */
8409 
8410 bc.outer = bcptr;
8411 bc.current_branch = code;
8412 
8413 firstcu = reqcu = 0;
8414 firstcuflags = reqcuflags = REQ_UNSET;
8415 
8416 /* Accumulate the length for use in the pre-compile phase. Start with the
8417 length of the BRA and KET and any extra code units that are required at the
8418 beginning. We accumulate in a local variable to save frequent testing of
8419 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8420 start and end of each alternative, because compiled items are discarded during
8421 the pre-compile phase so that the workspace is not exceeded. */
8422 
8423 length = 2 + 2*LINK_SIZE + skipunits;
8424 
8425 /* Remember if this is a lookbehind assertion, and if it is, save its length
8426 and skip over the pattern offset. */
8427 
8428 lookbehind = *code == OP_ASSERTBACK ||
8429              *code == OP_ASSERTBACK_NOT ||
8430              *code == OP_ASSERTBACK_NA;
8431 
8432 if (lookbehind)
8433   {
8434   lookbehindlength = META_DATA(pptr[-1]);
8435   lookbehindminlength = *pptr;
8436   pptr += SIZEOFFSET;
8437   }
8438 else lookbehindlength = lookbehindminlength = 0;
8439 
8440 /* If this is a capturing subpattern, add to the chain of open capturing items
8441 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8442 need be tested here; changing this opcode to one of its variants, e.g.
8443 OP_SCBRAPOS, happens later, after the group has been compiled. */
8444 
8445 if (*code == OP_CBRA)
8446   {
8447   capnumber = GET2(code, 1 + LINK_SIZE);
8448   capitem.number = capnumber;
8449   capitem.next = open_caps;
8450   capitem.assert_depth = cb->assert_depth;
8451   open_caps = &capitem;
8452   }
8453 
8454 /* Offset is set zero to mark that this bracket is still open */
8455 
8456 PUT(code, 1, 0);
8457 code += 1 + LINK_SIZE + skipunits;
8458 
8459 /* Loop for each alternative branch */
8460 
8461 for (;;)
8462   {
8463   int branch_return;
8464 
8465   /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8466   is only a single mimimum length for the whole assertion. When the mimimum
8467   length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8468   though not necessarily the same length. In this case, the original OP_REVERSE
8469   can be used. It can also be used if a branch in a variable length lookbehind
8470   has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8471   maximum and minimum values. */
8472 
8473   if (lookbehind && lookbehindlength > 0)
8474     {
8475     if (lookbehindminlength == LOOKBEHIND_MAX ||
8476         lookbehindminlength == lookbehindlength)
8477       {
8478       *code++ = OP_REVERSE;
8479       PUT2INC(code, 0, lookbehindlength);
8480       length += 1 + IMM2_SIZE;
8481       }
8482     else
8483       {
8484       *code++ = OP_VREVERSE;
8485       PUT2INC(code, 0, lookbehindminlength);
8486       PUT2INC(code, 0, lookbehindlength);
8487       length += 1 + 2*IMM2_SIZE;
8488       }
8489     }
8490 
8491   /* Now compile the branch; in the pre-compile phase its length gets added
8492   into the length. */
8493 
8494   if ((branch_return =
8495         compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8496           &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8497           &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8498     return 0;
8499 
8500   /* If a branch can match an empty string, so can the whole group. */
8501 
8502   if (branch_return < 0) okreturn = -1;
8503 
8504   /* In the real compile phase, there is some post-processing to be done. */
8505 
8506   if (lengthptr == NULL)
8507     {
8508     /* If this is the first branch, the firstcu and reqcu values for the
8509     branch become the values for the regex. */
8510 
8511     if (*last_branch != OP_ALT)
8512       {
8513       firstcu = branchfirstcu;
8514       firstcuflags = branchfirstcuflags;
8515       reqcu = branchreqcu;
8516       reqcuflags = branchreqcuflags;
8517       }
8518 
8519     /* If this is not the first branch, the first char and reqcu have to
8520     match the values from all the previous branches, except that if the
8521     previous value for reqcu didn't have REQ_VARY set, it can still match,
8522     and we set REQ_VARY for the group from this branch's value. */
8523 
8524     else
8525       {
8526       /* If we previously had a firstcu, but it doesn't match the new branch,
8527       we have to abandon the firstcu for the regex, but if there was
8528       previously no reqcu, it takes on the value of the old firstcu. */
8529 
8530       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8531         {
8532         if (firstcuflags < REQ_NONE)
8533           {
8534           if (reqcuflags >= REQ_NONE)
8535             {
8536             reqcu = firstcu;
8537             reqcuflags = firstcuflags;
8538             }
8539           }
8540         firstcuflags = REQ_NONE;
8541         }
8542 
8543       /* If we (now or from before) have no firstcu, a firstcu from the
8544       branch becomes a reqcu if there isn't a branch reqcu. */
8545 
8546       if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8547           branchreqcuflags >= REQ_NONE)
8548         {
8549         branchreqcu = branchfirstcu;
8550         branchreqcuflags = branchfirstcuflags;
8551         }
8552 
8553       /* Now ensure that the reqcus match */
8554 
8555       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8556           reqcu != branchreqcu)
8557         reqcuflags = REQ_NONE;
8558       else
8559         {
8560         reqcu = branchreqcu;
8561         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8562         }
8563       }
8564     }
8565 
8566   /* Handle reaching the end of the expression, either ')' or end of pattern.
8567   In the real compile phase, go back through the alternative branches and
8568   reverse the chain of offsets, with the field in the BRA item now becoming an
8569   offset to the first alternative. If there are no alternatives, it points to
8570   the end of the group. The length in the terminating ket is always the length
8571   of the whole bracketed item. Return leaving the pointer at the terminating
8572   char. */
8573 
8574   if (META_CODE(*pptr) != META_ALT)
8575     {
8576     if (lengthptr == NULL)
8577       {
8578       PCRE2_SIZE branch_length = code - last_branch;
8579       do
8580         {
8581         PCRE2_SIZE prev_length = GET(last_branch, 1);
8582         PUT(last_branch, 1, branch_length);
8583         branch_length = prev_length;
8584         last_branch -= branch_length;
8585         }
8586       while (branch_length > 0);
8587       }
8588 
8589     /* Fill in the ket */
8590 
8591     *code = OP_KET;
8592     PUT(code, 1, (int)(code - start_bracket));
8593     code += 1 + LINK_SIZE;
8594 
8595     /* Set values to pass back */
8596 
8597     *codeptr = code;
8598     *pptrptr = pptr;
8599     *firstcuptr = firstcu;
8600     *firstcuflagsptr = firstcuflags;
8601     *reqcuptr = reqcu;
8602     *reqcuflagsptr = reqcuflags;
8603     if (lengthptr != NULL)
8604       {
8605       if (OFLOW_MAX - *lengthptr < length)
8606         {
8607         *errorcodeptr = ERR20;
8608         return 0;
8609         }
8610       *lengthptr += length;
8611       }
8612     return okreturn;
8613     }
8614 
8615   /* Another branch follows. In the pre-compile phase, we can move the code
8616   pointer back to where it was for the start of the first branch. (That is,
8617   pretend that each branch is the only one.)
8618 
8619   In the real compile phase, insert an ALT node. Its length field points back
8620   to the previous branch while the bracket remains open. At the end the chain
8621   is reversed. It's done like this so that the start of the bracket has a
8622   zero offset until it is closed, making it possible to detect recursion. */
8623 
8624   if (lengthptr != NULL)
8625     {
8626     code = *codeptr + 1 + LINK_SIZE + skipunits;
8627     length += 1 + LINK_SIZE;
8628     }
8629   else
8630     {
8631     *code = OP_ALT;
8632     PUT(code, 1, (int)(code - last_branch));
8633     bc.current_branch = last_branch = code;
8634     code += 1 + LINK_SIZE;
8635     }
8636 
8637   /* Set the maximum lookbehind length for the next branch (if not in a
8638   lookbehind the value will be zero) and then advance past the vertical bar. */
8639 
8640   lookbehindlength = META_DATA(*pptr);
8641   pptr++;
8642   }
8643 /* Control never reaches here */
8644 }
8645 
8646 
8647 
8648 /*************************************************
8649 *          Check for anchored pattern            *
8650 *************************************************/
8651 
8652 /* Try to find out if this is an anchored regular expression. Consider each
8653 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8654 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8655 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8656 be found, because ^ generates OP_CIRCM in that mode.
8657 
8658 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8659 This is the code for \G, which means "match at start of match position, taking
8660 into account the match offset".
8661 
8662 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8663 because that will try the rest of the pattern at all possible matching points,
8664 so there is no point trying again.... er ....
8665 
8666 .... except when the .* appears inside capturing parentheses, and there is a
8667 subsequent back reference to those parentheses. We haven't enough information
8668 to catch that case precisely.
8669 
8670 At first, the best we could do was to detect when .* was in capturing brackets
8671 and the highest back reference was greater than or equal to that level.
8672 However, by keeping a bitmap of the first 31 back references, we can catch some
8673 of the more common cases more precisely.
8674 
8675 ... A second exception is when the .* appears inside an atomic group, because
8676 this prevents the number of characters it matches from being adjusted.
8677 
8678 Arguments:
8679   code           points to start of the compiled pattern
8680   bracket_map    a bitmap of which brackets we are inside while testing; this
8681                    handles up to substring 31; after that we just have to take
8682                    the less precise approach
8683   cb             points to the compile data block
8684   atomcount      atomic group level
8685   inassert       TRUE if in an assertion
8686 
8687 Returns:     TRUE or FALSE
8688 */
8689 
8690 static BOOL
is_anchored(PCRE2_SPTR code,uint32_t bracket_map,compile_block * cb,int atomcount,BOOL inassert)8691 is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8692   int atomcount, BOOL inassert)
8693 {
8694 do {
8695    PCRE2_SPTR scode = first_significant_code(
8696      code + PRIV(OP_lengths)[*code], FALSE);
8697    int op = *scode;
8698 
8699    /* Non-capturing brackets */
8700 
8701    if (op == OP_BRA  || op == OP_BRAPOS ||
8702        op == OP_SBRA || op == OP_SBRAPOS)
8703      {
8704      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8705        return FALSE;
8706      }
8707 
8708    /* Capturing brackets */
8709 
8710    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8711             op == OP_SCBRA || op == OP_SCBRAPOS)
8712      {
8713      int n = GET2(scode, 1+LINK_SIZE);
8714      uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8715      if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8716      }
8717 
8718    /* Positive forward assertion */
8719 
8720    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8721      {
8722      if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8723      }
8724 
8725    /* Condition. If there is no second branch, it can't be anchored. */
8726 
8727    else if (op == OP_COND || op == OP_SCOND)
8728      {
8729      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8730      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8731        return FALSE;
8732      }
8733 
8734    /* Atomic groups */
8735 
8736    else if (op == OP_ONCE)
8737      {
8738      if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8739        return FALSE;
8740      }
8741 
8742    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8743    it isn't in brackets that are or may be referenced or inside an atomic
8744    group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8745    because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8746    with the subject "aab", which matches "b", i.e. not at the start of a line.
8747    There is also an option that disables auto-anchoring. */
8748 
8749    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8750              op == OP_TYPEPOSSTAR))
8751      {
8752      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8753          atomcount > 0 || cb->had_pruneorskip || inassert ||
8754          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8755        return FALSE;
8756      }
8757 
8758    /* Check for explicit anchoring */
8759 
8760    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8761 
8762    code += GET(code, 1);
8763    }
8764 while (*code == OP_ALT);   /* Loop for each alternative */
8765 return TRUE;
8766 }
8767 
8768 
8769 
8770 /*************************************************
8771 *         Check for starting with ^ or .*        *
8772 *************************************************/
8773 
8774 /* This is called to find out if every branch starts with ^ or .* so that
8775 "first char" processing can be done to speed things up in multiline
8776 matching and for non-DOTALL patterns that start with .* (which must start at
8777 the beginning or after \n). As in the case of is_anchored() (see above), we
8778 have to take account of back references to capturing brackets that contain .*
8779 because in that case we can't make the assumption. Also, the appearance of .*
8780 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8781 or *SKIP does not count, because once again the assumption no longer holds.
8782 
8783 Arguments:
8784   code           points to start of the compiled pattern or a group
8785   bracket_map    a bitmap of which brackets we are inside while testing; this
8786                    handles up to substring 31; after that we just have to take
8787                    the less precise approach
8788   cb             points to the compile data
8789   atomcount      atomic group level
8790   inassert       TRUE if in an assertion
8791 
8792 Returns:         TRUE or FALSE
8793 */
8794 
8795 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8796 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8797   int atomcount, BOOL inassert)
8798 {
8799 do {
8800    PCRE2_SPTR scode = first_significant_code(
8801      code + PRIV(OP_lengths)[*code], FALSE);
8802    int op = *scode;
8803 
8804    /* If we are at the start of a conditional assertion group, *both* the
8805    conditional assertion *and* what follows the condition must satisfy the test
8806    for start of line. Other kinds of condition fail. Note that there may be an
8807    auto-callout at the start of a condition. */
8808 
8809    if (op == OP_COND)
8810      {
8811      scode += 1 + LINK_SIZE;
8812 
8813      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8814        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8815 
8816      switch (*scode)
8817        {
8818        case OP_CREF:
8819        case OP_DNCREF:
8820        case OP_RREF:
8821        case OP_DNRREF:
8822        case OP_FAIL:
8823        case OP_FALSE:
8824        case OP_TRUE:
8825        return FALSE;
8826 
8827        default:     /* Assertion */
8828        if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8829        do scode += GET(scode, 1); while (*scode == OP_ALT);
8830        scode += 1 + LINK_SIZE;
8831        break;
8832        }
8833      scode = first_significant_code(scode, FALSE);
8834      op = *scode;
8835      }
8836 
8837    /* Non-capturing brackets */
8838 
8839    if (op == OP_BRA  || op == OP_BRAPOS ||
8840        op == OP_SBRA || op == OP_SBRAPOS)
8841      {
8842      if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8843        return FALSE;
8844      }
8845 
8846    /* Capturing brackets */
8847 
8848    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8849             op == OP_SCBRA || op == OP_SCBRAPOS)
8850      {
8851      int n = GET2(scode, 1+LINK_SIZE);
8852      unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8853      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8854      }
8855 
8856    /* Positive forward assertions */
8857 
8858    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8859      {
8860      if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8861        return FALSE;
8862      }
8863 
8864    /* Atomic brackets */
8865 
8866    else if (op == OP_ONCE)
8867      {
8868      if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8869        return FALSE;
8870      }
8871 
8872    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8873    brackets that may be referenced or an assertion, and as long as the pattern
8874    does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8875    for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8876    i.e. not at the start of a line. There is also an option that disables this
8877    optimization. */
8878 
8879    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8880      {
8881      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8882          atomcount > 0 || cb->had_pruneorskip || inassert ||
8883          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8884        return FALSE;
8885      }
8886 
8887    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8888    in particular that this includes atomic brackets OP_ONCE because the number
8889    of characters matched by .* cannot be adjusted inside them. */
8890 
8891    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8892 
8893    /* Move on to the next alternative */
8894 
8895    code += GET(code, 1);
8896    }
8897 while (*code == OP_ALT);  /* Loop for each alternative */
8898 return TRUE;
8899 }
8900 
8901 
8902 
8903 /*************************************************
8904 *   Scan compiled regex for recursion reference  *
8905 *************************************************/
8906 
8907 /* This function scans through a compiled pattern until it finds an instance of
8908 OP_RECURSE.
8909 
8910 Arguments:
8911   code        points to start of expression
8912   utf         TRUE in UTF mode
8913 
8914 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8915 */
8916 
8917 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8918 find_recurse(PCRE2_SPTR code, BOOL utf)
8919 {
8920 for (;;)
8921   {
8922   PCRE2_UCHAR c = *code;
8923   if (c == OP_END) return NULL;
8924   if (c == OP_RECURSE) return code;
8925 
8926   /* XCLASS is used for classes that cannot be represented just by a bit map.
8927   This includes negated single high-valued characters. CALLOUT_STR is used for
8928   callouts with string arguments. In both cases the length in the table is
8929   zero; the actual length is stored in the compiled code. */
8930 
8931   if (c == OP_XCLASS) code += GET(code, 1);
8932     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8933 
8934   /* Otherwise, we can get the item's length from the table, except that for
8935   repeated character types, we have to test for \p and \P, which have an extra
8936   two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8937   we must add in its length. */
8938 
8939   else
8940     {
8941     switch(c)
8942       {
8943       case OP_TYPESTAR:
8944       case OP_TYPEMINSTAR:
8945       case OP_TYPEPLUS:
8946       case OP_TYPEMINPLUS:
8947       case OP_TYPEQUERY:
8948       case OP_TYPEMINQUERY:
8949       case OP_TYPEPOSSTAR:
8950       case OP_TYPEPOSPLUS:
8951       case OP_TYPEPOSQUERY:
8952       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8953       break;
8954 
8955       case OP_TYPEPOSUPTO:
8956       case OP_TYPEUPTO:
8957       case OP_TYPEMINUPTO:
8958       case OP_TYPEEXACT:
8959       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8960         code += 2;
8961       break;
8962 
8963       case OP_MARK:
8964       case OP_COMMIT_ARG:
8965       case OP_PRUNE_ARG:
8966       case OP_SKIP_ARG:
8967       case OP_THEN_ARG:
8968       code += code[1];
8969       break;
8970       }
8971 
8972     /* Add in the fixed length from the table */
8973 
8974     code += PRIV(OP_lengths)[c];
8975 
8976     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8977     be followed by a multi-unit character. The length in the table is a
8978     minimum, so we have to arrange to skip the extra units. */
8979 
8980 #ifdef MAYBE_UTF_MULTI
8981     if (utf) switch(c)
8982       {
8983       case OP_CHAR:
8984       case OP_CHARI:
8985       case OP_NOT:
8986       case OP_NOTI:
8987       case OP_EXACT:
8988       case OP_EXACTI:
8989       case OP_NOTEXACT:
8990       case OP_NOTEXACTI:
8991       case OP_UPTO:
8992       case OP_UPTOI:
8993       case OP_NOTUPTO:
8994       case OP_NOTUPTOI:
8995       case OP_MINUPTO:
8996       case OP_MINUPTOI:
8997       case OP_NOTMINUPTO:
8998       case OP_NOTMINUPTOI:
8999       case OP_POSUPTO:
9000       case OP_POSUPTOI:
9001       case OP_NOTPOSUPTO:
9002       case OP_NOTPOSUPTOI:
9003       case OP_STAR:
9004       case OP_STARI:
9005       case OP_NOTSTAR:
9006       case OP_NOTSTARI:
9007       case OP_MINSTAR:
9008       case OP_MINSTARI:
9009       case OP_NOTMINSTAR:
9010       case OP_NOTMINSTARI:
9011       case OP_POSSTAR:
9012       case OP_POSSTARI:
9013       case OP_NOTPOSSTAR:
9014       case OP_NOTPOSSTARI:
9015       case OP_PLUS:
9016       case OP_PLUSI:
9017       case OP_NOTPLUS:
9018       case OP_NOTPLUSI:
9019       case OP_MINPLUS:
9020       case OP_MINPLUSI:
9021       case OP_NOTMINPLUS:
9022       case OP_NOTMINPLUSI:
9023       case OP_POSPLUS:
9024       case OP_POSPLUSI:
9025       case OP_NOTPOSPLUS:
9026       case OP_NOTPOSPLUSI:
9027       case OP_QUERY:
9028       case OP_QUERYI:
9029       case OP_NOTQUERY:
9030       case OP_NOTQUERYI:
9031       case OP_MINQUERY:
9032       case OP_MINQUERYI:
9033       case OP_NOTMINQUERY:
9034       case OP_NOTMINQUERYI:
9035       case OP_POSQUERY:
9036       case OP_POSQUERYI:
9037       case OP_NOTPOSQUERY:
9038       case OP_NOTPOSQUERYI:
9039       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9040       break;
9041       }
9042 #else
9043     (void)(utf);  /* Keep compiler happy by referencing function argument */
9044 #endif  /* MAYBE_UTF_MULTI */
9045     }
9046   }
9047 }
9048 
9049 
9050 
9051 /*************************************************
9052 *    Check for asserted fixed first code unit    *
9053 *************************************************/
9054 
9055 /* During compilation, the "first code unit" settings from forward assertions
9056 are discarded, because they can cause conflicts with actual literals that
9057 follow. However, if we end up without a first code unit setting for an
9058 unanchored pattern, it is worth scanning the regex to see if there is an
9059 initial asserted first code unit. If all branches start with the same asserted
9060 code unit, or with a non-conditional bracket all of whose alternatives start
9061 with the same asserted code unit (recurse ad lib), then we return that code
9062 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9063 REQ_NONE in the flags.
9064 
9065 Arguments:
9066   code       points to start of compiled pattern
9067   flags      points to the first code unit flags
9068   inassert   non-zero if in an assertion
9069 
9070 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9071 */
9072 
9073 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,uint32_t * flags,uint32_t inassert)9074 find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9075 {
9076 uint32_t c = 0;
9077 uint32_t cflags = REQ_NONE;
9078 
9079 *flags = REQ_NONE;
9080 do {
9081    uint32_t d;
9082    uint32_t dflags;
9083    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9084              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9085    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9086    PCRE2_UCHAR op = *scode;
9087 
9088    switch(op)
9089      {
9090      default:
9091      return 0;
9092 
9093      case OP_BRA:
9094      case OP_BRAPOS:
9095      case OP_CBRA:
9096      case OP_SCBRA:
9097      case OP_CBRAPOS:
9098      case OP_SCBRAPOS:
9099      case OP_ASSERT:
9100      case OP_ASSERT_NA:
9101      case OP_ONCE:
9102      case OP_SCRIPT_RUN:
9103      d = find_firstassertedcu(scode, &dflags, inassert +
9104        ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9105      if (dflags >= REQ_NONE) return 0;
9106      if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9107        else if (c != d || cflags != dflags) return 0;
9108      break;
9109 
9110      case OP_EXACT:
9111      scode += IMM2_SIZE;
9112      /* Fall through */
9113 
9114      case OP_CHAR:
9115      case OP_PLUS:
9116      case OP_MINPLUS:
9117      case OP_POSPLUS:
9118      if (inassert == 0) return 0;
9119      if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9120        else if (c != scode[1]) return 0;
9121      break;
9122 
9123      case OP_EXACTI:
9124      scode += IMM2_SIZE;
9125      /* Fall through */
9126 
9127      case OP_CHARI:
9128      case OP_PLUSI:
9129      case OP_MINPLUSI:
9130      case OP_POSPLUSI:
9131      if (inassert == 0) return 0;
9132 
9133      /* If the character is more than one code unit long, we cannot set its
9134      first code unit when matching caselessly. Later scanning may pick up
9135      multiple code units. */
9136 
9137 #ifdef SUPPORT_UNICODE
9138 #if PCRE2_CODE_UNIT_WIDTH == 8
9139      if (scode[1] >= 0x80) return 0;
9140 #elif PCRE2_CODE_UNIT_WIDTH == 16
9141      if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9142 #endif
9143 #endif
9144 
9145      if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9146        else if (c != scode[1]) return 0;
9147      break;
9148      }
9149 
9150    code += GET(code, 1);
9151    }
9152 while (*code == OP_ALT);
9153 
9154 *flags = cflags;
9155 return c;
9156 }
9157 
9158 
9159 
9160 /*************************************************
9161 *     Add an entry to the name/number table      *
9162 *************************************************/
9163 
9164 /* This function is called between compiling passes to add an entry to the
9165 name/number table, maintaining alphabetical order. Checking for permitted
9166 and forbidden duplicates has already been done.
9167 
9168 Arguments:
9169   cb           the compile data block
9170   name         the name to add
9171   length       the length of the name
9172   groupno      the group number
9173   tablecount   the count of names in the table so far
9174 
9175 Returns:       nothing
9176 */
9177 
9178 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)9179 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9180   unsigned int groupno, uint32_t tablecount)
9181 {
9182 uint32_t i;
9183 PCRE2_UCHAR *slot = cb->name_table;
9184 
9185 for (i = 0; i < tablecount; i++)
9186   {
9187   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9188   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9189     crc = -1; /* Current name is a substring */
9190 
9191   /* Make space in the table and break the loop for an earlier name. For a
9192   duplicate or later name, carry on. We do this for duplicates so that in the
9193   simple case (when ?(| is not used) they are in order of their numbers. In all
9194   cases they are in the order in which they appear in the pattern. */
9195 
9196   if (crc < 0)
9197     {
9198     (void)memmove(slot + cb->name_entry_size, slot,
9199       CU2BYTES((tablecount - i) * cb->name_entry_size));
9200     break;
9201     }
9202 
9203   /* Continue the loop for a later or duplicate name */
9204 
9205   slot += cb->name_entry_size;
9206   }
9207 
9208 PUT2(slot, 0, groupno);
9209 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9210 
9211 /* Add a terminating zero and fill the rest of the slot with zeroes so that
9212 the memory is all initialized. Otherwise valgrind moans about uninitialized
9213 memory when saving serialized compiled patterns. */
9214 
9215 memset(slot + IMM2_SIZE + length, 0,
9216   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9217 }
9218 
9219 
9220 
9221 /*************************************************
9222 *             Skip in parsed pattern             *
9223 *************************************************/
9224 
9225 /* This function is called to skip parts of the parsed pattern when finding the
9226 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9227 the end of the branch, it is called to skip over an internal lookaround or
9228 (DEFINE) group, and it is also called to skip to the end of a class, during
9229 which it will never encounter nested groups (but there's no need to have
9230 special code for that).
9231 
9232 When called to find the end of a branch or group, pptr must point to the first
9233 meta code inside the branch, not the branch-starting code. In other cases it
9234 can point to the item that causes the function to be called.
9235 
9236 Arguments:
9237   pptr       current pointer to skip from
9238   skiptype   PSKIP_CLASS when skipping to end of class
9239              PSKIP_ALT when META_ALT ends the skip
9240              PSKIP_KET when only META_KET ends the skip
9241 
9242 Returns:     new value of pptr
9243              NULL if META_END is reached - should never occur
9244                or for an unknown meta value - likewise
9245 */
9246 
9247 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)9248 parsed_skip(uint32_t *pptr, uint32_t skiptype)
9249 {
9250 uint32_t nestlevel = 0;
9251 
9252 for (;; pptr++)
9253   {
9254   uint32_t meta = META_CODE(*pptr);
9255 
9256   switch(meta)
9257     {
9258     default:  /* Just skip over most items */
9259     if (meta < META_END) continue;  /* Literal */
9260     break;
9261 
9262     /* This should never occur. */
9263 
9264     case META_END:
9265     return NULL;
9266 
9267     /* The data for these items is variable in length. */
9268 
9269     case META_BACKREF:  /* Offset is present only if group >= 10 */
9270     if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9271     break;
9272 
9273     case META_ESCAPE:   /* A few escapes are followed by data items. */
9274     switch (META_DATA(*pptr))
9275       {
9276       case ESC_P:
9277       case ESC_p:
9278       pptr += 1;
9279       break;
9280 
9281       case ESC_g:
9282       case ESC_k:
9283       pptr += 1 + SIZEOFFSET;
9284       break;
9285       }
9286     break;
9287 
9288     case META_MARK:     /* Add the length of the name. */
9289     case META_COMMIT_ARG:
9290     case META_PRUNE_ARG:
9291     case META_SKIP_ARG:
9292     case META_THEN_ARG:
9293     pptr += pptr[1];
9294     break;
9295 
9296     /* These are the "active" items in this loop. */
9297 
9298     case META_CLASS_END:
9299     if (skiptype == PSKIP_CLASS) return pptr;
9300     break;
9301 
9302     case META_ATOMIC:
9303     case META_CAPTURE:
9304     case META_COND_ASSERT:
9305     case META_COND_DEFINE:
9306     case META_COND_NAME:
9307     case META_COND_NUMBER:
9308     case META_COND_RNAME:
9309     case META_COND_RNUMBER:
9310     case META_COND_VERSION:
9311     case META_LOOKAHEAD:
9312     case META_LOOKAHEADNOT:
9313     case META_LOOKAHEAD_NA:
9314     case META_LOOKBEHIND:
9315     case META_LOOKBEHINDNOT:
9316     case META_LOOKBEHIND_NA:
9317     case META_NOCAPTURE:
9318     case META_SCRIPT_RUN:
9319     nestlevel++;
9320     break;
9321 
9322     case META_ALT:
9323     if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9324     break;
9325 
9326     case META_KET:
9327     if (nestlevel == 0) return pptr;
9328     nestlevel--;
9329     break;
9330     }
9331 
9332   /* The extra data item length for each meta is in a table. */
9333 
9334   meta = (meta >> 16) & 0x7fff;
9335   if (meta >= sizeof(meta_extra_lengths)) return NULL;
9336   pptr += meta_extra_lengths[meta];
9337   }
9338 /* Control never reaches here */
9339 return pptr;
9340 }
9341 
9342 
9343 
9344 /*************************************************
9345 *       Find length of a parsed group            *
9346 *************************************************/
9347 
9348 /* This is called for nested groups within a branch of a lookbehind whose
9349 length is being computed. On entry, the pointer must be at the first element
9350 after the group initializing code. On exit it points to OP_KET. Caching is used
9351 to improve processing speed when the same capturing group occurs many times.
9352 
9353 Arguments:
9354   pptrptr     pointer to pointer in the parsed pattern
9355   minptr      where to return the minimum length
9356   isinline    FALSE if a reference or recursion; TRUE for inline group
9357   errcodeptr  pointer to the errorcode
9358   lcptr       pointer to the loop counter
9359   group       number of captured group or -1 for a non-capturing group
9360   recurses    chain of recurse_check to catch mutual recursion
9361   cb          pointer to the compile data
9362 
9363 Returns:      the maximum group length or a negative number
9364 */
9365 
9366 static int
get_grouplength(uint32_t ** pptrptr,int * minptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)9367 get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9368   int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9369 {
9370 uint32_t *gi = cb->groupinfo + 2 * group;
9371 int branchlength, branchminlength;
9372 int grouplength = -1;
9373 int groupminlength = INT_MAX;
9374 
9375 /* The cache can be used only if there is no possibility of there being two
9376 groups with the same number. We do not need to set the end pointer for a group
9377 that is being processed as a back reference or recursion, but we must do so for
9378 an inline group. */
9379 
9380 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9381   {
9382   uint32_t groupinfo = gi[0];
9383   if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9384   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9385     {
9386     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9387     *minptr = gi[1];
9388     return groupinfo & GI_FIXED_LENGTH_MASK;
9389     }
9390   }
9391 
9392 /* Scan the group. In this case we find the end pointer of necessity. */
9393 
9394 for(;;)
9395   {
9396   branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9397     recurses, cb);
9398   if (branchlength < 0) goto ISNOTFIXED;
9399   if (branchlength > grouplength) grouplength = branchlength;
9400   if (branchminlength < groupminlength) groupminlength = branchminlength;
9401   if (**pptrptr == META_KET) break;
9402   *pptrptr += 1;   /* Skip META_ALT */
9403   }
9404 
9405 if (group > 0)
9406   {
9407   gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9408   gi[1] = groupminlength;
9409   }
9410 
9411 *minptr = groupminlength;
9412 return grouplength;
9413 
9414 ISNOTFIXED:
9415 if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9416 return -1;
9417 }
9418 
9419 
9420 
9421 /*************************************************
9422 *        Find length of a parsed branch          *
9423 *************************************************/
9424 
9425 /* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9426 giving an error if the length is not limited. On entry, *pptrptr points to the
9427 first element inside the branch. On exit it is set to point to the ALT or KET.
9428 
9429 Arguments:
9430   pptrptr     pointer to pointer in the parsed pattern
9431   minptr      where to return the minimum length
9432   errcodeptr  pointer to error code
9433   lcptr       pointer to loop counter
9434   recurses    chain of recurse_check to catch mutual recursion
9435   cb          pointer to compile block
9436 
9437 Returns:      the maximum length, or a negative value on error
9438 */
9439 
9440 static int
get_branchlength(uint32_t ** pptrptr,int * minptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9441 get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9442   parsed_recurse_check *recurses, compile_block *cb)
9443 {
9444 int branchlength = 0;
9445 int branchminlength = 0;
9446 int grouplength, groupminlength;
9447 uint32_t lastitemlength = 0;
9448 uint32_t lastitemminlength = 0;
9449 uint32_t *pptr = *pptrptr;
9450 PCRE2_SIZE offset;
9451 parsed_recurse_check this_recurse;
9452 
9453 /* A large and/or complex regex can take too long to process. This can happen
9454 more often when (?| groups are present in the pattern because their length
9455 cannot be cached. */
9456 
9457 if ((*lcptr)++ > 2000)
9458   {
9459   *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9460   return -1;
9461   }
9462 
9463 /* Scan the branch, accumulating the length. */
9464 
9465 for (;; pptr++)
9466   {
9467   parsed_recurse_check *r;
9468   uint32_t *gptr, *gptrend;
9469   uint32_t escape;
9470   uint32_t group = 0;
9471   uint32_t itemlength = 0;
9472   uint32_t itemminlength = 0;
9473   uint32_t min, max;
9474 
9475   if (*pptr < META_END)
9476     {
9477     itemlength = itemminlength = 1;
9478     }
9479 
9480   else switch (META_CODE(*pptr))
9481     {
9482     case META_KET:
9483     case META_ALT:
9484     goto EXIT;
9485 
9486     /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9487     actual termination. */
9488 
9489     case META_ACCEPT:
9490     case META_FAIL:
9491     pptr = parsed_skip(pptr, PSKIP_ALT);
9492     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9493     goto EXIT;
9494 
9495     case META_MARK:
9496     case META_COMMIT_ARG:
9497     case META_PRUNE_ARG:
9498     case META_SKIP_ARG:
9499     case META_THEN_ARG:
9500     pptr += pptr[1] + 1;
9501     break;
9502 
9503     case META_CIRCUMFLEX:
9504     case META_COMMIT:
9505     case META_DOLLAR:
9506     case META_PRUNE:
9507     case META_SKIP:
9508     case META_THEN:
9509     break;
9510 
9511     case META_OPTIONS:
9512     pptr += 2;
9513     break;
9514 
9515     case META_BIGVALUE:
9516     itemlength = itemminlength = 1;
9517     pptr += 1;
9518     break;
9519 
9520     case META_CLASS:
9521     case META_CLASS_NOT:
9522     itemlength = itemminlength = 1;
9523     pptr = parsed_skip(pptr, PSKIP_CLASS);
9524     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9525     break;
9526 
9527     case META_CLASS_EMPTY_NOT:
9528     case META_DOT:
9529     itemlength = itemminlength = 1;
9530     break;
9531 
9532     case META_CALLOUT_NUMBER:
9533     pptr += 3;
9534     break;
9535 
9536     case META_CALLOUT_STRING:
9537     pptr += 3 + SIZEOFFSET;
9538     break;
9539 
9540     /* Only some escapes consume a character. Of those, \R can match one or two
9541     characters, but \X is never allowed because it matches an unknown number of
9542     characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9543 
9544     case META_ESCAPE:
9545     escape = META_DATA(*pptr);
9546     if (escape == ESC_X) return -1;
9547     if (escape == ESC_R)
9548       {
9549       itemminlength = 1;
9550       itemlength = 2;
9551       }
9552     else if (escape > ESC_b && escape < ESC_Z)
9553       {
9554 #if PCRE2_CODE_UNIT_WIDTH != 32
9555       if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9556         {
9557         *errcodeptr = ERR36;
9558         return -1;
9559         }
9560 #endif
9561       itemlength = itemminlength = 1;
9562       if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9563       }
9564     break;
9565 
9566     /* Lookaheads do not contribute to the length of this branch, but they may
9567     contain lookbehinds within them whose lengths need to be set. */
9568 
9569     case META_LOOKAHEAD:
9570     case META_LOOKAHEADNOT:
9571     case META_LOOKAHEAD_NA:
9572     *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9573     if (*errcodeptr != 0) return -1;
9574 
9575     /* Ignore any qualifiers that follow a lookahead assertion. */
9576 
9577     switch (pptr[1])
9578       {
9579       case META_ASTERISK:
9580       case META_ASTERISK_PLUS:
9581       case META_ASTERISK_QUERY:
9582       case META_PLUS:
9583       case META_PLUS_PLUS:
9584       case META_PLUS_QUERY:
9585       case META_QUERY:
9586       case META_QUERY_PLUS:
9587       case META_QUERY_QUERY:
9588       pptr++;
9589       break;
9590 
9591       case META_MINMAX:
9592       case META_MINMAX_PLUS:
9593       case META_MINMAX_QUERY:
9594       pptr += 3;
9595       break;
9596 
9597       default:
9598       break;
9599       }
9600     break;
9601 
9602     /* A nested lookbehind does not contribute any length to this lookbehind,
9603     but must itself be checked and have its lengths set. */
9604 
9605     case META_LOOKBEHIND:
9606     case META_LOOKBEHINDNOT:
9607     case META_LOOKBEHIND_NA:
9608     if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9609       return -1;
9610     break;
9611 
9612     /* Back references and recursions are handled by very similar code. At this
9613     stage, the names generated in the parsing pass are available, but the main
9614     name table has not yet been created. So for the named varieties, scan the
9615     list of names in order to get the number of the first one in the pattern,
9616     and whether or not this name is duplicated. */
9617 
9618     case META_BACKREF_BYNAME:
9619     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9620       goto ISNOTFIXED;
9621     /* Fall through */
9622 
9623     case META_RECURSE_BYNAME:
9624       {
9625       int i;
9626       PCRE2_SPTR name;
9627       BOOL is_dupname = FALSE;
9628       named_group *ng = cb->named_groups;
9629       uint32_t meta_code = META_CODE(*pptr);
9630       uint32_t length = *(++pptr);
9631 
9632       GETPLUSOFFSET(offset, pptr);
9633       name = cb->start_pattern + offset;
9634       for (i = 0; i < cb->names_found; i++, ng++)
9635         {
9636         if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9637           {
9638           group = ng->number;
9639           is_dupname = ng->isdup;
9640           break;
9641           }
9642         }
9643 
9644       if (group == 0)
9645         {
9646         *errcodeptr = ERR15;  /* Non-existent subpattern */
9647         cb->erroroffset = offset;
9648         return -1;
9649         }
9650 
9651       /* A numerical back reference can be fixed length if duplicate capturing
9652       groups are not being used. A non-duplicate named back reference can also
9653       be handled. */
9654 
9655       if (meta_code == META_RECURSE_BYNAME ||
9656           (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9657         goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9658       }
9659     goto ISNOTFIXED;                     /* Duplicate name or number */
9660 
9661     /* The offset values for back references < 10 are in a separate vector
9662     because otherwise they would use more than two parsed pattern elements on
9663     64-bit systems. */
9664 
9665     case META_BACKREF:
9666     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9667         (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9668       goto ISNOTFIXED;
9669     group = META_DATA(*pptr);
9670     if (group < 10)
9671       {
9672       offset = cb->small_ref_offset[group];
9673       goto RECURSE_OR_BACKREF_LENGTH;
9674       }
9675 
9676     /* Fall through */
9677     /* For groups >= 10 - picking up group twice does no harm. */
9678 
9679     /* A true recursion implies not fixed length, but a subroutine call may
9680     be OK. Back reference "recursions" are also failed. */
9681 
9682     case META_RECURSE:
9683     group = META_DATA(*pptr);
9684     GETPLUSOFFSET(offset, pptr);
9685 
9686     RECURSE_OR_BACKREF_LENGTH:
9687     if (group > cb->bracount)
9688       {
9689       cb->erroroffset = offset;
9690       *errcodeptr = ERR15;  /* Non-existent subpattern */
9691       return -1;
9692       }
9693     if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9694     for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9695       {
9696       if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9697         else if (*gptr == (META_CAPTURE | group)) break;
9698       }
9699 
9700     /* We must start the search for the end of the group at the first meta code
9701     inside the group. Otherwise it will be treated as an enclosed group. */
9702 
9703     gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9704     if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9705     if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9706     for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9707     if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9708     this_recurse.prev = recurses;
9709     this_recurse.groupptr = gptr;
9710 
9711     /* We do not need to know the position of the end of the group, that is,
9712     gptr is not used after the call to get_grouplength(). Setting the second
9713     argument FALSE stops it scanning for the end when the length can be found
9714     in the cache. */
9715 
9716     gptr++;
9717     grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9718       lcptr, group, &this_recurse, cb);
9719     if (grouplength < 0)
9720       {
9721       if (*errcodeptr == 0) goto ISNOTFIXED;
9722       return -1;  /* Error already set */
9723       }
9724     itemlength = grouplength;
9725     itemminlength = groupminlength;
9726     break;
9727 
9728     /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9729     the length of this branch. Skip from the following item to the next
9730     unpaired ket. */
9731 
9732     case META_COND_DEFINE:
9733     pptr = parsed_skip(pptr + 1, PSKIP_KET);
9734     break;
9735 
9736     /* Check other nested groups - advance past the initial data for each type
9737     and then seek a fixed length with get_grouplength(). */
9738 
9739     case META_COND_NAME:
9740     case META_COND_NUMBER:
9741     case META_COND_RNAME:
9742     case META_COND_RNUMBER:
9743     pptr += 2 + SIZEOFFSET;
9744     goto CHECK_GROUP;
9745 
9746     case META_COND_ASSERT:
9747     pptr += 1;
9748     goto CHECK_GROUP;
9749 
9750     case META_COND_VERSION:
9751     pptr += 4;
9752     goto CHECK_GROUP;
9753 
9754     case META_CAPTURE:
9755     group = META_DATA(*pptr);
9756     /* Fall through */
9757 
9758     case META_ATOMIC:
9759     case META_NOCAPTURE:
9760     case META_SCRIPT_RUN:
9761     pptr++;
9762     CHECK_GROUP:
9763     grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9764       lcptr, group, recurses, cb);
9765     if (grouplength < 0) return -1;
9766     itemlength = grouplength;
9767     itemminlength = groupminlength;
9768     break;
9769 
9770     case META_QUERY:
9771     case META_QUERY_PLUS:
9772     case META_QUERY_QUERY:
9773     min = 0;
9774     max = 1;
9775     goto REPETITION;
9776 
9777     /* Exact repetition is OK; variable repetition is not. A repetition of zero
9778     must subtract the length that has already been added. */
9779 
9780     case META_MINMAX:
9781     case META_MINMAX_PLUS:
9782     case META_MINMAX_QUERY:
9783     min = pptr[1];
9784     max = pptr[2];
9785     pptr += 2;
9786 
9787     REPETITION:
9788     if (max != REPEAT_UNLIMITED)
9789       {
9790       if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9791           max != 0 &&
9792           (INT_MAX - branchlength)/lastitemlength < max - 1)
9793         {
9794         *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9795         return -1;
9796         }
9797       if (min == 0) branchminlength -= lastitemminlength;
9798         else itemminlength = (min - 1) * lastitemminlength;
9799       if (max == 0) branchlength -= lastitemlength;
9800         else itemlength = (max - 1) * lastitemlength;
9801       break;
9802       }
9803     /* Fall through */
9804 
9805     /* Any other item means this branch does not have a fixed length. */
9806 
9807     default:
9808     ISNOTFIXED:
9809     *errcodeptr = ERR25;   /* Not fixed length */
9810     return -1;
9811     }
9812 
9813   /* Add the item length to the branchlength, checking for integer overflow and
9814   for the branch length exceeding the overall limit. Later, if there is at
9815   least one variable-length branch in the group, there is a test for the
9816   (smaller) variable-length branch length limit. */
9817 
9818   if (INT_MAX - branchlength < (int)itemlength ||
9819       (branchlength += itemlength) > LOOKBEHIND_MAX)
9820     {
9821     *errcodeptr = ERR87;
9822     return -1;
9823     }
9824 
9825   branchminlength += itemminlength;
9826 
9827   /* Save this item length for use if the next item is a quantifier. */
9828 
9829   lastitemlength = itemlength;
9830   lastitemminlength = itemminlength;
9831   }
9832 
9833 EXIT:
9834 *pptrptr = pptr;
9835 *minptr = branchminlength;
9836 return branchlength;
9837 
9838 PARSED_SKIP_FAILED:
9839 *errcodeptr = ERR90;
9840 return -1;
9841 }
9842 
9843 
9844 
9845 /*************************************************
9846 *        Set lengths in a lookbehind             *
9847 *************************************************/
9848 
9849 /* This function is called for each lookbehind, to set the lengths in its
9850 branches. An error occurs if any branch does not have a limited maximum length
9851 that is less than the limit (65535). On exit, the pointer must be left on the
9852 final ket.
9853 
9854 The function also maintains the max_lookbehind value. Any lookbehind branch
9855 that contains a nested lookbehind may actually look further back than the
9856 length of the branch. The additional amount is passed back from
9857 get_branchlength() as an "extra" value.
9858 
9859 Arguments:
9860   pptrptr     pointer to pointer in the parsed pattern
9861   errcodeptr  pointer to error code
9862   lcptr       pointer to loop counter
9863   recurses    chain of recurse_check to catch mutual recursion
9864   cb          pointer to compile block
9865 
9866 Returns:      TRUE if all is well
9867               FALSE otherwise, with error code and offset set
9868 */
9869 
9870 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9871 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9872   parsed_recurse_check *recurses, compile_block *cb)
9873 {
9874 PCRE2_SIZE offset;
9875 uint32_t *bptr = *pptrptr;
9876 uint32_t *gbptr = bptr;
9877 int maxlength = 0;
9878 int minlength = INT_MAX;
9879 BOOL variable = FALSE;
9880 
9881 READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9882 *pptrptr += SIZEOFFSET;
9883 
9884 /* Each branch can have a different maximum length, but we can keep only a
9885 single minimum for the whole group, because there's nowhere to save individual
9886 values in the META_ALT item. */
9887 
9888 do
9889   {
9890   int branchlength, branchminlength;
9891 
9892   *pptrptr += 1;
9893   branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9894     recurses, cb);
9895 
9896   if (branchlength < 0)
9897     {
9898     /* The errorcode and offset may already be set from a nested lookbehind. */
9899     if (*errcodeptr == 0) *errcodeptr = ERR25;
9900     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9901     return FALSE;
9902     }
9903 
9904   if (branchlength != branchminlength) variable = TRUE;
9905   if (branchminlength < minlength) minlength = branchminlength;
9906   if (branchlength > maxlength) maxlength = branchlength;
9907   if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9908   *bptr |= branchlength;  /* branchlength never more than 65535 */
9909   bptr = *pptrptr;
9910   }
9911 while (*bptr == META_ALT);
9912 
9913 /* If any branch is of variable length, the whole lookbehind is of variable
9914 length. If the maximum length of any branch exceeds the maximum for variable
9915 lookbehinds, give an error. Otherwise, the minimum length is set in the word
9916 that follows the original group META value. For a fixed-length lookbehind, this
9917 is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9918 possibly different) length. */
9919 
9920 if (variable)
9921   {
9922   gbptr[1] = minlength;
9923   if ((uint32_t)maxlength > cb->max_varlookbehind)
9924     {
9925     *errcodeptr = ERR100;
9926     cb->erroroffset = offset;
9927     return FALSE;
9928     }
9929   }
9930 else gbptr[1] = LOOKBEHIND_MAX;
9931 
9932 
9933 gbptr[1] = variable? minlength : LOOKBEHIND_MAX;
9934 return TRUE;
9935 }
9936 
9937 
9938 
9939 /*************************************************
9940 *         Check parsed pattern lookbehinds       *
9941 *************************************************/
9942 
9943 /* This function is called at the end of parsing a pattern if any lookbehinds
9944 were encountered. It scans the parsed pattern for them, calling
9945 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9946 the error offset is marked unset. The enables the functions above not to
9947 override settings from deeper nestings.
9948 
9949 This function is called recursively from get_branchlength() for lookaheads in
9950 order to process any lookbehinds that they may contain. It stops when it hits a
9951 non-nested closing parenthesis in this case, returning a pointer to it.
9952 
9953 Arguments
9954   pptr      points to where to start (start of pattern or start of lookahead)
9955   retptr    if not NULL, return the ket pointer here
9956   recurses  chain of recurse_check to catch mutual recursion
9957   cb        points to the compile block
9958   lcptr     points to loop counter
9959 
9960 Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9961 */
9962 
9963 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9964 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9965   parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9966 {
9967 int errorcode = 0;
9968 int nestlevel = 0;
9969 
9970 cb->erroroffset = PCRE2_UNSET;
9971 
9972 for (; *pptr != META_END; pptr++)
9973   {
9974   if (*pptr < META_END) continue;  /* Literal */
9975 
9976   switch (META_CODE(*pptr))
9977     {
9978     default:
9979     return ERR70;  /* Unrecognized meta code */
9980 
9981     case META_ESCAPE:
9982     if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9983       pptr += 1;
9984     break;
9985 
9986     case META_KET:
9987     if (--nestlevel < 0)
9988       {
9989       if (retptr != NULL) *retptr = pptr;
9990       return 0;
9991       }
9992     break;
9993 
9994     case META_ATOMIC:
9995     case META_CAPTURE:
9996     case META_COND_ASSERT:
9997     case META_LOOKAHEAD:
9998     case META_LOOKAHEADNOT:
9999     case META_LOOKAHEAD_NA:
10000     case META_NOCAPTURE:
10001     case META_SCRIPT_RUN:
10002     nestlevel++;
10003     break;
10004 
10005     case META_ACCEPT:
10006     case META_ALT:
10007     case META_ASTERISK:
10008     case META_ASTERISK_PLUS:
10009     case META_ASTERISK_QUERY:
10010     case META_BACKREF:
10011     case META_CIRCUMFLEX:
10012     case META_CLASS:
10013     case META_CLASS_EMPTY:
10014     case META_CLASS_EMPTY_NOT:
10015     case META_CLASS_END:
10016     case META_CLASS_NOT:
10017     case META_COMMIT:
10018     case META_DOLLAR:
10019     case META_DOT:
10020     case META_FAIL:
10021     case META_PLUS:
10022     case META_PLUS_PLUS:
10023     case META_PLUS_QUERY:
10024     case META_PRUNE:
10025     case META_QUERY:
10026     case META_QUERY_PLUS:
10027     case META_QUERY_QUERY:
10028     case META_RANGE_ESCAPED:
10029     case META_RANGE_LITERAL:
10030     case META_SKIP:
10031     case META_THEN:
10032     break;
10033 
10034     case META_RECURSE:
10035     pptr += SIZEOFFSET;
10036     break;
10037 
10038     case META_BACKREF_BYNAME:
10039     case META_RECURSE_BYNAME:
10040     pptr += 1 + SIZEOFFSET;
10041     break;
10042 
10043     case META_COND_DEFINE:
10044     pptr += SIZEOFFSET;
10045     nestlevel++;
10046     break;
10047 
10048     case META_COND_NAME:
10049     case META_COND_NUMBER:
10050     case META_COND_RNAME:
10051     case META_COND_RNUMBER:
10052     pptr += 1 + SIZEOFFSET;
10053     nestlevel++;
10054     break;
10055 
10056     case META_COND_VERSION:
10057     pptr += 3;
10058     nestlevel++;
10059     break;
10060 
10061     case META_CALLOUT_STRING:
10062     pptr += 3 + SIZEOFFSET;
10063     break;
10064 
10065     case META_BIGVALUE:
10066     case META_POSIX:
10067     case META_POSIX_NEG:
10068     pptr += 1;
10069     break;
10070 
10071     case META_MINMAX:
10072     case META_MINMAX_QUERY:
10073     case META_MINMAX_PLUS:
10074     case META_OPTIONS:
10075     pptr += 2;
10076     break;
10077 
10078     case META_CALLOUT_NUMBER:
10079     pptr += 3;
10080     break;
10081 
10082     case META_MARK:
10083     case META_COMMIT_ARG:
10084     case META_PRUNE_ARG:
10085     case META_SKIP_ARG:
10086     case META_THEN_ARG:
10087     pptr += 1 + pptr[1];
10088     break;
10089 
10090     case META_LOOKBEHIND:
10091     case META_LOOKBEHINDNOT:
10092     case META_LOOKBEHIND_NA:
10093     if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10094       return errorcode;
10095     break;
10096     }
10097   }
10098 
10099 return 0;
10100 }
10101 
10102 
10103 
10104 /*************************************************
10105 *     External function to compile a pattern     *
10106 *************************************************/
10107 
10108 /* This function reads a regular expression in the form of a string and returns
10109 a pointer to a block of store holding a compiled version of the expression.
10110 
10111 Arguments:
10112   pattern       the regular expression
10113   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10114   options       option bits
10115   errorptr      pointer to errorcode
10116   erroroffset   pointer to error offset
10117   ccontext      points to a compile context or is NULL
10118 
10119 Returns:        pointer to compiled data block, or NULL on error,
10120                 with errorcode and erroroffset set
10121 */
10122 
10123 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)10124 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10125    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10126 {
10127 BOOL utf;                             /* Set TRUE for UTF mode */
10128 BOOL ucp;                             /* Set TRUE for UCP mode */
10129 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10130 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10131 pcre2_real_code *re = NULL;           /* What we will return */
10132 compile_block cb;                     /* "Static" compile-time data */
10133 const uint8_t *tables;                /* Char tables base pointer */
10134 
10135 PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10136 PCRE2_SPTR codestart;                 /* Start of compiled code */
10137 PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10138 uint32_t *pptr;                       /* Current pointer in parsed pattern */
10139 
10140 PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10141 PCRE2_SIZE usedlength;                /* Actual length used */
10142 PCRE2_SIZE re_blocksize;              /* Size of memory block */
10143 PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
10144 PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10145 
10146 uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10147 uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10148 uint32_t setflags = 0;                /* NL and BSR set flags */
10149 
10150 uint32_t skipatstart;                 /* When checking (*UTF) etc */
10151 uint32_t limit_heap  = UINT32_MAX;
10152 uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10153 uint32_t limit_depth = UINT32_MAX;
10154 
10155 int newline = 0;                      /* Unset; can be set by the pattern */
10156 int bsr = 0;                          /* Unset; can be set by the pattern */
10157 int errorcode = 0;                    /* Initialize to avoid compiler warn */
10158 int regexrc;                          /* Return from compile */
10159 
10160 uint32_t i;                           /* Local loop counter */
10161 
10162 /* Comments at the head of this file explain about these variables. */
10163 
10164 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10165 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10166 named_group named_groups[NAMED_GROUP_LIST_SIZE];
10167 
10168 /* The workspace is used in different ways in the different compiling phases.
10169 It needs to be 16-bit aligned for the preliminary parsing scan. */
10170 
10171 uint32_t c16workspace[C16_WORK_SIZE];
10172 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10173 
10174 
10175 /* -------------- Check arguments and set up the pattern ----------------- */
10176 
10177 /* There must be error code and offset pointers. */
10178 
10179 if (errorptr == NULL || erroroffset == NULL) return NULL;
10180 *errorptr = ERR0;
10181 *erroroffset = 0;
10182 
10183 /* There must be a pattern, but NULL is allowed with zero length. */
10184 
10185 if (pattern == NULL)
10186   {
10187   if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10188     {
10189     *errorptr = ERR16;
10190     return NULL;
10191     }
10192   }
10193 
10194 /* A NULL compile context means "use a default context" */
10195 
10196 if (ccontext == NULL)
10197   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10198 
10199 /* PCRE2_MATCH_INVALID_UTF implies UTF */
10200 
10201 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10202 
10203 /* Check that all undefined public option bits are zero. */
10204 
10205 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10206     (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10207   {
10208   *errorptr = ERR17;
10209   return NULL;
10210   }
10211 
10212 if ((options & PCRE2_LITERAL) != 0 &&
10213     ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10214      (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10215   {
10216   *errorptr = ERR92;
10217   return NULL;
10218   }
10219 
10220 /* A zero-terminated pattern is indicated by the special length value
10221 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10222 
10223 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10224   patlen = PRIV(strlen)(pattern);
10225 
10226 if (patlen > ccontext->max_pattern_length)
10227   {
10228   *errorptr = ERR88;
10229   return NULL;
10230   }
10231 
10232 /* From here on, all returns from this function should end up going via the
10233 EXIT label. */
10234 
10235 
10236 /* ------------ Initialize the "static" compile data -------------- */
10237 
10238 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10239 
10240 cb.lcc = tables + lcc_offset;          /* Individual */
10241 cb.fcc = tables + fcc_offset;          /*   character */
10242 cb.cbits = tables + cbits_offset;      /*      tables */
10243 cb.ctypes = tables + ctypes_offset;
10244 
10245 cb.assert_depth = 0;
10246 cb.bracount = 0;
10247 cb.cx = ccontext;
10248 cb.dupnames = FALSE;
10249 cb.end_pattern = pattern + patlen;
10250 cb.erroroffset = 0;
10251 cb.external_flags = 0;
10252 cb.external_options = options;
10253 cb.groupinfo = stack_groupinfo;
10254 cb.had_recurse = FALSE;
10255 cb.lastcapture = 0;
10256 cb.max_lookbehind = 0;                               /* Max encountered */
10257 cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10258 cb.name_entry_size = 0;
10259 cb.name_table = NULL;
10260 cb.named_groups = named_groups;
10261 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10262 cb.names_found = 0;
10263 cb.parens_depth = 0;
10264 cb.parsed_pattern = stack_parsed_pattern;
10265 cb.req_varyopt = 0;
10266 cb.start_code = cworkspace;
10267 cb.start_pattern = pattern;
10268 cb.start_workspace = cworkspace;
10269 cb.workspace_size = COMPILE_WORK_SIZE;
10270 
10271 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10272 references to help in deciding whether (.*) can be treated as anchored or not.
10273 */
10274 
10275 cb.top_backref = 0;
10276 cb.backref_map = 0;
10277 
10278 /* Escape sequences \1 to \9 are always back references, but as they are only
10279 two characters long, only two elements can be used in the parsed_pattern
10280 vector. The first contains the reference, and we'd like to use the second to
10281 record the offset in the pattern, so that forward references to non-existent
10282 groups can be diagnosed later with an offset. However, on 64-bit systems,
10283 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10284 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10285 references have enough space for the offset to be put into the parsed pattern.
10286 */
10287 
10288 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10289 
10290 
10291 /* --------------- Start looking at the pattern --------------- */
10292 
10293 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10294 the start of the pattern, and remember the offset to the actual regex. With
10295 valgrind support, make the terminator of a zero-terminated pattern
10296 inaccessible. This catches bugs that would otherwise only show up for
10297 non-zero-terminated patterns. */
10298 
10299 #ifdef SUPPORT_VALGRIND
10300 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10301 #endif
10302 
10303 ptr = pattern;
10304 skipatstart = 0;
10305 
10306 if ((options & PCRE2_LITERAL) == 0)
10307   {
10308   while (patlen - skipatstart >= 2 &&
10309          ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10310          ptr[skipatstart+1] == CHAR_ASTERISK)
10311     {
10312     for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10313       {
10314       uint32_t c, pp;
10315       const pso *p = pso_list + i;
10316 
10317       if (patlen - skipatstart - 2 >= p->length &&
10318           PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
10319             p->length) == 0)
10320         {
10321         skipatstart += p->length + 2;
10322         switch(p->type)
10323           {
10324           case PSO_OPT:
10325           cb.external_options |= p->value;
10326           break;
10327 
10328           case PSO_FLG:
10329           setflags |= p->value;
10330           break;
10331 
10332           case PSO_NL:
10333           newline = p->value;
10334           setflags |= PCRE2_NL_SET;
10335           break;
10336 
10337           case PSO_BSR:
10338           bsr = p->value;
10339           setflags |= PCRE2_BSR_SET;
10340           break;
10341 
10342           case PSO_LIMM:
10343           case PSO_LIMD:
10344           case PSO_LIMH:
10345           c = 0;
10346           pp = skipatstart;
10347           if (!IS_DIGIT(ptr[pp]))
10348             {
10349             errorcode = ERR60;
10350             ptr += pp;
10351             goto HAD_EARLY_ERROR;
10352             }
10353           while (IS_DIGIT(ptr[pp]))
10354             {
10355             if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10356             c = c*10 + (ptr[pp++] - CHAR_0);
10357             }
10358           if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
10359             {
10360             errorcode = ERR60;
10361             ptr += pp;
10362             goto HAD_EARLY_ERROR;
10363             }
10364           if (p->type == PSO_LIMH) limit_heap = c;
10365             else if (p->type == PSO_LIMM) limit_match = c;
10366             else limit_depth = c;
10367           skipatstart += pp - skipatstart;
10368           break;
10369           }
10370         break;   /* Out of the table scan loop */
10371         }
10372       }
10373     if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10374     }
10375   }
10376 
10377 /* End of pattern-start options; advance to start of real regex. */
10378 
10379 ptr += skipatstart;
10380 
10381 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10382 
10383 #ifndef SUPPORT_UNICODE
10384 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10385   {
10386   errorcode = ERR32;
10387   goto HAD_EARLY_ERROR;
10388   }
10389 #endif
10390 
10391 /* Check UTF. We have the original options in 'options', with that value as
10392 modified by (*UTF) etc in cb->external_options. The extra option
10393 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10394 surrogate code points cannot be represented in UTF-16. */
10395 
10396 utf = (cb.external_options & PCRE2_UTF) != 0;
10397 if (utf)
10398   {
10399   if ((options & PCRE2_NEVER_UTF) != 0)
10400     {
10401     errorcode = ERR74;
10402     goto HAD_EARLY_ERROR;
10403     }
10404   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10405        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10406     goto HAD_ERROR;  /* Offset was set by valid_utf() */
10407 
10408 #if PCRE2_CODE_UNIT_WIDTH == 16
10409   if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10410     {
10411     errorcode = ERR91;
10412     goto HAD_EARLY_ERROR;
10413     }
10414 #endif
10415   }
10416 
10417 /* Check UCP lockout. */
10418 
10419 ucp = (cb.external_options & PCRE2_UCP) != 0;
10420 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10421   {
10422   errorcode = ERR75;
10423   goto HAD_EARLY_ERROR;
10424   }
10425 
10426 /* Process the BSR setting. */
10427 
10428 if (bsr == 0) bsr = ccontext->bsr_convention;
10429 
10430 /* Process the newline setting. */
10431 
10432 if (newline == 0) newline = ccontext->newline_convention;
10433 cb.nltype = NLTYPE_FIXED;
10434 switch(newline)
10435   {
10436   case PCRE2_NEWLINE_CR:
10437   cb.nllen = 1;
10438   cb.nl[0] = CHAR_CR;
10439   break;
10440 
10441   case PCRE2_NEWLINE_LF:
10442   cb.nllen = 1;
10443   cb.nl[0] = CHAR_NL;
10444   break;
10445 
10446   case PCRE2_NEWLINE_NUL:
10447   cb.nllen = 1;
10448   cb.nl[0] = CHAR_NUL;
10449   break;
10450 
10451   case PCRE2_NEWLINE_CRLF:
10452   cb.nllen = 2;
10453   cb.nl[0] = CHAR_CR;
10454   cb.nl[1] = CHAR_NL;
10455   break;
10456 
10457   case PCRE2_NEWLINE_ANY:
10458   cb.nltype = NLTYPE_ANY;
10459   break;
10460 
10461   case PCRE2_NEWLINE_ANYCRLF:
10462   cb.nltype = NLTYPE_ANYCRLF;
10463   break;
10464 
10465   default:
10466   errorcode = ERR56;
10467   goto HAD_EARLY_ERROR;
10468   }
10469 
10470 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10471 their numerical equivalents, so that this information is always available for
10472 the remaining processing. (2) At the same time, parse the pattern and put a
10473 processed version into the parsed_pattern vector. This has escapes interpreted
10474 and comments removed (amongst other things).
10475 
10476 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10477 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10478 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10479 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10480 characters greater than META_END (0x80000000) have to be coded as two units. In
10481 this case, therefore, we scan the pattern to check for such values. */
10482 
10483 #if PCRE2_CODE_UNIT_WIDTH == 32
10484 if (!utf)
10485   {
10486   PCRE2_SPTR p;
10487   for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10488   }
10489 #endif
10490 
10491 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10492 is set we have to assume a numerical callout (4 elements) for each character
10493 plus one at the end. This is overkill, but memory is plentiful these days. For
10494 many smaller patterns the vector on the stack (which was set up above) can be
10495 used. */
10496 
10497 parsed_size_needed = patlen - skipatstart + big32count;
10498 
10499 if ((ccontext->extra_options &
10500      (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10501   parsed_size_needed += 4;
10502 
10503 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10504   parsed_size_needed = (parsed_size_needed + 1) * 5;
10505 
10506 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10507   {
10508   uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10509     (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10510   if (heap_parsed_pattern == NULL)
10511     {
10512     *errorptr = ERR21;
10513     goto EXIT;
10514     }
10515   cb.parsed_pattern = heap_parsed_pattern;
10516   }
10517 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10518 
10519 /* Do the parsing scan. */
10520 
10521 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10522 if (errorcode != 0) goto HAD_CB_ERROR;
10523 
10524 /* If there are any lookbehinds, scan the parsed pattern to figure out their
10525 lengths. Workspace is needed to remember whether numbered groups are or are not
10526 of limited length, and if limited, what the minimum and maximum lengths are.
10527 This caching saves re-computing the length of any group that is referenced more
10528 than once, which is particularly relevant when recursion is involved.
10529 Unnumbered groups do not have this exposure because they cannot be referenced.
10530 If there are sufficiently few groups, the default index vector on the stack, as
10531 set up above, can be used. Otherwise we have to get/free some heap memory. The
10532 vector must be initialized to zero. */
10533 
10534 if (has_lookbehind)
10535   {
10536   int loopcount = 0;
10537   if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10538     {
10539     cb.groupinfo = ccontext->memctl.malloc(
10540       (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10541     if (cb.groupinfo == NULL)
10542       {
10543       errorcode = ERR21;
10544       cb.erroroffset = 0;
10545       goto HAD_CB_ERROR;
10546       }
10547     }
10548   memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10549   errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10550   if (errorcode != 0) goto HAD_CB_ERROR;
10551   }
10552 
10553 /* For debugging, there is a function that shows the parsed pattern vector. */
10554 
10555 #ifdef DEBUG_SHOW_PARSED
10556 fprintf(stderr, "+++ Pre-scan complete:\n");
10557 show_parsed(&cb);
10558 #endif
10559 
10560 /* For debugging capturing information this code can be enabled. */
10561 
10562 #ifdef DEBUG_SHOW_CAPTURES
10563   {
10564   named_group *ng = cb.named_groups;
10565   fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10566   for (i = 0; i < cb.names_found; i++, ng++)
10567     {
10568     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10569     }
10570   }
10571 #endif
10572 
10573 /* Pretend to compile the pattern while actually just accumulating the amount
10574 of memory required in the 'length' variable. This behaviour is triggered by
10575 passing a non-NULL final argument to compile_regex(). We pass a block of
10576 workspace (cworkspace) for it to compile parts of the pattern into; the
10577 compiled code is discarded when it is no longer needed, so hopefully this
10578 workspace will never overflow, though there is a test for its doing so.
10579 
10580 On error, errorcode will be set non-zero, so we don't need to look at the
10581 result of the function. The initial options have been put into the cb block,
10582 but we still have to pass a separate options variable (the first argument)
10583 because the options may change as the pattern is processed. */
10584 
10585 cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10586 pptr = cb.parsed_pattern;
10587 code = cworkspace;
10588 *code = OP_BRA;
10589 
10590 (void)compile_regex(cb.external_options, ccontext->extra_options, &code, &pptr,
10591    &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10592    &cb, &length);
10593 
10594 if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10595 
10596 /* This should be caught in compile_regex(), but just in case... */
10597 
10598 if (length > MAX_PATTERN_SIZE)
10599   {
10600   errorcode = ERR20;
10601   goto HAD_CB_ERROR;
10602   }
10603 
10604 /* Compute the size of, and then get and initialize, the data block for storing
10605 the compiled pattern and names table. Integer overflow should no longer be
10606 possible because nowadays we limit the maximum value of cb.names_found and
10607 cb.name_entry_size. */
10608 
10609 re_blocksize = sizeof(pcre2_real_code) +
10610   CU2BYTES(length +
10611   (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10612 re = (pcre2_real_code *)
10613   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10614 if (re == NULL)
10615   {
10616   errorcode = ERR21;
10617   goto HAD_CB_ERROR;
10618   }
10619 
10620 /* The compiler may put padding at the end of the pcre2_real_code structure in
10621 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10622 compiled pattern is copied (for example, when serialized) undefined bytes are
10623 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10624 write to the last 8 bytes of the structure before setting the fields. */
10625 
10626 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10627 re->memctl = ccontext->memctl;
10628 re->tables = tables;
10629 re->executable_jit = NULL;
10630 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10631 re->blocksize = re_blocksize;
10632 re->magic_number = MAGIC_NUMBER;
10633 re->compile_options = options;
10634 re->overall_options = cb.external_options;
10635 re->extra_options = ccontext->extra_options;
10636 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10637 re->limit_heap = limit_heap;
10638 re->limit_match = limit_match;
10639 re->limit_depth = limit_depth;
10640 re->first_codeunit = 0;
10641 re->last_codeunit = 0;
10642 re->bsr_convention = bsr;
10643 re->newline_convention = newline;
10644 re->max_lookbehind = 0;
10645 re->minlength = 0;
10646 re->top_bracket = 0;
10647 re->top_backref = 0;
10648 re->name_entry_size = cb.name_entry_size;
10649 re->name_count = cb.names_found;
10650 
10651 /* The basic block is immediately followed by the name table, and the compiled
10652 code follows after that. */
10653 
10654 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10655   re->name_entry_size * re->name_count;
10656 
10657 /* Update the compile data block for the actual compile. The starting points of
10658 the name/number translation table and of the code are passed around in the
10659 compile data block. The start/end pattern and initial options are already set
10660 from the pre-compile phase, as is the name_entry_size field. */
10661 
10662 cb.parens_depth = 0;
10663 cb.assert_depth = 0;
10664 cb.lastcapture = 0;
10665 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10666 cb.start_code = codestart;
10667 cb.req_varyopt = 0;
10668 cb.had_accept = FALSE;
10669 cb.had_pruneorskip = FALSE;
10670 
10671 /* If any named groups were found, create the name/number table from the list
10672 created in the pre-pass. */
10673 
10674 if (cb.names_found > 0)
10675   {
10676   named_group *ng = cb.named_groups;
10677   for (i = 0; i < cb.names_found; i++, ng++)
10678     add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10679   }
10680 
10681 /* Set up a starting, non-extracting bracket, then compile the expression. On
10682 error, errorcode will be set non-zero, so we don't need to look at the result
10683 of the function here. */
10684 
10685 pptr = cb.parsed_pattern;
10686 code = (PCRE2_UCHAR *)codestart;
10687 *code = OP_BRA;
10688 regexrc = compile_regex(re->overall_options, ccontext->extra_options, &code,
10689   &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10690   NULL, &cb, NULL);
10691 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10692 re->top_bracket = cb.bracount;
10693 re->top_backref = cb.top_backref;
10694 re->max_lookbehind = cb.max_lookbehind;
10695 
10696 if (cb.had_accept)
10697   {
10698   reqcu = 0;                     /* Must disable after (*ACCEPT) */
10699   reqcuflags = REQ_NONE;
10700   re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10701   }
10702 
10703 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10704 but the estimated length exceeds the really used length, adjust the value of
10705 re->blocksize, and if valgrind support is configured, mark the extra allocated
10706 memory as unaddressable, so that any out-of-bound reads can be detected. */
10707 
10708 *code++ = OP_END;
10709 usedlength = code - codestart;
10710 if (usedlength > length) errorcode = ERR23; else
10711   {
10712   re->blocksize -= CU2BYTES(length - usedlength);
10713 #ifdef SUPPORT_VALGRIND
10714   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10715 #endif
10716   }
10717 
10718 /* Scan the pattern for recursion/subroutine calls and convert the group
10719 numbers into offsets. Maintain a small cache so that repeated groups containing
10720 recursions are efficiently handled. */
10721 
10722 #define RSCAN_CACHE_SIZE 8
10723 
10724 if (errorcode == 0 && cb.had_recurse)
10725   {
10726   PCRE2_UCHAR *rcode;
10727   PCRE2_SPTR rgroup;
10728   unsigned int ccount = 0;
10729   int start = RSCAN_CACHE_SIZE;
10730   recurse_cache rc[RSCAN_CACHE_SIZE];
10731 
10732   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10733        rcode != NULL;
10734        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10735     {
10736     int p, groupnumber;
10737 
10738     groupnumber = (int)GET(rcode, 1);
10739     if (groupnumber == 0) rgroup = codestart; else
10740       {
10741       PCRE2_SPTR search_from = codestart;
10742       rgroup = NULL;
10743       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10744         {
10745         if (groupnumber == rc[p].groupnumber)
10746           {
10747           rgroup = rc[p].group;
10748           break;
10749           }
10750 
10751         /* Group n+1 must always start to the right of group n, so we can save
10752         search time below when the new group number is greater than any of the
10753         previously found groups. */
10754 
10755         if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10756         }
10757 
10758       if (rgroup == NULL)
10759         {
10760         rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10761         if (rgroup == NULL)
10762           {
10763           errorcode = ERR53;
10764           break;
10765           }
10766         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10767         rc[start].groupnumber = groupnumber;
10768         rc[start].group = rgroup;
10769         if (ccount < RSCAN_CACHE_SIZE) ccount++;
10770         }
10771       }
10772 
10773     PUT(rcode, 1, rgroup - codestart);
10774     }
10775   }
10776 
10777 /* In rare debugging situations we sometimes need to look at the compiled code
10778 at this stage. */
10779 
10780 #ifdef DEBUG_CALL_PRINTINT
10781 pcre2_printint(re, stderr, TRUE);
10782 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10783 #endif
10784 
10785 /* Unless disabled, check whether any single character iterators can be
10786 auto-possessified. The function overwrites the appropriate opcode values, so
10787 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10788 used in this code because at least one compiler gives a warning about loss of
10789 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10790 function call. */
10791 
10792 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10793   {
10794   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10795   if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10796   }
10797 
10798 /* Failed to compile, or error while post-processing. */
10799 
10800 if (errorcode != 0) goto HAD_CB_ERROR;
10801 
10802 /* Successful compile. If the anchored option was not passed, set it if
10803 we can determine that the pattern is anchored by virtue of ^ characters or \A
10804 or anything else, such as starting with non-atomic .* when DOTALL is set and
10805 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10806 disable this case). */
10807 
10808 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10809      is_anchored(codestart, 0, &cb, 0, FALSE))
10810   re->overall_options |= PCRE2_ANCHORED;
10811 
10812 /* Set up the first code unit or startline flag, the required code unit, and
10813 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10814 is set, as the data it would create will not be used. Note that a first code
10815 unit (but not the startline flag) is useful for anchored patterns because it
10816 can still give a quick "no match" and also avoid searching for a last code
10817 unit. */
10818 
10819 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10820   {
10821   int minminlength = 0;  /* For minimal minlength from first/required CU */
10822 
10823   /* If we do not have a first code unit, see if there is one that is asserted
10824   (these are not saved during the compile because they can cause conflicts with
10825   actual literals that follow). */
10826 
10827   if (firstcuflags >= REQ_NONE)
10828     firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10829 
10830   /* Save the data for a first code unit. The existence of one means the
10831   minimum length must be at least 1. */
10832 
10833   if (firstcuflags < REQ_NONE)
10834     {
10835     re->first_codeunit = firstcu;
10836     re->flags |= PCRE2_FIRSTSET;
10837     minminlength++;
10838 
10839     /* Handle caseless first code units. */
10840 
10841     if ((firstcuflags & REQ_CASELESS) != 0)
10842       {
10843       if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10844         {
10845         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10846         }
10847 
10848       /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10849       In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10850       points and cannot have another case, but if UCP is set they may do. */
10851 
10852 #ifdef SUPPORT_UNICODE
10853 #if PCRE2_CODE_UNIT_WIDTH == 8
10854       else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10855         re->flags |= PCRE2_FIRSTCASELESS;
10856 #else
10857       else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10858                UCD_OTHERCASE(firstcu) != firstcu)
10859         re->flags |= PCRE2_FIRSTCASELESS;
10860 #endif
10861 #endif  /* SUPPORT_UNICODE */
10862       }
10863     }
10864 
10865   /* When there is no first code unit, for non-anchored patterns, see if we can
10866   set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10867   branches start with ^ and also when all branches start with non-atomic .* for
10868   non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10869   that disables this case.) */
10870 
10871   else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10872            is_startline(codestart, 0, &cb, 0, FALSE))
10873     re->flags |= PCRE2_STARTLINE;
10874 
10875   /* Handle the "required code unit", if one is set. In the UTF case we can
10876   increment the minimum minimum length only if we are sure this really is a
10877   different character and not a non-starting code unit of the first character,
10878   because the minimum length count is in characters, not code units. */
10879 
10880   if (reqcuflags < REQ_NONE)
10881     {
10882 #if PCRE2_CODE_UNIT_WIDTH == 16
10883     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10884         firstcuflags >= REQ_NONE ||                 /* First not set */
10885         (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10886         (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10887 #elif PCRE2_CODE_UNIT_WIDTH == 8
10888     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10889         firstcuflags >= REQ_NONE ||                 /* First not set */
10890         (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10891         (reqcu & 0x80) == 0)                        /* Req is ASCII */
10892 #endif
10893       {
10894       minminlength++;
10895       }
10896 
10897     /* In the case of an anchored pattern, set up the value only if it follows
10898     a variable length item in the pattern. */
10899 
10900     if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10901         (reqcuflags & REQ_VARY) != 0)
10902       {
10903       re->last_codeunit = reqcu;
10904       re->flags |= PCRE2_LASTSET;
10905 
10906       /* Handle caseless required code units as for first code units (above). */
10907 
10908       if ((reqcuflags & REQ_CASELESS) != 0)
10909         {
10910         if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10911           {
10912           if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10913           }
10914 #ifdef SUPPORT_UNICODE
10915 #if PCRE2_CODE_UNIT_WIDTH == 8
10916       else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10917         re->flags |= PCRE2_LASTCASELESS;
10918 #else
10919       else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10920                UCD_OTHERCASE(reqcu) != reqcu)
10921         re->flags |= PCRE2_LASTCASELESS;
10922 #endif
10923 #endif  /* SUPPORT_UNICODE */
10924         }
10925       }
10926     }
10927 
10928   /* Study the compiled pattern to set up information such as a bitmap of
10929   starting code units and a minimum matching length. */
10930 
10931   if (PRIV(study)(re) != 0)
10932     {
10933     errorcode = ERR31;
10934     goto HAD_CB_ERROR;
10935     }
10936 
10937   /* If study() set a bitmap of starting code units, it implies a minimum
10938   length of at least one. */
10939 
10940   if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10941     minminlength = 1;
10942 
10943   /* If the minimum length set (or not set) by study() is less than the minimum
10944   implied by required code units, override it. */
10945 
10946   if (re->minlength < minminlength) re->minlength = minminlength;
10947   }   /* End of start-of-match optimizations. */
10948 
10949 /* Control ends up here in all cases. When running under valgrind, make a
10950 pattern's terminating zero defined again. If memory was obtained for the parsed
10951 version of the pattern, free it before returning. Also free the list of named
10952 groups if a larger one had to be obtained, and likewise the group information
10953 vector. */
10954 
10955 EXIT:
10956 #ifdef SUPPORT_VALGRIND
10957 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10958 #endif
10959 if (cb.parsed_pattern != stack_parsed_pattern)
10960   ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10961 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10962   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10963 if (cb.groupinfo != stack_groupinfo)
10964   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10965 return re;    /* Will be NULL after an error */
10966 
10967 /* Errors discovered in parse_regex() set the offset value in the compile
10968 block. Errors discovered before it is called must compute it from the ptr
10969 value. After parse_regex() is called, the offset in the compile block is set to
10970 the end of the pattern, but certain errors in compile_regex() may reset it if
10971 an offset is available in the parsed pattern. */
10972 
10973 HAD_CB_ERROR:
10974 ptr = pattern + cb.erroroffset;
10975 
10976 HAD_EARLY_ERROR:
10977 *erroroffset = ptr - pattern;
10978 
10979 HAD_ERROR:
10980 *errorptr = errorcode;
10981 pcre2_code_free(re);
10982 re = NULL;
10983 goto EXIT;
10984 }
10985 
10986 /* These #undefs are here to enable unity builds with CMake. */
10987 
10988 #undef NLBLOCK /* Block containing newline information */
10989 #undef PSSTART /* Field containing processed string start */
10990 #undef PSEND   /* Field containing processed string end */
10991 
10992 /* End of pcre2_compile.c */
10993