xref: /PHP-8.2/ext/pcre/pcre2lib/pcre2_compile.c (revision c4e8f652)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2022 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #define NLBLOCK cb             /* Block containing newline information */
47 #define PSSTART start_pattern  /* Field containing processed string start */
48 #define PSEND   end_pattern    /* Field containing processed string end */
49 
50 #include "pcre2_internal.h"
51 
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53 
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63 
64 /* Other debugging code can be enabled by these defines. */
65 
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68 
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71 
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c)                xdigitab[c]
75 
76 #else  /* Either 16-bit or 32-bit */
77 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78 
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81 
82 #else  /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86 
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90 
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100   { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102   { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110 
111 /* Macros for manipulating elements of the parsed pattern vector. */
112 
113 #define META_CODE(x)   (x & 0xffff0000u)
114 #define META_DATA(x)   (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116 
117 /* Function definitions to allow mutual recursion */
118 
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121   add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122     compile_block *, const uint32_t *, unsigned int);
123 #endif
124 
125 static int
126   compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127     uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128     compile_block *, PCRE2_SIZE *);
129 
130 static int
131   get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132     compile_block *);
133 
134 static BOOL
135   set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136     compile_block *);
137 
138 static int
139   check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140     compile_block *, int *);
141 
142 
143 /*************************************************
144 *      Code parameters and static tables         *
145 *************************************************/
146 
147 #define MAX_GROUP_NUMBER   65535u
148 #define MAX_REPEAT_COUNT   65535u
149 #define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
150 
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156 
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163 
164 In the real compile phase, this workspace is not currently used. */
165 
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
167 
168 #define C16_WORK_SIZE \
169   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170 
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174 
175 #define GROUPINFO_DEFAULT_SIZE 256
176 
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179 
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181 
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186 
187 #define NAMED_GROUP_LIST_SIZE  20
188 
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192 
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194 
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199 
200 #define OFLOW_MAX (INT_MAX - 20)
201 
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207 
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210 
211 #define META_END              0x80000000u  /* End of pattern */
212 
213 #define META_ALT              0x80010000u  /* alternation */
214 #define META_ATOMIC           0x80020000u  /* atomic group */
215 #define META_BACKREF          0x80030000u  /* Back ref */
216 #define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
217 #define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
219 #define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
220 #define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
221 #define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
222 #define META_CLASS            0x800a0000u  /* start non-empty class */
223 #define META_CLASS_EMPTY      0x800b0000u  /* empty class */
224 #define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
225 #define META_CLASS_END        0x800d0000u  /* end of non-empty class */
226 #define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
227 #define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
228 #define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
229 #define META_COND_NAME        0x80110000u  /* (?(<name>)... */
230 #define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
231 #define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
232 #define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
233 #define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR           0x80160000u  /* $ metacharacter */
235 #define META_DOT              0x80170000u  /* . metacharacter */
236 #define META_ESCAPE           0x80180000u  /* \d and friends */
237 #define META_KET              0x80190000u  /* closing parenthesis */
238 #define META_NOCAPTURE        0x801a0000u  /* no capture parens */
239 #define META_OPTIONS          0x801b0000u  /* (?i) and friends */
240 #define META_POSIX            0x801c0000u  /* POSIX class item */
241 #define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
242 #define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
243 #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
244 #define META_RECURSE          0x80200000u  /* Recursion */
245 #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
246 #define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
247 
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250 
251 #define META_LOOKAHEAD        0x80230000u  /* (?= */
252 #define META_LOOKAHEADNOT     0x80240000u  /* (?! */
253 #define META_LOOKBEHIND       0x80250000u  /* (?<= */
254 #define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
255 
256 /* These cannot be conditions */
257 
258 #define META_LOOKAHEAD_NA     0x80270000u  /* (*napla: */
259 #define META_LOOKBEHIND_NA    0x80280000u  /* (*naplb: */
260 
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264 
265 #define META_MARK             0x80290000u  /* (*MARK) */
266 #define META_ACCEPT           0x802a0000u  /* (*ACCEPT) */
267 #define META_FAIL             0x802b0000u  /* (*FAIL) */
268 #define META_COMMIT           0x802c0000u  /* These               */
269 #define META_COMMIT_ARG       0x802d0000u  /*   pairs             */
270 #define META_PRUNE            0x802e0000u  /*     must            */
271 #define META_PRUNE_ARG        0x802f0000u  /*       be            */
272 #define META_SKIP             0x80300000u  /*         kept        */
273 #define META_SKIP_ARG         0x80310000u  /*           in        */
274 #define META_THEN             0x80320000u  /*             this    */
275 #define META_THEN_ARG         0x80330000u  /*               order */
276 
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278 
279 #define META_ASTERISK         0x80340000u  /* *  */
280 #define META_ASTERISK_PLUS    0x80350000u  /* *+ */
281 #define META_ASTERISK_QUERY   0x80360000u  /* *? */
282 #define META_PLUS             0x80370000u  /* +  */
283 #define META_PLUS_PLUS        0x80380000u  /* ++ */
284 #define META_PLUS_QUERY       0x80390000u  /* +? */
285 #define META_QUERY            0x803a0000u  /* ?  */
286 #define META_QUERY_PLUS       0x803b0000u  /* ?+ */
287 #define META_QUERY_QUERY      0x803c0000u  /* ?? */
288 #define META_MINMAX           0x803d0000u  /* {n,m}  repeat */
289 #define META_MINMAX_PLUS      0x803e0000u  /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY     0x803f0000u  /* {n,m}? repeat */
291 
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER  META_MINMAX_QUERY
294 
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299 
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301 
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305 
306 static unsigned char meta_extra_lengths[] = {
307   0,             /* META_END */
308   0,             /* META_ALT */
309   0,             /* META_ATOMIC */
310   0,             /* META_BACKREF - more if group is >= 10 */
311   1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
312   1,             /* META_BIGVALUE */
313   3,             /* META_CALLOUT_NUMBER */
314   3+SIZEOFFSET,  /* META_CALLOUT_STRING */
315   0,             /* META_CAPTURE */
316   0,             /* META_CIRCUMFLEX */
317   0,             /* META_CLASS */
318   0,             /* META_CLASS_EMPTY */
319   0,             /* META_CLASS_EMPTY_NOT */
320   0,             /* META_CLASS_END */
321   0,             /* META_CLASS_NOT */
322   0,             /* META_COND_ASSERT */
323   SIZEOFFSET,    /* META_COND_DEFINE */
324   1+SIZEOFFSET,  /* META_COND_NAME */
325   1+SIZEOFFSET,  /* META_COND_NUMBER */
326   1+SIZEOFFSET,  /* META_COND_RNAME */
327   1+SIZEOFFSET,  /* META_COND_RNUMBER */
328   3,             /* META_COND_VERSION */
329   0,             /* META_DOLLAR */
330   0,             /* META_DOT */
331   0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332   0,             /* META_KET */
333   0,             /* META_NOCAPTURE */
334   1,             /* META_OPTIONS */
335   1,             /* META_POSIX */
336   1,             /* META_POSIX_NEG */
337   0,             /* META_RANGE_ESCAPED */
338   0,             /* META_RANGE_LITERAL */
339   SIZEOFFSET,    /* META_RECURSE */
340   1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
341   0,             /* META_SCRIPT_RUN */
342   0,             /* META_LOOKAHEAD */
343   0,             /* META_LOOKAHEADNOT */
344   SIZEOFFSET,    /* META_LOOKBEHIND */
345   SIZEOFFSET,    /* META_LOOKBEHINDNOT */
346   0,             /* META_LOOKAHEAD_NA */
347   SIZEOFFSET,    /* META_LOOKBEHIND_NA */
348   1,             /* META_MARK - plus the string length */
349   0,             /* META_ACCEPT */
350   0,             /* META_FAIL */
351   0,             /* META_COMMIT */
352   1,             /* META_COMMIT_ARG - plus the string length */
353   0,             /* META_PRUNE */
354   1,             /* META_PRUNE_ARG - plus the string length */
355   0,             /* META_SKIP */
356   1,             /* META_SKIP_ARG - plus the string length */
357   0,             /* META_THEN */
358   1,             /* META_THEN_ARG - plus the string length */
359   0,             /* META_ASTERISK */
360   0,             /* META_ASTERISK_PLUS */
361   0,             /* META_ASTERISK_QUERY */
362   0,             /* META_PLUS */
363   0,             /* META_PLUS_PLUS */
364   0,             /* META_PLUS_QUERY */
365   0,             /* META_QUERY */
366   0,             /* META_QUERY_PLUS */
367   0,             /* META_QUERY_QUERY */
368   2,             /* META_MINMAX */
369   2,             /* META_MINMAX_PLUS */
370   2              /* META_MINMAX_QUERY */
371 };
372 
373 /* Types for skipping parts of a parsed pattern. */
374 
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376 
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380 
381   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382 
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385 
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387 
388 /* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389 variables, which are concerned with first and required code units. A value
390 greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391 matching xxcu variable is set, and the low valued bits are relevant. */
392 
393 #define REQ_UNSET     0xffffffffu  /* Not yet found anything */
394 #define REQ_NONE      0xfffffffeu  /* Found not fixed character */
395 #define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
396 #define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
397 
398 /* These flags are used in the groupinfo vector. */
399 
400 #define GI_SET_FIXED_LENGTH    0x80000000u
401 #define GI_NOT_FIXED_LENGTH    0x40000000u
402 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
403 
404 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405 and is fast (a good compiler can turn it into a subtraction and unsigned
406 comparison). */
407 
408 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409 
410 /* Table to identify hex digits. The tables in chartables are dependent on the
411 locale, and may mark arbitrary characters as digits. We want to recognize only
412 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413 costs 256 bytes, but it is a lot faster than doing character value tests (at
414 least in some simple cases I timed), and in some applications one wants PCRE2
415 to compile efficiently as well as match efficiently. The value in the table is
416 the binary hex digit value, or 0xff for non-hex digits. */
417 
418 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419 UTF-8 mode. */
420 
421 #ifndef EBCDIC
422 static const uint8_t xdigitab[] =
423   {
424   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
425   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
426   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
427   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
428   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
429   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
430   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
431   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
432   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
433   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
434   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
435   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
436   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
437   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
438   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
439   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
440   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456 
457 #else
458 
459 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460 
461 static const uint8_t xdigitab[] =
462   {
463   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
464   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
465   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
466   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
467   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
468   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
469   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
470   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
471   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
472   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
473   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
474   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
475   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
476   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
477   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
479   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
480   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
481   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
482   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
483   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
484   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
485   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
486   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
487   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
488   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
489   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
490   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
491   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
492   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
493   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
494   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
495 #endif  /* EBCDIC */
496 
497 
498 /* Table for handling alphanumeric escaped characters. Positive returns are
499 simple data values; negative values are for special things like \d and so on.
500 Zero means further processing is needed (for things like \x), or the escape is
501 invalid. */
502 
503 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
504 in UTF-8 mode. It runs from '0' to 'z'. */
505 
506 #ifndef EBCDIC
507 #define ESCAPES_FIRST       CHAR_0
508 #define ESCAPES_LAST        CHAR_z
509 #define UPPER_CASE(c)       (c-32)
510 
511 static const short int escapes[] = {
512      0,                       0,
513      0,                       0,
514      0,                       0,
515      0,                       0,
516      0,                       0,
517      CHAR_COLON,              CHAR_SEMICOLON,
518      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
519      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
520      CHAR_COMMERCIAL_AT,      -ESC_A,
521      -ESC_B,                  -ESC_C,
522      -ESC_D,                  -ESC_E,
523      0,                       -ESC_G,
524      -ESC_H,                  0,
525      0,                       -ESC_K,
526      0,                       0,
527      -ESC_N,                  0,
528      -ESC_P,                  -ESC_Q,
529      -ESC_R,                  -ESC_S,
530      0,                       0,
531      -ESC_V,                  -ESC_W,
532      -ESC_X,                  0,
533      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
534      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
535      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
536      CHAR_GRAVE_ACCENT,       CHAR_BEL,
537      -ESC_b,                  0,
538      -ESC_d,                  CHAR_ESC,
539      CHAR_FF,                 0,
540      -ESC_h,                  0,
541      0,                       -ESC_k,
542      0,                       0,
543      CHAR_LF,                 0,
544      -ESC_p,                  0,
545      CHAR_CR,                 -ESC_s,
546      CHAR_HT,                 0,
547      -ESC_v,                  -ESC_w,
548      0,                       0,
549      -ESC_z
550 };
551 
552 #else
553 
554 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557 because it is defined as 'a', which of course picks up the ASCII value. */
558 
559 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
560 #define ESCAPES_FIRST       CHAR_a
561 #define ESCAPES_LAST        CHAR_9
562 #define UPPER_CASE(c)       (c+64)
563 #else                              /* Testing in an ASCII environment */
564 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
565 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
566 #define UPPER_CASE(c)  (c-32)
567 #endif
568 
569 static const short int escapes[] = {
570 /*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
571 /*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
572 /*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
573 /*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
574 /*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
575 /*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
576 /*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
577 /*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
578 /*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
579 /*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
580 /*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
581 /*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
582 /*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
583 /*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
584 /*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
585 /*  F8 */      0,        0
586 };
587 
588 /* We also need a table of characters that may follow \c in an EBCDIC
589 environment for characters 0-31. */
590 
591 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592 
593 #endif   /* EBCDIC */
594 
595 
596 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597 searched linearly. Put all the names into a single string, in order to reduce
598 the number of relocations when a shared library is dynamically linked. The
599 string is built from string macros so that it works in UTF-8 mode on EBCDIC
600 platforms. */
601 
602 typedef struct verbitem {
603   unsigned int len;          /* Length of verb name */
604   uint32_t meta;             /* Base META_ code */
605   int has_arg;               /* Argument requirement */
606 } verbitem;
607 
608 static const char verbnames[] =
609   "\0"                       /* Empty name is a shorthand for MARK */
610   STRING_MARK0
611   STRING_ACCEPT0
612   STRING_F0
613   STRING_FAIL0
614   STRING_COMMIT0
615   STRING_PRUNE0
616   STRING_SKIP0
617   STRING_THEN;
618 
619 static const verbitem verbs[] = {
620   { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
621   { 4, META_MARK,   +1 },
622   { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
623   { 1, META_FAIL,   -1 },
624   { 4, META_FAIL,   -1 },
625   { 6, META_COMMIT,  0 },
626   { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
627   { 4, META_SKIP,    0 },
628   { 4, META_THEN,    0 }
629 };
630 
631 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632 
633 /* Verb opcodes, indexed by their META code offset from META_MARK. */
634 
635 static const uint32_t verbops[] = {
636   OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637   OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638 
639 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640 
641 typedef struct alasitem {
642   unsigned int len;          /* Length of name */
643   uint32_t meta;             /* Base META_ code */
644 } alasitem;
645 
646 static const char alasnames[] =
647   STRING_pla0
648   STRING_plb0
649   STRING_napla0
650   STRING_naplb0
651   STRING_nla0
652   STRING_nlb0
653   STRING_positive_lookahead0
654   STRING_positive_lookbehind0
655   STRING_non_atomic_positive_lookahead0
656   STRING_non_atomic_positive_lookbehind0
657   STRING_negative_lookahead0
658   STRING_negative_lookbehind0
659   STRING_atomic0
660   STRING_sr0
661   STRING_asr0
662   STRING_script_run0
663   STRING_atomic_script_run;
664 
665 static const alasitem alasmeta[] = {
666   {  3, META_LOOKAHEAD         },
667   {  3, META_LOOKBEHIND        },
668   {  5, META_LOOKAHEAD_NA      },
669   {  5, META_LOOKBEHIND_NA     },
670   {  3, META_LOOKAHEADNOT      },
671   {  3, META_LOOKBEHINDNOT     },
672   { 18, META_LOOKAHEAD         },
673   { 19, META_LOOKBEHIND        },
674   { 29, META_LOOKAHEAD_NA      },
675   { 30, META_LOOKBEHIND_NA     },
676   { 18, META_LOOKAHEADNOT      },
677   { 19, META_LOOKBEHINDNOT     },
678   {  6, META_ATOMIC            },
679   {  2, META_SCRIPT_RUN        }, /* sr = script run */
680   {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681   { 10, META_SCRIPT_RUN        }, /* script run */
682   { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
683 };
684 
685 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686 
687 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688 
689 static uint32_t chartypeoffset[] = {
690   OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
691   OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692 
693 /* Tables of names of POSIX character classes and their lengths. The names are
694 now all in a single string, to reduce the number of relocations when a shared
695 library is dynamically loaded. The list of lengths is terminated by a zero
696 length entry. The first three must be alpha, lower, upper, as this is assumed
697 for handling case independence. The indices for graph, print, and punct are
698 needed, so identify them. */
699 
700 static const char posix_names[] =
701   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704   STRING_word0  STRING_xdigit;
705 
706 static const uint8_t posix_name_lengths[] = {
707   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708 
709 #define PC_GRAPH  8
710 #define PC_PRINT  9
711 #define PC_PUNCT 10
712 
713 /* Table of class bit maps for each POSIX class. Each class is formed from a
714 base map, with an optional addition or removal of another map. Then, for some
715 classes, there is some additional tweaking: for [:blank:] the vertical space
716 characters are removed, and for [:alpha:] and [:alnum:] the underscore
717 character is removed. The triples in the table consist of the base map offset,
718 second map offset or -1 if no second map, and a non-negative value for map
719 addition or a negative value for map subtraction (if there are two maps). The
720 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
721 remove vertical space characters, 2 => remove underscore. */
722 
723 static const int posix_class_maps[] = {
724   cbit_word,  cbit_digit, -2,             /* alpha */
725   cbit_lower, -1,          0,             /* lower */
726   cbit_upper, -1,          0,             /* upper */
727   cbit_word,  -1,          2,             /* alnum - word without underscore */
728   cbit_print, cbit_cntrl,  0,             /* ascii */
729   cbit_space, -1,          1,             /* blank - a GNU extension */
730   cbit_cntrl, -1,          0,             /* cntrl */
731   cbit_digit, -1,          0,             /* digit */
732   cbit_graph, -1,          0,             /* graph */
733   cbit_print, -1,          0,             /* print */
734   cbit_punct, -1,          0,             /* punct */
735   cbit_space, -1,          0,             /* space */
736   cbit_word,  -1,          0,             /* word - a Perl extension */
737   cbit_xdigit,-1,          0              /* xdigit */
738 };
739 
740 #ifdef SUPPORT_UNICODE
741 
742 /* The POSIX class Unicode property substitutes that are used in UCP mode must
743 be in the order of the POSIX class names, defined above. */
744 
745 static int posix_substitutes[] = {
746   PT_GC, ucp_L,     /* alpha */
747   PT_PC, ucp_Ll,    /* lower */
748   PT_PC, ucp_Lu,    /* upper */
749   PT_ALNUM, 0,      /* alnum */
750   -1, 0,            /* ascii, treat as non-UCP */
751   -1, 1,            /* blank, treat as \h */
752   PT_PC, ucp_Cc,    /* cntrl */
753   PT_PC, ucp_Nd,    /* digit */
754   PT_PXGRAPH, 0,    /* graph */
755   PT_PXPRINT, 0,    /* print */
756   PT_PXPUNCT, 0,    /* punct */
757   PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
758   PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
759   -1, 0             /* xdigit, treat as non-UCP */
760 };
761 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
762 #endif  /* SUPPORT_UNICODE */
763 
764 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
765 are allowed. */
766 
767 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
768   (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
769    PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
770    PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
771 
772 #define PUBLIC_COMPILE_OPTIONS \
773   (PUBLIC_LITERAL_COMPILE_OPTIONS| \
774    PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
775    PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
776    PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
777    PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
778    PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
779    PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
780 
781 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
782    (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
783 
784 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
785    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
786     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
787     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
788     PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)
789 
790 /* Compile time error code numbers. They are given names so that they can more
791 easily be tracked. When a new number is added, the tables called eint1 and
792 eint2 in pcre2posix.c may need to be updated, and a new error text must be
793 added to compile_error_texts in pcre2_error.c. Also, the error codes in
794 pcre2.h.in must be updated - their values are exactly 100 greater than these
795 values. */
796 
797 enum { ERR0 = COMPILE_ERROR_BASE,
798        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
799        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
800        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
801        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
802        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
803        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
804        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
805        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
806        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
807        ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 };
808 
809 /* This is a table of start-of-pattern options such as (*UTF) and settings such
810 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
811 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
812 generic and always supported. */
813 
814 enum { PSO_OPT,     /* Value is an option bit */
815        PSO_FLG,     /* Value is a flag bit */
816        PSO_NL,      /* Value is a newline type */
817        PSO_BSR,     /* Value is a \R type */
818        PSO_LIMH,    /* Read integer value for heap limit */
819        PSO_LIMM,    /* Read integer value for match limit */
820        PSO_LIMD };  /* Read integer value for depth limit */
821 
822 typedef struct pso {
823   const uint8_t *name;
824   uint16_t length;
825   uint16_t type;
826   uint32_t value;
827 } pso;
828 
829 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
830 
831 static pso pso_list[] = {
832   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
833   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
834   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
835   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
836   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
837   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
838   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
839   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
840   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
841   { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
842   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
843   { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
844   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
845   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
846   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
847   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
848   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
849   { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
850   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
851   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
852   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
853 };
854 
855 /* This table is used when converting repeating opcodes into possessified
856 versions as a result of an explicit possessive quantifier such as ++. A zero
857 value means there is no possessified version - in those cases the item in
858 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
859 because all relevant opcodes are less than that. */
860 
861 static const uint8_t opcode_possessify[] = {
862   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
863   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
864 
865   0,                       /* NOTI */
866   OP_POSSTAR, 0,           /* STAR, MINSTAR */
867   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
868   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
869   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
870   0,                       /* EXACT */
871   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
872 
873   OP_POSSTARI, 0,          /* STARI, MINSTARI */
874   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
875   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
876   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
877   0,                       /* EXACTI */
878   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
879 
880   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
881   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
882   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
883   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
884   0,                       /* NOTEXACT */
885   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
886 
887   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
888   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
889   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
890   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
891   0,                       /* NOTEXACTI */
892   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
893 
894   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
895   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
896   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
897   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
898   0,                       /* TYPEEXACT */
899   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
900 
901   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
902   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
903   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
904   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
905   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
906 
907   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
908   0, 0,                    /* REF, REFI */
909   0, 0,                    /* DNREF, DNREFI */
910   0, 0                     /* RECURSE, CALLOUT */
911 };
912 
913 
914 #ifdef DEBUG_SHOW_PARSED
915 /*************************************************
916 *     Show the parsed pattern for debugging      *
917 *************************************************/
918 
919 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
920 can be enabled. */
921 
show_parsed(compile_block * cb)922 static void show_parsed(compile_block *cb)
923 {
924 uint32_t *pptr = cb->parsed_pattern;
925 
926 for (;;)
927   {
928   int max, min;
929   PCRE2_SIZE offset;
930   uint32_t i;
931   uint32_t length;
932   uint32_t meta_arg = META_DATA(*pptr);
933 
934   fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
935 
936   if (*pptr < META_END)
937     {
938     if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
939     pptr++;
940     }
941 
942   else switch (META_CODE(*pptr++))
943     {
944     default:
945     fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
946     return;
947 
948     case META_END:
949     fprintf(stderr, "META_END\n");
950     return;
951 
952     case META_CAPTURE:
953     fprintf(stderr, "META_CAPTURE %d", meta_arg);
954     break;
955 
956     case META_RECURSE:
957     GETOFFSET(offset, pptr);
958     fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
959     break;
960 
961     case META_BACKREF:
962     if (meta_arg < 10)
963       offset = cb->small_ref_offset[meta_arg];
964     else
965       GETOFFSET(offset, pptr);
966     fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
967     break;
968 
969     case META_ESCAPE:
970     if (meta_arg == ESC_P || meta_arg == ESC_p)
971       {
972       uint32_t ptype = *pptr >> 16;
973       uint32_t pvalue = *pptr++ & 0xffff;
974       fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
975         ptype, pvalue);
976       }
977     else
978       {
979       uint32_t cc;
980       /* There's just one escape we might have here that isn't negated in the
981       escapes table. */
982       if (meta_arg == ESC_g) cc = CHAR_g;
983       else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
984         {
985         if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
986         }
987       if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
988       fprintf(stderr, "META \\%c", cc);
989       }
990     break;
991 
992     case META_MINMAX:
993     min = *pptr++;
994     max = *pptr++;
995     if (max != REPEAT_UNLIMITED)
996       fprintf(stderr, "META {%d,%d}", min, max);
997     else
998       fprintf(stderr, "META {%d,}", min);
999     break;
1000 
1001     case META_MINMAX_QUERY:
1002     min = *pptr++;
1003     max = *pptr++;
1004     if (max != REPEAT_UNLIMITED)
1005       fprintf(stderr, "META {%d,%d}?", min, max);
1006     else
1007       fprintf(stderr, "META {%d,}?", min);
1008     break;
1009 
1010     case META_MINMAX_PLUS:
1011     min = *pptr++;
1012     max = *pptr++;
1013     if (max != REPEAT_UNLIMITED)
1014       fprintf(stderr, "META {%d,%d}+", min, max);
1015     else
1016       fprintf(stderr, "META {%d,}+", min);
1017     break;
1018 
1019     case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1020     case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1021     case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1022     case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1023     case META_DOT: fprintf(stderr, "META_DOT"); break;
1024     case META_ASTERISK: fprintf(stderr, "META *"); break;
1025     case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1026     case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1027     case META_PLUS: fprintf(stderr, "META +"); break;
1028     case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1029     case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1030     case META_QUERY: fprintf(stderr, "META ?"); break;
1031     case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1032     case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1033 
1034     case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1035     case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1036     case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1037     case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1038     case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1039     case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1040     case META_KET: fprintf(stderr, "META )"); break;
1041     case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1042 
1043     case META_CLASS: fprintf(stderr, "META ["); break;
1044     case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1045     case META_CLASS_END: fprintf(stderr, "META ]"); break;
1046     case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1047     case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1048 
1049     case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1050     case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1051 
1052     case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1053     case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1054 
1055     case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1056     case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1057     case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1058     case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1059     case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1060     case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1061 
1062     case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1063 
1064     case META_LOOKBEHIND:
1065     fprintf(stderr, "META (?<= %d offset=", meta_arg);
1066     GETOFFSET(offset, pptr);
1067     fprintf(stderr, "%zd", offset);
1068     break;
1069 
1070     case META_LOOKBEHIND_NA:
1071     fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
1072     GETOFFSET(offset, pptr);
1073     fprintf(stderr, "%zd", offset);
1074     break;
1075 
1076     case META_LOOKBEHINDNOT:
1077     fprintf(stderr, "META (?<! %d offset=", meta_arg);
1078     GETOFFSET(offset, pptr);
1079     fprintf(stderr, "%zd", offset);
1080     break;
1081 
1082     case META_CALLOUT_NUMBER:
1083     fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1084        pptr[1]);
1085     pptr += 3;
1086     break;
1087 
1088     case META_CALLOUT_STRING:
1089       {
1090       uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1091       uint32_t patlength = *pptr++;    /* Length of next pattern item */
1092       fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1093       GETOFFSET(offset, pptr);
1094       fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1095       }
1096     break;
1097 
1098     case META_RECURSE_BYNAME:
1099     fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1100     GETOFFSET(offset, pptr);
1101     fprintf(stderr, "%zd", offset);
1102     break;
1103 
1104     case META_BACKREF_BYNAME:
1105     fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1106     GETOFFSET(offset, pptr);
1107     fprintf(stderr, "%zd", offset);
1108     break;
1109 
1110     case META_COND_NUMBER:
1111     fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1112     GETOFFSET(offset, pptr);
1113     fprintf(stderr, "%zd", offset);
1114     pptr++;
1115     break;
1116 
1117     case META_COND_DEFINE:
1118     fprintf(stderr, "META (?(DEFINE) offset=");
1119     GETOFFSET(offset, pptr);
1120     fprintf(stderr, "%zd", offset);
1121     break;
1122 
1123     case META_COND_VERSION:
1124     fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1125     fprintf(stderr, "%d.", *pptr++);
1126     fprintf(stderr, "%d)", *pptr++);
1127     break;
1128 
1129     case META_COND_NAME:
1130     fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1131     GETOFFSET(offset, pptr);
1132     fprintf(stderr, "%zd", offset);
1133     break;
1134 
1135     case META_COND_RNAME:
1136     fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1137     GETOFFSET(offset, pptr);
1138     fprintf(stderr, "%zd", offset);
1139     break;
1140 
1141     /* This is kept as a name, because it might be. */
1142 
1143     case META_COND_RNUMBER:
1144     fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1145     GETOFFSET(offset, pptr);
1146     fprintf(stderr, "%zd", offset);
1147     break;
1148 
1149     case META_MARK:
1150     fprintf(stderr, "META (*MARK:");
1151     goto SHOWARG;
1152 
1153     case META_COMMIT_ARG:
1154     fprintf(stderr, "META (*COMMIT:");
1155     goto SHOWARG;
1156 
1157     case META_PRUNE_ARG:
1158     fprintf(stderr, "META (*PRUNE:");
1159     goto SHOWARG;
1160 
1161     case META_SKIP_ARG:
1162     fprintf(stderr, "META (*SKIP:");
1163     goto SHOWARG;
1164 
1165     case META_THEN_ARG:
1166     fprintf(stderr, "META (*THEN:");
1167     SHOWARG:
1168     length = *pptr++;
1169     for (i = 0; i < length; i++)
1170       {
1171       uint32_t cc = *pptr++;
1172       if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1173         else fprintf(stderr, "\\x{%x}", cc);
1174       }
1175     fprintf(stderr, ") length=%u", length);
1176     break;
1177     }
1178   fprintf(stderr, "\n");
1179   }
1180 return;
1181 }
1182 #endif  /* DEBUG_SHOW_PARSED */
1183 
1184 
1185 
1186 /*************************************************
1187 *               Copy compiled code               *
1188 *************************************************/
1189 
1190 /* Compiled JIT code cannot be copied, so the new compiled block has no
1191 associated JIT data. */
1192 
1193 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1194 pcre2_code_copy(const pcre2_code *code)
1195 {
1196 PCRE2_SIZE* ref_count;
1197 pcre2_code *newcode;
1198 
1199 if (code == NULL) return NULL;
1200 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1201 if (newcode == NULL) return NULL;
1202 memcpy(newcode, code, code->blocksize);
1203 newcode->executable_jit = NULL;
1204 
1205 /* If the code is one that has been deserialized, increment the reference count
1206 in the decoded tables. */
1207 
1208 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1209   {
1210   ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1211   (*ref_count)++;
1212   }
1213 
1214 return newcode;
1215 }
1216 
1217 
1218 
1219 /*************************************************
1220 *     Copy compiled code and character tables    *
1221 *************************************************/
1222 
1223 /* Compiled JIT code cannot be copied, so the new compiled block has no
1224 associated JIT data. This version of code_copy also makes a separate copy of
1225 the character tables. */
1226 
1227 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1228 pcre2_code_copy_with_tables(const pcre2_code *code)
1229 {
1230 PCRE2_SIZE* ref_count;
1231 pcre2_code *newcode;
1232 uint8_t *newtables;
1233 
1234 if (code == NULL) return NULL;
1235 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1236 if (newcode == NULL) return NULL;
1237 memcpy(newcode, code, code->blocksize);
1238 newcode->executable_jit = NULL;
1239 
1240 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1241   code->memctl.memory_data);
1242 if (newtables == NULL)
1243   {
1244   code->memctl.free((void *)newcode, code->memctl.memory_data);
1245   return NULL;
1246   }
1247 memcpy(newtables, code->tables, TABLES_LENGTH);
1248 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1249 *ref_count = 1;
1250 
1251 newcode->tables = newtables;
1252 newcode->flags |= PCRE2_DEREF_TABLES;
1253 return newcode;
1254 }
1255 
1256 
1257 
1258 /*************************************************
1259 *               Free compiled code               *
1260 *************************************************/
1261 
1262 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1263 pcre2_code_free(pcre2_code *code)
1264 {
1265 PCRE2_SIZE* ref_count;
1266 
1267 if (code != NULL)
1268   {
1269 #ifdef SUPPORT_JIT
1270   if (code->executable_jit != NULL)
1271     PRIV(jit_free)(code->executable_jit, &code->memctl);
1272 #endif
1273 
1274   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1275     {
1276     /* Decoded tables belong to the codes after deserialization, and they must
1277     be freed when there are no more references to them. The *ref_count should
1278     always be > 0. */
1279 
1280     ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1281     if (*ref_count > 0)
1282       {
1283       (*ref_count)--;
1284       if (*ref_count == 0)
1285         code->memctl.free((void *)code->tables, code->memctl.memory_data);
1286       }
1287     }
1288 
1289   code->memctl.free(code, code->memctl.memory_data);
1290   }
1291 }
1292 
1293 
1294 
1295 /*************************************************
1296 *         Read a number, possibly signed         *
1297 *************************************************/
1298 
1299 /* This function is used to read numbers in the pattern. The initial pointer
1300 must be the sign or first digit of the number. When relative values (introduced
1301 by + or -) are allowed, they are relative group numbers, and the result must be
1302 greater than zero.
1303 
1304 Arguments:
1305   ptrptr      points to the character pointer variable
1306   ptrend      points to the end of the input string
1307   allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1308   max_value   the largest number allowed
1309   max_error   the error to give for an over-large number
1310   intptr      where to put the result
1311   errcodeptr  where to put an error code
1312 
1313 Returns:      TRUE  - a number was read
1314               FALSE - errorcode == 0 => no number was found
1315                       errorcode != 0 => an error occurred
1316 */
1317 
1318 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1319 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1320   uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1321 {
1322 int sign = 0;
1323 uint32_t n = 0;
1324 PCRE2_SPTR ptr = *ptrptr;
1325 BOOL yield = FALSE;
1326 
1327 *errorcodeptr = 0;
1328 
1329 if (allow_sign >= 0 && ptr < ptrend)
1330   {
1331   if (*ptr == CHAR_PLUS)
1332     {
1333     sign = +1;
1334     max_value -= allow_sign;
1335     ptr++;
1336     }
1337   else if (*ptr == CHAR_MINUS)
1338     {
1339     sign = -1;
1340     ptr++;
1341     }
1342   }
1343 
1344 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1345 while (ptr < ptrend && IS_DIGIT(*ptr))
1346   {
1347   n = n * 10 + *ptr++ - CHAR_0;
1348   if (n > max_value)
1349     {
1350     *errorcodeptr = max_error;
1351     goto EXIT;
1352     }
1353   }
1354 
1355 if (allow_sign >= 0 && sign != 0)
1356   {
1357   if (n == 0)
1358     {
1359     *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1360     goto EXIT;
1361     }
1362 
1363   if (sign > 0) n += allow_sign;
1364   else if ((int)n > allow_sign)
1365     {
1366     *errorcodeptr = ERR15;  /* Non-existent subpattern */
1367     goto EXIT;
1368     }
1369   else n = allow_sign + 1 - n;
1370   }
1371 
1372 yield = TRUE;
1373 
1374 EXIT:
1375 *intptr = n;
1376 *ptrptr = ptr;
1377 return yield;
1378 }
1379 
1380 
1381 
1382 /*************************************************
1383 *         Read repeat counts                     *
1384 *************************************************/
1385 
1386 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1387 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1388 larger value is used for "unlimited". We have to use signed arguments for
1389 read_number() because it is capable of returning a signed value.
1390 
1391 Arguments:
1392   ptrptr         points to pointer to character after'{'
1393   ptrend         pointer to end of input
1394   minp           if not NULL, pointer to int for min
1395   maxp           if not NULL, pointer to int for max (-1 if no max)
1396                  returned as -1 if no max
1397   errorcodeptr   points to error code variable
1398 
1399 Returns:         FALSE if not a repeat quantifier, errorcode set zero
1400                  FALSE on error, with errorcode set non-zero
1401                  TRUE on success, with pointer updated to point after '}'
1402 */
1403 
1404 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1405 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1406   uint32_t *maxp, int *errorcodeptr)
1407 {
1408 PCRE2_SPTR p;
1409 BOOL yield = FALSE;
1410 BOOL had_comma = FALSE;
1411 int32_t min = 0;
1412 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1413 
1414 /* Check the syntax */
1415 
1416 *errorcodeptr = 0;
1417 for (p = *ptrptr;; p++)
1418   {
1419   uint32_t c;
1420   if (p >= ptrend) return FALSE;
1421   c = *p;
1422   if (IS_DIGIT(c)) continue;
1423   if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1424   if (c == CHAR_COMMA)
1425     {
1426     if (had_comma) return FALSE;
1427     had_comma = TRUE;
1428     }
1429   else return FALSE;
1430   }
1431 
1432 /* The only error from read_number() is for a number that is too big. */
1433 
1434 p = *ptrptr;
1435 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1436   goto EXIT;
1437 
1438 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1439   {
1440   p++;
1441   max = min;
1442   }
1443 else
1444   {
1445   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1446     {
1447     if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1448         errorcodeptr))
1449       goto EXIT;
1450     if (max < min)
1451       {
1452       *errorcodeptr = ERR4;
1453       goto EXIT;
1454       }
1455     }
1456   p++;
1457   }
1458 
1459 yield = TRUE;
1460 if (minp != NULL) *minp = (uint32_t)min;
1461 if (maxp != NULL) *maxp = (uint32_t)max;
1462 
1463 /* Update the pattern pointer */
1464 
1465 EXIT:
1466 *ptrptr = p;
1467 return yield;
1468 }
1469 
1470 
1471 
1472 /*************************************************
1473 *            Handle escapes                      *
1474 *************************************************/
1475 
1476 /* This function is called when a \ has been encountered. It either returns a
1477 positive value for a simple escape such as \d, or 0 for a data character, which
1478 is placed in chptr. A backreference to group n is returned as negative n. On
1479 entry, ptr is pointing at the character after \. On exit, it points after the
1480 final code unit of the escape sequence.
1481 
1482 This function is also called from pcre2_substitute() to handle escape sequences
1483 in replacement strings. In this case, the cb argument is NULL, and in the case
1484 of escapes that have further processing, only sequences that define a data
1485 character are recognised. The isclass argument is not relevant; the options
1486 argument is the final value of the compiled pattern's options.
1487 
1488 Arguments:
1489   ptrptr         points to the input position pointer
1490   ptrend         points to the end of the input
1491   chptr          points to a returned data character
1492   errorcodeptr   points to the errorcode variable (containing zero)
1493   options        the current options bits
1494   isclass        TRUE if inside a character class
1495   cb             compile data block or NULL when called from pcre2_substitute()
1496 
1497 Returns:         zero => a data character
1498                  positive => a special escape sequence
1499                  negative => a numerical back reference
1500                  on error, errorcodeptr is set non-zero
1501 */
1502 
1503 int
PRIV(check_escape)1504 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1505   int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1506   compile_block *cb)
1507 {
1508 BOOL utf = (options & PCRE2_UTF) != 0;
1509 PCRE2_SPTR ptr = *ptrptr;
1510 uint32_t c, cc;
1511 int escape = 0;
1512 int i;
1513 
1514 /* If backslash is at the end of the string, it's an error. */
1515 
1516 if (ptr >= ptrend)
1517   {
1518   *errorcodeptr = ERR1;
1519   return 0;
1520   }
1521 
1522 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1523 *errorcodeptr = 0;              /* Be optimistic */
1524 
1525 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1526 value test saves a memory lookup for code points outside the alphanumeric
1527 range. */
1528 
1529 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1530 
1531 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1532 positive value is a literal value for something like \n. A negative value is
1533 the negation of one of the ESC_ macros that is passed back for handling by the
1534 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1535 is supported. If the value is zero, further processing is handled below. */
1536 
1537 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1538   {
1539   if (i > 0)
1540     {
1541     c = (uint32_t)i;
1542     if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1543       c = CHAR_LF;
1544     }
1545   else  /* Negative table entry */
1546     {
1547     escape = -i;                    /* Else return a special escape */
1548     if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1549       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1550 
1551     /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1552     Unicode code points, as well as plain \N for "not newline". PCRE does not
1553     support \N{name}. However, it does support quantification such as \N{2,3},
1554     so if \N{ is not followed by U+dddd we check for a quantifier. */
1555 
1556     if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1557       {
1558       PCRE2_SPTR p = ptr + 1;
1559 
1560       /* \N{U+ can be handled by the \x{ code. However, this construction is
1561       not valid in EBCDIC environments because it specifies a Unicode
1562       character, not a codepoint in the local code. For example \N{U+0041}
1563       must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1564       casing semantics for the entire pattern, so allow it only in UTF (i.e.
1565       Unicode) mode. */
1566 
1567       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1568         {
1569 #ifdef EBCDIC
1570         *errorcodeptr = ERR93;
1571 #else
1572         if (utf)
1573           {
1574           ptr = p + 1;
1575           escape = 0;   /* Not a fancy escape after all */
1576           goto COME_FROM_NU;
1577           }
1578         else *errorcodeptr = ERR93;
1579 #endif
1580         }
1581 
1582       /* Give an error if what follows is not a quantifier, but don't override
1583       an error set by the quantifier reader (e.g. number overflow). */
1584 
1585       else
1586         {
1587         if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1588              *errorcodeptr == 0)
1589           *errorcodeptr = ERR37;
1590         }
1591       }
1592     }
1593   }
1594 
1595 /* Escapes that need further processing, including those that are unknown, have
1596 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1597 \o, and \x are recognized (\u and \U can never appear as they are used for case
1598 forcing). */
1599 
1600 else
1601   {
1602   int s;
1603   PCRE2_SPTR oldptr;
1604   BOOL overflow;
1605   BOOL alt_bsux =
1606     ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1607 
1608   /* Filter calls from pcre2_substitute(). */
1609 
1610   if (cb == NULL)
1611     {
1612     if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1613       {
1614       *errorcodeptr = ERR3;
1615       return 0;
1616       }
1617     alt_bsux = FALSE;   /* Do not modify \x handling */
1618     }
1619 
1620   switch (c)
1621     {
1622     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1623     error. */
1624 
1625     case CHAR_F:
1626     case CHAR_l:
1627     case CHAR_L:
1628     *errorcodeptr = ERR37;
1629     break;
1630 
1631     /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1632     is set. Otherwise, \u must be followed by exactly four hex digits or, if
1633     PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1634     Otherwise it is a lowercase u letter. This gives some compatibility with
1635     ECMAScript (aka JavaScript). */
1636 
1637     case CHAR_u:
1638     if (!alt_bsux) *errorcodeptr = ERR37; else
1639       {
1640       uint32_t xc;
1641 
1642       if (ptr >= ptrend) break;
1643       if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1644           (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1645         {
1646         PCRE2_SPTR hptr = ptr + 1;
1647         cc = 0;
1648 
1649         while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1650           {
1651           if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1652             {
1653             *errorcodeptr = ERR77;
1654             ptr = hptr;   /* Show where */
1655             break;        /* *hptr != } will cause another break below */
1656             }
1657           cc = (cc << 4) | xc;
1658           hptr++;
1659           }
1660 
1661         if (hptr == ptr + 1 ||   /* No hex digits */
1662             hptr >= ptrend ||    /* Hit end of input */
1663             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1664           break;         /* Hex escape not recognized */
1665 
1666         c = cc;          /* Accept the code point */
1667         ptr = hptr + 1;
1668         }
1669 
1670       else  /* Must be exactly 4 hex digits */
1671         {
1672         if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1673         if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1674         if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1675         cc = (cc << 4) | xc;
1676         if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1677         cc = (cc << 4) | xc;
1678         if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1679         c = (cc << 4) | xc;
1680         ptr += 4;
1681         }
1682 
1683       if (utf)
1684         {
1685         if (c > 0x10ffffU) *errorcodeptr = ERR77;
1686         else
1687           if (c >= 0xd800 && c <= 0xdfff &&
1688               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1689                 *errorcodeptr = ERR73;
1690         }
1691       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1692       }
1693     break;
1694 
1695     /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1696     in which case it is an upper case letter. */
1697 
1698     case CHAR_U:
1699     if (!alt_bsux) *errorcodeptr = ERR37;
1700     break;
1701 
1702     /* In a character class, \g is just a literal "g". Outside a character
1703     class, \g must be followed by one of a number of specific things:
1704 
1705     (1) A number, either plain or braced. If positive, it is an absolute
1706     backreference. If negative, it is a relative backreference. This is a Perl
1707     5.10 feature.
1708 
1709     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1710     is part of Perl's movement towards a unified syntax for back references. As
1711     this is synonymous with \k{name}, we fudge it up by pretending it really
1712     was \k{name}.
1713 
1714     (3) For Oniguruma compatibility we also support \g followed by a name or a
1715     number either in angle brackets or in single quotes. However, these are
1716     (possibly recursive) subroutine calls, _not_ backreferences. We return
1717     the ESC_g code.
1718 
1719     Summary: Return a negative number for a numerical back reference, ESC_k for
1720     a named back reference, and ESC_g for a named or numbered subroutine call.
1721     */
1722 
1723     case CHAR_g:
1724     if (isclass) break;
1725 
1726     if (ptr >= ptrend)
1727       {
1728       *errorcodeptr = ERR57;
1729       break;
1730       }
1731 
1732     if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1733       {
1734       escape = ESC_g;
1735       break;
1736       }
1737 
1738     /* If there is a brace delimiter, try to read a numerical reference. If
1739     there isn't one, assume we have a name and treat it as \k. */
1740 
1741     if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1742       {
1743       PCRE2_SPTR p = ptr + 1;
1744       if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1745           errorcodeptr))
1746         {
1747         if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1748         break;
1749         }
1750       if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1751         {
1752         *errorcodeptr = ERR57;
1753         break;
1754         }
1755       ptr = p + 1;
1756       }
1757 
1758     /* Read an undelimited number */
1759 
1760     else
1761       {
1762       if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1763           errorcodeptr))
1764         {
1765         if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1766         break;
1767         }
1768       }
1769 
1770     if (s <= 0)
1771       {
1772       *errorcodeptr = ERR15;
1773       break;
1774       }
1775 
1776     escape = -s;
1777     break;
1778 
1779     /* The handling of escape sequences consisting of a string of digits
1780     starting with one that is not zero is not straightforward. Perl has changed
1781     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1782     recommended to avoid the ambiguities in the old syntax.
1783 
1784     Outside a character class, the digits are read as a decimal number. If the
1785     number is less than 10, or if there are that many previous extracting left
1786     brackets, it is a back reference. Otherwise, up to three octal digits are
1787     read to form an escaped character code. Thus \123 is likely to be octal 123
1788     (cf \0123, which is octal 012 followed by the literal 3).
1789 
1790     Inside a character class, \ followed by a digit is always either a literal
1791     8 or 9 or an octal number. */
1792 
1793     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1794     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1795 
1796     if (!isclass)
1797       {
1798       oldptr = ptr;
1799       ptr--;   /* Back to the digit */
1800 
1801       /* As we know we are at a digit, the only possible error from
1802       read_number() is a number that is too large to be a group number. In this
1803       case we fall through handle this as not a group reference. If we have
1804       read a small enough number, check for a back reference.
1805 
1806       \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1807       are octal escapes if there are not that many previous captures. */
1808 
1809       if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1810           (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1811         {
1812         if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1813           else escape = -s;     /* Indicates a back reference */
1814         break;
1815         }
1816 
1817       ptr = oldptr;      /* Put the pointer back and fall through */
1818       }
1819 
1820     /* Handle a digit following \ when the number is not a back reference, or
1821     we are within a character class. If the first digit is 8 or 9, Perl used to
1822     generate a binary zero and then treat the digit as a following literal. At
1823     least by Perl 5.18 this changed so as not to insert the binary zero. */
1824 
1825     if (c >= CHAR_8) break;
1826 
1827     /* Fall through */
1828 
1829     /* \0 always starts an octal number, but we may drop through to here with a
1830     larger first octal digit. The original code used just to take the least
1831     significant 8 bits of octal numbers (I think this is what early Perls used
1832     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1833     but no more than 3 octal digits. */
1834 
1835     case CHAR_0:
1836     c -= CHAR_0;
1837     while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1838         c = c * 8 + *ptr++ - CHAR_0;
1839 #if PCRE2_CODE_UNIT_WIDTH == 8
1840     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1841 #endif
1842     break;
1843 
1844     /* \o is a relatively new Perl feature, supporting a more general way of
1845     specifying character codes in octal. The only supported form is \o{ddd}. */
1846 
1847     case CHAR_o:
1848     if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1849       {
1850       ptr--;
1851       *errorcodeptr = ERR55;
1852       }
1853     else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1854       *errorcodeptr = ERR78;
1855     else
1856       {
1857       c = 0;
1858       overflow = FALSE;
1859       while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1860         {
1861         cc = *ptr++;
1862         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1863 #if PCRE2_CODE_UNIT_WIDTH == 32
1864         if (c >= 0x20000000l) { overflow = TRUE; break; }
1865 #endif
1866         c = (c << 3) + (cc - CHAR_0);
1867 #if PCRE2_CODE_UNIT_WIDTH == 8
1868         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1869 #elif PCRE2_CODE_UNIT_WIDTH == 16
1870         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1871 #elif PCRE2_CODE_UNIT_WIDTH == 32
1872         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1873 #endif
1874         }
1875       if (overflow)
1876         {
1877         while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1878         *errorcodeptr = ERR34;
1879         }
1880       else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1881         {
1882         if (utf && c >= 0xd800 && c <= 0xdfff &&
1883             (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1884           {
1885           ptr--;
1886           *errorcodeptr = ERR73;
1887           }
1888         }
1889       else
1890         {
1891         ptr--;
1892         *errorcodeptr = ERR64;
1893         }
1894       }
1895     break;
1896 
1897     /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1898     by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1899 
1900     case CHAR_x:
1901     if (alt_bsux)
1902       {
1903       uint32_t xc;
1904       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1905       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1906       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1907       c = (cc << 4) | xc;
1908       ptr += 2;
1909       }
1910 
1911     /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1912     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1913     digits. If not, { used to be treated as a data character. However, Perl
1914     seems to read hex digits up to the first non-such, and ignore the rest, so
1915     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1916     now gives an error. */
1917 
1918     else
1919       {
1920       if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1921         {
1922 #ifndef EBCDIC
1923         COME_FROM_NU:
1924 #endif
1925         if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1926           {
1927           *errorcodeptr = ERR78;
1928           break;
1929           }
1930         c = 0;
1931         overflow = FALSE;
1932 
1933         while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1934           {
1935           ptr++;
1936           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
1937 #if PCRE2_CODE_UNIT_WIDTH == 32
1938           if (c >= 0x10000000l) { overflow = TRUE; break; }
1939 #endif
1940           c = (c << 4) | cc;
1941           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1942             {
1943             overflow = TRUE;
1944             break;
1945             }
1946           }
1947 
1948         if (overflow)
1949           {
1950           while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1951           *errorcodeptr = ERR34;
1952           }
1953         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1954           {
1955           if (utf && c >= 0xd800 && c <= 0xdfff &&
1956               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1957             {
1958             ptr--;
1959             *errorcodeptr = ERR73;
1960             }
1961           }
1962 
1963         /* If the sequence of hex digits does not end with '}', give an error.
1964         We used just to recognize this construct and fall through to the normal
1965         \x handling, but nowadays Perl gives an error, which seems much more
1966         sensible, so we do too. */
1967 
1968         else
1969           {
1970           ptr--;
1971           *errorcodeptr = ERR67;
1972           }
1973         }   /* End of \x{} processing */
1974 
1975       /* Read a up to two hex digits after \x */
1976 
1977       else
1978         {
1979         c = 0;
1980         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1981         ptr++;
1982         c = cc;
1983         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1984         ptr++;
1985         c = (c << 4) | cc;
1986         }     /* End of \xdd handling */
1987       }       /* End of Perl-style \x handling */
1988     break;
1989 
1990     /* The handling of \c is different in ASCII and EBCDIC environments. In an
1991     ASCII (or Unicode) environment, an error is given if the character
1992     following \c is not a printable ASCII character. Otherwise, the following
1993     character is upper-cased if it is a letter, and after that the 0x40 bit is
1994     flipped. The result is the value of the escape.
1995 
1996     In an EBCDIC environment the handling of \c is compatible with the
1997     specification in the perlebcdic document. The following character must be
1998     a letter or one of small number of special characters. These provide a
1999     means of defining the character values 0-31.
2000 
2001     For testing the EBCDIC handling of \c in an ASCII environment, recognize
2002     the EBCDIC value of 'c' explicitly. */
2003 
2004 #if defined EBCDIC && 'a' != 0x81
2005     case 0x83:
2006 #else
2007     case CHAR_c:
2008 #endif
2009     if (ptr >= ptrend)
2010       {
2011       *errorcodeptr = ERR2;
2012       break;
2013       }
2014     c = *ptr;
2015     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2016 
2017     /* Handle \c in an ASCII/Unicode environment. */
2018 
2019 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
2020     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2021       {
2022       *errorcodeptr = ERR68;
2023       break;
2024       }
2025     c ^= 0x40;
2026 
2027     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2028     255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2029     POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2030     The other valid sequences correspond to a list of specific characters. */
2031 
2032 #else
2033     if (c == CHAR_QUESTION_MARK)
2034       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2035     else
2036       {
2037       for (i = 0; i < 32; i++)
2038         {
2039         if (c == ebcdic_escape_c[i]) break;
2040         }
2041       if (i < 32) c = i; else *errorcodeptr = ERR68;
2042       }
2043 #endif  /* EBCDIC */
2044 
2045     ptr++;
2046     break;
2047 
2048     /* Any other alphanumeric following \ is an error. Perl gives an error only
2049     if in warning mode, but PCRE doesn't have a warning mode. */
2050 
2051     default:
2052     *errorcodeptr = ERR3;
2053     *ptrptr = ptr - 1;     /* Point to the character at fault */
2054     return 0;
2055     }
2056   }
2057 
2058 /* Set the pointer to the next character before returning. */
2059 
2060 *ptrptr = ptr;
2061 *chptr = c;
2062 return escape;
2063 }
2064 
2065 
2066 
2067 #ifdef SUPPORT_UNICODE
2068 /*************************************************
2069 *               Handle \P and \p                 *
2070 *************************************************/
2071 
2072 /* This function is called after \P or \p has been encountered, provided that
2073 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2074 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2075 after the final code unit of the escape sequence.
2076 
2077 Arguments:
2078   ptrptr         the pattern position pointer
2079   negptr         a boolean that is set TRUE for negation else FALSE
2080   ptypeptr       an unsigned int that is set to the type value
2081   pdataptr       an unsigned int that is set to the detailed property value
2082   errorcodeptr   the error code variable
2083   cb             the compile data
2084 
2085 Returns:         TRUE if the type value was found, or FALSE for an invalid type
2086 */
2087 
2088 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2089 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2090   uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2091 {
2092 PCRE2_UCHAR c;
2093 PCRE2_SIZE i, bot, top;
2094 PCRE2_SPTR ptr = *ptrptr;
2095 PCRE2_UCHAR name[50];
2096 PCRE2_UCHAR *vptr = NULL;
2097 uint16_t ptscript = PT_NOTSCRIPT;
2098 
2099 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2100 c = *ptr++;
2101 *negptr = FALSE;
2102 
2103 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2104 negation. */
2105 
2106 if (c == CHAR_LEFT_CURLY_BRACKET)
2107   {
2108   if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2109 
2110   if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2111     {
2112     *negptr = TRUE;
2113     ptr++;
2114     }
2115 
2116   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2117     {
2118     if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2119     c = *ptr++;
2120     while (c == '_' || c == '-' || isspace(c))
2121       {
2122       if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2123       c = *ptr++;
2124       }
2125     if (c == CHAR_NUL) goto ERROR_RETURN;
2126     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2127     name[i] = tolower(c);
2128     if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2129     }
2130 
2131   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2132   name[i] = 0;
2133   }
2134 
2135 /* If { doesn't follow \p or \P there is just one following character, which
2136 must be an ASCII letter. */
2137 
2138 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2139   {
2140   name[0] = tolower(c);
2141   name[1] = 0;
2142   }
2143 else goto ERROR_RETURN;
2144 
2145 *ptrptr = ptr;
2146 
2147 /* If the property contains ':' or '=' we have class name and value separately
2148 specified. The following are supported:
2149 
2150   . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2151   . Script (synonym sc) for which the property name is the script name
2152   . Script_Extensions (synonym scx), ditto
2153 
2154 As this is a small number, we currently just check the names directly. If this
2155 grows, a sorted table and a switch will be neater.
2156 
2157 For both the script properties, set a PT_xxx value so that (1) they can be
2158 distinguished and (2) invalid script names that happen to be the name of
2159 another property can be diagnosed. */
2160 
2161 if (vptr != NULL)
2162   {
2163   int offset = 0;
2164   PCRE2_UCHAR sname[8];
2165 
2166   *vptr = 0;   /* Terminate property name */
2167   if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2168       PRIV(strcmp_c8)(name, STRING_bc) == 0)
2169     {
2170     offset = 4;
2171     sname[0] = CHAR_b;
2172     sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2173     sname[2] = CHAR_d;
2174     sname[3] = CHAR_i;
2175     }
2176 
2177   else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2178            PRIV(strcmp_c8)(name, STRING_sc) == 0)
2179     ptscript = PT_SC;
2180 
2181   else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2182            PRIV(strcmp_c8)(name, STRING_scx) == 0)
2183     ptscript = PT_SCX;
2184 
2185   else
2186     {
2187     *errorcodeptr = ERR47;
2188     return FALSE;
2189     }
2190 
2191   /* Adjust the string in name[] as needed */
2192 
2193   memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2194   if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2195   }
2196 
2197 /* Search for a recognized property using binary chop. */
2198 
2199 bot = 0;
2200 top = PRIV(utt_size);
2201 
2202 while (bot < top)
2203   {
2204   int r;
2205   i = (bot + top) >> 1;
2206   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2207 
2208   /* When a matching property is found, some extra checking is needed when the
2209   \p{xx:yy} syntax is used and xx is either sc or scx. */
2210 
2211   if (r == 0)
2212     {
2213     *pdataptr = PRIV(utt)[i].value;
2214     if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2215       {
2216       *ptypeptr = PRIV(utt)[i].type;
2217       return TRUE;
2218       }
2219 
2220     switch (PRIV(utt)[i].type)
2221       {
2222       case PT_SC:
2223       *ptypeptr = PT_SC;
2224       return TRUE;
2225 
2226       case PT_SCX:
2227       *ptypeptr = ptscript;
2228       return TRUE;
2229       }
2230 
2231     break;  /* Non-script found */
2232     }
2233 
2234   if (r > 0) bot = i + 1; else top = i;
2235   }
2236 
2237 *errorcodeptr = ERR47;   /* Unrecognized property */
2238 return FALSE;
2239 
2240 ERROR_RETURN:            /* Malformed \P or \p */
2241 *errorcodeptr = ERR46;
2242 *ptrptr = ptr;
2243 return FALSE;
2244 }
2245 #endif
2246 
2247 
2248 
2249 /*************************************************
2250 *           Check for POSIX class syntax         *
2251 *************************************************/
2252 
2253 /* This function is called when the sequence "[:" or "[." or "[=" is
2254 encountered in a character class. It checks whether this is followed by a
2255 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2256 reach an unescaped ']' without the special preceding character, return FALSE.
2257 
2258 Originally, this function only recognized a sequence of letters between the
2259 terminators, but it seems that Perl recognizes any sequence of characters,
2260 though of course unknown POSIX names are subsequently rejected. Perl gives an
2261 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2262 didn't consider this to be a POSIX class. Likewise for [:1234:].
2263 
2264 The problem in trying to be exactly like Perl is in the handling of escapes. We
2265 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2266 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2267 below handles the special cases \\ and \], but does not try to do any other
2268 escape processing. This makes it different from Perl for cases such as
2269 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2270 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2271 when Perl does, I think.
2272 
2273 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2274 It seems that the appearance of a nested POSIX class supersedes an apparent
2275 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2276 a digit. This is handled by returning FALSE if the start of a new group with
2277 the same terminator is encountered, since the next closing sequence must close
2278 the nested group, not the outer one.
2279 
2280 In Perl, unescaped square brackets may also appear as part of class names. For
2281 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2282 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2283 seem right at all. PCRE does not allow closing square brackets in POSIX class
2284 names.
2285 
2286 Arguments:
2287   ptr      pointer to the character after the initial [ (colon, dot, equals)
2288   ptrend   pointer to the end of the pattern
2289   endptr   where to return a pointer to the terminating ':', '.', or '='
2290 
2291 Returns:   TRUE or FALSE
2292 */
2293 
2294 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2295 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2296 {
2297 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2298 terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2299 
2300 for (; ptrend - ptr >= 2; ptr++)
2301   {
2302   if (*ptr == CHAR_BACKSLASH &&
2303       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2304     ptr++;
2305 
2306   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2307             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2308 
2309   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2310     {
2311     *endptr = ptr;
2312     return TRUE;
2313     }
2314   }
2315 
2316 return FALSE;
2317 }
2318 
2319 
2320 
2321 /*************************************************
2322 *          Check POSIX class name                *
2323 *************************************************/
2324 
2325 /* This function is called to check the name given in a POSIX-style class entry
2326 such as [:alnum:].
2327 
2328 Arguments:
2329   ptr        points to the first letter
2330   len        the length of the name
2331 
2332 Returns:     a value representing the name, or -1 if unknown
2333 */
2334 
2335 static int
check_posix_name(PCRE2_SPTR ptr,int len)2336 check_posix_name(PCRE2_SPTR ptr, int len)
2337 {
2338 const char *pn = posix_names;
2339 int yield = 0;
2340 while (posix_name_lengths[yield] != 0)
2341   {
2342   if (len == posix_name_lengths[yield] &&
2343     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2344   pn += posix_name_lengths[yield] + 1;
2345   yield++;
2346   }
2347 return -1;
2348 }
2349 
2350 
2351 
2352 /*************************************************
2353 *       Read a subpattern or VERB name           *
2354 *************************************************/
2355 
2356 /* This function is called from parse_regex() below whenever it needs to read
2357 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2358 pointer must be to the character before the name. If that character is '*' we
2359 are reading a verb or alpha assertion name. The pointer is updated to point
2360 after the name, for a VERB or alpha assertion name, or after tha name's
2361 terminator for a subpattern name. Returning both the offset and the name
2362 pointer is redundant information, but some callers use one and some the other,
2363 so it is simplest just to return both.
2364 
2365 Arguments:
2366   ptrptr      points to the character pointer variable
2367   ptrend      points to the end of the input string
2368   utf         true if the input is UTF-encoded
2369   terminator  the terminator of a subpattern name must be this
2370   offsetptr   where to put the offset from the start of the pattern
2371   nameptr     where to put a pointer to the name in the input
2372   namelenptr  where to put the length of the name
2373   errcodeptr  where to put an error code
2374   cb          pointer to the compile data block
2375 
2376 Returns:    TRUE if a name was read
2377             FALSE otherwise, with error code set
2378 */
2379 
2380 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2381 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2382   PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2383   int *errorcodeptr, compile_block *cb)
2384 {
2385 PCRE2_SPTR ptr = *ptrptr;
2386 BOOL is_group = (*ptr != CHAR_ASTERISK);
2387 
2388 if (++ptr >= ptrend)               /* No characters in name */
2389   {
2390   *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2391                             ERR60; /* Verb not recognized or malformed */
2392   goto FAILED;
2393   }
2394 
2395 *nameptr = ptr;
2396 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2397 
2398 /* In UTF mode, a group name may contain letters and decimal digits as defined
2399 by Unicode properties, and underscores, but must not start with a digit. */
2400 
2401 #ifdef SUPPORT_UNICODE
2402 if (utf && is_group)
2403   {
2404   uint32_t c, type;
2405 
2406   GETCHAR(c, ptr);
2407   type = UCD_CHARTYPE(c);
2408 
2409   if (type == ucp_Nd)
2410     {
2411     *errorcodeptr = ERR44;
2412     goto FAILED;
2413     }
2414 
2415   for(;;)
2416     {
2417     if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2418         c != CHAR_UNDERSCORE) break;
2419     ptr++;
2420     FORWARDCHARTEST(ptr, ptrend);
2421     if (ptr >= ptrend) break;
2422     GETCHAR(c, ptr);
2423     type = UCD_CHARTYPE(c);
2424     }
2425   }
2426 else
2427 #else
2428 (void)utf;  /* Avoid compiler warning */
2429 #endif      /* SUPPORT_UNICODE */
2430 
2431 /* Handle non-group names and group names in non-UTF modes. A group name must
2432 not start with a digit. If either of the others start with a digit it just
2433 won't be recognized. */
2434 
2435   {
2436   if (is_group && IS_DIGIT(*ptr))
2437     {
2438     *errorcodeptr = ERR44;
2439     goto FAILED;
2440     }
2441 
2442   while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2443     {
2444     ptr++;
2445     }
2446   }
2447 
2448 /* Check name length */
2449 
2450 if (ptr > *nameptr + MAX_NAME_SIZE)
2451   {
2452   *errorcodeptr = ERR48;
2453   goto FAILED;
2454   }
2455 *namelenptr = (uint32_t)(ptr - *nameptr);
2456 
2457 /* Subpattern names must not be empty, and their terminator is checked here.
2458 (What follows a verb or alpha assertion name is checked separately.) */
2459 
2460 if (is_group)
2461   {
2462   if (ptr == *nameptr)
2463     {
2464     *errorcodeptr = ERR62;   /* Subpattern name expected */
2465     goto FAILED;
2466     }
2467   if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2468     {
2469     *errorcodeptr = ERR42;
2470     goto FAILED;
2471     }
2472   ptr++;
2473   }
2474 
2475 *ptrptr = ptr;
2476 return TRUE;
2477 
2478 FAILED:
2479 *ptrptr = ptr;
2480 return FALSE;
2481 }
2482 
2483 
2484 
2485 /*************************************************
2486 *          Manage callouts at start of cycle     *
2487 *************************************************/
2488 
2489 /* At the start of a new item in parse_regex() we are able to record the
2490 details of the previous item in a prior callout, and also to set up an
2491 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2492 which would otherwise happen for items such as \Q that contribute nothing to
2493 the parsed pattern.
2494 
2495 Arguments:
2496   ptr              current pattern pointer
2497   pcalloutptr      points to a pointer to previous callout, or NULL
2498   auto_callout     TRUE if auto_callouts are enabled
2499   parsed_pattern   the parsed pattern pointer
2500   cb               compile block
2501 
2502 Returns: possibly updated parsed_pattern pointer.
2503 */
2504 
2505 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2506 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2507   uint32_t *parsed_pattern, compile_block *cb)
2508 {
2509 uint32_t *previous_callout = *pcalloutptr;
2510 
2511 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2512   cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2513 
2514 if (!auto_callout) previous_callout = NULL; else
2515   {
2516   if (previous_callout == NULL ||
2517       previous_callout != parsed_pattern - 4 ||
2518       previous_callout[3] != 255)
2519     {
2520     previous_callout = parsed_pattern;  /* Set up new automatic callout */
2521     parsed_pattern += 4;
2522     previous_callout[0] = META_CALLOUT_NUMBER;
2523     previous_callout[2] = 0;
2524     previous_callout[3] = 255;
2525     }
2526   previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2527   }
2528 
2529 *pcalloutptr = previous_callout;
2530 return parsed_pattern;
2531 }
2532 
2533 
2534 
2535 /*************************************************
2536 *      Parse regex and identify named groups     *
2537 *************************************************/
2538 
2539 /* This function is called first of all. It scans the pattern and does two
2540 things: (1) It identifies capturing groups and makes a table of named capturing
2541 groups so that information about them is fully available to both the compiling
2542 scans. (2) It writes a parsed version of the pattern with comments omitted and
2543 escapes processed into the parsed_pattern vector.
2544 
2545 Arguments:
2546   ptr             points to the start of the pattern
2547   options         compiling dynamic options (may change during the scan)
2548   has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2549   cb              pointer to the compile data block
2550 
2551 Returns:   zero on success or a non-zero error code, with the
2552              error offset placed in the cb field
2553 */
2554 
2555 /* A structure and some flags for dealing with nested groups. */
2556 
2557 typedef struct nest_save {
2558   uint16_t  nest_depth;
2559   uint16_t  reset_group;
2560   uint16_t  max_group;
2561   uint16_t  flags;
2562   uint32_t  options;
2563 } nest_save;
2564 
2565 #define NSF_RESET          0x0001u
2566 #define NSF_CONDASSERT     0x0002u
2567 #define NSF_ATOMICSR       0x0004u
2568 
2569 /* Options that are changeable within the pattern must be tracked during
2570 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2571 but all must be tracked so that META_OPTIONS items set the correct values for
2572 the main compiling phase. */
2573 
2574 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2575   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2576   PCRE2_UNGREEDY)
2577 
2578 /* States used for analyzing ranges in character classes. The two OK values
2579 must be last. */
2580 
2581 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2582 
2583 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2584 the storing of literal values in the main parsed pattern, where they can always
2585 be quantified. */
2586 
2587 #if PCRE2_CODE_UNIT_WIDTH == 32
2588 #define PARSED_LITERAL(c, p) \
2589   { \
2590   if (c >= META_END) *p++ = META_BIGVALUE; \
2591   *p++ = c; \
2592   okquantifier = TRUE; \
2593   }
2594 #else
2595 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2596 #endif
2597 
2598 /* Here's the actual function. */
2599 
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2600 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2601   compile_block *cb)
2602 {
2603 uint32_t c;
2604 uint32_t delimiter;
2605 uint32_t namelen;
2606 uint32_t class_range_state;
2607 uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2608 uint32_t *verbstartptr = NULL;
2609 uint32_t *previous_callout = NULL;
2610 uint32_t *parsed_pattern = cb->parsed_pattern;
2611 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2612 uint32_t meta_quantifier = 0;
2613 uint32_t add_after_mark = 0;
2614 uint32_t extra_options = cb->cx->extra_options;
2615 uint16_t nest_depth = 0;
2616 int after_manual_callout = 0;
2617 int expect_cond_assert = 0;
2618 int errorcode = 0;
2619 int escape;
2620 int i;
2621 BOOL inescq = FALSE;
2622 BOOL inverbname = FALSE;
2623 BOOL utf = (options & PCRE2_UTF) != 0;
2624 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2625 BOOL isdupname;
2626 BOOL negate_class;
2627 BOOL okquantifier = FALSE;
2628 PCRE2_SPTR thisptr;
2629 PCRE2_SPTR name;
2630 PCRE2_SPTR ptrend = cb->end_pattern;
2631 PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2632 named_group *ng;
2633 nest_save *top_nest, *end_nests;
2634 
2635 /* Insert leading items for word and line matching (features provided for the
2636 benefit of pcre2grep). */
2637 
2638 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2639   {
2640   *parsed_pattern++ = META_CIRCUMFLEX;
2641   *parsed_pattern++ = META_NOCAPTURE;
2642   }
2643 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2644   {
2645   *parsed_pattern++ = META_ESCAPE + ESC_b;
2646   *parsed_pattern++ = META_NOCAPTURE;
2647   }
2648 
2649 /* If the pattern is actually a literal string, process it separately to avoid
2650 cluttering up the main loop. */
2651 
2652 if ((options & PCRE2_LITERAL) != 0)
2653   {
2654   while (ptr < ptrend)
2655     {
2656     if (parsed_pattern >= parsed_pattern_end)
2657       {
2658       errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2659       goto FAILED;
2660       }
2661     thisptr = ptr;
2662     GETCHARINCTEST(c, ptr);
2663     if (auto_callout)
2664       parsed_pattern = manage_callouts(thisptr, &previous_callout,
2665         auto_callout, parsed_pattern, cb);
2666     PARSED_LITERAL(c, parsed_pattern);
2667     }
2668   goto PARSED_END;
2669   }
2670 
2671 /* Process a real regex which may contain meta-characters. */
2672 
2673 top_nest = NULL;
2674 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2675 
2676 /* The size of the nest_save structure might not be a factor of the size of the
2677 workspace. Therefore we must round down end_nests so as to correctly avoid
2678 creating a nest_save that spans the end of the workspace. */
2679 
2680 end_nests = (nest_save *)((char *)end_nests -
2681   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2682 
2683 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2684 
2685 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2686 
2687 /* Now scan the pattern */
2688 
2689 while (ptr < ptrend)
2690   {
2691   int prev_expect_cond_assert;
2692   uint32_t min_repeat = 0, max_repeat = 0;
2693   uint32_t set, unset, *optset;
2694   uint32_t terminator;
2695   uint32_t prev_meta_quantifier;
2696   BOOL prev_okquantifier;
2697   PCRE2_SPTR tempptr;
2698   PCRE2_SIZE offset;
2699 
2700   if (parsed_pattern >= parsed_pattern_end)
2701     {
2702     errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2703     goto FAILED;
2704     }
2705 
2706   if (nest_depth > cb->cx->parens_nest_limit)
2707     {
2708     errorcode = ERR19;
2709     goto FAILED;        /* Parentheses too deeply nested */
2710     }
2711 
2712   /* Get next input character, save its position for callout handling. */
2713 
2714   thisptr = ptr;
2715   GETCHARINCTEST(c, ptr);
2716 
2717   /* Copy quoted literals until \E, allowing for the possibility of automatic
2718   callouts, except when processing a (*VERB) "name".  */
2719 
2720   if (inescq)
2721     {
2722     if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2723       {
2724       inescq = FALSE;
2725       ptr++;   /* Skip E */
2726       }
2727     else
2728       {
2729       if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2730         {                           /* expecting a conditional assertion, */
2731         ptr--;                      /* but an empty \Q\E sequence is OK.  */
2732         errorcode = ERR28;
2733         goto FAILED;
2734         }
2735       if (inverbname)
2736         {                          /* Don't use PARSED_LITERAL() because it */
2737 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2738         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2739 #endif
2740         *parsed_pattern++ = c;
2741         }
2742       else
2743         {
2744         if (after_manual_callout-- <= 0)
2745           parsed_pattern = manage_callouts(thisptr, &previous_callout,
2746             auto_callout, parsed_pattern, cb);
2747         PARSED_LITERAL(c, parsed_pattern);
2748         }
2749       meta_quantifier = 0;
2750       }
2751     continue;  /* Next character */
2752     }
2753 
2754   /* If we are processing the "name" part of a (*VERB:NAME) item, all
2755   characters up to the closing parenthesis are literals except when
2756   PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2757   and \E and escaped characters are allowed (no character types such as \d). If
2758   PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2759   this by not entering the special (*VERB:NAME) processing - they are then
2760   picked up below. Note that c is a character, not a code unit, so we must not
2761   use MAX_255 to test its size because MAX_255 tests code units and is assumed
2762   TRUE in 8-bit mode. */
2763 
2764   if (inverbname &&
2765        (
2766         /* EITHER: not both options set */
2767         ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2768                     (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2769 #ifdef SUPPORT_UNICODE
2770         /* OR: character > 255 AND not Unicode Pattern White Space */
2771         (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2772 #endif
2773         /* OR: not a # comment or isspace() white space */
2774         (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2775 #ifdef SUPPORT_UNICODE
2776         /* and not CHAR_NEL when Unicode is supported */
2777           && c != CHAR_NEL
2778 #endif
2779        )))
2780     {
2781     PCRE2_SIZE verbnamelength;
2782 
2783     switch(c)
2784       {
2785       default:                     /* Don't use PARSED_LITERAL() because it */
2786 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2787       if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2788 #endif
2789       *parsed_pattern++ = c;
2790       break;
2791 
2792       case CHAR_RIGHT_PARENTHESIS:
2793       inverbname = FALSE;
2794       /* This is the length in characters */
2795       verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2796       /* But the limit on the length is in code units */
2797       if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2798         {
2799         ptr--;
2800         errorcode = ERR76;
2801         goto FAILED;
2802         }
2803       *verblengthptr = (uint32_t)verbnamelength;
2804 
2805       /* If this name was on a verb such as (*ACCEPT) which does not continue,
2806       a (*MARK) was generated for the name. We now add the original verb as the
2807       next item. */
2808 
2809       if (add_after_mark != 0)
2810         {
2811         *parsed_pattern++ = add_after_mark;
2812         add_after_mark = 0;
2813         }
2814       break;
2815 
2816       case CHAR_BACKSLASH:
2817       if ((options & PCRE2_ALT_VERBNAMES) != 0)
2818         {
2819         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2820           cb->cx->extra_options, FALSE, cb);
2821         if (errorcode != 0) goto FAILED;
2822         }
2823       else escape = 0;   /* Treat all as literal */
2824 
2825       switch(escape)
2826         {
2827         case 0:                    /* Don't use PARSED_LITERAL() because it */
2828 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2829         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2830 #endif
2831         *parsed_pattern++ = c;
2832         break;
2833 
2834         case ESC_Q:
2835         inescq = TRUE;
2836         break;
2837 
2838         case ESC_E:           /* Ignore */
2839         break;
2840 
2841         default:
2842         errorcode = ERR40;    /* Invalid in verb name */
2843         goto FAILED;
2844         }
2845       }
2846     continue;   /* Next character in pattern */
2847     }
2848 
2849   /* Not a verb name character. At this point we must process everything that
2850   must not change the quantification state. This is mainly comments, but we
2851   handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2852   A+, as in Perl. An isolated \E is ignored. */
2853 
2854   if (c == CHAR_BACKSLASH && ptr < ptrend)
2855     {
2856     if (*ptr == CHAR_Q || *ptr == CHAR_E)
2857       {
2858       inescq = *ptr == CHAR_Q;
2859       ptr++;
2860       continue;
2861       }
2862     }
2863 
2864   /* Skip over whitespace and # comments in extended mode. Note that c is a
2865   character, not a code unit, so we must not use MAX_255 to test its size
2866   because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2867   whitespace characters are those designated as "Pattern White Space" by
2868   Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2869   U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2870   subset of space characters that match \h and \v. */
2871 
2872   if ((options & PCRE2_EXTENDED) != 0)
2873     {
2874     if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2875 #ifdef SUPPORT_UNICODE
2876     if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2877 #endif
2878     if (c == CHAR_NUMBER_SIGN)
2879       {
2880       while (ptr < ptrend)
2881         {
2882         if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
2883           {                       /* IS_NEWLINE sets cb->nllen. */
2884           ptr += cb->nllen;
2885           break;
2886           }
2887         ptr++;
2888 #ifdef SUPPORT_UNICODE
2889         if (utf) FORWARDCHARTEST(ptr, ptrend);
2890 #endif
2891         }
2892       continue;  /* Next character in pattern */
2893       }
2894     }
2895 
2896   /* Skip over bracketed comments */
2897 
2898   if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2899       ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2900     {
2901     while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2902     if (ptr >= ptrend)
2903       {
2904       errorcode = ERR18;  /* A special error for missing ) in a comment */
2905       goto FAILED;        /* to make it easier to debug. */
2906       }
2907     ptr++;
2908     continue;  /* Next character in pattern */
2909     }
2910 
2911   /* If the next item is not a quantifier, fill in length of any previous
2912   callout and create an auto callout if required. */
2913 
2914   if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2915        (c != CHAR_LEFT_CURLY_BRACKET ||
2916          (tempptr = ptr,
2917          !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2918     {
2919     if (after_manual_callout-- <= 0)
2920       parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2921         parsed_pattern, cb);
2922     }
2923 
2924   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2925   assertion, possibly preceded by a callout. If the value is 1, we have just
2926   had the callout and expect an assertion. There must be at least 3 more
2927   characters in all cases. When expect_cond_assert is 2, we know that the
2928   current character is an opening parenthesis, as otherwise we wouldn't be
2929   here. However, when it is 1, we need to check, and it's easiest just to check
2930   always. Note that expect_cond_assert may be negative, since all callouts just
2931   decrement it. */
2932 
2933   if (expect_cond_assert > 0)
2934     {
2935     BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2936               (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2937     if (ok)
2938       {
2939       if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
2940         {
2941         ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2942         }
2943       else switch(ptr[1])  /* Traditional symbolic format */
2944         {
2945         case CHAR_C:
2946         ok = expect_cond_assert == 2;
2947         break;
2948 
2949         case CHAR_EQUALS_SIGN:
2950         case CHAR_EXCLAMATION_MARK:
2951         break;
2952 
2953         case CHAR_LESS_THAN_SIGN:
2954         ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2955         break;
2956 
2957         default:
2958         ok = FALSE;
2959         }
2960       }
2961 
2962     if (!ok)
2963       {
2964       ptr--;   /* Adjust error offset */
2965       errorcode = ERR28;
2966       goto FAILED;
2967       }
2968     }
2969 
2970   /* Remember whether we are expecting a conditional assertion, and set the
2971   default for this item. */
2972 
2973   prev_expect_cond_assert = expect_cond_assert;
2974   expect_cond_assert = 0;
2975 
2976   /* Remember quantification status for the previous significant item, then set
2977   default for this item. */
2978 
2979   prev_okquantifier = okquantifier;
2980   prev_meta_quantifier = meta_quantifier;
2981   okquantifier = FALSE;
2982   meta_quantifier = 0;
2983 
2984   /* If the previous significant item was a quantifier, adjust the parsed code
2985   if there is a following modifier. The base meta value is always followed by
2986   the PLUS and QUERY values, in that order. We do this here rather than after
2987   reading a quantifier so that intervening comments and /x whitespace can be
2988   ignored without having to replicate code. */
2989 
2990   if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2991     {
2992     parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2993       prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2994         0x00020000u : 0x00010000u);
2995     continue;  /* Next character in pattern */
2996     }
2997 
2998 
2999   /* Process the next item in the main part of a pattern. */
3000 
3001   switch(c)
3002     {
3003     default:              /* Non-special character */
3004     PARSED_LITERAL(c, parsed_pattern);
3005     break;
3006 
3007 
3008     /* ---- Escape sequence ---- */
3009 
3010     case CHAR_BACKSLASH:
3011     tempptr = ptr;
3012     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3013       cb->cx->extra_options, FALSE, cb);
3014     if (errorcode != 0)
3015       {
3016       ESCAPE_FAILED:
3017       if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3018         goto FAILED;
3019       ptr = tempptr;
3020       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3021         {
3022         GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3023         }
3024       escape = 0;                 /* Treat as literal character */
3025       }
3026 
3027     /* The escape was a data escape or literal character. */
3028 
3029     if (escape == 0)
3030       {
3031       PARSED_LITERAL(c, parsed_pattern);
3032       }
3033 
3034     /* The escape was a back (or forward) reference. We keep the offset in
3035     order to give a more useful diagnostic for a bad forward reference. For
3036     references to groups numbered less than 10 we can't use more than two items
3037     in parsed_pattern because they may be just two characters in the input (and
3038     in a 64-bit world an offset may need two elements). So for them, the offset
3039     of the first occurrent is held in a special vector. */
3040 
3041     else if (escape < 0)
3042       {
3043       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3044       escape = -escape;
3045       *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3046       if (escape < 10)
3047         {
3048         if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3049           cb->small_ref_offset[escape] = offset;
3050         }
3051       else
3052         {
3053         PUTOFFSET(offset, parsed_pattern);
3054         }
3055       okquantifier = TRUE;
3056       }
3057 
3058     /* The escape was a character class such as \d etc. or other special
3059     escape indicator such as \A or \X. Most of them generate just a single
3060     parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3061     value. They are supported only when Unicode is available. The type and
3062     value are packed into a single 32-bit value so that the whole sequences
3063     uses only two elements in the parsed_vector. This is because the same
3064     coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3065     set.
3066 
3067     There are also some cases where the escape sequence is followed by a name:
3068     \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3069     and \g'name' are subroutine calls by name; \g{name} is a synonym for
3070     \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3071     and returned as a negative value (handled above). A name is coded as an
3072     offset into the pattern and a length. */
3073 
3074     else switch (escape)
3075       {
3076       case ESC_C:
3077 #ifdef NEVER_BACKSLASH_C
3078       errorcode = ERR85;
3079       goto ESCAPE_FAILED;
3080 #else
3081       if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3082         {
3083         errorcode = ERR83;
3084         goto ESCAPE_FAILED;
3085         }
3086 #endif
3087       okquantifier = TRUE;
3088       *parsed_pattern++ = META_ESCAPE + escape;
3089       break;
3090 
3091       case ESC_X:
3092 #ifndef SUPPORT_UNICODE
3093       errorcode = ERR45;   /* Supported only with Unicode support */
3094       goto ESCAPE_FAILED;
3095 #endif
3096       case ESC_H:
3097       case ESC_h:
3098       case ESC_N:
3099       case ESC_R:
3100       case ESC_V:
3101       case ESC_v:
3102       okquantifier = TRUE;
3103       *parsed_pattern++ = META_ESCAPE + escape;
3104       break;
3105 
3106       default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3107       *parsed_pattern++ = META_ESCAPE + escape;
3108       break;
3109 
3110       /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
3111       without Unicode support because it is checked when pcre2_compile() is
3112       called. */
3113 
3114       case ESC_d:
3115       case ESC_D:
3116       case ESC_s:
3117       case ESC_S:
3118       case ESC_w:
3119       case ESC_W:
3120       okquantifier = TRUE;
3121       if ((options & PCRE2_UCP) == 0)
3122         {
3123         *parsed_pattern++ = META_ESCAPE + escape;
3124         }
3125       else
3126         {
3127         *parsed_pattern++ = META_ESCAPE +
3128           ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3129             ESC_p : ESC_P);
3130         switch(escape)
3131           {
3132           case ESC_d:
3133           case ESC_D:
3134           *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3135           break;
3136 
3137           case ESC_s:
3138           case ESC_S:
3139           *parsed_pattern++ = PT_SPACE << 16;
3140           break;
3141 
3142           case ESC_w:
3143           case ESC_W:
3144           *parsed_pattern++ = PT_WORD << 16;
3145           break;
3146           }
3147         }
3148       break;
3149 
3150       /* Unicode property matching */
3151 
3152       case ESC_P:
3153       case ESC_p:
3154 #ifdef SUPPORT_UNICODE
3155         {
3156         BOOL negated;
3157         uint16_t ptype = 0, pdata = 0;
3158         if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3159           goto ESCAPE_FAILED;
3160         if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3161         *parsed_pattern++ = META_ESCAPE + escape;
3162         *parsed_pattern++ = (ptype << 16) | pdata;
3163         okquantifier = TRUE;
3164         }
3165 #else
3166       errorcode = ERR45;
3167       goto ESCAPE_FAILED;
3168 #endif
3169       break;  /* End \P and \p */
3170 
3171       /* When \g is used with quotes or angle brackets as delimiters, it is a
3172       numerical or named subroutine call, and control comes here. When used
3173       with brace delimiters it is a numberical back reference and does not come
3174       here because check_escape() returns it directly as a reference. \k is
3175       always a named back reference. */
3176 
3177       case ESC_g:
3178       case ESC_k:
3179       if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3180           *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3181         {
3182         errorcode = (escape == ESC_g)? ERR57 : ERR69;
3183         goto ESCAPE_FAILED;
3184         }
3185       terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3186         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3187         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3188 
3189       /* For a non-braced \g, check for a numerical recursion. */
3190 
3191       if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3192         {
3193         PCRE2_SPTR p = ptr + 1;
3194 
3195         if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3196             &errorcode))
3197           {
3198           if (p >= ptrend || *p != terminator)
3199             {
3200             errorcode = ERR57;
3201             goto ESCAPE_FAILED;
3202             }
3203           ptr = p;
3204           goto SET_RECURSION;
3205           }
3206         if (errorcode != 0) goto ESCAPE_FAILED;
3207         }
3208 
3209       /* Not a numerical recursion */
3210 
3211       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3212           &errorcode, cb)) goto ESCAPE_FAILED;
3213 
3214       /* \k and \g when used with braces are back references, whereas \g used
3215       with quotes or angle brackets is a recursion */
3216 
3217       *parsed_pattern++ =
3218         (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3219           META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3220       *parsed_pattern++ = namelen;
3221 
3222       PUTOFFSET(offset, parsed_pattern);
3223       okquantifier = TRUE;
3224       break;  /* End special escape processing */
3225       }
3226     break;    /* End escape sequence processing */
3227 
3228 
3229     /* ---- Single-character special items ---- */
3230 
3231     case CHAR_CIRCUMFLEX_ACCENT:
3232     *parsed_pattern++ = META_CIRCUMFLEX;
3233     break;
3234 
3235     case CHAR_DOLLAR_SIGN:
3236     *parsed_pattern++ = META_DOLLAR;
3237     break;
3238 
3239     case CHAR_DOT:
3240     *parsed_pattern++ = META_DOT;
3241     okquantifier = TRUE;
3242     break;
3243 
3244 
3245     /* ---- Single-character quantifiers ---- */
3246 
3247     case CHAR_ASTERISK:
3248     meta_quantifier = META_ASTERISK;
3249     goto CHECK_QUANTIFIER;
3250 
3251     case CHAR_PLUS:
3252     meta_quantifier = META_PLUS;
3253     goto CHECK_QUANTIFIER;
3254 
3255     case CHAR_QUESTION_MARK:
3256     meta_quantifier = META_QUERY;
3257     goto CHECK_QUANTIFIER;
3258 
3259 
3260     /* ---- Potential {n,m} quantifier ---- */
3261 
3262     case CHAR_LEFT_CURLY_BRACKET:
3263     if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3264         &errorcode))
3265       {
3266       if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3267       PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3268       break;                               /* No more quantifier processing */
3269       }
3270     meta_quantifier = META_MINMAX;
3271     /* Fall through */
3272 
3273 
3274     /* ---- Quantifier post-processing ---- */
3275 
3276     /* Check that a quantifier is allowed after the previous item. */
3277 
3278     CHECK_QUANTIFIER:
3279     if (!prev_okquantifier)
3280       {
3281       errorcode = ERR9;
3282       goto FAILED_BACK;
3283       }
3284 
3285     /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3286     quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3287     sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3288     wrapping it in non-capturing brackets, but we have to allow for a preceding
3289     (*MARK) for when (*ACCEPT) has an argument. */
3290 
3291     if (parsed_pattern[-1] == META_ACCEPT)
3292       {
3293       uint32_t *p;
3294       for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3295       *verbstartptr = META_NOCAPTURE;
3296       parsed_pattern[1] = META_KET;
3297       parsed_pattern += 2;
3298       }
3299 
3300     /* Now we can put the quantifier into the parsed pattern vector. At this
3301     stage, we have only the basic quantifier. The check for a following + or ?
3302     modifier happens at the top of the loop, after any intervening comments
3303     have been removed. */
3304 
3305     *parsed_pattern++ = meta_quantifier;
3306     if (c == CHAR_LEFT_CURLY_BRACKET)
3307       {
3308       *parsed_pattern++ = min_repeat;
3309       *parsed_pattern++ = max_repeat;
3310       }
3311     break;
3312 
3313 
3314     /* ---- Character class ---- */
3315 
3316     case CHAR_LEFT_SQUARE_BRACKET:
3317     okquantifier = TRUE;
3318 
3319     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3320     used for "start of word" and "end of word". As these are otherwise illegal
3321     sequences, we don't break anything by recognizing them. They are replaced
3322     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3323     erroneous and are handled by the normal code below. */
3324 
3325     if (ptrend - ptr >= 6 &&
3326          (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3327           PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3328       {
3329       *parsed_pattern++ = META_ESCAPE + ESC_b;
3330 
3331       if (ptr[2] == CHAR_LESS_THAN_SIGN)
3332         {
3333         *parsed_pattern++ = META_LOOKAHEAD;
3334         }
3335       else
3336         {
3337         *parsed_pattern++ = META_LOOKBEHIND;
3338         *has_lookbehind = TRUE;
3339 
3340         /* The offset is used only for the "non-fixed length" error; this won't
3341         occur here, so just store zero. */
3342 
3343         PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3344         }
3345 
3346       if ((options & PCRE2_UCP) == 0)
3347         *parsed_pattern++ = META_ESCAPE + ESC_w;
3348       else
3349         {
3350         *parsed_pattern++ = META_ESCAPE + ESC_p;
3351         *parsed_pattern++ = PT_WORD << 16;
3352         }
3353       *parsed_pattern++ = META_KET;
3354       ptr += 6;
3355       break;
3356       }
3357 
3358     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3359     they are encountered at the top level, so we'll do that too. */
3360 
3361     if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3362          *ptr == CHAR_EQUALS_SIGN) &&
3363         check_posix_syntax(ptr, ptrend, &tempptr))
3364       {
3365       errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3366       goto FAILED;
3367       }
3368 
3369     /* Process a regular character class. If the first character is '^', set
3370     the negation flag. If the first few characters (either before or after ^)
3371     are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3372     This makes for compatibility with Perl. */
3373 
3374     negate_class = FALSE;
3375     while (ptr < ptrend)
3376       {
3377       GETCHARINCTEST(c, ptr);
3378       if (c == CHAR_BACKSLASH)
3379         {
3380         if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3381         else if (ptrend - ptr >= 3 &&
3382              PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3383           ptr += 3;
3384         else
3385           break;
3386         }
3387       else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3388                (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3389         continue;
3390       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3391         negate_class = TRUE;
3392       else break;
3393       }
3394 
3395     /* Now the real contents of the class; c has the first "real" character.
3396     Empty classes are permitted only if the option is set. */
3397 
3398     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3399         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3400       {
3401       *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3402       break;  /* End of class processing */
3403       }
3404 
3405     /* Process a non-empty class. */
3406 
3407     *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3408     class_range_state = RANGE_NO;
3409 
3410     /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3411     because there are holes in the encoding, and simply using the range A-Z
3412     (for example) would include the characters in the holes. This applies only
3413     to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3414     in this respect. In order to accommodate this, we keep track of whether
3415     character values are literal or not, and a state variable for handling
3416     ranges. */
3417 
3418     /* Loop for the contents of the class */
3419 
3420     for (;;)
3421       {
3422       BOOL char_is_literal = TRUE;
3423 
3424       /* Inside \Q...\E everything is literal except \E */
3425 
3426       if (inescq)
3427         {
3428         if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3429           {
3430           inescq = FALSE;                   /* Reset literal state */
3431           ptr++;                            /* Skip the 'E' */
3432           goto CLASS_CONTINUE;
3433           }
3434         goto CLASS_LITERAL;
3435         }
3436 
3437       /* Skip over space and tab (only) in extended-more mode. */
3438 
3439       if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3440           (c == CHAR_SPACE || c == CHAR_HT))
3441         goto CLASS_CONTINUE;
3442 
3443       /* Handle POSIX class names. Perl allows a negation extension of the
3444       form [:^name:]. A square bracket that doesn't match the syntax is
3445       treated as a literal. We also recognize the POSIX constructions
3446       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3447       5.6 and 5.8 do. */
3448 
3449       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3450           ptrend - ptr >= 3 &&
3451           (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3452            *ptr == CHAR_EQUALS_SIGN) &&
3453           check_posix_syntax(ptr, ptrend, &tempptr))
3454         {
3455         BOOL posix_negate = FALSE;
3456         int posix_class;
3457 
3458         /* Perl treats a hyphen before a POSIX class as a literal, not the
3459         start of a range. However, it gives a warning in its warning mode. PCRE
3460         does not have a warning mode, so we give an error, because this is
3461         likely an error on the user's part. */
3462 
3463         if (class_range_state == RANGE_STARTED)
3464           {
3465           errorcode = ERR50;
3466           goto FAILED;
3467           }
3468 
3469         if (*ptr != CHAR_COLON)
3470           {
3471           errorcode = ERR13;
3472           goto FAILED_BACK;
3473           }
3474 
3475         if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3476           {
3477           posix_negate = TRUE;
3478           ptr++;
3479           }
3480 
3481         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3482         if (posix_class < 0)
3483           {
3484           errorcode = ERR30;
3485           goto FAILED;
3486           }
3487         ptr = tempptr + 2;
3488 
3489         /* Perl treats a hyphen after a POSIX class as a literal, not the
3490         start of a range. However, it gives a warning in its warning mode
3491         unless the hyphen is the last character in the class. PCRE does not
3492         have a warning mode, so we give an error, because this is likely an
3493         error on the user's part. */
3494 
3495         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3496             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3497           {
3498           errorcode = ERR50;
3499           goto FAILED;
3500           }
3501 
3502         /* Set "a hyphen is not the start of a range" for the -] case, and also
3503         in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3504         fuzzers do that kind of thing) and *then* a hyphen. This causes that
3505         hyphen to be treated as a literal. I don't think it's worth setting up
3506         special apparatus to do otherwise. */
3507 
3508         class_range_state = RANGE_NO;
3509 
3510         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3511         use Unicode properties \p or \P or, in one case, \h or \H. The
3512         substitutes table has two values per class, containing the type and
3513         value of a \p or \P item. The special cases are specified with a
3514         negative type: a non-zero value causes \h or \H to be used, and a zero
3515         value falls through to behave like a non-UCP POSIX class. */
3516 
3517 #ifdef SUPPORT_UNICODE
3518         if ((options & PCRE2_UCP) != 0)
3519           {
3520           int ptype = posix_substitutes[2*posix_class];
3521           int pvalue = posix_substitutes[2*posix_class + 1];
3522           if (ptype >= 0)
3523             {
3524             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3525             *parsed_pattern++ = (ptype << 16) | pvalue;
3526             goto CLASS_CONTINUE;
3527             }
3528 
3529           if (pvalue != 0)
3530             {
3531             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3532             goto CLASS_CONTINUE;
3533             }
3534 
3535           /* Fall through */
3536           }
3537 #endif  /* SUPPORT_UNICODE */
3538 
3539         /* Non-UCP POSIX class */
3540 
3541         *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3542         *parsed_pattern++ = posix_class;
3543         }
3544 
3545       /* Handle potential start of range */
3546 
3547       else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3548         {
3549         *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3550           META_RANGE_LITERAL : META_RANGE_ESCAPED;
3551         class_range_state = RANGE_STARTED;
3552         }
3553 
3554       /* Handle a literal character */
3555 
3556       else if (c != CHAR_BACKSLASH)
3557         {
3558         CLASS_LITERAL:
3559         if (class_range_state == RANGE_STARTED)
3560           {
3561           if (c == parsed_pattern[-2])       /* Optimize one-char range */
3562             parsed_pattern--;
3563           else if (parsed_pattern[-2] > c)   /* Check range is in order */
3564             {
3565             errorcode = ERR8;
3566             goto FAILED_BACK;
3567             }
3568           else
3569             {
3570             if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3571               parsed_pattern[-1] = META_RANGE_ESCAPED;
3572             PARSED_LITERAL(c, parsed_pattern);
3573             }
3574           class_range_state = RANGE_NO;
3575           }
3576         else  /* Potential start of range */
3577           {
3578           class_range_state = char_is_literal?
3579             RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3580           PARSED_LITERAL(c, parsed_pattern);
3581           }
3582         }
3583 
3584       /* Handle escapes in a class */
3585 
3586       else
3587         {
3588         tempptr = ptr;
3589         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3590           cb->cx->extra_options, TRUE, cb);
3591 
3592         if (errorcode != 0)
3593           {
3594           if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3595             goto FAILED;
3596           ptr = tempptr;
3597           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3598             {
3599             GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3600             }
3601           escape = 0;                 /* Treat as literal character */
3602           }
3603 
3604         switch(escape)
3605           {
3606           case 0:  /* Escaped character code point is in c */
3607           char_is_literal = FALSE;
3608           goto CLASS_LITERAL;
3609 
3610           case ESC_b:
3611           c = CHAR_BS;    /* \b is backspace in a class */
3612           char_is_literal = FALSE;
3613           goto CLASS_LITERAL;
3614 
3615           case ESC_Q:
3616           inescq = TRUE;  /* Enter literal mode */
3617           goto CLASS_CONTINUE;
3618 
3619           case ESC_E:     /* Ignore orphan \E */
3620           goto CLASS_CONTINUE;
3621 
3622           case ESC_B:     /* Always an error in a class */
3623           case ESC_R:
3624           case ESC_X:
3625           errorcode = ERR7;
3626           ptr--;
3627           goto FAILED;
3628           }
3629 
3630         /* The second part of a range can be a single-character escape
3631         sequence (detected above), but not any of the other escapes. Perl
3632         treats a hyphen as a literal in such circumstances. However, in Perl's
3633         warning mode, a warning is given, so PCRE now faults it, as it is
3634         almost certainly a mistake on the user's part. */
3635 
3636         if (class_range_state == RANGE_STARTED)
3637           {
3638           errorcode = ERR50;
3639           goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3640           }
3641 
3642         /* Of the remaining escapes, only those that define characters are
3643         allowed in a class. None may start a range. */
3644 
3645         class_range_state = RANGE_NO;
3646         switch(escape)
3647           {
3648           case ESC_N:
3649           errorcode = ERR71;
3650           goto FAILED;
3651 
3652           case ESC_H:
3653           case ESC_h:
3654           case ESC_V:
3655           case ESC_v:
3656           *parsed_pattern++ = META_ESCAPE + escape;
3657           break;
3658 
3659           /* These escapes are converted to Unicode property tests when
3660           PCRE2_UCP is set. */
3661 
3662           case ESC_d:
3663           case ESC_D:
3664           case ESC_s:
3665           case ESC_S:
3666           case ESC_w:
3667           case ESC_W:
3668           if ((options & PCRE2_UCP) == 0)
3669             {
3670             *parsed_pattern++ = META_ESCAPE + escape;
3671             }
3672           else
3673             {
3674             *parsed_pattern++ = META_ESCAPE +
3675               ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3676                 ESC_p : ESC_P);
3677             switch(escape)
3678               {
3679               case ESC_d:
3680               case ESC_D:
3681               *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3682               break;
3683 
3684               case ESC_s:
3685               case ESC_S:
3686               *parsed_pattern++ = PT_SPACE << 16;
3687               break;
3688 
3689               case ESC_w:
3690               case ESC_W:
3691               *parsed_pattern++ = PT_WORD << 16;
3692               break;
3693               }
3694             }
3695           break;
3696 
3697           /* Explicit Unicode property matching */
3698 
3699           case ESC_P:
3700           case ESC_p:
3701 #ifdef SUPPORT_UNICODE
3702             {
3703             BOOL negated;
3704             uint16_t ptype = 0, pdata = 0;
3705             if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3706               goto FAILED;
3707             if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3708             *parsed_pattern++ = META_ESCAPE + escape;
3709             *parsed_pattern++ = (ptype << 16) | pdata;
3710             }
3711 #else
3712           errorcode = ERR45;
3713           goto FAILED;
3714 #endif
3715           break;  /* End \P and \p */
3716 
3717           default:    /* All others are not allowed in a class */
3718           errorcode = ERR7;
3719           ptr--;
3720           goto FAILED;
3721           }
3722 
3723         /* Perl gives a warning unless a following hyphen is the last character
3724         in the class. PCRE throws an error. */
3725 
3726         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3727             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3728           {
3729           errorcode = ERR50;
3730           goto FAILED;
3731           }
3732         }
3733 
3734       /* Proceed to next thing in the class. */
3735 
3736       CLASS_CONTINUE:
3737       if (ptr >= ptrend)
3738         {
3739         errorcode = ERR6;  /* Missing terminating ']' */
3740         goto FAILED;
3741         }
3742       GETCHARINCTEST(c, ptr);
3743       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3744       }     /* End of class-processing loop */
3745 
3746     /* -] at the end of a class is a literal '-' */
3747 
3748     if (class_range_state == RANGE_STARTED)
3749       {
3750       parsed_pattern[-1] = CHAR_MINUS;
3751       class_range_state = RANGE_NO;
3752       }
3753 
3754     *parsed_pattern++ = META_CLASS_END;
3755     break;  /* End of character class */
3756 
3757 
3758     /* ---- Opening parenthesis ---- */
3759 
3760     case CHAR_LEFT_PARENTHESIS:
3761     if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3762 
3763     /* If ( is not followed by ? it is either a capture or a special verb or an
3764     alpha assertion or a positive non-atomic lookahead. */
3765 
3766     if (*ptr != CHAR_QUESTION_MARK)
3767       {
3768       const char *vn;
3769 
3770       /* Handle capturing brackets (or non-capturing if auto-capture is turned
3771       off). */
3772 
3773       if (*ptr != CHAR_ASTERISK)
3774         {
3775         nest_depth++;
3776         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3777           {
3778           if (cb->bracount >= MAX_GROUP_NUMBER)
3779             {
3780             errorcode = ERR97;
3781             goto FAILED;
3782             }
3783           cb->bracount++;
3784           *parsed_pattern++ = META_CAPTURE | cb->bracount;
3785           }
3786         else *parsed_pattern++ = META_NOCAPTURE;
3787         }
3788 
3789       /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3790       quantifier" error rather than "(*MARK) must have an argument". */
3791 
3792       else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3793         break;
3794 
3795       /* Handle "alpha assertions" such as (*pla:...). Most of these are
3796       synonyms for the historical symbolic assertions, but the script run and
3797       non-atomic lookaround ones are new. They are distinguished by starting
3798       with a lower case letter. Checking both ends of the alphabet makes this
3799       work in all character codes. */
3800 
3801       else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3802         {
3803         uint32_t meta;
3804 
3805         vn = alasnames;
3806         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3807           &errorcode, cb)) goto FAILED;
3808         if (ptr >= ptrend || *ptr != CHAR_COLON)
3809           {
3810           errorcode = ERR95;  /* Malformed */
3811           goto FAILED;
3812           }
3813 
3814         /* Scan the table of alpha assertion names */
3815 
3816         for (i = 0; i < alascount; i++)
3817           {
3818           if (namelen == alasmeta[i].len &&
3819               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3820             break;
3821           vn += alasmeta[i].len + 1;
3822           }
3823 
3824         if (i >= alascount)
3825           {
3826           errorcode = ERR95;  /* Alpha assertion not recognized */
3827           goto FAILED;
3828           }
3829 
3830         /* Check for expecting an assertion condition. If so, only atomic
3831         lookaround assertions are valid. */
3832 
3833         meta = alasmeta[i].meta;
3834         if (prev_expect_cond_assert > 0 &&
3835             (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3836           {
3837           errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3838             ERR98 : ERR28;  /* (Atomic) assertion expected */
3839           goto FAILED;
3840           }
3841 
3842         /* The lookaround alphabetic synonyms can mostly be handled by jumping
3843         to the code that handles the traditional symbolic forms. */
3844 
3845         switch(meta)
3846           {
3847           default:
3848           errorcode = ERR89;  /* Unknown code; should never occur because */
3849           goto FAILED;        /* the meta values come from a table above. */
3850 
3851           case META_ATOMIC:
3852           goto ATOMIC_GROUP;
3853 
3854           case META_LOOKAHEAD:
3855           goto POSITIVE_LOOK_AHEAD;
3856 
3857           case META_LOOKAHEAD_NA:
3858           goto POSITIVE_NONATOMIC_LOOK_AHEAD;
3859 
3860           case META_LOOKAHEADNOT:
3861           goto NEGATIVE_LOOK_AHEAD;
3862 
3863           case META_LOOKBEHIND:
3864           case META_LOOKBEHINDNOT:
3865           case META_LOOKBEHIND_NA:
3866           *parsed_pattern++ = meta;
3867           ptr--;
3868           goto POST_LOOKBEHIND;
3869 
3870           /* The script run facilities are handled here. Unicode support is
3871           required (give an error if not, as this is a security issue). Always
3872           record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3873           META_ATOMIC and remember that we need two META_KETs at the end. */
3874 
3875           case META_SCRIPT_RUN:
3876           case META_ATOMIC_SCRIPT_RUN:
3877 #ifdef SUPPORT_UNICODE
3878           *parsed_pattern++ = META_SCRIPT_RUN;
3879           nest_depth++;
3880           ptr++;
3881           if (meta == META_ATOMIC_SCRIPT_RUN)
3882             {
3883             *parsed_pattern++ = META_ATOMIC;
3884             if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3885             else if (++top_nest >= end_nests)
3886               {
3887               errorcode = ERR84;
3888               goto FAILED;
3889               }
3890             top_nest->nest_depth = nest_depth;
3891             top_nest->flags = NSF_ATOMICSR;
3892             top_nest->options = options & PARSE_TRACKED_OPTIONS;
3893             }
3894           break;
3895 #else  /* SUPPORT_UNICODE */
3896           errorcode = ERR96;
3897           goto FAILED;
3898 #endif
3899           }
3900         }
3901 
3902 
3903       /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3904 
3905       else
3906         {
3907         vn = verbnames;
3908         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3909           &errorcode, cb)) goto FAILED;
3910         if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3911                               *ptr != CHAR_RIGHT_PARENTHESIS))
3912           {
3913           errorcode = ERR60;  /* Malformed */
3914           goto FAILED;
3915           }
3916 
3917         /* Scan the table of verb names */
3918 
3919         for (i = 0; i < verbcount; i++)
3920           {
3921           if (namelen == verbs[i].len &&
3922               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3923             break;
3924           vn += verbs[i].len + 1;
3925           }
3926 
3927         if (i >= verbcount)
3928           {
3929           errorcode = ERR60;  /* Verb not recognized */
3930           goto FAILED;
3931           }
3932 
3933         /* An empty argument is treated as no argument. */
3934 
3935         if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3936              ptr[1] == CHAR_RIGHT_PARENTHESIS)
3937           ptr++;    /* Advance to the closing parens */
3938 
3939         /* Check for mandatory non-empty argument; this is (*MARK) */
3940 
3941         if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3942           {
3943           errorcode = ERR66;
3944           goto FAILED;
3945           }
3946 
3947         /* Remember where this verb, possibly with a preceding (*MARK), starts,
3948         for handling quantified (*ACCEPT). */
3949 
3950         verbstartptr = parsed_pattern;
3951         okquantifier = (verbs[i].meta == META_ACCEPT);
3952 
3953         /* It appears that Perl allows any characters whatsoever, other than a
3954         closing parenthesis, to appear in arguments ("names"), so we no longer
3955         insist on letters, digits, and underscores. Perl does not, however, do
3956         any interpretation within arguments, and has no means of including a
3957         closing parenthesis. PCRE supports escape processing but only when it
3958         is requested by an option. We set inverbname TRUE here, and let the
3959         main loop take care of this so that escape and \x processing is done by
3960         the main code above. */
3961 
3962         if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
3963           {
3964           /* Some optional arguments can be treated as a preceding (*MARK) */
3965 
3966           if (verbs[i].has_arg < 0)
3967             {
3968             add_after_mark = verbs[i].meta;
3969             *parsed_pattern++ = META_MARK;
3970             }
3971 
3972           /* The remaining verbs with arguments (except *MARK) need a different
3973           opcode. */
3974 
3975           else
3976             {
3977             *parsed_pattern++ = verbs[i].meta +
3978               ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3979             }
3980 
3981           /* Set up for reading the name in the main loop. */
3982 
3983           verblengthptr = parsed_pattern++;
3984           verbnamestart = ptr;
3985           inverbname = TRUE;
3986           }
3987         else  /* No verb "name" argument */
3988           {
3989           *parsed_pattern++ = verbs[i].meta;
3990           }
3991         }     /* End of (*VERB) handling */
3992       break;  /* Done with this parenthesis */
3993       }       /* End of groups that don't start with (? */
3994 
3995 
3996     /* ---- Items starting (? ---- */
3997 
3998     /* The type of item is determined by what follows (?. Handle (?| and option
3999     changes under "default" because both need a new block on the nest stack.
4000     Comments starting with (?# are handled above. Note that there is some
4001     ambiguity about the sequence (?- because if a digit follows it's a relative
4002     recursion or subroutine call whereas otherwise it's an option unsetting. */
4003 
4004     if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4005 
4006     switch(*ptr)
4007       {
4008       default:
4009       if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4010         goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4011 
4012       /* We now have either (?| or a (possibly empty) option setting,
4013       optionally followed by a non-capturing group. */
4014 
4015       nest_depth++;
4016       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4017       else if (++top_nest >= end_nests)
4018         {
4019         errorcode = ERR84;
4020         goto FAILED;
4021         }
4022       top_nest->nest_depth = nest_depth;
4023       top_nest->flags = 0;
4024       top_nest->options = options & PARSE_TRACKED_OPTIONS;
4025 
4026       /* Start of non-capturing group that resets the capture count for each
4027       branch. */
4028 
4029       if (*ptr == CHAR_VERTICAL_LINE)
4030         {
4031         top_nest->reset_group = (uint16_t)cb->bracount;
4032         top_nest->max_group = (uint16_t)cb->bracount;
4033         top_nest->flags |= NSF_RESET;
4034         cb->external_flags |= PCRE2_DUPCAPUSED;
4035         *parsed_pattern++ = META_NOCAPTURE;
4036         ptr++;
4037         }
4038 
4039       /* Scan for options imnsxJU to be set or unset. */
4040 
4041       else
4042         {
4043         BOOL hyphenok = TRUE;
4044         uint32_t oldoptions = options;
4045 
4046         top_nest->reset_group = 0;
4047         top_nest->max_group = 0;
4048         set = unset = 0;
4049         optset = &set;
4050 
4051         /* ^ at the start unsets imnsx and disables the subsequent use of - */
4052 
4053         if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4054           {
4055           options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4056                        PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4057           hyphenok = FALSE;
4058           ptr++;
4059           }
4060 
4061         while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4062                                *ptr != CHAR_COLON)
4063           {
4064           switch (*ptr++)
4065             {
4066             case CHAR_MINUS:
4067             if (!hyphenok)
4068               {
4069               errorcode = ERR94;
4070               ptr--;  /* Correct the offset */
4071               goto FAILED;
4072               }
4073             optset = &unset;
4074             hyphenok = FALSE;
4075             break;
4076 
4077             case CHAR_J:  /* Record that it changed in the external options */
4078             *optset |= PCRE2_DUPNAMES;
4079             cb->external_flags |= PCRE2_JCHANGED;
4080             break;
4081 
4082             case CHAR_i: *optset |= PCRE2_CASELESS; break;
4083             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4084             case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4085             case CHAR_s: *optset |= PCRE2_DOTALL; break;
4086             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4087 
4088             /* If x appears twice it sets the extended extended option. */
4089 
4090             case CHAR_x:
4091             *optset |= PCRE2_EXTENDED;
4092             if (ptr < ptrend && *ptr == CHAR_x)
4093               {
4094               *optset |= PCRE2_EXTENDED_MORE;
4095               ptr++;
4096               }
4097             break;
4098 
4099             default:
4100             errorcode = ERR11;
4101             ptr--;    /* Correct the offset */
4102             goto FAILED;
4103             }
4104           }
4105 
4106         /* If we are setting extended without extended-more, ensure that any
4107         existing extended-more gets unset. Also, unsetting extended must also
4108         unset extended-more. */
4109 
4110         if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4111             (unset & PCRE2_EXTENDED) != 0)
4112           unset |= PCRE2_EXTENDED_MORE;
4113 
4114         options = (options | set) & (~unset);
4115 
4116         /* If the options ended with ')' this is not the start of a nested
4117         group with option changes, so the options change at this level.
4118         In this case, if the previous level set up a nest block, discard the
4119         one we have just created. Otherwise adjust it for the previous level.
4120         If the options ended with ':' we are starting a non-capturing group,
4121         possibly with an options setting. */
4122 
4123         if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4124         if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4125           {
4126           nest_depth--;  /* This is not a nested group after all. */
4127           if (top_nest > (nest_save *)(cb->start_workspace) &&
4128               (top_nest-1)->nest_depth == nest_depth) top_nest--;
4129           else top_nest->nest_depth = nest_depth;
4130           }
4131         else *parsed_pattern++ = META_NOCAPTURE;
4132 
4133         /* If nothing changed, no need to record. */
4134 
4135         if (options != oldoptions)
4136           {
4137           *parsed_pattern++ = META_OPTIONS;
4138           *parsed_pattern++ = options;
4139           }
4140         }     /* End options processing */
4141       break;  /* End default case after (? */
4142 
4143 
4144       /* ---- Python syntax support ---- */
4145 
4146       case CHAR_P:
4147       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4148 
4149       /* (?P<name> is the same as (?<name>, which defines a named group. */
4150 
4151       if (*ptr == CHAR_LESS_THAN_SIGN)
4152         {
4153         terminator = CHAR_GREATER_THAN_SIGN;
4154         goto DEFINE_NAME;
4155         }
4156 
4157       /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4158       call. */
4159 
4160       if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4161 
4162       /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4163       else after (?P is an error. */
4164 
4165       if (*ptr != CHAR_EQUALS_SIGN)
4166         {
4167         errorcode = ERR41;
4168         goto FAILED;
4169         }
4170       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4171           &namelen, &errorcode, cb)) goto FAILED;
4172       *parsed_pattern++ = META_BACKREF_BYNAME;
4173       *parsed_pattern++ = namelen;
4174       PUTOFFSET(offset, parsed_pattern);
4175       okquantifier = TRUE;
4176       break;   /* End of (?P processing */
4177 
4178 
4179       /* ---- Recursion/subroutine calls by number ---- */
4180 
4181       case CHAR_R:
4182       i = 0;         /* (?R) == (?R0) */
4183       ptr++;
4184       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4185         {
4186         errorcode = ERR58;
4187         goto FAILED;
4188         }
4189       goto SET_RECURSION;
4190 
4191       /* An item starting (?- followed by a digit comes here via the "default"
4192       case because (?- followed by a non-digit is an options setting. */
4193 
4194       case CHAR_PLUS:
4195       if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4196         {
4197         errorcode = ERR29;   /* Missing number */
4198         goto FAILED;
4199         }
4200       /* Fall through */
4201 
4202       case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4203       case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4204       RECURSION_BYNUMBER:
4205       if (!read_number(&ptr, ptrend,
4206           (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4207           MAX_GROUP_NUMBER, ERR61,
4208           &i, &errorcode)) goto FAILED;
4209       if (i < 0)  /* NB (?0) is permitted */
4210         {
4211         errorcode = ERR15;   /* Unknown group */
4212         goto FAILED_BACK;
4213         }
4214       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4215         goto UNCLOSED_PARENTHESIS;
4216 
4217       SET_RECURSION:
4218       *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4219       offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4220       ptr++;
4221       PUTOFFSET(offset, parsed_pattern);
4222       okquantifier = TRUE;
4223       break;  /* End of recursive call by number handling */
4224 
4225 
4226       /* ---- Recursion/subroutine calls by name ---- */
4227 
4228       case CHAR_AMPERSAND:
4229       RECURSE_BY_NAME:
4230       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4231           &namelen, &errorcode, cb)) goto FAILED;
4232       *parsed_pattern++ = META_RECURSE_BYNAME;
4233       *parsed_pattern++ = namelen;
4234       PUTOFFSET(offset, parsed_pattern);
4235       okquantifier = TRUE;
4236       break;
4237 
4238       /* ---- Callout with numerical or string argument ---- */
4239 
4240       case CHAR_C:
4241       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4242 
4243       /* If the previous item was a condition starting (?(? an assertion,
4244       optionally preceded by a callout, is expected. This is checked later on,
4245       during actual compilation. However we need to identify this kind of
4246       assertion in this pass because it must not be qualified. The value of
4247       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4248       for a callout - still leaving a positive value that identifies the
4249       assertion. Multiple callouts or any other items will make it zero or
4250       less, which doesn't matter because they will cause an error later. */
4251 
4252       expect_cond_assert = prev_expect_cond_assert - 1;
4253 
4254       /* If previous_callout is not NULL, it means this follows a previous
4255       callout. If it was a manual callout, do nothing; this means its "length
4256       of next pattern item" field will remain zero. If it was an automatic
4257       callout, abolish it. */
4258 
4259       if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4260           previous_callout == parsed_pattern - 4 &&
4261           parsed_pattern[-1] == 255)
4262         parsed_pattern = previous_callout;
4263 
4264       /* Save for updating next pattern item length, and skip one item before
4265       completing. */
4266 
4267       previous_callout = parsed_pattern;
4268       after_manual_callout = 1;
4269 
4270       /* Handle a string argument; specific delimiter is required. */
4271 
4272       if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4273         {
4274         PCRE2_SIZE calloutlength;
4275         PCRE2_SPTR startptr = ptr;
4276 
4277         delimiter = 0;
4278         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4279           {
4280           if (*ptr == PRIV(callout_start_delims)[i])
4281             {
4282             delimiter = PRIV(callout_end_delims)[i];
4283             break;
4284             }
4285           }
4286         if (delimiter == 0)
4287           {
4288           errorcode = ERR82;
4289           goto FAILED;
4290           }
4291 
4292         *parsed_pattern = META_CALLOUT_STRING;
4293         parsed_pattern += 3;   /* Skip pattern info */
4294 
4295         for (;;)
4296           {
4297           if (++ptr >= ptrend)
4298             {
4299             errorcode = ERR81;
4300             ptr = startptr;   /* To give a more useful message */
4301             goto FAILED;
4302             }
4303           if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4304             break;
4305           }
4306 
4307         calloutlength = (PCRE2_SIZE)(ptr - startptr);
4308         if (calloutlength > UINT32_MAX)
4309           {
4310           errorcode = ERR72;
4311           goto FAILED;
4312           }
4313         *parsed_pattern++ = (uint32_t)calloutlength;
4314         offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4315         PUTOFFSET(offset, parsed_pattern);
4316         }
4317 
4318       /* Handle a callout with an optional numerical argument, which must be
4319       less than or equal to 255. A missing argument gives 0. */
4320 
4321       else
4322         {
4323         int n = 0;
4324         *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4325         parsed_pattern += 3;                       /* Skip pattern info */
4326         while (ptr < ptrend && IS_DIGIT(*ptr))
4327           {
4328           n = n * 10 + *ptr++ - CHAR_0;
4329           if (n > 255)
4330             {
4331             errorcode = ERR38;
4332             goto FAILED;
4333             }
4334           }
4335         *parsed_pattern++ = n;
4336         }
4337 
4338       /* Both formats must have a closing parenthesis */
4339 
4340       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4341         {
4342         errorcode = ERR39;
4343         goto FAILED;
4344         }
4345       ptr++;
4346 
4347       /* Remember the offset to the next item in the pattern, and set a default
4348       length. This should get updated after the next item is read. */
4349 
4350       previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4351       previous_callout[2] = 0;
4352       break;                  /* End callout */
4353 
4354 
4355       /* ---- Conditional group ---- */
4356 
4357       /* A condition can be an assertion, a number (referring to a numbered
4358       group's having been set), a name (referring to a named group), or 'R',
4359       referring to overall recursion. R<digits> and R&name are also permitted
4360       for recursion state tests. Numbers may be preceded by + or - to specify a
4361       relative group number.
4362 
4363       There are several syntaxes for testing a named group: (?(name)) is used
4364       by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4365 
4366       There are two unfortunate ambiguities. 'R' can be the recursive thing or
4367       the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4368       the Perl DEFINE feature or the Python named test. We look for a name
4369       first; if not found, we try the other case.
4370 
4371       For compatibility with auto-callouts, we allow a callout to be specified
4372       before a condition that is an assertion. */
4373 
4374       case CHAR_LEFT_PARENTHESIS:
4375       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4376       nest_depth++;
4377 
4378       /* If the next character is ? or * there must be an assertion next
4379       (optionally preceded by a callout). We do not check this here, but
4380       instead we set expect_cond_assert to 2. If this is still greater than
4381       zero (callouts decrement it) when the next assertion is read, it will be
4382       marked as a condition that must not be repeated. A value greater than
4383       zero also causes checking that an assertion (possibly with callout)
4384       follows. */
4385 
4386       if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4387         {
4388         *parsed_pattern++ = META_COND_ASSERT;
4389         ptr--;   /* Pull pointer back to the opening parenthesis. */
4390         expect_cond_assert = 2;
4391         break;  /* End of conditional */
4392         }
4393 
4394       /* Handle (?([+-]number)... */
4395 
4396       if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4397           &errorcode))
4398         {
4399         if (i <= 0)
4400           {
4401           errorcode = ERR15;
4402           goto FAILED;
4403           }
4404         *parsed_pattern++ = META_COND_NUMBER;
4405         offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4406         PUTOFFSET(offset, parsed_pattern);
4407         *parsed_pattern++ = i;
4408         }
4409       else if (errorcode != 0) goto FAILED;   /* Number too big */
4410 
4411       /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4412 
4413       else if (ptrend - ptr >= 10 &&
4414                PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4415                ptr[7] != CHAR_RIGHT_PARENTHESIS)
4416         {
4417         uint32_t ge = 0;
4418         int major = 0;
4419         int minor = 0;
4420 
4421         ptr += 7;
4422         if (*ptr == CHAR_GREATER_THAN_SIGN)
4423           {
4424           ge = 1;
4425           ptr++;
4426           }
4427 
4428         /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4429         references its argument twice. */
4430 
4431         if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4432           goto BAD_VERSION_CONDITION;
4433 
4434         if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4435           goto FAILED;
4436 
4437         if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4438         if (*ptr == CHAR_DOT)
4439           {
4440           if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4441           minor = (*ptr++ - CHAR_0) * 10;
4442           if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4443           if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4444           if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4445             goto BAD_VERSION_CONDITION;
4446           }
4447 
4448         *parsed_pattern++ = META_COND_VERSION;
4449         *parsed_pattern++ = ge;
4450         *parsed_pattern++ = major;
4451         *parsed_pattern++ = minor;
4452         }
4453 
4454       /* All the remaining cases now require us to read a name. We cannot at
4455       this stage distinguish ambiguous cases such as (?(R12) which might be a
4456       recursion test by number or a name, because the named groups have not yet
4457       all been identified. Those cases are treated as names, but given a
4458       different META code. */
4459 
4460       else
4461         {
4462         BOOL was_r_ampersand = FALSE;
4463 
4464         if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4465           {
4466           terminator = CHAR_RIGHT_PARENTHESIS;
4467           was_r_ampersand = TRUE;
4468           ptr++;
4469           }
4470         else if (*ptr == CHAR_LESS_THAN_SIGN)
4471           terminator = CHAR_GREATER_THAN_SIGN;
4472         else if (*ptr == CHAR_APOSTROPHE)
4473           terminator = CHAR_APOSTROPHE;
4474         else
4475           {
4476           terminator = CHAR_RIGHT_PARENTHESIS;
4477           ptr--;   /* Point to char before name */
4478           }
4479         if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4480             &errorcode, cb)) goto FAILED;
4481 
4482         /* Handle (?(R&name) */
4483 
4484         if (was_r_ampersand)
4485           {
4486           *parsed_pattern = META_COND_RNAME;
4487           ptr--;   /* Back to closing parens */
4488           }
4489 
4490         /* Handle (?(name). If the name is "DEFINE" we identify it with a
4491         special code. Likewise if the name consists of R followed only by
4492         digits. Otherwise, handle it like a quoted name. */
4493 
4494         else if (terminator == CHAR_RIGHT_PARENTHESIS)
4495           {
4496           if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4497             *parsed_pattern = META_COND_DEFINE;
4498           else
4499             {
4500             for (i = 1; i < (int)namelen; i++)
4501               if (!IS_DIGIT(name[i])) break;
4502             *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4503               META_COND_RNUMBER : META_COND_NAME;
4504             }
4505           ptr--;   /* Back to closing parens */
4506           }
4507 
4508         /* Handle (?('name') or (?(<name>) */
4509 
4510         else *parsed_pattern = META_COND_NAME;
4511 
4512         /* All these cases except DEFINE end with the name length and offset;
4513         DEFINE just has an offset (for the "too many branches" error). */
4514 
4515         if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4516         PUTOFFSET(offset, parsed_pattern);
4517         }  /* End cases that read a name */
4518 
4519       /* Check the closing parenthesis of the condition */
4520 
4521       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4522         {
4523         errorcode = ERR24;
4524         goto FAILED;
4525         }
4526       ptr++;
4527       break;  /* End of condition processing */
4528 
4529 
4530       /* ---- Atomic group ---- */
4531 
4532       case CHAR_GREATER_THAN_SIGN:
4533       ATOMIC_GROUP:                          /* Come from (*atomic: */
4534       *parsed_pattern++ = META_ATOMIC;
4535       nest_depth++;
4536       ptr++;
4537       break;
4538 
4539 
4540       /* ---- Lookahead assertions ---- */
4541 
4542       case CHAR_EQUALS_SIGN:
4543       POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4544       *parsed_pattern++ = META_LOOKAHEAD;
4545       ptr++;
4546       goto POST_ASSERTION;
4547 
4548       case CHAR_ASTERISK:
4549       POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */
4550       *parsed_pattern++ = META_LOOKAHEAD_NA;
4551       ptr++;
4552       goto POST_ASSERTION;
4553 
4554       case CHAR_EXCLAMATION_MARK:
4555       NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4556       *parsed_pattern++ = META_LOOKAHEADNOT;
4557       ptr++;
4558       goto POST_ASSERTION;
4559 
4560 
4561       /* ---- Lookbehind assertions ---- */
4562 
4563       /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4564       is the start of the name of a capturing group. */
4565 
4566       case CHAR_LESS_THAN_SIGN:
4567       if (ptrend - ptr <= 1 ||
4568          (ptr[1] != CHAR_EQUALS_SIGN &&
4569           ptr[1] != CHAR_EXCLAMATION_MARK &&
4570           ptr[1] != CHAR_ASTERISK))
4571         {
4572         terminator = CHAR_GREATER_THAN_SIGN;
4573         goto DEFINE_NAME;
4574         }
4575       *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4576         META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4577         META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4578 
4579       POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
4580       *has_lookbehind = TRUE;
4581       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4582       PUTOFFSET(offset, parsed_pattern);
4583       ptr += 2;
4584       /* Fall through */
4585 
4586       /* If the previous item was a condition starting (?(? an assertion,
4587       optionally preceded by a callout, is expected. This is checked later on,
4588       during actual compilation. However we need to identify this kind of
4589       assertion in this pass because it must not be qualified. The value of
4590       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4591       for a callout - still leaving a positive value that identifies the
4592       assertion. Multiple callouts or any other items will make it zero or
4593       less, which doesn't matter because they will cause an error later. */
4594 
4595       POST_ASSERTION:
4596       nest_depth++;
4597       if (prev_expect_cond_assert > 0)
4598         {
4599         if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4600         else if (++top_nest >= end_nests)
4601           {
4602           errorcode = ERR84;
4603           goto FAILED;
4604           }
4605         top_nest->nest_depth = nest_depth;
4606         top_nest->flags = NSF_CONDASSERT;
4607         top_nest->options = options & PARSE_TRACKED_OPTIONS;
4608         }
4609       break;
4610 
4611 
4612       /* ---- Define a named group ---- */
4613 
4614       /* A named group may be defined as (?'name') or (?<name>). In the latter
4615       case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4616       terminator set to '>'. */
4617 
4618       case CHAR_APOSTROPHE:
4619       terminator = CHAR_APOSTROPHE;    /* Terminator */
4620 
4621       DEFINE_NAME:
4622       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4623           &errorcode, cb)) goto FAILED;
4624 
4625       /* We have a name for this capturing group. It is also assigned a number,
4626       which is its primary means of identification. */
4627 
4628       if (cb->bracount >= MAX_GROUP_NUMBER)
4629         {
4630         errorcode = ERR97;
4631         goto FAILED;
4632         }
4633       cb->bracount++;
4634       *parsed_pattern++ = META_CAPTURE | cb->bracount;
4635       nest_depth++;
4636 
4637       /* Check not too many names */
4638 
4639       if (cb->names_found >= MAX_NAME_COUNT)
4640         {
4641         errorcode = ERR49;
4642         goto FAILED;
4643         }
4644 
4645       /* Adjust the entry size to accommodate the longest name found. */
4646 
4647       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4648         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4649 
4650       /* Scan the list to check for duplicates. For duplicate names, if the
4651       number is the same, break the loop, which causes the name to be
4652       discarded; otherwise, if DUPNAMES is not set, give an error.
4653       If it is set, allow the name with a different number, but continue
4654       scanning in case this is a duplicate with the same number. For
4655       non-duplicate names, give an error if the number is duplicated. */
4656 
4657       isdupname = FALSE;
4658       ng = cb->named_groups;
4659       for (i = 0; i < cb->names_found; i++, ng++)
4660         {
4661         if (namelen == ng->length &&
4662             PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4663           {
4664           if (ng->number == cb->bracount) break;
4665           if ((options & PCRE2_DUPNAMES) == 0)
4666             {
4667             errorcode = ERR43;
4668             goto FAILED;
4669             }
4670           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4671           cb->dupnames = TRUE;              /* Duplicate names exist */
4672           }
4673         else if (ng->number == cb->bracount)
4674           {
4675           errorcode = ERR65;
4676           goto FAILED;
4677           }
4678         }
4679 
4680       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4681 
4682       /* Increase the list size if necessary */
4683 
4684       if (cb->names_found >= cb->named_group_list_size)
4685         {
4686         uint32_t newsize = cb->named_group_list_size * 2;
4687         named_group *newspace =
4688           cb->cx->memctl.malloc(newsize * sizeof(named_group),
4689           cb->cx->memctl.memory_data);
4690         if (newspace == NULL)
4691           {
4692           errorcode = ERR21;
4693           goto FAILED;
4694           }
4695 
4696         memcpy(newspace, cb->named_groups,
4697           cb->named_group_list_size * sizeof(named_group));
4698         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4699           cb->cx->memctl.free((void *)cb->named_groups,
4700           cb->cx->memctl.memory_data);
4701         cb->named_groups = newspace;
4702         cb->named_group_list_size = newsize;
4703         }
4704 
4705       /* Add this name to the list */
4706 
4707       cb->named_groups[cb->names_found].name = name;
4708       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4709       cb->named_groups[cb->names_found].number = cb->bracount;
4710       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4711       cb->names_found++;
4712       break;
4713       }        /* End of (? switch */
4714     break;     /* End of ( handling */
4715 
4716 
4717     /* ---- Branch terminators ---- */
4718 
4719     /* Alternation: reset the capture count if we are in a (?| group. */
4720 
4721     case CHAR_VERTICAL_LINE:
4722     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4723         (top_nest->flags & NSF_RESET) != 0)
4724       {
4725       if (cb->bracount > top_nest->max_group)
4726         top_nest->max_group = (uint16_t)cb->bracount;
4727       cb->bracount = top_nest->reset_group;
4728       }
4729     *parsed_pattern++ = META_ALT;
4730     break;
4731 
4732     /* End of group; reset the capture count to the maximum if we are in a (?|
4733     group and/or reset the options that are tracked during parsing. Disallow
4734     quantifier for a condition that is an assertion. */
4735 
4736     case CHAR_RIGHT_PARENTHESIS:
4737     okquantifier = TRUE;
4738     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4739       {
4740       options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4741       if ((top_nest->flags & NSF_RESET) != 0 &&
4742           top_nest->max_group > cb->bracount)
4743         cb->bracount = top_nest->max_group;
4744       if ((top_nest->flags & NSF_CONDASSERT) != 0)
4745         okquantifier = FALSE;
4746 
4747       if ((top_nest->flags & NSF_ATOMICSR) != 0)
4748         {
4749         *parsed_pattern++ = META_KET;
4750         }
4751 
4752       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4753         else top_nest--;
4754       }
4755     if (nest_depth == 0)    /* Unmatched closing parenthesis */
4756       {
4757       errorcode = ERR22;
4758       goto FAILED_BACK;
4759       }
4760     nest_depth--;
4761     *parsed_pattern++ = META_KET;
4762     break;
4763     }  /* End of switch on pattern character */
4764   }    /* End of main character scan loop */
4765 
4766 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4767 
4768 if (inverbname && ptr >= ptrend)
4769   {
4770   errorcode = ERR60;
4771   goto FAILED;
4772   }
4773 
4774 /* Manage callout for the final item */
4775 
4776 PARSED_END:
4777 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4778   parsed_pattern, cb);
4779 
4780 /* Insert trailing items for word and line matching (features provided for the
4781 benefit of pcre2grep). */
4782 
4783 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4784   {
4785   *parsed_pattern++ = META_KET;
4786   *parsed_pattern++ = META_DOLLAR;
4787   }
4788 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4789   {
4790   *parsed_pattern++ = META_KET;
4791   *parsed_pattern++ = META_ESCAPE + ESC_b;
4792   }
4793 
4794 /* Terminate the parsed pattern, then return success if all groups are closed.
4795 Otherwise we have unclosed parentheses. */
4796 
4797 if (parsed_pattern >= parsed_pattern_end)
4798   {
4799   errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
4800   goto FAILED;
4801   }
4802 
4803 *parsed_pattern = META_END;
4804 if (nest_depth == 0) return 0;
4805 
4806 UNCLOSED_PARENTHESIS:
4807 errorcode = ERR14;
4808 
4809 /* Come here for all failures. */
4810 
4811 FAILED:
4812 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4813 return errorcode;
4814 
4815 /* Some errors need to indicate the previous character. */
4816 
4817 FAILED_BACK:
4818 ptr--;
4819 goto FAILED;
4820 
4821 /* This failure happens several times. */
4822 
4823 BAD_VERSION_CONDITION:
4824 errorcode = ERR79;
4825 goto FAILED;
4826 }
4827 
4828 
4829 
4830 /*************************************************
4831 *       Find first significant opcode            *
4832 *************************************************/
4833 
4834 /* This is called by several functions that scan a compiled expression looking
4835 for a fixed first character, or an anchoring opcode etc. It skips over things
4836 that do not influence this. For some calls, it makes sense to skip negative
4837 forward and all backward assertions, and also the \b assertion; for others it
4838 does not.
4839 
4840 Arguments:
4841   code         pointer to the start of the group
4842   skipassert   TRUE if certain assertions are to be skipped
4843 
4844 Returns:       pointer to the first significant opcode
4845 */
4846 
4847 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4848 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4849 {
4850 for (;;)
4851   {
4852   switch ((int)*code)
4853     {
4854     case OP_ASSERT_NOT:
4855     case OP_ASSERTBACK:
4856     case OP_ASSERTBACK_NOT:
4857     case OP_ASSERTBACK_NA:
4858     if (!skipassert) return code;
4859     do code += GET(code, 1); while (*code == OP_ALT);
4860     code += PRIV(OP_lengths)[*code];
4861     break;
4862 
4863     case OP_WORD_BOUNDARY:
4864     case OP_NOT_WORD_BOUNDARY:
4865     if (!skipassert) return code;
4866     /* Fall through */
4867 
4868     case OP_CALLOUT:
4869     case OP_CREF:
4870     case OP_DNCREF:
4871     case OP_RREF:
4872     case OP_DNRREF:
4873     case OP_FALSE:
4874     case OP_TRUE:
4875     code += PRIV(OP_lengths)[*code];
4876     break;
4877 
4878     case OP_CALLOUT_STR:
4879     code += GET(code, 1 + 2*LINK_SIZE);
4880     break;
4881 
4882     case OP_SKIPZERO:
4883     code += 2 + GET(code, 2) + LINK_SIZE;
4884     break;
4885 
4886     case OP_COND:
4887     case OP_SCOND:
4888     if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
4889         code[GET(code, 1)] != OP_KET)      /* More than one branch */
4890       return code;
4891     code += GET(code, 1) + 1 + LINK_SIZE;
4892     break;
4893 
4894     case OP_MARK:
4895     case OP_COMMIT_ARG:
4896     case OP_PRUNE_ARG:
4897     case OP_SKIP_ARG:
4898     case OP_THEN_ARG:
4899     code += code[1] + PRIV(OP_lengths)[*code];
4900     break;
4901 
4902     default:
4903     return code;
4904     }
4905   }
4906 /* Control never reaches here */
4907 }
4908 
4909 
4910 
4911 #ifdef SUPPORT_UNICODE
4912 /*************************************************
4913 *           Get othercase range                  *
4914 *************************************************/
4915 
4916 /* This function is passed the start and end of a class range in UCP mode. It
4917 searches up the characters, looking for ranges of characters in the "other"
4918 case. Each call returns the next one, updating the start address. A character
4919 with multiple other cases is returned on its own with a special return value.
4920 
4921 Arguments:
4922   cptr        points to starting character value; updated
4923   d           end value
4924   ocptr       where to put start of othercase range
4925   odptr       where to put end of othercase range
4926 
4927 Yield:        -1 when no more
4928                0 when a range is returned
4929               >0 the CASESET offset for char with multiple other cases
4930                 in this case, ocptr contains the original
4931 */
4932 
4933 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4934 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4935   uint32_t *odptr)
4936 {
4937 uint32_t c, othercase, next;
4938 unsigned int co;
4939 
4940 /* Find the first character that has an other case. If it has multiple other
4941 cases, return its case offset value. */
4942 
4943 for (c = *cptr; c <= d; c++)
4944   {
4945   if ((co = UCD_CASESET(c)) != 0)
4946     {
4947     *ocptr = c++;   /* Character that has the set */
4948     *cptr = c;      /* Rest of input range */
4949     return (int)co;
4950     }
4951   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4952   }
4953 
4954 if (c > d) return -1;  /* Reached end of range */
4955 
4956 /* Found a character that has a single other case. Search for the end of the
4957 range, which is either the end of the input range, or a character that has zero
4958 or more than one other cases. */
4959 
4960 *ocptr = othercase;
4961 next = othercase + 1;
4962 
4963 for (++c; c <= d; c++)
4964   {
4965   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4966   next++;
4967   }
4968 
4969 *odptr = next - 1;     /* End of othercase range */
4970 *cptr = c;             /* Rest of input range */
4971 return 0;
4972 }
4973 #endif  /* SUPPORT_UNICODE */
4974 
4975 
4976 
4977 /*************************************************
4978 * Add a character or range to a class (internal) *
4979 *************************************************/
4980 
4981 /* This function packages up the logic of adding a character or range of
4982 characters to a class. The character values in the arguments will be within the
4983 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4984 called only from within the "add to class" group of functions, some of which
4985 are recursive and mutually recursive. The external entry point is
4986 add_to_class().
4987 
4988 Arguments:
4989   classbits     the bit map for characters < 256
4990   uchardptr     points to the pointer for extra data
4991   options       the options word
4992   cb            compile data
4993   start         start of range character
4994   end           end of range character
4995 
4996 Returns:        the number of < 256 characters added
4997                 the pointer to extra data is updated
4998 */
4999 
5000 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5001 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5002   uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
5003 {
5004 uint32_t c;
5005 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5006 unsigned int n8 = 0;
5007 
5008 /* If caseless matching is required, scan the range and process alternate
5009 cases. In Unicode, there are 8-bit characters that have alternate cases that
5010 are greater than 255 and vice-versa. Sometimes we can just extend the original
5011 range. */
5012 
5013 if ((options & PCRE2_CASELESS) != 0)
5014   {
5015 #ifdef SUPPORT_UNICODE
5016   if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5017     {
5018     int rc;
5019     uint32_t oc, od;
5020 
5021     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
5022     c = start;
5023 
5024     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
5025       {
5026       /* Handle a single character that has more than one other case. */
5027 
5028       if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
5029         PRIV(ucd_caseless_sets) + rc, oc);
5030 
5031       /* Do nothing if the other case range is within the original range. */
5032 
5033       else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
5034 
5035       /* Extend the original range if there is overlap, noting that if oc < c, we
5036       can't have od > end because a subrange is always shorter than the basic
5037       range. Otherwise, use a recursive call to add the additional range. */
5038 
5039       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5040       else if (od > end && oc <= end + 1)
5041         {
5042         end = od;       /* Extend upwards */
5043         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5044         }
5045       else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
5046       }
5047     }
5048   else
5049 #endif  /* SUPPORT_UNICODE */
5050 
5051   /* Not UTF mode */
5052 
5053   for (c = start; c <= classbits_end; c++)
5054     {
5055     SETBIT(classbits, cb->fcc[c]);
5056     n8++;
5057     }
5058   }
5059 
5060 /* Now handle the originally supplied range. Adjust the final value according
5061 to the bit length - this means that the same lists of (e.g.) horizontal spaces
5062 can be used in all cases. */
5063 
5064 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5065   end = MAX_NON_UTF_CHAR;
5066 
5067 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5068 
5069 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
5070 
5071 for (c = start; c <= classbits_end; c++)
5072   {
5073   /* Regardless of start, c will always be <= 255. */
5074   SETBIT(classbits, c);
5075   n8++;
5076   }
5077 
5078 #ifdef SUPPORT_WIDE_CHARS
5079 if (start <= 0xff) start = 0xff + 1;
5080 
5081 if (end >= start)
5082   {
5083   PCRE2_UCHAR *uchardata = *uchardptr;
5084 
5085 #ifdef SUPPORT_UNICODE
5086   if ((options & PCRE2_UTF) != 0)
5087     {
5088     if (start < end)
5089       {
5090       *uchardata++ = XCL_RANGE;
5091       uchardata += PRIV(ord2utf)(start, uchardata);
5092       uchardata += PRIV(ord2utf)(end, uchardata);
5093       }
5094     else if (start == end)
5095       {
5096       *uchardata++ = XCL_SINGLE;
5097       uchardata += PRIV(ord2utf)(start, uchardata);
5098       }
5099     }
5100   else
5101 #endif  /* SUPPORT_UNICODE */
5102 
5103   /* Without UTF support, character values are constrained by the bit length,
5104   and can only be > 256 for 16-bit and 32-bit libraries. */
5105 
5106 #if PCRE2_CODE_UNIT_WIDTH == 8
5107     {}
5108 #else
5109   if (start < end)
5110     {
5111     *uchardata++ = XCL_RANGE;
5112     *uchardata++ = start;
5113     *uchardata++ = end;
5114     }
5115   else if (start == end)
5116     {
5117     *uchardata++ = XCL_SINGLE;
5118     *uchardata++ = start;
5119     }
5120 #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
5121   *uchardptr = uchardata;   /* Updata extra data pointer */
5122   }
5123 #else  /* SUPPORT_WIDE_CHARS */
5124   (void)uchardptr;          /* Avoid compiler warning */
5125 #endif /* SUPPORT_WIDE_CHARS */
5126 
5127 return n8;    /* Number of 8-bit characters */
5128 }
5129 
5130 
5131 
5132 #ifdef SUPPORT_UNICODE
5133 /*************************************************
5134 * Add a list of characters to a class (internal) *
5135 *************************************************/
5136 
5137 /* This function is used for adding a list of case-equivalent characters to a
5138 class when in UTF mode. This function is called only from within
5139 add_to_class_internal(), with which it is mutually recursive.
5140 
5141 Arguments:
5142   classbits     the bit map for characters < 256
5143   uchardptr     points to the pointer for extra data
5144   options       the options word
5145   cb            contains pointers to tables etc.
5146   p             points to row of 32-bit values, terminated by NOTACHAR
5147   except        character to omit; this is used when adding lists of
5148                   case-equivalent characters to avoid including the one we
5149                   already know about
5150 
5151 Returns:        the number of < 256 characters added
5152                 the pointer to extra data is updated
5153 */
5154 
5155 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5156 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5157   uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
5158 {
5159 unsigned int n8 = 0;
5160 while (p[0] < NOTACHAR)
5161   {
5162   unsigned int n = 0;
5163   if (p[0] != except)
5164     {
5165     while(p[n+1] == p[0] + n + 1) n++;
5166     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5167     }
5168   p += n + 1;
5169   }
5170 return n8;
5171 }
5172 #endif
5173 
5174 
5175 
5176 /*************************************************
5177 *   External entry point for add range to class  *
5178 *************************************************/
5179 
5180 /* This function sets the overall range so that the internal functions can try
5181 to avoid duplication when handling case-independence.
5182 
5183 Arguments:
5184   classbits     the bit map for characters < 256
5185   uchardptr     points to the pointer for extra data
5186   options       the options word
5187   cb            compile data
5188   start         start of range character
5189   end           end of range character
5190 
5191 Returns:        the number of < 256 characters added
5192                 the pointer to extra data is updated
5193 */
5194 
5195 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5196 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5197   compile_block *cb, uint32_t start, uint32_t end)
5198 {
5199 cb->class_range_start = start;
5200 cb->class_range_end = end;
5201 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5202 }
5203 
5204 
5205 /*************************************************
5206 *   External entry point for add list to class   *
5207 *************************************************/
5208 
5209 /* This function is used for adding a list of horizontal or vertical whitespace
5210 characters to a class. The list must be in order so that ranges of characters
5211 can be detected and handled appropriately. This function sets the overall range
5212 so that the internal functions can try to avoid duplication when handling
5213 case-independence.
5214 
5215 Arguments:
5216   classbits     the bit map for characters < 256
5217   uchardptr     points to the pointer for extra data
5218   options       the options word
5219   cb            contains pointers to tables etc.
5220   p             points to row of 32-bit values, terminated by NOTACHAR
5221   except        character to omit; this is used when adding lists of
5222                   case-equivalent characters to avoid including the one we
5223                   already know about
5224 
5225 Returns:        the number of < 256 characters added
5226                 the pointer to extra data is updated
5227 */
5228 
5229 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5230 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5231   compile_block *cb, const uint32_t *p, unsigned int except)
5232 {
5233 unsigned int n8 = 0;
5234 while (p[0] < NOTACHAR)
5235   {
5236   unsigned int n = 0;
5237   if (p[0] != except)
5238     {
5239     while(p[n+1] == p[0] + n + 1) n++;
5240     cb->class_range_start = p[0];
5241     cb->class_range_end = p[n];
5242     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5243     }
5244   p += n + 1;
5245   }
5246 return n8;
5247 }
5248 
5249 
5250 
5251 /*************************************************
5252 *    Add characters not in a list to a class     *
5253 *************************************************/
5254 
5255 /* This function is used for adding the complement of a list of horizontal or
5256 vertical whitespace to a class. The list must be in order.
5257 
5258 Arguments:
5259   classbits     the bit map for characters < 256
5260   uchardptr     points to the pointer for extra data
5261   options       the options word
5262   cb            contains pointers to tables etc.
5263   p             points to row of 32-bit values, terminated by NOTACHAR
5264 
5265 Returns:        the number of < 256 characters added
5266                 the pointer to extra data is updated
5267 */
5268 
5269 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)5270 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5271   uint32_t options, compile_block *cb, const uint32_t *p)
5272 {
5273 BOOL utf = (options & PCRE2_UTF) != 0;
5274 unsigned int n8 = 0;
5275 if (p[0] > 0)
5276   n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5277 while (p[0] < NOTACHAR)
5278   {
5279   while (p[1] == p[0] + 1) p++;
5280   n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5281     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5282   p++;
5283   }
5284 return n8;
5285 }
5286 
5287 
5288 
5289 /*************************************************
5290 *    Find details of duplicate group names       *
5291 *************************************************/
5292 
5293 /* This is called from compile_branch() when it needs to know the index and
5294 count of duplicates in the names table when processing named backreferences,
5295 either directly, or as conditions.
5296 
5297 Arguments:
5298   name          points to the name
5299   length        the length of the name
5300   indexptr      where to put the index
5301   countptr      where to put the count of duplicates
5302   errorcodeptr  where to put an error code
5303   cb            the compile block
5304 
5305 Returns:        TRUE if OK, FALSE if not, error code set
5306 */
5307 
5308 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5309 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5310   int *countptr, int *errorcodeptr, compile_block *cb)
5311 {
5312 uint32_t i, groupnumber;
5313 int count;
5314 PCRE2_UCHAR *slot = cb->name_table;
5315 
5316 /* Find the first entry in the table */
5317 
5318 for (i = 0; i < cb->names_found; i++)
5319   {
5320   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5321       slot[IMM2_SIZE+length] == 0) break;
5322   slot += cb->name_entry_size;
5323   }
5324 
5325 /* This should not occur, because this function is called only when we know we
5326 have duplicate names. Give an internal error. */
5327 
5328 if (i >= cb->names_found)
5329   {
5330   *errorcodeptr = ERR53;
5331   cb->erroroffset = name - cb->start_pattern;
5332   return FALSE;
5333   }
5334 
5335 /* Record the index and then see how many duplicates there are, updating the
5336 backref map and maximum back reference as we do. */
5337 
5338 *indexptr = i;
5339 count = 0;
5340 
5341 for (;;)
5342   {
5343   count++;
5344   groupnumber = GET2(slot,0);
5345   cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5346   if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5347   if (++i >= cb->names_found) break;
5348   slot += cb->name_entry_size;
5349   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5350     (slot+IMM2_SIZE)[length] != 0) break;
5351   }
5352 
5353 *countptr = count;
5354 return TRUE;
5355 }
5356 
5357 
5358 
5359 /*************************************************
5360 *           Compile one branch                   *
5361 *************************************************/
5362 
5363 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5364 the options are changed during the branch, the pointer is used to change the
5365 external options bits. This function is used during the pre-compile phase when
5366 we are trying to find out the amount of memory needed, as well as during the
5367 real compile phase. The value of lengthptr distinguishes the two phases.
5368 
5369 Arguments:
5370   optionsptr        pointer to the option bits
5371   codeptr           points to the pointer to the current code point
5372   pptrptr           points to the current parsed pattern pointer
5373   errorcodeptr      points to error code variable
5374   firstcuptr        place to put the first required code unit
5375   firstcuflagsptr   place to put the first code unit flags
5376   reqcuptr          place to put the last required code unit
5377   reqcuflagsptr     place to put the last required code unit flags
5378   bcptr             points to current branch chain
5379   cb                contains pointers to tables etc.
5380   lengthptr         NULL during the real compile phase
5381                     points to length accumulator during pre-compile phase
5382 
5383 Returns:            0 There's been an error, *errorcodeptr is non-zero
5384                    +1 Success, this branch must match at least one character
5385                    -1 Success, this branch may match an empty string
5386 */
5387 
5388 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)5389 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
5390   int *errorcodeptr, uint32_t *firstcuptr, uint32_t *firstcuflagsptr,
5391   uint32_t *reqcuptr, uint32_t *reqcuflagsptr, branch_chain *bcptr,
5392   compile_block *cb, PCRE2_SIZE *lengthptr)
5393 {
5394 int bravalue = 0;
5395 int okreturn = -1;
5396 int group_return = 0;
5397 uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5398 uint32_t greedy_default, greedy_non_default;
5399 uint32_t repeat_type, op_type;
5400 uint32_t options = *optionsptr;               /* May change dynamically */
5401 uint32_t firstcu, reqcu;
5402 uint32_t zeroreqcu, zerofirstcu;
5403 uint32_t escape;
5404 uint32_t *pptr = *pptrptr;
5405 uint32_t meta, meta_arg;
5406 uint32_t firstcuflags, reqcuflags;
5407 uint32_t zeroreqcuflags, zerofirstcuflags;
5408 uint32_t req_caseopt, reqvary, tempreqvary;
5409 PCRE2_SIZE offset = 0;
5410 PCRE2_SIZE length_prevgroup = 0;
5411 PCRE2_UCHAR *code = *codeptr;
5412 PCRE2_UCHAR *last_code = code;
5413 PCRE2_UCHAR *orig_code = code;
5414 PCRE2_UCHAR *tempcode;
5415 PCRE2_UCHAR *previous = NULL;
5416 PCRE2_UCHAR op_previous;
5417 BOOL groupsetfirstcu = FALSE;
5418 BOOL had_accept = FALSE;
5419 BOOL matched_char = FALSE;
5420 BOOL previous_matched_char = FALSE;
5421 BOOL reset_caseful = FALSE;
5422 const uint8_t *cbits = cb->cbits;
5423 uint8_t classbits[32];
5424 
5425 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5426 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5427 dynamically as we process the pattern. */
5428 
5429 #ifdef SUPPORT_UNICODE
5430 BOOL utf = (options & PCRE2_UTF) != 0;
5431 BOOL ucp = (options & PCRE2_UCP) != 0;
5432 #else  /* No Unicode support */
5433 BOOL utf = FALSE;
5434 #endif
5435 
5436 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5437 class_uchardata always so that it can be passed to add_to_class() always,
5438 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5439 alternative calls for the different cases. */
5440 
5441 PCRE2_UCHAR *class_uchardata;
5442 #ifdef SUPPORT_WIDE_CHARS
5443 BOOL xclass;
5444 PCRE2_UCHAR *class_uchardata_base;
5445 #endif
5446 
5447 /* Set up the default and non-default settings for greediness */
5448 
5449 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5450 greedy_non_default = greedy_default ^ 1;
5451 
5452 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5453 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5454 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5455 
5456 When we hit a repeat whose minimum is zero, we may have to adjust these values
5457 to take the zero repeat into account. This is implemented by setting them to
5458 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5459 item types that can be repeated set these backoff variables appropriately. */
5460 
5461 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5462 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5463 
5464 /* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5465 according to the current setting of the caseless flag. The REQ_CASELESS value
5466 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5467 to record the case status of the value. This is used only for ASCII characters.
5468 */
5469 
5470 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5471 
5472 /* Switch on next META item until the end of the branch */
5473 
5474 for (;; pptr++)
5475   {
5476 #ifdef SUPPORT_WIDE_CHARS
5477   BOOL xclass_has_prop;
5478 #endif
5479   BOOL negate_class;
5480   BOOL should_flip_negation;
5481   BOOL match_all_or_no_wide_chars;
5482   BOOL possessive_quantifier;
5483   BOOL note_group_empty;
5484   int class_has_8bitchar;
5485   uint32_t mclength;
5486   uint32_t skipunits;
5487   uint32_t subreqcu, subfirstcu;
5488   uint32_t groupnumber;
5489   uint32_t verbarglen, verbculen;
5490   uint32_t subreqcuflags, subfirstcuflags;
5491   open_capitem *oc;
5492   PCRE2_UCHAR mcbuffer[8];
5493 
5494   /* Get next META item in the pattern and its potential argument. */
5495 
5496   meta = META_CODE(*pptr);
5497   meta_arg = META_DATA(*pptr);
5498 
5499   /* If we are in the pre-compile phase, accumulate the length used for the
5500   previous cycle of this loop, unless the next item is a quantifier. */
5501 
5502   if (lengthptr != NULL)
5503     {
5504     if (code > cb->start_workspace + cb->workspace_size -
5505         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5506       {
5507       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5508         ERR52 : ERR86;
5509       return 0;
5510       }
5511 
5512     /* There is at least one situation where code goes backwards: this is the
5513     case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5514     is processed, the whole class is eliminated. However, it is created first,
5515     so we have to allow memory for it. Therefore, don't ever reduce the length
5516     at this point. */
5517 
5518     if (code < last_code) code = last_code;
5519 
5520     /* If the next thing is not a quantifier, we add the length of the previous
5521     item into the total, and reset the code pointer to the start of the
5522     workspace. Otherwise leave the previous item available to be quantified. */
5523 
5524     if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5525       {
5526       if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5527         {
5528         *errorcodeptr = ERR20;   /* Integer overflow */
5529         return 0;
5530         }
5531       *lengthptr += (PCRE2_SIZE)(code - orig_code);
5532       if (*lengthptr > MAX_PATTERN_SIZE)
5533         {
5534         *errorcodeptr = ERR20;   /* Pattern is too large */
5535         return 0;
5536         }
5537       code = orig_code;
5538       }
5539 
5540     /* Remember where this code item starts so we can catch the "backwards"
5541     case above next time round. */
5542 
5543     last_code = code;
5544     }
5545 
5546   /* Process the next parsed pattern item. If it is not a quantifier, remember
5547   where it starts so that it can be quantified when a quantifier follows.
5548   Checking for the legality of quantifiers happens in parse_regex(), except for
5549   a quantifier after an assertion that is a condition. */
5550 
5551   if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5552     {
5553     previous = code;
5554     if (matched_char && !had_accept) okreturn = 1;
5555     }
5556 
5557   previous_matched_char = matched_char;
5558   matched_char = FALSE;
5559   note_group_empty = FALSE;
5560   skipunits = 0;         /* Default value for most subgroups */
5561 
5562   switch(meta)
5563     {
5564     /* ===================================================================*/
5565     /* The branch terminates at pattern end or | or ) */
5566 
5567     case META_END:
5568     case META_ALT:
5569     case META_KET:
5570     *firstcuptr = firstcu;
5571     *firstcuflagsptr = firstcuflags;
5572     *reqcuptr = reqcu;
5573     *reqcuflagsptr = reqcuflags;
5574     *codeptr = code;
5575     *pptrptr = pptr;
5576     return okreturn;
5577 
5578 
5579     /* ===================================================================*/
5580     /* Handle single-character metacharacters. In multiline mode, ^ disables
5581     the setting of any following char as a first character. */
5582 
5583     case META_CIRCUMFLEX:
5584     if ((options & PCRE2_MULTILINE) != 0)
5585       {
5586       if (firstcuflags == REQ_UNSET)
5587         zerofirstcuflags = firstcuflags = REQ_NONE;
5588       *code++ = OP_CIRCM;
5589       }
5590     else *code++ = OP_CIRC;
5591     break;
5592 
5593     case META_DOLLAR:
5594     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5595     break;
5596 
5597     /* There can never be a first char if '.' is first, whatever happens about
5598     repeats. The value of reqcu doesn't change either. */
5599 
5600     case META_DOT:
5601     matched_char = TRUE;
5602     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5603     zerofirstcu = firstcu;
5604     zerofirstcuflags = firstcuflags;
5605     zeroreqcu = reqcu;
5606     zeroreqcuflags = reqcuflags;
5607     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5608     break;
5609 
5610 
5611     /* ===================================================================*/
5612     /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5613     Otherwise, an initial ']' is taken as a data character. When empty classes
5614     are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5615     match any character, so generate OP_ALLANY. */
5616 
5617     case META_CLASS_EMPTY:
5618     case META_CLASS_EMPTY_NOT:
5619     matched_char = TRUE;
5620     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5621     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5622     zerofirstcu = firstcu;
5623     zerofirstcuflags = firstcuflags;
5624     break;
5625 
5626 
5627     /* ===================================================================*/
5628     /* Non-empty character class. If the included characters are all < 256, we
5629     build a 32-byte bitmap of the permitted characters, except in the special
5630     case where there is only one such character. For negated classes, we build
5631     the map as usual, then invert it at the end. However, we use a different
5632     opcode so that data characters > 255 can be handled correctly.
5633 
5634     If the class contains characters outside the 0-255 range, a different
5635     opcode is compiled. It may optionally have a bit map for characters < 256,
5636     but those above are are explicitly listed afterwards. A flag code unit
5637     tells whether the bitmap is present, and whether this is a negated class or
5638     not. */
5639 
5640     case META_CLASS_NOT:
5641     case META_CLASS:
5642     matched_char = TRUE;
5643     negate_class = meta == META_CLASS_NOT;
5644 
5645     /* We can optimize the case of a single character in a class by generating
5646     OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5647     negative. In the negative case there can be no first char if this item is
5648     first, whatever repeat count may follow. In the case of reqcu, save the
5649     previous value for reinstating. */
5650 
5651     /* NOTE: at present this optimization is not effective if the only
5652     character in a class in 32-bit, non-UCP mode has its top bit set. */
5653 
5654     if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5655       {
5656 #ifdef SUPPORT_UNICODE
5657       uint32_t d;
5658 #endif
5659       uint32_t c = pptr[1];
5660 
5661       pptr += 2;                 /* Move on to class end */
5662       if (meta == META_CLASS)    /* A positive one-char class can be */
5663         {                        /* handled as a normal literal character. */
5664         meta = c;                /* Set up the character */
5665         goto NORMAL_CHAR_SET;
5666         }
5667 
5668       /* Handle a negative one-character class */
5669 
5670       zeroreqcu = reqcu;
5671       zeroreqcuflags = reqcuflags;
5672       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5673       zerofirstcu = firstcu;
5674       zerofirstcuflags = firstcuflags;
5675 
5676       /* For caseless UTF or UCP mode, check whether this character has more
5677       than one other case. If so, generate a special OP_NOTPROP item instead of
5678       OP_NOTI. */
5679 
5680 #ifdef SUPPORT_UNICODE
5681       if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5682           (d = UCD_CASESET(c)) != 0)
5683         {
5684         *code++ = OP_NOTPROP;
5685         *code++ = PT_CLIST;
5686         *code++ = d;
5687         break;   /* We are finished with this class */
5688         }
5689 #endif
5690       /* Char has only one other case, or UCP not available */
5691 
5692       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5693       code += PUTCHAR(c, code);
5694       break;   /* We are finished with this class */
5695       }        /* End of 1-char optimization */
5696 
5697     /* Handle character classes that contain more than just one literal
5698     character. If there are exactly two characters in a positive class, see if
5699     they are case partners. This can be optimized to generate a caseless single
5700     character match (which also sets first/required code units if relevant). */
5701 
5702     if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5703         pptr[3] == META_CLASS_END)
5704       {
5705       uint32_t c = pptr[1];
5706 
5707 #ifdef SUPPORT_UNICODE
5708       if (UCD_CASESET(c) == 0)
5709 #endif
5710         {
5711         uint32_t d;
5712 
5713 #ifdef SUPPORT_UNICODE
5714         if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5715 #endif
5716           {
5717 #if PCRE2_CODE_UNIT_WIDTH != 8
5718           if (c > 255) d = c; else
5719 #endif
5720           d = TABLE_GET(c, cb->fcc, c);
5721           }
5722 
5723         if (c != d && pptr[2] == d)
5724           {
5725           pptr += 3;                 /* Move on to class end */
5726           meta = c;
5727           if ((options & PCRE2_CASELESS) == 0)
5728             {
5729             reset_caseful = TRUE;
5730             options |= PCRE2_CASELESS;
5731             req_caseopt = REQ_CASELESS;
5732             }
5733           goto CLASS_CASELESS_CHAR;
5734           }
5735         }
5736       }
5737 
5738     /* If a non-extended class contains a negative special such as \S, we need
5739     to flip the negation flag at the end, so that support for characters > 255
5740     works correctly (they are all included in the class). An extended class may
5741     need to insert specific matching or non-matching code for wide characters.
5742     */
5743 
5744     should_flip_negation = match_all_or_no_wide_chars = FALSE;
5745 
5746     /* Extended class (xclass) will be used when characters > 255
5747     might match. */
5748 
5749 #ifdef SUPPORT_WIDE_CHARS
5750     xclass = FALSE;
5751     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
5752     class_uchardata_base = class_uchardata;   /* Save the start */
5753 #endif
5754 
5755     /* For optimization purposes, we track some properties of the class:
5756     class_has_8bitchar will be non-zero if the class contains at least one
5757     character with a code point less than 256; xclass_has_prop will be TRUE if
5758     Unicode property checks are present in the class. */
5759 
5760     class_has_8bitchar = 0;
5761 #ifdef SUPPORT_WIDE_CHARS
5762     xclass_has_prop = FALSE;
5763 #endif
5764 
5765     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5766     in a temporary bit of memory, in case the class contains fewer than two
5767     8-bit characters because in that case the compiled code doesn't use the bit
5768     map. */
5769 
5770     memset(classbits, 0, 32 * sizeof(uint8_t));
5771 
5772     /* Process items until META_CLASS_END is reached. */
5773 
5774     while ((meta = *(++pptr)) != META_CLASS_END)
5775       {
5776       /* Handle POSIX classes such as [:alpha:] etc. */
5777 
5778       if (meta == META_POSIX || meta == META_POSIX_NEG)
5779         {
5780         BOOL local_negate = (meta == META_POSIX_NEG);
5781         int posix_class = *(++pptr);
5782         int taboffset, tabopt;
5783         uint8_t pbits[32];
5784 
5785         should_flip_negation = local_negate;  /* Note negative special */
5786 
5787         /* If matching is caseless, upper and lower are converted to alpha.
5788         This relies on the fact that the class table starts with alpha,
5789         lower, upper as the first 3 entries. */
5790 
5791         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5792           posix_class = 0;
5793 
5794         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5795         different escape sequences that use Unicode properties \p or \P.
5796         Others that are not available via \p or \P have to generate
5797         XCL_PROP/XCL_NOTPROP directly, which is done here. */
5798 
5799 #ifdef SUPPORT_UNICODE
5800         if ((options & PCRE2_UCP) != 0) switch(posix_class)
5801           {
5802           case PC_GRAPH:
5803           case PC_PRINT:
5804           case PC_PUNCT:
5805           *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5806           *class_uchardata++ = (PCRE2_UCHAR)
5807             ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5808              (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5809           *class_uchardata++ = 0;
5810           xclass_has_prop = TRUE;
5811           goto CONTINUE_CLASS;
5812 
5813           /* For the other POSIX classes (ascii, xdigit) we are going to
5814           fall through to the non-UCP case and build a bit map for
5815           characters with code points less than 256. However, if we are in
5816           a negated POSIX class, characters with code points greater than
5817           255 must either all match or all not match, depending on whether
5818           the whole class is not or is negated. For example, for
5819           [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5820           they must not.
5821 
5822           In the special case where there are no xclass items, this is
5823           automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5824           explicit range is needed for OP_XCLASS. Setting a flag here
5825           causes the range to be generated later when it is known that
5826           OP_XCLASS is required. In the 8-bit library this is relevant only in
5827           utf mode, since no wide characters can exist otherwise. */
5828 
5829           default:
5830 #if PCRE2_CODE_UNIT_WIDTH == 8
5831           if (utf)
5832 #endif
5833           match_all_or_no_wide_chars |= local_negate;
5834           break;
5835           }
5836 #endif  /* SUPPORT_UNICODE */
5837 
5838         /* In the non-UCP case, or when UCP makes no difference, we build the
5839         bit map for the POSIX class in a chunk of local store because we may
5840         be adding and subtracting from it, and we don't want to subtract bits
5841         that may be in the main map already. At the end we or the result into
5842         the bit map that is being built. */
5843 
5844         posix_class *= 3;
5845 
5846         /* Copy in the first table (always present) */
5847 
5848         memcpy(pbits, cbits + posix_class_maps[posix_class],
5849           32 * sizeof(uint8_t));
5850 
5851         /* If there is a second table, add or remove it as required. */
5852 
5853         taboffset = posix_class_maps[posix_class + 1];
5854         tabopt = posix_class_maps[posix_class + 2];
5855 
5856         if (taboffset >= 0)
5857           {
5858           if (tabopt >= 0)
5859             for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5860           else
5861             for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5862           }
5863 
5864         /* Now see if we need to remove any special characters. An option
5865         value of 1 removes vertical space and 2 removes underscore. */
5866 
5867         if (tabopt < 0) tabopt = -tabopt;
5868         if (tabopt == 1) pbits[1] &= ~0x3c;
5869           else if (tabopt == 2) pbits[11] &= 0x7f;
5870 
5871         /* Add the POSIX table or its complement into the main table that is
5872         being built and we are done. */
5873 
5874         if (local_negate)
5875           for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
5876         else
5877           for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
5878 
5879         /* Every class contains at least one < 256 character. */
5880 
5881         class_has_8bitchar = 1;
5882         goto CONTINUE_CLASS;    /* End of POSIX handling */
5883         }
5884 
5885       /* Other than POSIX classes, the only items we should encounter are
5886       \d-type escapes and literal characters (possibly as ranges). */
5887 
5888       if (meta == META_BIGVALUE)
5889         {
5890         meta = *(++pptr);
5891         goto CLASS_LITERAL;
5892         }
5893 
5894       /* Any other non-literal must be an escape */
5895 
5896       if (meta >= META_END)
5897         {
5898         if (META_CODE(meta) != META_ESCAPE)
5899           {
5900 #ifdef DEBUG_SHOW_PARSED
5901           fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5902                           "in character class\n", meta);
5903 #endif
5904           *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
5905           return 0;
5906           }
5907         escape = META_DATA(meta);
5908 
5909         /* Every class contains at least one < 256 character. */
5910 
5911         class_has_8bitchar++;
5912 
5913         switch(escape)
5914           {
5915           case ESC_d:
5916           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5917           break;
5918 
5919           case ESC_D:
5920           should_flip_negation = TRUE;
5921           for (int i = 0; i < 32; i++)
5922             classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
5923           break;
5924 
5925           case ESC_w:
5926           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5927           break;
5928 
5929           case ESC_W:
5930           should_flip_negation = TRUE;
5931           for (int i = 0; i < 32; i++)
5932             classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
5933           break;
5934 
5935           /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5936           5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5937           previously set by something earlier in the character class.
5938           Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5939           we could just adjust the appropriate bit. From PCRE 8.34 we no
5940           longer treat \s and \S specially. */
5941 
5942           case ESC_s:
5943           for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5944           break;
5945 
5946           case ESC_S:
5947           should_flip_negation = TRUE;
5948           for (int i = 0; i < 32; i++)
5949             classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
5950           break;
5951 
5952           /* When adding the horizontal or vertical space lists to a class, or
5953           their complements, disable PCRE2_CASELESS, because it justs wastes
5954           time, and in the "not-x" UTF cases can create unwanted duplicates in
5955           the XCLASS list (provoked by characters that have more than one other
5956           case and by both cases being in the same "not-x" sublist). */
5957 
5958           case ESC_h:
5959           (void)add_list_to_class(classbits, &class_uchardata,
5960             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5961           break;
5962 
5963           case ESC_H:
5964           (void)add_not_list_to_class(classbits, &class_uchardata,
5965             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5966           break;
5967 
5968           case ESC_v:
5969           (void)add_list_to_class(classbits, &class_uchardata,
5970             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5971           break;
5972 
5973           case ESC_V:
5974           (void)add_not_list_to_class(classbits, &class_uchardata,
5975             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5976           break;
5977 
5978           /* If Unicode is not supported, \P and \p are not allowed and are
5979           faulted at parse time, so will never appear here. */
5980 
5981 #ifdef SUPPORT_UNICODE
5982           case ESC_p:
5983           case ESC_P:
5984             {
5985             uint32_t ptype = *(++pptr) >> 16;
5986             uint32_t pdata = *pptr & 0xffff;
5987             *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5988             *class_uchardata++ = ptype;
5989             *class_uchardata++ = pdata;
5990             xclass_has_prop = TRUE;
5991             class_has_8bitchar--;                /* Undo! */
5992             }
5993           break;
5994 #endif
5995           }
5996 
5997         goto CONTINUE_CLASS;
5998         }  /* End handling \d-type escapes */
5999 
6000       /* A literal character may be followed by a range meta. At parse time
6001       there are checks for out-of-order characters, for ranges where the two
6002       characters are equal, and for hyphens that cannot indicate a range. At
6003       this point, therefore, no checking is needed. */
6004 
6005       else
6006         {
6007         uint32_t c, d;
6008 
6009         CLASS_LITERAL:
6010         c = d = meta;
6011 
6012         /* Remember if \r or \n were explicitly used */
6013 
6014         if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6015 
6016         /* Process a character range */
6017 
6018         if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6019           {
6020 #ifdef EBCDIC
6021           BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6022 #endif
6023           pptr += 2;
6024           d = *pptr;
6025           if (d == META_BIGVALUE) d = *(++pptr);
6026 
6027           /* Remember an explicit \r or \n, and add the range to the class. */
6028 
6029           if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6030 
6031           /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6032           because there are holes in the encoding, and simply using the range
6033           A-Z (for example) would include the characters in the holes. This
6034           applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6035 
6036 #ifdef EBCDIC
6037           if (range_is_literal &&
6038                (cb->ctypes[c] & ctype_letter) != 0 &&
6039                (cb->ctypes[d] & ctype_letter) != 0 &&
6040                (c <= CHAR_z) == (d <= CHAR_z))
6041             {
6042             uint32_t uc = (d <= CHAR_z)? 0 : 64;
6043             uint32_t C = c - uc;
6044             uint32_t D = d - uc;
6045 
6046             if (C <= CHAR_i)
6047               {
6048               class_has_8bitchar +=
6049                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6050                   ((D < CHAR_i)? D : CHAR_i) + uc);
6051               C = CHAR_j;
6052               }
6053 
6054             if (C <= D && C <= CHAR_r)
6055               {
6056               class_has_8bitchar +=
6057                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6058                   ((D < CHAR_r)? D : CHAR_r) + uc);
6059               C = CHAR_s;
6060               }
6061 
6062             if (C <= D)
6063               {
6064               class_has_8bitchar +=
6065                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6066                   D + uc);
6067               }
6068             }
6069           else
6070 #endif
6071           /* Not an EBCDIC special range */
6072 
6073           class_has_8bitchar +=
6074             add_to_class(classbits, &class_uchardata, options, cb, c, d);
6075           goto CONTINUE_CLASS;   /* Go get the next char in the class */
6076           }  /* End of range handling */
6077 
6078 
6079         /* Handle a single character. */
6080 
6081         class_has_8bitchar +=
6082           add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
6083         }
6084 
6085       /* Continue to the next item in the class. */
6086 
6087       CONTINUE_CLASS:
6088 
6089 #ifdef SUPPORT_WIDE_CHARS
6090       /* If any wide characters or Unicode properties have been encountered,
6091       set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6092       of the extra data and reset the pointer. This is so that very large
6093       classes that contain a zillion wide characters or Unicode property tests
6094       do not overwrite the workspace (which is on the stack). */
6095 
6096       if (class_uchardata > class_uchardata_base)
6097         {
6098         xclass = TRUE;
6099         if (lengthptr != NULL)
6100           {
6101           *lengthptr += class_uchardata - class_uchardata_base;
6102           class_uchardata = class_uchardata_base;
6103           }
6104         }
6105 #endif
6106 
6107       continue;  /* Needed to avoid error when not supporting wide chars */
6108       }   /* End of main class-processing loop */
6109 
6110     /* If this class is the first thing in the branch, there can be no first
6111     char setting, whatever the repeat count. Any reqcu setting must remain
6112     unchanged after any kind of repeat. */
6113 
6114     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6115     zerofirstcu = firstcu;
6116     zerofirstcuflags = firstcuflags;
6117     zeroreqcu = reqcu;
6118     zeroreqcuflags = reqcuflags;
6119 
6120     /* If there are characters with values > 255, or Unicode property settings
6121     (\p or \P), we have to compile an extended class, with its own opcode,
6122     unless there were no property settings and there was a negated special such
6123     as \S in the class, and PCRE2_UCP is not set, because in that case all
6124     characters > 255 are in or not in the class, so any that were explicitly
6125     given as well can be ignored.
6126 
6127     In the UCP case, if certain negated POSIX classes ([:^ascii:] or
6128     [^:xdigit:]) were present in a class, we either have to match or not match
6129     all wide characters (depending on whether the whole class is or is not
6130     negated). This requirement is indicated by match_all_or_no_wide_chars being
6131     true. We do this by including an explicit range, which works in both cases.
6132     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6133     cannot be any wide characters in 8-bit non-UTF mode.
6134 
6135     When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6136     class where \S etc is present without PCRE2_UCP, causing an extended class
6137     to be compiled, we make sure that all characters > 255 are included by
6138     forcing match_all_or_no_wide_chars to be true.
6139 
6140     If, when generating an xclass, there are no characters < 256, we can omit
6141     the bitmap in the actual compiled code. */
6142 
6143 #ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
6144     if (xclass && (
6145 #ifdef SUPPORT_UNICODE
6146         (options & PCRE2_UCP) != 0 ||
6147 #endif
6148         xclass_has_prop || !should_flip_negation))
6149       {
6150       if (match_all_or_no_wide_chars || (
6151 #if PCRE2_CODE_UNIT_WIDTH == 8
6152            utf &&
6153 #endif
6154            should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6155         {
6156         *class_uchardata++ = XCL_RANGE;
6157         if (utf)   /* Will always be utf in the 8-bit library */
6158           {
6159           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6160           class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6161           }
6162         else       /* Can only happen for the 16-bit & 32-bit libraries */
6163           {
6164 #if PCRE2_CODE_UNIT_WIDTH == 16
6165           *class_uchardata++ = 0x100;
6166           *class_uchardata++ = 0xffffu;
6167 #elif PCRE2_CODE_UNIT_WIDTH == 32
6168           *class_uchardata++ = 0x100;
6169           *class_uchardata++ = 0xffffffffu;
6170 #endif
6171           }
6172         }
6173       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
6174       *code++ = OP_XCLASS;
6175       code += LINK_SIZE;
6176       *code = negate_class? XCL_NOT:0;
6177       if (xclass_has_prop) *code |= XCL_HASPROP;
6178 
6179       /* If the map is required, move up the extra data to make room for it;
6180       otherwise just move the code pointer to the end of the extra data. */
6181 
6182       if (class_has_8bitchar > 0)
6183         {
6184         *code++ |= XCL_MAP;
6185         (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6186           CU2BYTES(class_uchardata - code));
6187         if (negate_class && !xclass_has_prop)
6188           {
6189           /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6190           for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6191           }
6192         memcpy(code, classbits, 32);
6193         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6194         }
6195       else code = class_uchardata;
6196 
6197       /* Now fill in the complete length of the item */
6198 
6199       PUT(previous, 1, (int)(code - previous));
6200       break;   /* End of class handling */
6201       }
6202 #endif  /* SUPPORT_WIDE_CHARS */
6203 
6204     /* If there are no characters > 255, or they are all to be included or
6205     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6206     whole class was negated and whether there were negative specials such as \S
6207     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6208     negating it if necessary. */
6209 
6210     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6211     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
6212       {
6213       if (negate_class)
6214         {
6215        /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6216        for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6217        }
6218       memcpy(code, classbits, 32);
6219       }
6220     code += 32 / sizeof(PCRE2_UCHAR);
6221     break;  /* End of class processing */
6222 
6223 
6224     /* ===================================================================*/
6225     /* Deal with (*VERB)s. */
6226 
6227     /* Check for open captures before ACCEPT and close those that are within
6228     the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6229     assertion. In the first pass, just accumulate the length required;
6230     otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6231     workspace overflow. Do not set firstcu after *ACCEPT. */
6232 
6233     case META_ACCEPT:
6234     cb->had_accept = had_accept = TRUE;
6235     for (oc = cb->open_caps;
6236          oc != NULL && oc->assert_depth >= cb->assert_depth;
6237          oc = oc->next)
6238       {
6239       if (lengthptr != NULL)
6240         {
6241         *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6242         }
6243       else
6244         {
6245         *code++ = OP_CLOSE;
6246         PUT2INC(code, 0, oc->number);
6247         }
6248       }
6249     *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6250     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6251     break;
6252 
6253     case META_PRUNE:
6254     case META_SKIP:
6255     cb->had_pruneorskip = TRUE;
6256     /* Fall through */
6257     case META_COMMIT:
6258     case META_FAIL:
6259     *code++ = verbops[(meta - META_MARK) >> 16];
6260     break;
6261 
6262     case META_THEN:
6263     cb->external_flags |= PCRE2_HASTHEN;
6264     *code++ = OP_THEN;
6265     break;
6266 
6267     /* Handle verbs with arguments. Arguments can be very long, especially in
6268     16- and 32-bit modes, and can overflow the workspace in the first pass.
6269     However, the argument length is constrained to be small enough to fit in
6270     one code unit. This check happens in parse_regex(). In the first pass,
6271     instead of putting the argument into memory, we just update the length
6272     counter and set up an empty argument. */
6273 
6274     case META_THEN_ARG:
6275     cb->external_flags |= PCRE2_HASTHEN;
6276     goto VERB_ARG;
6277 
6278     case META_PRUNE_ARG:
6279     case META_SKIP_ARG:
6280     cb->had_pruneorskip = TRUE;
6281     /* Fall through */
6282     case META_MARK:
6283     case META_COMMIT_ARG:
6284     VERB_ARG:
6285     *code++ = verbops[(meta - META_MARK) >> 16];
6286     /* The length is in characters. */
6287     verbarglen = *(++pptr);
6288     verbculen = 0;
6289     tempcode = code++;
6290     for (int i = 0; i < (int)verbarglen; i++)
6291       {
6292       meta = *(++pptr);
6293 #ifdef SUPPORT_UNICODE
6294       if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6295 #endif
6296         {
6297         mclength = 1;
6298         mcbuffer[0] = meta;
6299         }
6300       if (lengthptr != NULL) *lengthptr += mclength; else
6301         {
6302         memcpy(code, mcbuffer, CU2BYTES(mclength));
6303         code += mclength;
6304         verbculen += mclength;
6305         }
6306       }
6307 
6308     *tempcode = verbculen;   /* Fill in the code unit length */
6309     *code++ = 0;             /* Terminating zero */
6310     break;
6311 
6312 
6313     /* ===================================================================*/
6314     /* Handle options change. The new setting must be passed back for use in
6315     subsequent branches. Reset the greedy defaults and the case value for
6316     firstcu and reqcu. */
6317 
6318     case META_OPTIONS:
6319     *optionsptr = options = *(++pptr);
6320     greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6321     greedy_non_default = greedy_default ^ 1;
6322     req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6323     break;
6324 
6325 
6326     /* ===================================================================*/
6327     /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6328     because it could be a numerical check on recursion, or a name check on a
6329     group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6330     we can handle it either way. We first try for a name; if not found, process
6331     the number. */
6332 
6333     case META_COND_RNUMBER:   /* (?(Rdigits) */
6334     case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6335     case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6336     bravalue = OP_COND;
6337       {
6338       int count, index;
6339       unsigned int i;
6340       PCRE2_SPTR name;
6341       named_group *ng = cb->named_groups;
6342       uint32_t length = *(++pptr);
6343 
6344       GETPLUSOFFSET(offset, pptr);
6345       name = cb->start_pattern + offset;
6346 
6347       /* In the first pass, the names generated in the pre-pass are available,
6348       but the main name table has not yet been created. Scan the list of names
6349       generated in the pre-pass in order to get a number and whether or not
6350       this name is duplicated. If it is not duplicated, we can handle it as a
6351       numerical group. */
6352 
6353       for (i = 0; i < cb->names_found; i++, ng++)
6354         {
6355         if (length == ng->length &&
6356             PRIV(strncmp)(name, ng->name, length) == 0)
6357           {
6358           if (!ng->isdup)
6359             {
6360             code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6361             PUT2(code, 2+LINK_SIZE, ng->number);
6362             if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6363             skipunits = 1+IMM2_SIZE;
6364             goto GROUP_PROCESS_NOTE_EMPTY;
6365             }
6366           break;  /* Found a duplicated name */
6367           }
6368         }
6369 
6370       /* If the name was not found we have a bad reference, unless we are
6371       dealing with R<digits>, which is treated as a recursion test by number.
6372       */
6373 
6374       if (i >= cb->names_found)
6375         {
6376         groupnumber = 0;
6377         if (meta == META_COND_RNUMBER)
6378           {
6379           for (i = 1; i < length; i++)
6380             {
6381             groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6382             if (groupnumber > MAX_GROUP_NUMBER)
6383               {
6384               *errorcodeptr = ERR61;
6385               cb->erroroffset = offset + i;
6386               return 0;
6387               }
6388             }
6389           }
6390 
6391         if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6392           {
6393           *errorcodeptr = ERR15;
6394           cb->erroroffset = offset;
6395           return 0;
6396           }
6397 
6398         /* (?Rdigits) treated as a recursion reference by number. A value of
6399         zero (which is the result of both (?R) and (?R0)) means "any", and is
6400         translated into RREF_ANY (which is 0xffff). */
6401 
6402         if (groupnumber == 0) groupnumber = RREF_ANY;
6403         code[1+LINK_SIZE] = OP_RREF;
6404         PUT2(code, 2+LINK_SIZE, groupnumber);
6405         skipunits = 1+IMM2_SIZE;
6406         goto GROUP_PROCESS_NOTE_EMPTY;
6407         }
6408 
6409       /* A duplicated name was found. Note that if an R<digits> name is found
6410       (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6411 
6412       code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6413 
6414       /* We have a duplicated name. In the compile pass we have to search the
6415       main table in order to get the index and count values. */
6416 
6417       count = 0;  /* Values for first pass (avoids compiler warning) */
6418       index = 0;
6419       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6420             &count, errorcodeptr, cb)) return 0;
6421 
6422       /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6423       insert appropriate data values. */
6424 
6425       code[1+LINK_SIZE]++;
6426       skipunits = 1+2*IMM2_SIZE;
6427       PUT2(code, 2+LINK_SIZE, index);
6428       PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6429       }
6430     goto GROUP_PROCESS_NOTE_EMPTY;
6431 
6432     /* The DEFINE condition is always false. Its internal groups may never
6433     be called, so matched_char must remain false, hence the jump to
6434     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6435 
6436     case META_COND_DEFINE:
6437     bravalue = OP_COND;
6438     GETPLUSOFFSET(offset, pptr);
6439     code[1+LINK_SIZE] = OP_DEFINE;
6440     skipunits = 1;
6441     goto GROUP_PROCESS;
6442 
6443     /* Conditional test of a group's being set. */
6444 
6445     case META_COND_NUMBER:
6446     bravalue = OP_COND;
6447     GETPLUSOFFSET(offset, pptr);
6448     groupnumber = *(++pptr);
6449     if (groupnumber > cb->bracount)
6450       {
6451       *errorcodeptr = ERR15;
6452       cb->erroroffset = offset;
6453       return 0;
6454       }
6455     if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6456     offset -= 2;   /* Point at initial ( for too many branches error */
6457     code[1+LINK_SIZE] = OP_CREF;
6458     skipunits = 1+IMM2_SIZE;
6459     PUT2(code, 2+LINK_SIZE, groupnumber);
6460     goto GROUP_PROCESS_NOTE_EMPTY;
6461 
6462     /* Test for the PCRE2 version. */
6463 
6464     case META_COND_VERSION:
6465     bravalue = OP_COND;
6466     if (pptr[1] > 0)
6467       code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6468         (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6469           OP_TRUE : OP_FALSE;
6470     else
6471       code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6472         OP_TRUE : OP_FALSE;
6473     skipunits = 1;
6474     pptr += 3;
6475     goto GROUP_PROCESS_NOTE_EMPTY;
6476 
6477     /* The condition is an assertion, possibly preceded by a callout. */
6478 
6479     case META_COND_ASSERT:
6480     bravalue = OP_COND;
6481     goto GROUP_PROCESS_NOTE_EMPTY;
6482 
6483 
6484     /* ===================================================================*/
6485     /* Handle all kinds of nested bracketed groups. The non-capturing,
6486     non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6487 
6488     case META_LOOKAHEAD:
6489     bravalue = OP_ASSERT;
6490     cb->assert_depth += 1;
6491     goto GROUP_PROCESS;
6492 
6493     case META_LOOKAHEAD_NA:
6494     bravalue = OP_ASSERT_NA;
6495     cb->assert_depth += 1;
6496     goto GROUP_PROCESS;
6497 
6498     /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6499     thing to do, but Perl allows all assertions to be quantified, and when
6500     they contain capturing parentheses there may be a potential use for
6501     this feature. Not that that applies to a quantified (?!) but we allow
6502     it for uniformity. */
6503 
6504     case META_LOOKAHEADNOT:
6505     if (pptr[1] == META_KET &&
6506          (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6507       {
6508       *code++ = OP_FAIL;
6509       pptr++;
6510       }
6511     else
6512       {
6513       bravalue = OP_ASSERT_NOT;
6514       cb->assert_depth += 1;
6515       goto GROUP_PROCESS;
6516       }
6517     break;
6518 
6519     case META_LOOKBEHIND:
6520     bravalue = OP_ASSERTBACK;
6521     cb->assert_depth += 1;
6522     goto GROUP_PROCESS;
6523 
6524     case META_LOOKBEHINDNOT:
6525     bravalue = OP_ASSERTBACK_NOT;
6526     cb->assert_depth += 1;
6527     goto GROUP_PROCESS;
6528 
6529     case META_LOOKBEHIND_NA:
6530     bravalue = OP_ASSERTBACK_NA;
6531     cb->assert_depth += 1;
6532     goto GROUP_PROCESS;
6533 
6534     case META_ATOMIC:
6535     bravalue = OP_ONCE;
6536     goto GROUP_PROCESS_NOTE_EMPTY;
6537 
6538     case META_SCRIPT_RUN:
6539     bravalue = OP_SCRIPT_RUN;
6540     goto GROUP_PROCESS_NOTE_EMPTY;
6541 
6542     case META_NOCAPTURE:
6543     bravalue = OP_BRA;
6544     /* Fall through */
6545 
6546     /* Process nested bracketed regex. The nesting depth is maintained for the
6547     benefit of the stackguard function. The test for too deep nesting is now
6548     done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6549     others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6550     note of whether or not they may match an empty string. */
6551 
6552     GROUP_PROCESS_NOTE_EMPTY:
6553     note_group_empty = TRUE;
6554 
6555     GROUP_PROCESS:
6556     cb->parens_depth += 1;
6557     *code = bravalue;
6558     pptr++;
6559     tempcode = code;
6560     tempreqvary = cb->req_varyopt;        /* Save value before group */
6561     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6562 
6563     if ((group_return =
6564          compile_regex(
6565          options,                         /* The option state */
6566          &tempcode,                       /* Where to put code (updated) */
6567          &pptr,                           /* Input pointer (updated) */
6568          errorcodeptr,                    /* Where to put an error message */
6569          skipunits,                       /* Skip over bracket number */
6570          &subfirstcu,                     /* For possible first char */
6571          &subfirstcuflags,
6572          &subreqcu,                       /* For possible last char */
6573          &subreqcuflags,
6574          bcptr,                           /* Current branch chain */
6575          cb,                              /* Compile data block */
6576          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6577            &length_prevgroup              /* Pre-compile phase */
6578          )) == 0)
6579       return 0;  /* Error */
6580 
6581     cb->parens_depth -= 1;
6582 
6583     /* If that was a non-conditional significant group (not an assertion, not a
6584     DEFINE) that matches at least one character, then the current item matches
6585     a character. Conditionals are handled below. */
6586 
6587     if (note_group_empty && bravalue != OP_COND && group_return > 0)
6588       matched_char = TRUE;
6589 
6590     /* If we've just compiled an assertion, pop the assert depth. */
6591 
6592     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6593       cb->assert_depth -= 1;
6594 
6595     /* At the end of compiling, code is still pointing to the start of the
6596     group, while tempcode has been updated to point past the end of the group.
6597     The parsed pattern pointer (pptr) is on the closing META_KET.
6598 
6599     If this is a conditional bracket, check that there are no more than
6600     two branches in the group, or just one if it's a DEFINE group. We do this
6601     in the real compile phase, not in the pre-pass, where the whole group may
6602     not be available. */
6603 
6604     if (bravalue == OP_COND && lengthptr == NULL)
6605       {
6606       PCRE2_UCHAR *tc = code;
6607       int condcount = 0;
6608 
6609       do {
6610          condcount++;
6611          tc += GET(tc,1);
6612          }
6613       while (*tc != OP_KET);
6614 
6615       /* A DEFINE group is never obeyed inline (the "condition" is always
6616       false). It must have only one branch. Having checked this, change the
6617       opcode to OP_FALSE. */
6618 
6619       if (code[LINK_SIZE+1] == OP_DEFINE)
6620         {
6621         if (condcount > 1)
6622           {
6623           cb->erroroffset = offset;
6624           *errorcodeptr = ERR54;
6625           return 0;
6626           }
6627         code[LINK_SIZE+1] = OP_FALSE;
6628         bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6629         }
6630 
6631       /* A "normal" conditional group. If there is just one branch, we must not
6632       make use of its firstcu or reqcu, because this is equivalent to an
6633       empty second branch. Also, it may match an empty string. If there are two
6634       branches, this item must match a character if the group must. */
6635 
6636       else
6637         {
6638         if (condcount > 2)
6639           {
6640           cb->erroroffset = offset;
6641           *errorcodeptr = ERR27;
6642           return 0;
6643           }
6644         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6645           else if (group_return > 0) matched_char = TRUE;
6646         }
6647       }
6648 
6649     /* In the pre-compile phase, update the length by the length of the group,
6650     less the brackets at either end. Then reduce the compiled code to just a
6651     set of non-capturing brackets so that it doesn't use much memory if it is
6652     duplicated by a quantifier.*/
6653 
6654     if (lengthptr != NULL)
6655       {
6656       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6657         {
6658         *errorcodeptr = ERR20;
6659         return 0;
6660         }
6661       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6662       code++;   /* This already contains bravalue */
6663       PUTINC(code, 0, 1 + LINK_SIZE);
6664       *code++ = OP_KET;
6665       PUTINC(code, 0, 1 + LINK_SIZE);
6666       break;    /* No need to waste time with special character handling */
6667       }
6668 
6669     /* Otherwise update the main code pointer to the end of the group. */
6670 
6671     code = tempcode;
6672 
6673     /* For a DEFINE group, required and first character settings are not
6674     relevant. */
6675 
6676     if (bravalue == OP_DEFINE) break;
6677 
6678     /* Handle updating of the required and first code units for other types of
6679     group. Update for normal brackets of all kinds, and conditions with two
6680     branches (see code above). If the bracket is followed by a quantifier with
6681     zero repeat, we have to back off. Hence the definition of zeroreqcu and
6682     zerofirstcu outside the main loop so that they can be accessed for the back
6683     off. */
6684 
6685     zeroreqcu = reqcu;
6686     zeroreqcuflags = reqcuflags;
6687     zerofirstcu = firstcu;
6688     zerofirstcuflags = firstcuflags;
6689     groupsetfirstcu = FALSE;
6690 
6691     if (bravalue >= OP_ONCE)  /* Not an assertion */
6692       {
6693       /* If we have not yet set a firstcu in this branch, take it from the
6694       subpattern, remembering that it was set here so that a repeat of more
6695       than one can replicate it as reqcu if necessary. If the subpattern has
6696       no firstcu, set "none" for the whole branch. In both cases, a zero
6697       repeat forces firstcu to "none". */
6698 
6699       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6700         {
6701         if (subfirstcuflags < REQ_NONE)
6702           {
6703           firstcu = subfirstcu;
6704           firstcuflags = subfirstcuflags;
6705           groupsetfirstcu = TRUE;
6706           }
6707         else firstcuflags = REQ_NONE;
6708         zerofirstcuflags = REQ_NONE;
6709         }
6710 
6711       /* If firstcu was previously set, convert the subpattern's firstcu
6712       into reqcu if there wasn't one, using the vary flag that was in
6713       existence beforehand. */
6714 
6715       else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6716         {
6717         subreqcu = subfirstcu;
6718         subreqcuflags = subfirstcuflags | tempreqvary;
6719         }
6720 
6721       /* If the subpattern set a required code unit (or set a first code unit
6722       that isn't really the first code unit - see above), set it. */
6723 
6724       if (subreqcuflags < REQ_NONE)
6725         {
6726         reqcu = subreqcu;
6727         reqcuflags = subreqcuflags;
6728         }
6729       }
6730 
6731     /* For a forward assertion, we take the reqcu, if set, provided that the
6732     group has also set a firstcu. This can be helpful if the pattern that
6733     follows the assertion doesn't set a different char. For example, it's
6734     useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6735     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6736     the "real" "a" would then become a reqcu instead of a firstcu. This is
6737     overcome by a scan at the end if there's no firstcu, looking for an
6738     asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6739     we must only take the reqcu when the group also set a firstcu. Otherwise,
6740     in that example, 'X' ends up set for both. */
6741 
6742     else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6743              subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
6744       {
6745       reqcu = subreqcu;
6746       reqcuflags = subreqcuflags;
6747       }
6748 
6749     break;  /* End of nested group handling */
6750 
6751 
6752     /* ===================================================================*/
6753     /* Handle named backreferences and recursions. */
6754 
6755     case META_BACKREF_BYNAME:
6756     case META_RECURSE_BYNAME:
6757       {
6758       int count, index;
6759       PCRE2_SPTR name;
6760       BOOL is_dupname = FALSE;
6761       named_group *ng = cb->named_groups;
6762       uint32_t length = *(++pptr);
6763 
6764       GETPLUSOFFSET(offset, pptr);
6765       name = cb->start_pattern + offset;
6766 
6767       /* In the first pass, the names generated in the pre-pass are available,
6768       but the main name table has not yet been created. Scan the list of names
6769       generated in the pre-pass in order to get a number and whether or not
6770       this name is duplicated. */
6771 
6772       groupnumber = 0;
6773       for (unsigned int i = 0; i < cb->names_found; i++, ng++)
6774         {
6775         if (length == ng->length &&
6776             PRIV(strncmp)(name, ng->name, length) == 0)
6777           {
6778           is_dupname = ng->isdup;
6779           groupnumber = ng->number;
6780 
6781           /* For a recursion, that's all that is needed. We can now go to
6782           the code that handles numerical recursion, applying it to the first
6783           group with the given name. */
6784 
6785           if (meta == META_RECURSE_BYNAME)
6786             {
6787             meta_arg = groupnumber;
6788             goto HANDLE_NUMERICAL_RECURSION;
6789             }
6790 
6791           /* For a back reference, update the back reference map and the
6792           maximum back reference. */
6793 
6794           cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6795           if (groupnumber > cb->top_backref)
6796             cb->top_backref = groupnumber;
6797           }
6798         }
6799 
6800       /* If the name was not found we have a bad reference. */
6801 
6802       if (groupnumber == 0)
6803         {
6804         *errorcodeptr = ERR15;
6805         cb->erroroffset = offset;
6806         return 0;
6807         }
6808 
6809       /* If a back reference name is not duplicated, we can handle it as
6810       a numerical reference. */
6811 
6812       if (!is_dupname)
6813         {
6814         meta_arg = groupnumber;
6815         goto HANDLE_SINGLE_REFERENCE;
6816         }
6817 
6818       /* If a back reference name is duplicated, we generate a different
6819       opcode to a numerical back reference. In the second pass we must
6820       search for the index and count in the final name table. */
6821 
6822       count = 0;  /* Values for first pass (avoids compiler warning) */
6823       index = 0;
6824       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6825             &count, errorcodeptr, cb)) return 0;
6826 
6827       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6828       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6829       PUT2INC(code, 0, index);
6830       PUT2INC(code, 0, count);
6831       }
6832     break;
6833 
6834 
6835     /* ===================================================================*/
6836     /* Handle a numerical callout. */
6837 
6838     case META_CALLOUT_NUMBER:
6839     code[0] = OP_CALLOUT;
6840     PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6841     PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6842     code[1 + 2*LINK_SIZE] = pptr[3];
6843     pptr += 3;
6844     code += PRIV(OP_lengths)[OP_CALLOUT];
6845     break;
6846 
6847 
6848     /* ===================================================================*/
6849     /* Handle a callout with a string argument. In the pre-pass we just compute
6850     the length without generating anything. The length in pptr[3] includes both
6851     delimiters; in the actual compile only the first one is copied, but a
6852     terminating zero is added. Any doubled delimiters within the string make
6853     this an overestimate, but it is not worth bothering about. */
6854 
6855     case META_CALLOUT_STRING:
6856     if (lengthptr != NULL)
6857       {
6858       *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6859       pptr += 3;
6860       SKIPOFFSET(pptr);
6861       }
6862 
6863     /* In the real compile we can copy the string. The starting delimiter is
6864      included so that the client can discover it if they want. We also pass the
6865      start offset to help a script language give better error messages. */
6866 
6867     else
6868       {
6869       PCRE2_SPTR pp;
6870       uint32_t delimiter;
6871       uint32_t length = pptr[3];
6872       PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6873 
6874       code[0] = OP_CALLOUT_STR;
6875       PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6876       PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6877 
6878       pptr += 3;
6879       GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
6880       pp = cb->start_pattern + offset;
6881       delimiter = *callout_string++ = *pp++;
6882       if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6883         delimiter = CHAR_RIGHT_CURLY_BRACKET;
6884       PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
6885 
6886       /* The syntax of the pattern was checked in the parsing scan. The length
6887       includes both delimiters, but we have passed the opening one just above,
6888       so we reduce length before testing it. The test is for > 1 because we do
6889       not want to copy the final delimiter. This also ensures that pp[1] is
6890       accessible. */
6891 
6892       while (--length > 1)
6893         {
6894         if (*pp == delimiter && pp[1] == delimiter)
6895           {
6896           *callout_string++ = delimiter;
6897           pp += 2;
6898           length--;
6899           }
6900         else *callout_string++ = *pp++;
6901         }
6902       *callout_string++ = CHAR_NUL;
6903 
6904       /* Set the length of the entire item, the advance to its end. */
6905 
6906       PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6907       code = callout_string;
6908       }
6909     break;
6910 
6911 
6912     /* ===================================================================*/
6913     /* Handle repetition. The different types are all sorted out in the parsing
6914     pass. */
6915 
6916     case META_MINMAX_PLUS:
6917     case META_MINMAX_QUERY:
6918     case META_MINMAX:
6919     repeat_min = *(++pptr);
6920     repeat_max = *(++pptr);
6921     goto REPEAT;
6922 
6923     case META_ASTERISK:
6924     case META_ASTERISK_PLUS:
6925     case META_ASTERISK_QUERY:
6926     repeat_min = 0;
6927     repeat_max = REPEAT_UNLIMITED;
6928     goto REPEAT;
6929 
6930     case META_PLUS:
6931     case META_PLUS_PLUS:
6932     case META_PLUS_QUERY:
6933     repeat_min = 1;
6934     repeat_max = REPEAT_UNLIMITED;
6935     goto REPEAT;
6936 
6937     case META_QUERY:
6938     case META_QUERY_PLUS:
6939     case META_QUERY_QUERY:
6940     repeat_min = 0;
6941     repeat_max = 1;
6942 
6943     REPEAT:
6944     if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6945 
6946     /* Remember whether this is a variable length repeat, and default to
6947     single-char opcodes. */
6948 
6949     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6950     op_type = 0;
6951 
6952     /* Adjust first and required code units for a zero repeat. */
6953 
6954     if (repeat_min == 0)
6955       {
6956       firstcu = zerofirstcu;
6957       firstcuflags = zerofirstcuflags;
6958       reqcu = zeroreqcu;
6959       reqcuflags = zeroreqcuflags;
6960       }
6961 
6962     /* Note the greediness and possessiveness. */
6963 
6964     switch (meta)
6965       {
6966       case META_MINMAX_PLUS:
6967       case META_ASTERISK_PLUS:
6968       case META_PLUS_PLUS:
6969       case META_QUERY_PLUS:
6970       repeat_type = 0;                  /* Force greedy */
6971       possessive_quantifier = TRUE;
6972       break;
6973 
6974       case META_MINMAX_QUERY:
6975       case META_ASTERISK_QUERY:
6976       case META_PLUS_QUERY:
6977       case META_QUERY_QUERY:
6978       repeat_type = greedy_non_default;
6979       possessive_quantifier = FALSE;
6980       break;
6981 
6982       default:
6983       repeat_type = greedy_default;
6984       possessive_quantifier = FALSE;
6985       break;
6986       }
6987 
6988     /* Save start of previous item, in case we have to move it up in order to
6989     insert something before it, and remember what it was. */
6990 
6991     tempcode = previous;
6992     op_previous = *previous;
6993 
6994     /* Now handle repetition for the different types of item. If the repeat
6995     minimum and the repeat maximum are both 1, we can ignore the quantifier for
6996     non-parenthesized items, as they have only one alternative. For anything in
6997     parentheses, we must not ignore if {1} is possessive. */
6998 
6999     switch (op_previous)
7000       {
7001       /* If previous was a character or negated character match, abolish the
7002       item and generate a repeat item instead. If a char item has a minimum of
7003       more than one, ensure that it is set in reqcu - it might not be if a
7004       sequence such as x{3} is the first thing in a branch because the x will
7005       have gone into firstcu instead.  */
7006 
7007       case OP_CHAR:
7008       case OP_CHARI:
7009       case OP_NOT:
7010       case OP_NOTI:
7011       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7012       op_type = chartypeoffset[op_previous - OP_CHAR];
7013 
7014       /* Deal with UTF characters that take up more than one code unit. */
7015 
7016 #ifdef MAYBE_UTF_MULTI
7017       if (utf && NOT_FIRSTCU(code[-1]))
7018         {
7019         PCRE2_UCHAR *lastchar = code - 1;
7020         BACKCHAR(lastchar);
7021         mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7022         memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7023         }
7024       else
7025 #endif  /* MAYBE_UTF_MULTI */
7026 
7027       /* Handle the case of a single code unit - either with no UTF support, or
7028       with UTF disabled, or for a single-code-unit UTF character. In the latter
7029       case, for a repeated positive match, get the caseless flag for the
7030       required code unit from the previous character, because a class like [Aa]
7031       sets a caseless A but by now the req_caseopt flag has been reset. */
7032 
7033         {
7034         mcbuffer[0] = code[-1];
7035         mclength = 1;
7036         if (op_previous <= OP_CHARI && repeat_min > 1)
7037           {
7038           reqcu = mcbuffer[0];
7039           reqcuflags = cb->req_varyopt;
7040           if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7041           }
7042         }
7043       goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7044 
7045       /* If previous was a character class or a back reference, we put the
7046       repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7047 
7048 #ifdef SUPPORT_WIDE_CHARS
7049       case OP_XCLASS:
7050 #endif
7051       case OP_CLASS:
7052       case OP_NCLASS:
7053       case OP_REF:
7054       case OP_REFI:
7055       case OP_DNREF:
7056       case OP_DNREFI:
7057 
7058       if (repeat_max == 0)
7059         {
7060         code = previous;
7061         goto END_REPEAT;
7062         }
7063       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7064 
7065       if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7066         *code++ = OP_CRSTAR + repeat_type;
7067       else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7068         *code++ = OP_CRPLUS + repeat_type;
7069       else if (repeat_min == 0 && repeat_max == 1)
7070         *code++ = OP_CRQUERY + repeat_type;
7071       else
7072         {
7073         *code++ = OP_CRRANGE + repeat_type;
7074         PUT2INC(code, 0, repeat_min);
7075         if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7076         PUT2INC(code, 0, repeat_max);
7077         }
7078       break;
7079 
7080       /* If previous is OP_FAIL, it was generated by an empty class []
7081       (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7082       generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7083       time. We can just ignore this repeat. */
7084 
7085       case OP_FAIL:
7086       goto END_REPEAT;
7087 
7088       /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7089       because pcre2_match() could not handle backtracking into recursively
7090       called groups. Now that this backtracking is available, we no longer need
7091       to do this. However, we still need to replicate recursions as we do for
7092       groups so as to have independent backtracking points. We can replicate
7093       for the minimum number of repeats directly. For optional repeats we now
7094       wrap the recursion in OP_BRA brackets and make use of the bracket
7095       repetition. */
7096 
7097       case OP_RECURSE:
7098       if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7099         goto END_REPEAT;
7100 
7101       /* Generate unwrapped repeats for a non-zero minimum, except when the
7102       minimum is 1 and the maximum unlimited, because that can be handled with
7103       OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7104       minimum, we just need to generate the appropriate additional copies.
7105       Otherwise we need to generate one more, to simulate the situation when
7106       the minimum is zero. */
7107 
7108       if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7109         {
7110         int replicate = repeat_min;
7111         if (repeat_min == repeat_max) replicate--;
7112 
7113         /* In the pre-compile phase, we don't actually do the replication. We
7114         just adjust the length as if we had. Do some paranoid checks for
7115         potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7116         integer type when available, otherwise double. */
7117 
7118         if (lengthptr != NULL)
7119           {
7120           PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
7121           if ((INT64_OR_DOUBLE)replicate*
7122                 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
7123                   (INT64_OR_DOUBLE)INT_MAX ||
7124               OFLOW_MAX - *lengthptr < delta)
7125             {
7126             *errorcodeptr = ERR20;
7127             return 0;
7128             }
7129           *lengthptr += delta;
7130           }
7131 
7132         else for (int i = 0; i < replicate; i++)
7133           {
7134           memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7135           previous = code;
7136           code += 1 + LINK_SIZE;
7137           }
7138 
7139         /* If the number of repeats is fixed, we are done. Otherwise, adjust
7140         the counts and fall through. */
7141 
7142         if (repeat_min == repeat_max) break;
7143         if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7144         repeat_min = 0;
7145         }
7146 
7147       /* Wrap the recursion call in OP_BRA brackets. */
7148 
7149       (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7150       op_previous = *previous = OP_BRA;
7151       PUT(previous, 1, 2 + 2*LINK_SIZE);
7152       previous[2 + 2*LINK_SIZE] = OP_KET;
7153       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7154       code += 2 + 2 * LINK_SIZE;
7155       length_prevgroup = 3 + 3*LINK_SIZE;
7156       group_return = -1;  /* Set "may match empty string" */
7157 
7158       /* Now treat as a repeated OP_BRA. */
7159       /* Fall through */
7160 
7161       /* If previous was a bracket group, we may have to replicate it in
7162       certain cases. Note that at this point we can encounter only the "basic"
7163       bracket opcodes such as BRA and CBRA, as this is the place where they get
7164       converted into the more special varieties such as BRAPOS and SBRA.
7165       Originally, PCRE did not allow repetition of assertions, but now it does,
7166       for Perl compatibility. */
7167 
7168       case OP_ASSERT:
7169       case OP_ASSERT_NOT:
7170       case OP_ASSERT_NA:
7171       case OP_ASSERTBACK:
7172       case OP_ASSERTBACK_NOT:
7173       case OP_ASSERTBACK_NA:
7174       case OP_ONCE:
7175       case OP_SCRIPT_RUN:
7176       case OP_BRA:
7177       case OP_CBRA:
7178       case OP_COND:
7179         {
7180         int len = (int)(code - previous);
7181         PCRE2_UCHAR *bralink = NULL;
7182         PCRE2_UCHAR *brazeroptr = NULL;
7183 
7184         if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7185           goto END_REPEAT;
7186 
7187         /* Repeating a DEFINE group (or any group where the condition is always
7188         FALSE and there is only one branch) is pointless, but Perl allows the
7189         syntax, so we just ignore the repeat. */
7190 
7191         if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7192             previous[GET(previous, 1)] != OP_ALT)
7193           goto END_REPEAT;
7194 
7195         /* Perl allows all assertions to be quantified, and when they contain
7196         capturing parentheses and/or are optional there are potential uses for
7197         this feature. PCRE2 used to force the maximum quantifier to 1 on the
7198         invalid grounds that further repetition was never useful. This was
7199         always a bit pointless, since an assertion could be wrapped with a
7200         repeated group to achieve the effect. General repetition is now
7201         permitted, but if the maximum is unlimited it is set to one more than
7202         the minimum. */
7203 
7204         if (op_previous < OP_ONCE)    /* Assertion */
7205           {
7206           if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7207           }
7208 
7209         /* The case of a zero minimum is special because of the need to stick
7210         OP_BRAZERO in front of it, and because the group appears once in the
7211         data, whereas in other cases it appears the minimum number of times. For
7212         this reason, it is simplest to treat this case separately, as otherwise
7213         the code gets far too messy. There are several special subcases when the
7214         minimum is zero. */
7215 
7216         if (repeat_min == 0)
7217           {
7218           /* If the maximum is also zero, we used to just omit the group from
7219           the output altogether, like this:
7220 
7221           ** if (repeat_max == 0)
7222           **   {
7223           **   code = previous;
7224           **   goto END_REPEAT;
7225           **   }
7226 
7227           However, that fails when a group or a subgroup within it is
7228           referenced as a subroutine from elsewhere in the pattern, so now we
7229           stick in OP_SKIPZERO in front of it so that it is skipped on
7230           execution. As we don't have a list of which groups are referenced, we
7231           cannot do this selectively.
7232 
7233           If the maximum is 1 or unlimited, we just have to stick in the
7234           BRAZERO and do no more at this point. */
7235 
7236           if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7237             {
7238             (void)memmove(previous + 1, previous, CU2BYTES(len));
7239             code++;
7240             if (repeat_max == 0)
7241               {
7242               *previous++ = OP_SKIPZERO;
7243               goto END_REPEAT;
7244               }
7245             brazeroptr = previous;    /* Save for possessive optimizing */
7246             *previous++ = OP_BRAZERO + repeat_type;
7247             }
7248 
7249           /* If the maximum is greater than 1 and limited, we have to replicate
7250           in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7251           The first one has to be handled carefully because it's the original
7252           copy, which has to be moved up. The remainder can be handled by code
7253           that is common with the non-zero minimum case below. We have to
7254           adjust the value or repeat_max, since one less copy is required. */
7255 
7256           else
7257             {
7258             int linkoffset;
7259             (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7260             code += 2 + LINK_SIZE;
7261             *previous++ = OP_BRAZERO + repeat_type;
7262             *previous++ = OP_BRA;
7263 
7264             /* We chain together the bracket link offset fields that have to be
7265             filled in later when the ends of the brackets are reached. */
7266 
7267             linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7268             bralink = previous;
7269             PUTINC(previous, 0, linkoffset);
7270             }
7271 
7272           if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7273           }
7274 
7275         /* If the minimum is greater than zero, replicate the group as many
7276         times as necessary, and adjust the maximum to the number of subsequent
7277         copies that we need. */
7278 
7279         else
7280           {
7281           if (repeat_min > 1)
7282             {
7283             /* In the pre-compile phase, we don't actually do the replication.
7284             We just adjust the length as if we had. Do some paranoid checks for
7285             potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7286             integer type when available, otherwise double. */
7287 
7288             if (lengthptr != NULL)
7289               {
7290               PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7291               if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7292                     (INT64_OR_DOUBLE)length_prevgroup >
7293                       (INT64_OR_DOUBLE)INT_MAX ||
7294                   OFLOW_MAX - *lengthptr < delta)
7295                 {
7296                 *errorcodeptr = ERR20;
7297                 return 0;
7298                 }
7299               *lengthptr += delta;
7300               }
7301 
7302             /* This is compiling for real. If there is a set first code unit
7303             for the group, and we have not yet set a "required code unit", set
7304             it. */
7305 
7306             else
7307               {
7308               if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7309                 {
7310                 reqcu = firstcu;
7311                 reqcuflags = firstcuflags;
7312                 }
7313               for (uint32_t i = 1; i < repeat_min; i++)
7314                 {
7315                 memcpy(code, previous, CU2BYTES(len));
7316                 code += len;
7317                 }
7318               }
7319             }
7320 
7321           if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7322           }
7323 
7324         /* This code is common to both the zero and non-zero minimum cases. If
7325         the maximum is limited, it replicates the group in a nested fashion,
7326         remembering the bracket starts on a stack. In the case of a zero
7327         minimum, the first one was set up above. In all cases the repeat_max
7328         now specifies the number of additional copies needed. Again, we must
7329         remember to replicate entries on the forward reference list. */
7330 
7331         if (repeat_max != REPEAT_UNLIMITED)
7332           {
7333           /* In the pre-compile phase, we don't actually do the replication. We
7334           just adjust the length as if we had. For each repetition we must add
7335           1 to the length for BRAZERO and for all but the last repetition we
7336           must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7337           paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7338           is a 64-bit integer type when available, otherwise double. */
7339 
7340           if (lengthptr != NULL && repeat_max > 0)
7341             {
7342             PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7343                         2 - 2*LINK_SIZE;   /* Last one doesn't nest */
7344             if ((INT64_OR_DOUBLE)repeat_max *
7345                   (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7346                     > (INT64_OR_DOUBLE)INT_MAX ||
7347                 OFLOW_MAX - *lengthptr < delta)
7348               {
7349               *errorcodeptr = ERR20;
7350               return 0;
7351               }
7352             *lengthptr += delta;
7353             }
7354 
7355           /* This is compiling for real */
7356 
7357           else for (uint32_t i = repeat_max; i >= 1; i--)
7358             {
7359             *code++ = OP_BRAZERO + repeat_type;
7360 
7361             /* All but the final copy start a new nesting, maintaining the
7362             chain of brackets outstanding. */
7363 
7364             if (i != 1)
7365               {
7366               int linkoffset;
7367               *code++ = OP_BRA;
7368               linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7369               bralink = code;
7370               PUTINC(code, 0, linkoffset);
7371               }
7372 
7373             memcpy(code, previous, CU2BYTES(len));
7374             code += len;
7375             }
7376 
7377           /* Now chain through the pending brackets, and fill in their length
7378           fields (which are holding the chain links pro tem). */
7379 
7380           while (bralink != NULL)
7381             {
7382             int oldlinkoffset;
7383             int linkoffset = (int)(code - bralink + 1);
7384             PCRE2_UCHAR *bra = code - linkoffset;
7385             oldlinkoffset = GET(bra, 1);
7386             bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7387             *code++ = OP_KET;
7388             PUTINC(code, 0, linkoffset);
7389             PUT(bra, 1, linkoffset);
7390             }
7391           }
7392 
7393         /* If the maximum is unlimited, set a repeater in the final copy. For
7394         SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7395         possessively repeated ONCE brackets can be converted into non-capturing
7396         brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7397         saves having to deal with possessive ONCEs specially.
7398 
7399         Otherwise, when we are doing the actual compile phase, check to see
7400         whether this group is one that could match an empty string. If so,
7401         convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7402         that runtime checking can be done. [This check is also applied to ONCE
7403         and SCRIPT_RUN groups at runtime, but in a different way.]
7404 
7405         Then, if the quantifier was possessive and the bracket is not a
7406         conditional, we convert the BRA code to the POS form, and the KET code
7407         to KETRPOS. (It turns out to be convenient at runtime to detect this
7408         kind of subpattern at both the start and at the end.) The use of
7409         special opcodes makes it possible to reduce greatly the stack usage in
7410         pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7411         OP_BRAPOSZERO.
7412 
7413         Then, if the minimum number of matches is 1 or 0, cancel the possessive
7414         flag so that the default action below, of wrapping everything inside
7415         atomic brackets, does not happen. When the minimum is greater than 1,
7416         there will be earlier copies of the group, and so we still have to wrap
7417         the whole thing. */
7418 
7419         else
7420           {
7421           PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7422           PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7423 
7424           /* Convert possessive ONCE brackets to non-capturing */
7425 
7426           if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7427 
7428           /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7429           to do is to set the KET. */
7430 
7431           if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7432             *ketcode = OP_KETRMAX + repeat_type;
7433 
7434           /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7435           (which have been converted to non-capturing above). */
7436 
7437           else
7438             {
7439             /* In the compile phase, adjust the opcode if the group can match
7440             an empty string. For a conditional group with only one branch, the
7441             value of group_return will not show "could be empty", so we must
7442             check that separately. */
7443 
7444             if (lengthptr == NULL)
7445               {
7446               if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7447               if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7448                 *bracode = OP_SCOND;
7449               }
7450 
7451             /* Handle possessive quantifiers. */
7452 
7453             if (possessive_quantifier)
7454               {
7455               /* For COND brackets, we wrap the whole thing in a possessively
7456               repeated non-capturing bracket, because we have not invented POS
7457               versions of the COND opcodes. */
7458 
7459               if (*bracode == OP_COND || *bracode == OP_SCOND)
7460                 {
7461                 int nlen = (int)(code - bracode);
7462                 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7463                 code += 1 + LINK_SIZE;
7464                 nlen += 1 + LINK_SIZE;
7465                 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7466                 *code++ = OP_KETRPOS;
7467                 PUTINC(code, 0, nlen);
7468                 PUT(bracode, 1, nlen);
7469                 }
7470 
7471               /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7472 
7473               else
7474                 {
7475                 *bracode += 1;              /* Switch to xxxPOS opcodes */
7476                 *ketcode = OP_KETRPOS;
7477                 }
7478 
7479               /* If the minimum is zero, mark it as possessive, then unset the
7480               possessive flag when the minimum is 0 or 1. */
7481 
7482               if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7483               if (repeat_min < 2) possessive_quantifier = FALSE;
7484               }
7485 
7486             /* Non-possessive quantifier */
7487 
7488             else *ketcode = OP_KETRMAX + repeat_type;
7489             }
7490           }
7491         }
7492       break;
7493 
7494       /* If previous was a character type match (\d or similar), abolish it and
7495       create a suitable repeat item. The code is shared with single-character
7496       repeats by setting op_type to add a suitable offset into repeat_type.
7497       Note the the Unicode property types will be present only when
7498       SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7499       here because it just makes it horribly messy. */
7500 
7501       default:
7502       if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7503         {
7504         *errorcodeptr = ERR10;
7505         return 0;
7506         }
7507       else
7508         {
7509         int prop_type, prop_value;
7510         PCRE2_UCHAR *oldcode;
7511 
7512         if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7513 
7514         op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7515         mclength = 0;                         /* Not a character */
7516 
7517         if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7518           {
7519           prop_type = previous[1];
7520           prop_value = previous[2];
7521           }
7522         else
7523           {
7524           /* Come here from just above with a character in mcbuffer/mclength. */
7525           OUTPUT_SINGLE_REPEAT:
7526           prop_type = prop_value = -1;
7527           }
7528 
7529         /* At this point, if prop_type == prop_value == -1 we either have a
7530         character in mcbuffer when mclength is greater than zero, or we have
7531         mclength zero, in which case there is a non-property character type in
7532         op_previous. If prop_type/value are not negative, we have a property
7533         character type in op_previous. */
7534 
7535         oldcode = code;                   /* Save where we were */
7536         code = previous;                  /* Usually overwrite previous item */
7537 
7538         /* If the maximum is zero then the minimum must also be zero; Perl allows
7539         this case, so we do too - by simply omitting the item altogether. */
7540 
7541         if (repeat_max == 0) goto END_REPEAT;
7542 
7543         /* Combine the op_type with the repeat_type */
7544 
7545         repeat_type += op_type;
7546 
7547         /* A minimum of zero is handled either as the special case * or ?, or as
7548         an UPTO, with the maximum given. */
7549 
7550         if (repeat_min == 0)
7551           {
7552           if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7553             else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7554           else
7555             {
7556             *code++ = OP_UPTO + repeat_type;
7557             PUT2INC(code, 0, repeat_max);
7558             }
7559           }
7560 
7561         /* A repeat minimum of 1 is optimized into some special cases. If the
7562         maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7563         left in place and, if the maximum is greater than 1, we use OP_UPTO with
7564         one less than the maximum. */
7565 
7566         else if (repeat_min == 1)
7567           {
7568           if (repeat_max == REPEAT_UNLIMITED)
7569             *code++ = OP_PLUS + repeat_type;
7570           else
7571             {
7572             code = oldcode;  /* Leave previous item in place */
7573             if (repeat_max == 1) goto END_REPEAT;
7574             *code++ = OP_UPTO + repeat_type;
7575             PUT2INC(code, 0, repeat_max - 1);
7576             }
7577           }
7578 
7579         /* The case {n,n} is just an EXACT, while the general case {n,m} is
7580         handled as an EXACT followed by an UPTO or STAR or QUERY. */
7581 
7582         else
7583           {
7584           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7585           PUT2INC(code, 0, repeat_min);
7586 
7587           /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7588           and then generate the second opcode. For a repeated Unicode property
7589           match, there are two extra values that define the required property,
7590           and mclength is set zero to indicate this. */
7591 
7592           if (repeat_max != repeat_min)
7593             {
7594             if (mclength > 0)
7595               {
7596               memcpy(code, mcbuffer, CU2BYTES(mclength));
7597               code += mclength;
7598               }
7599             else
7600               {
7601               *code++ = op_previous;
7602               if (prop_type >= 0)
7603                 {
7604                 *code++ = prop_type;
7605                 *code++ = prop_value;
7606                 }
7607               }
7608 
7609             /* Now set up the following opcode */
7610 
7611             if (repeat_max == REPEAT_UNLIMITED)
7612               *code++ = OP_STAR + repeat_type;
7613             else
7614               {
7615               repeat_max -= repeat_min;
7616               if (repeat_max == 1)
7617                 {
7618                 *code++ = OP_QUERY + repeat_type;
7619                 }
7620               else
7621                 {
7622                 *code++ = OP_UPTO + repeat_type;
7623                 PUT2INC(code, 0, repeat_max);
7624                 }
7625               }
7626             }
7627           }
7628 
7629         /* Fill in the character or character type for the final opcode. */
7630 
7631         if (mclength > 0)
7632           {
7633           memcpy(code, mcbuffer, CU2BYTES(mclength));
7634           code += mclength;
7635           }
7636         else
7637           {
7638           *code++ = op_previous;
7639           if (prop_type >= 0)
7640             {
7641             *code++ = prop_type;
7642             *code++ = prop_value;
7643             }
7644           }
7645         }
7646       break;
7647       }  /* End of switch on different op_previous values */
7648 
7649 
7650     /* If the character following a repeat is '+', possessive_quantifier is
7651     TRUE. For some opcodes, there are special alternative opcodes for this
7652     case. For anything else, we wrap the entire repeated item inside OP_ONCE
7653     brackets. Logically, the '+' notation is just syntactic sugar, taken from
7654     Sun's Java package, but the special opcodes can optimize it.
7655 
7656     Some (but not all) possessively repeated subpatterns have already been
7657     completely handled in the code just above. For them, possessive_quantifier
7658     is always FALSE at this stage. Note that the repeated item starts at
7659     tempcode, not at previous, which might be the first part of a string whose
7660     (former) last char we repeated. */
7661 
7662     if (possessive_quantifier)
7663       {
7664       int len;
7665 
7666       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7667       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7668       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7669       remains is greater than zero, there's a further opcode that can be
7670       handled. If not, do nothing, leaving the EXACT alone. */
7671 
7672       switch(*tempcode)
7673         {
7674         case OP_TYPEEXACT:
7675         tempcode += PRIV(OP_lengths)[*tempcode] +
7676           ((tempcode[1 + IMM2_SIZE] == OP_PROP
7677           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7678         break;
7679 
7680         /* CHAR opcodes are used for exacts whose count is 1. */
7681 
7682         case OP_CHAR:
7683         case OP_CHARI:
7684         case OP_NOT:
7685         case OP_NOTI:
7686         case OP_EXACT:
7687         case OP_EXACTI:
7688         case OP_NOTEXACT:
7689         case OP_NOTEXACTI:
7690         tempcode += PRIV(OP_lengths)[*tempcode];
7691 #ifdef SUPPORT_UNICODE
7692         if (utf && HAS_EXTRALEN(tempcode[-1]))
7693           tempcode += GET_EXTRALEN(tempcode[-1]);
7694 #endif
7695         break;
7696 
7697         /* For the class opcodes, the repeat operator appears at the end;
7698         adjust tempcode to point to it. */
7699 
7700         case OP_CLASS:
7701         case OP_NCLASS:
7702         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7703         break;
7704 
7705 #ifdef SUPPORT_WIDE_CHARS
7706         case OP_XCLASS:
7707         tempcode += GET(tempcode, 1);
7708         break;
7709 #endif
7710         }
7711 
7712       /* If tempcode is equal to code (which points to the end of the repeated
7713       item), it means we have skipped an EXACT item but there is no following
7714       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7715       all other cases, tempcode will be pointing to the repeat opcode, and will
7716       be less than code, so the value of len will be greater than 0. */
7717 
7718       len = (int)(code - tempcode);
7719       if (len > 0)
7720         {
7721         unsigned int repcode = *tempcode;
7722 
7723         /* There is a table for possessifying opcodes, all of which are less
7724         than OP_CALLOUT. A zero entry means there is no possessified version.
7725         */
7726 
7727         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7728           *tempcode = opcode_possessify[repcode];
7729 
7730         /* For opcode without a special possessified version, wrap the item in
7731         ONCE brackets. */
7732 
7733         else
7734           {
7735           (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7736           code += 1 + LINK_SIZE;
7737           len += 1 + LINK_SIZE;
7738           tempcode[0] = OP_ONCE;
7739           *code++ = OP_KET;
7740           PUTINC(code, 0, len);
7741           PUT(tempcode, 1, len);
7742           }
7743         }
7744       }
7745 
7746     /* We set the "follows varying string" flag for subsequently encountered
7747     reqcus if it isn't already set and we have just passed a varying length
7748     item. */
7749 
7750     END_REPEAT:
7751     cb->req_varyopt |= reqvary;
7752     break;
7753 
7754 
7755     /* ===================================================================*/
7756     /* Handle a 32-bit data character with a value greater than META_END. */
7757 
7758     case META_BIGVALUE:
7759     pptr++;
7760     goto NORMAL_CHAR;
7761 
7762 
7763     /* ===============================================================*/
7764     /* Handle a back reference by number, which is the meta argument. The
7765     pattern offsets for back references to group numbers less than 10 are held
7766     in a special vector, to avoid using more than two parsed pattern elements
7767     in 64-bit environments. We only need the offset to the first occurrence,
7768     because if that doesn't fail, subsequent ones will also be OK. */
7769 
7770     case META_BACKREF:
7771     if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7772       else GETPLUSOFFSET(offset, pptr);
7773 
7774     if (meta_arg > cb->bracount)
7775       {
7776       cb->erroroffset = offset;
7777       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7778       return 0;
7779       }
7780 
7781     /* Come here from named backref handling when the reference is to a
7782     single group (that is, not to a duplicated name). The back reference
7783     data will have already been updated. We must disable firstcu if not
7784     set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7785     later. */
7786 
7787     HANDLE_SINGLE_REFERENCE:
7788     if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7789     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7790     PUT2INC(code, 0, meta_arg);
7791 
7792     /* Update the map of back references, and keep the highest one. We
7793     could do this in parse_regex() for numerical back references, but not
7794     for named back references, because we don't know the numbers to which
7795     named back references refer. So we do it all in this function. */
7796 
7797     cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7798     if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7799     break;
7800 
7801 
7802     /* ===============================================================*/
7803     /* Handle recursion by inserting the number of the called group (which is
7804     the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7805     scanned and these numbers are replaced by offsets within the pattern. It is
7806     done like this to avoid problems with forward references and adjusting
7807     offsets when groups are duplicated and moved (as discovered in previous
7808     implementations). Note that a recursion does not have a set first
7809     character. */
7810 
7811     case META_RECURSE:
7812     GETPLUSOFFSET(offset, pptr);
7813     if (meta_arg > cb->bracount)
7814       {
7815       cb->erroroffset = offset;
7816       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7817       return 0;
7818       }
7819     HANDLE_NUMERICAL_RECURSION:
7820     *code = OP_RECURSE;
7821     PUT(code, 1, meta_arg);
7822     code += 1 + LINK_SIZE;
7823     groupsetfirstcu = FALSE;
7824     cb->had_recurse = TRUE;
7825     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7826     zerofirstcu = firstcu;
7827     zerofirstcuflags = firstcuflags;
7828     break;
7829 
7830 
7831     /* ===============================================================*/
7832     /* Handle capturing parentheses; the number is the meta argument. */
7833 
7834     case META_CAPTURE:
7835     bravalue = OP_CBRA;
7836     skipunits = IMM2_SIZE;
7837     PUT2(code, 1+LINK_SIZE, meta_arg);
7838     cb->lastcapture = meta_arg;
7839     goto GROUP_PROCESS_NOTE_EMPTY;
7840 
7841 
7842     /* ===============================================================*/
7843     /* Handle escape sequence items. For ones like \d, the ESC_values are
7844     arranged to be the same as the corresponding OP_values in the default case
7845     when PCRE2_UCP is not set (which is the only case in which they will appear
7846     here).
7847 
7848     Note: \Q and \E are never seen here, as they were dealt with in
7849     parse_pattern(). Neither are numerical back references or recursions, which
7850     were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7851     \g, when followed by names, are turned into META_BACKREF_BYNAME or
7852     META_RECURSE_BYNAME. */
7853 
7854     case META_ESCAPE:
7855 
7856     /* We can test for escape sequences that consume a character because their
7857     values lie between ESC_b and ESC_Z; this may have to change if any new ones
7858     are ever created. For these sequences, we disable the setting of a first
7859     character if it hasn't already been set. */
7860 
7861     if (meta_arg > ESC_b && meta_arg < ESC_Z)
7862       {
7863       matched_char = TRUE;
7864       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7865       }
7866 
7867     /* Set values to reset to if this is followed by a zero repeat. */
7868 
7869     zerofirstcu = firstcu;
7870     zerofirstcuflags = firstcuflags;
7871     zeroreqcu = reqcu;
7872     zeroreqcuflags = reqcuflags;
7873 
7874     /* If Unicode is not supported, \P and \p are not allowed and are
7875     faulted at parse time, so will never appear here. */
7876 
7877 #ifdef SUPPORT_UNICODE
7878     if (meta_arg == ESC_P || meta_arg == ESC_p)
7879       {
7880       uint32_t ptype = *(++pptr) >> 16;
7881       uint32_t pdata = *pptr & 0xffff;
7882 
7883       /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7884       from the auto-anchoring code. */
7885 
7886       if (meta_arg == ESC_p && ptype == PT_ANY)
7887         {
7888         *code++ = OP_ALLANY;
7889         }
7890       else
7891         {
7892         *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7893         *code++ = ptype;
7894         *code++ = pdata;
7895         }
7896       break;  /* End META_ESCAPE */
7897       }
7898 #endif
7899 
7900     /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
7901     done. However, there's an option, in case anyone was relying on it. */
7902 
7903     if (cb->assert_depth > 0 && meta_arg == ESC_K &&
7904         (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
7905       {
7906       *errorcodeptr = ERR99;
7907       return 0;
7908       }
7909 
7910     /* For the rest (including \X when Unicode is supported - if not it's
7911     faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7912     not set; if it is set, these escapes do not show up here because they are
7913     converted into Unicode property tests in parse_regex(). Note that \b and \B
7914     do a one-character lookbehind, and \A also behaves as if it does. */
7915 
7916     if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7917     if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7918          cb->max_lookbehind == 0)
7919       cb->max_lookbehind = 1;
7920 
7921     /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7922     instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7923 
7924 #if PCRE2_CODE_UNIT_WIDTH == 32
7925     *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7926 #else
7927     *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7928 #endif
7929     break;  /* End META_ESCAPE */
7930 
7931 
7932     /* ===================================================================*/
7933     /* Handle an unrecognized meta value. A parsed pattern value less than
7934     META_END is a literal. Otherwise we have a problem. */
7935 
7936     default:
7937     if (meta >= META_END)
7938       {
7939 #ifdef DEBUG_SHOW_PARSED
7940       fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7941 #endif
7942       *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
7943       return 0;
7944       }
7945 
7946     /* Handle a literal character. We come here by goto in the case of a
7947     32-bit, non-UTF character whose value is greater than META_END. */
7948 
7949     NORMAL_CHAR:
7950     meta = *pptr;     /* Get the full 32 bits */
7951     NORMAL_CHAR_SET:  /* Character is already in meta */
7952     matched_char = TRUE;
7953 
7954     /* For caseless UTF or UCP mode, check whether this character has more than
7955     one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
7956     */
7957 
7958 #ifdef SUPPORT_UNICODE
7959     if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
7960       {
7961       uint32_t caseset = UCD_CASESET(meta);
7962       if (caseset != 0)
7963         {
7964         *code++ = OP_PROP;
7965         *code++ = PT_CLIST;
7966         *code++ = caseset;
7967         if (firstcuflags == REQ_UNSET)
7968           firstcuflags = zerofirstcuflags = REQ_NONE;
7969         break;  /* End handling this meta item */
7970         }
7971       }
7972 #endif
7973 
7974     /* Caseful matches, or caseless and not one of the multicase characters. We
7975     come here by goto in the case of a positive class that contains only
7976     case-partners of a character with just two cases; matched_char has already
7977     been set TRUE and options fudged if necessary. */
7978 
7979     CLASS_CASELESS_CHAR:
7980 
7981     /* Get the character's code units into mcbuffer, with the length in
7982     mclength. When not in UTF mode, the length is always 1. */
7983 
7984 #ifdef SUPPORT_UNICODE
7985     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7986 #endif
7987       {
7988       mclength = 1;
7989       mcbuffer[0] = meta;
7990       }
7991 
7992     /* Generate the appropriate code */
7993 
7994     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7995     memcpy(code, mcbuffer, CU2BYTES(mclength));
7996     code += mclength;
7997 
7998     /* Remember if \r or \n were seen */
7999 
8000     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8001       cb->external_flags |= PCRE2_HASCRORLF;
8002 
8003     /* Set the first and required code units appropriately. If no previous
8004     first code unit, set it from this character, but revert to none on a zero
8005     repeat. Otherwise, leave the firstcu value alone, and don't change it on
8006     a zero repeat. */
8007 
8008     if (firstcuflags == REQ_UNSET)
8009       {
8010       zerofirstcuflags = REQ_NONE;
8011       zeroreqcu = reqcu;
8012       zeroreqcuflags = reqcuflags;
8013 
8014       /* If the character is more than one code unit long, we can set a single
8015       firstcu only if it is not to be matched caselessly. Multiple possible
8016       starting code units may be picked up later in the studying code. */
8017 
8018       if (mclength == 1 || req_caseopt == 0)
8019         {
8020         firstcu = mcbuffer[0];
8021         firstcuflags = req_caseopt;
8022         if (mclength != 1)
8023           {
8024           reqcu = code[-1];
8025           reqcuflags = cb->req_varyopt;
8026           }
8027         }
8028       else firstcuflags = reqcuflags = REQ_NONE;
8029       }
8030 
8031     /* firstcu was previously set; we can set reqcu only if the length is
8032     1 or the matching is caseful. */
8033 
8034     else
8035       {
8036       zerofirstcu = firstcu;
8037       zerofirstcuflags = firstcuflags;
8038       zeroreqcu = reqcu;
8039       zeroreqcuflags = reqcuflags;
8040       if (mclength == 1 || req_caseopt == 0)
8041         {
8042         reqcu = code[-1];
8043         reqcuflags = req_caseopt | cb->req_varyopt;
8044         }
8045       }
8046 
8047     /* If caselessness was temporarily instated, reset it. */
8048 
8049     if (reset_caseful)
8050       {
8051       options &= ~PCRE2_CASELESS;
8052       req_caseopt = 0;
8053       reset_caseful = FALSE;
8054       }
8055 
8056     break;    /* End literal character handling */
8057     }         /* End of big switch */
8058   }           /* End of big loop */
8059 
8060 /* Control never reaches here. */
8061 }
8062 
8063 
8064 
8065 /*************************************************
8066 *   Compile regex: a sequence of alternatives    *
8067 *************************************************/
8068 
8069 /* On entry, pptr is pointing past the bracket meta, but on return it points to
8070 the closing bracket or META_END. The code variable is pointing at the code unit
8071 into which the BRA operator has been stored. This function is used during the
8072 pre-compile phase when we are trying to find out the amount of memory needed,
8073 as well as during the real compile phase. The value of lengthptr distinguishes
8074 the two phases.
8075 
8076 Arguments:
8077   options           option bits, including any changes for this subpattern
8078   codeptr           -> the address of the current code pointer
8079   pptrptr           -> the address of the current parsed pattern pointer
8080   errorcodeptr      -> pointer to error code variable
8081   skipunits         skip this many code units at start (for brackets and OP_COND)
8082   firstcuptr        place to put the first required code unit
8083   firstcuflagsptr   place to put the first code unit flags
8084   reqcuptr          place to put the last required code unit
8085   reqcuflagsptr     place to put the last required code unit flags
8086   bcptr             pointer to the chain of currently open branches
8087   cb                points to the data block with tables pointers etc.
8088   lengthptr         NULL during the real compile phase
8089                     points to length accumulator during pre-compile phase
8090 
8091 Returns:            0 There has been an error
8092                    +1 Success, this group must match at least one character
8093                    -1 Success, this group may match an empty string
8094 */
8095 
8096 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)8097 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
8098   int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
8099   uint32_t *firstcuflagsptr, uint32_t *reqcuptr, uint32_t *reqcuflagsptr,
8100   branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
8101 {
8102 PCRE2_UCHAR *code = *codeptr;
8103 PCRE2_UCHAR *last_branch = code;
8104 PCRE2_UCHAR *start_bracket = code;
8105 BOOL lookbehind;
8106 open_capitem capitem;
8107 int capnumber = 0;
8108 int okreturn = 1;
8109 uint32_t *pptr = *pptrptr;
8110 uint32_t firstcu, reqcu;
8111 uint32_t lookbehindlength;
8112 uint32_t firstcuflags, reqcuflags;
8113 uint32_t branchfirstcu, branchreqcu;
8114 uint32_t branchfirstcuflags, branchreqcuflags;
8115 PCRE2_SIZE length;
8116 branch_chain bc;
8117 
8118 /* If set, call the external function that checks for stack availability. */
8119 
8120 if (cb->cx->stack_guard != NULL &&
8121     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8122   {
8123   *errorcodeptr= ERR33;
8124   return 0;
8125   }
8126 
8127 /* Miscellaneous initialization */
8128 
8129 bc.outer = bcptr;
8130 bc.current_branch = code;
8131 
8132 firstcu = reqcu = 0;
8133 firstcuflags = reqcuflags = REQ_UNSET;
8134 
8135 /* Accumulate the length for use in the pre-compile phase. Start with the
8136 length of the BRA and KET and any extra code units that are required at the
8137 beginning. We accumulate in a local variable to save frequent testing of
8138 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8139 start and end of each alternative, because compiled items are discarded during
8140 the pre-compile phase so that the workspace is not exceeded. */
8141 
8142 length = 2 + 2*LINK_SIZE + skipunits;
8143 
8144 /* Remember if this is a lookbehind assertion, and if it is, save its length
8145 and skip over the pattern offset. */
8146 
8147 lookbehind = *code == OP_ASSERTBACK ||
8148              *code == OP_ASSERTBACK_NOT ||
8149              *code == OP_ASSERTBACK_NA;
8150 
8151 if (lookbehind)
8152   {
8153   lookbehindlength = META_DATA(pptr[-1]);
8154   pptr += SIZEOFFSET;
8155   }
8156 else lookbehindlength = 0;
8157 
8158 /* If this is a capturing subpattern, add to the chain of open capturing items
8159 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8160 need be tested here; changing this opcode to one of its variants, e.g.
8161 OP_SCBRAPOS, happens later, after the group has been compiled. */
8162 
8163 if (*code == OP_CBRA)
8164   {
8165   capnumber = GET2(code, 1 + LINK_SIZE);
8166   capitem.number = capnumber;
8167   capitem.next = cb->open_caps;
8168   capitem.assert_depth = cb->assert_depth;
8169   cb->open_caps = &capitem;
8170   }
8171 
8172 /* Offset is set zero to mark that this bracket is still open */
8173 
8174 PUT(code, 1, 0);
8175 code += 1 + LINK_SIZE + skipunits;
8176 
8177 /* Loop for each alternative branch */
8178 
8179 for (;;)
8180   {
8181   int branch_return;
8182 
8183   /* Insert OP_REVERSE if this is as lookbehind assertion. */
8184 
8185   if (lookbehind && lookbehindlength > 0)
8186     {
8187     *code++ = OP_REVERSE;
8188     PUTINC(code, 0, lookbehindlength);
8189     length += 1 + LINK_SIZE;
8190     }
8191 
8192   /* Now compile the branch; in the pre-compile phase its length gets added
8193   into the length. */
8194 
8195   if ((branch_return =
8196         compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
8197           &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
8198           cb, (lengthptr == NULL)? NULL : &length)) == 0)
8199     return 0;
8200 
8201   /* If a branch can match an empty string, so can the whole group. */
8202 
8203   if (branch_return < 0) okreturn = -1;
8204 
8205   /* In the real compile phase, there is some post-processing to be done. */
8206 
8207   if (lengthptr == NULL)
8208     {
8209     /* If this is the first branch, the firstcu and reqcu values for the
8210     branch become the values for the regex. */
8211 
8212     if (*last_branch != OP_ALT)
8213       {
8214       firstcu = branchfirstcu;
8215       firstcuflags = branchfirstcuflags;
8216       reqcu = branchreqcu;
8217       reqcuflags = branchreqcuflags;
8218       }
8219 
8220     /* If this is not the first branch, the first char and reqcu have to
8221     match the values from all the previous branches, except that if the
8222     previous value for reqcu didn't have REQ_VARY set, it can still match,
8223     and we set REQ_VARY for the group from this branch's value. */
8224 
8225     else
8226       {
8227       /* If we previously had a firstcu, but it doesn't match the new branch,
8228       we have to abandon the firstcu for the regex, but if there was
8229       previously no reqcu, it takes on the value of the old firstcu. */
8230 
8231       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8232         {
8233         if (firstcuflags < REQ_NONE)
8234           {
8235           if (reqcuflags >= REQ_NONE)
8236             {
8237             reqcu = firstcu;
8238             reqcuflags = firstcuflags;
8239             }
8240           }
8241         firstcuflags = REQ_NONE;
8242         }
8243 
8244       /* If we (now or from before) have no firstcu, a firstcu from the
8245       branch becomes a reqcu if there isn't a branch reqcu. */
8246 
8247       if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8248           branchreqcuflags >= REQ_NONE)
8249         {
8250         branchreqcu = branchfirstcu;
8251         branchreqcuflags = branchfirstcuflags;
8252         }
8253 
8254       /* Now ensure that the reqcus match */
8255 
8256       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8257           reqcu != branchreqcu)
8258         reqcuflags = REQ_NONE;
8259       else
8260         {
8261         reqcu = branchreqcu;
8262         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8263         }
8264       }
8265     }
8266 
8267   /* Handle reaching the end of the expression, either ')' or end of pattern.
8268   In the real compile phase, go back through the alternative branches and
8269   reverse the chain of offsets, with the field in the BRA item now becoming an
8270   offset to the first alternative. If there are no alternatives, it points to
8271   the end of the group. The length in the terminating ket is always the length
8272   of the whole bracketed item. Return leaving the pointer at the terminating
8273   char. */
8274 
8275   if (META_CODE(*pptr) != META_ALT)
8276     {
8277     if (lengthptr == NULL)
8278       {
8279       PCRE2_SIZE branch_length = code - last_branch;
8280       do
8281         {
8282         PCRE2_SIZE prev_length = GET(last_branch, 1);
8283         PUT(last_branch, 1, branch_length);
8284         branch_length = prev_length;
8285         last_branch -= branch_length;
8286         }
8287       while (branch_length > 0);
8288       }
8289 
8290     /* Fill in the ket */
8291 
8292     *code = OP_KET;
8293     PUT(code, 1, (int)(code - start_bracket));
8294     code += 1 + LINK_SIZE;
8295 
8296     /* If it was a capturing subpattern, remove the block from the chain. */
8297 
8298     if (capnumber > 0) cb->open_caps = cb->open_caps->next;
8299 
8300     /* Set values to pass back */
8301 
8302     *codeptr = code;
8303     *pptrptr = pptr;
8304     *firstcuptr = firstcu;
8305     *firstcuflagsptr = firstcuflags;
8306     *reqcuptr = reqcu;
8307     *reqcuflagsptr = reqcuflags;
8308     if (lengthptr != NULL)
8309       {
8310       if (OFLOW_MAX - *lengthptr < length)
8311         {
8312         *errorcodeptr = ERR20;
8313         return 0;
8314         }
8315       *lengthptr += length;
8316       }
8317     return okreturn;
8318     }
8319 
8320   /* Another branch follows. In the pre-compile phase, we can move the code
8321   pointer back to where it was for the start of the first branch. (That is,
8322   pretend that each branch is the only one.)
8323 
8324   In the real compile phase, insert an ALT node. Its length field points back
8325   to the previous branch while the bracket remains open. At the end the chain
8326   is reversed. It's done like this so that the start of the bracket has a
8327   zero offset until it is closed, making it possible to detect recursion. */
8328 
8329   if (lengthptr != NULL)
8330     {
8331     code = *codeptr + 1 + LINK_SIZE + skipunits;
8332     length += 1 + LINK_SIZE;
8333     }
8334   else
8335     {
8336     *code = OP_ALT;
8337     PUT(code, 1, (int)(code - last_branch));
8338     bc.current_branch = last_branch = code;
8339     code += 1 + LINK_SIZE;
8340     }
8341 
8342   /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8343   and then advance past the vertical bar. */
8344 
8345   lookbehindlength = META_DATA(*pptr);
8346   pptr++;
8347   }
8348 /* Control never reaches here */
8349 }
8350 
8351 
8352 
8353 /*************************************************
8354 *          Check for anchored pattern            *
8355 *************************************************/
8356 
8357 /* Try to find out if this is an anchored regular expression. Consider each
8358 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8359 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8360 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8361 be found, because ^ generates OP_CIRCM in that mode.
8362 
8363 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8364 This is the code for \G, which means "match at start of match position, taking
8365 into account the match offset".
8366 
8367 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8368 because that will try the rest of the pattern at all possible matching points,
8369 so there is no point trying again.... er ....
8370 
8371 .... except when the .* appears inside capturing parentheses, and there is a
8372 subsequent back reference to those parentheses. We haven't enough information
8373 to catch that case precisely.
8374 
8375 At first, the best we could do was to detect when .* was in capturing brackets
8376 and the highest back reference was greater than or equal to that level.
8377 However, by keeping a bitmap of the first 31 back references, we can catch some
8378 of the more common cases more precisely.
8379 
8380 ... A second exception is when the .* appears inside an atomic group, because
8381 this prevents the number of characters it matches from being adjusted.
8382 
8383 Arguments:
8384   code           points to start of the compiled pattern
8385   bracket_map    a bitmap of which brackets we are inside while testing; this
8386                    handles up to substring 31; after that we just have to take
8387                    the less precise approach
8388   cb             points to the compile data block
8389   atomcount      atomic group level
8390   inassert       TRUE if in an assertion
8391 
8392 Returns:     TRUE or FALSE
8393 */
8394 
8395 static BOOL
is_anchored(PCRE2_SPTR code,uint32_t bracket_map,compile_block * cb,int atomcount,BOOL inassert)8396 is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8397   int atomcount, BOOL inassert)
8398 {
8399 do {
8400    PCRE2_SPTR scode = first_significant_code(
8401      code + PRIV(OP_lengths)[*code], FALSE);
8402    int op = *scode;
8403 
8404    /* Non-capturing brackets */
8405 
8406    if (op == OP_BRA  || op == OP_BRAPOS ||
8407        op == OP_SBRA || op == OP_SBRAPOS)
8408      {
8409      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8410        return FALSE;
8411      }
8412 
8413    /* Capturing brackets */
8414 
8415    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8416             op == OP_SCBRA || op == OP_SCBRAPOS)
8417      {
8418      int n = GET2(scode, 1+LINK_SIZE);
8419      uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8420      if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8421      }
8422 
8423    /* Positive forward assertion */
8424 
8425    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8426      {
8427      if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8428      }
8429 
8430    /* Condition. If there is no second branch, it can't be anchored. */
8431 
8432    else if (op == OP_COND || op == OP_SCOND)
8433      {
8434      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8435      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8436        return FALSE;
8437      }
8438 
8439    /* Atomic groups */
8440 
8441    else if (op == OP_ONCE)
8442      {
8443      if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8444        return FALSE;
8445      }
8446 
8447    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8448    it isn't in brackets that are or may be referenced or inside an atomic
8449    group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8450    because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8451    with the subject "aab", which matches "b", i.e. not at the start of a line.
8452    There is also an option that disables auto-anchoring. */
8453 
8454    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8455              op == OP_TYPEPOSSTAR))
8456      {
8457      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8458          atomcount > 0 || cb->had_pruneorskip || inassert ||
8459          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8460        return FALSE;
8461      }
8462 
8463    /* Check for explicit anchoring */
8464 
8465    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8466 
8467    code += GET(code, 1);
8468    }
8469 while (*code == OP_ALT);   /* Loop for each alternative */
8470 return TRUE;
8471 }
8472 
8473 
8474 
8475 /*************************************************
8476 *         Check for starting with ^ or .*        *
8477 *************************************************/
8478 
8479 /* This is called to find out if every branch starts with ^ or .* so that
8480 "first char" processing can be done to speed things up in multiline
8481 matching and for non-DOTALL patterns that start with .* (which must start at
8482 the beginning or after \n). As in the case of is_anchored() (see above), we
8483 have to take account of back references to capturing brackets that contain .*
8484 because in that case we can't make the assumption. Also, the appearance of .*
8485 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8486 or *SKIP does not count, because once again the assumption no longer holds.
8487 
8488 Arguments:
8489   code           points to start of the compiled pattern or a group
8490   bracket_map    a bitmap of which brackets we are inside while testing; this
8491                    handles up to substring 31; after that we just have to take
8492                    the less precise approach
8493   cb             points to the compile data
8494   atomcount      atomic group level
8495   inassert       TRUE if in an assertion
8496 
8497 Returns:         TRUE or FALSE
8498 */
8499 
8500 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8501 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8502   int atomcount, BOOL inassert)
8503 {
8504 do {
8505    PCRE2_SPTR scode = first_significant_code(
8506      code + PRIV(OP_lengths)[*code], FALSE);
8507    int op = *scode;
8508 
8509    /* If we are at the start of a conditional assertion group, *both* the
8510    conditional assertion *and* what follows the condition must satisfy the test
8511    for start of line. Other kinds of condition fail. Note that there may be an
8512    auto-callout at the start of a condition. */
8513 
8514    if (op == OP_COND)
8515      {
8516      scode += 1 + LINK_SIZE;
8517 
8518      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8519        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8520 
8521      switch (*scode)
8522        {
8523        case OP_CREF:
8524        case OP_DNCREF:
8525        case OP_RREF:
8526        case OP_DNRREF:
8527        case OP_FAIL:
8528        case OP_FALSE:
8529        case OP_TRUE:
8530        return FALSE;
8531 
8532        default:     /* Assertion */
8533        if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8534        do scode += GET(scode, 1); while (*scode == OP_ALT);
8535        scode += 1 + LINK_SIZE;
8536        break;
8537        }
8538      scode = first_significant_code(scode, FALSE);
8539      op = *scode;
8540      }
8541 
8542    /* Non-capturing brackets */
8543 
8544    if (op == OP_BRA  || op == OP_BRAPOS ||
8545        op == OP_SBRA || op == OP_SBRAPOS)
8546      {
8547      if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8548        return FALSE;
8549      }
8550 
8551    /* Capturing brackets */
8552 
8553    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8554             op == OP_SCBRA || op == OP_SCBRAPOS)
8555      {
8556      int n = GET2(scode, 1+LINK_SIZE);
8557      unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8558      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8559      }
8560 
8561    /* Positive forward assertions */
8562 
8563    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8564      {
8565      if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8566        return FALSE;
8567      }
8568 
8569    /* Atomic brackets */
8570 
8571    else if (op == OP_ONCE)
8572      {
8573      if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8574        return FALSE;
8575      }
8576 
8577    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8578    brackets that may be referenced or an assertion, and as long as the pattern
8579    does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8580    for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8581    i.e. not at the start of a line. There is also an option that disables this
8582    optimization. */
8583 
8584    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8585      {
8586      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8587          atomcount > 0 || cb->had_pruneorskip || inassert ||
8588          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8589        return FALSE;
8590      }
8591 
8592    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8593    in particular that this includes atomic brackets OP_ONCE because the number
8594    of characters matched by .* cannot be adjusted inside them. */
8595 
8596    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8597 
8598    /* Move on to the next alternative */
8599 
8600    code += GET(code, 1);
8601    }
8602 while (*code == OP_ALT);  /* Loop for each alternative */
8603 return TRUE;
8604 }
8605 
8606 
8607 
8608 /*************************************************
8609 *   Scan compiled regex for recursion reference  *
8610 *************************************************/
8611 
8612 /* This function scans through a compiled pattern until it finds an instance of
8613 OP_RECURSE.
8614 
8615 Arguments:
8616   code        points to start of expression
8617   utf         TRUE in UTF mode
8618 
8619 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8620 */
8621 
8622 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8623 find_recurse(PCRE2_SPTR code, BOOL utf)
8624 {
8625 for (;;)
8626   {
8627   PCRE2_UCHAR c = *code;
8628   if (c == OP_END) return NULL;
8629   if (c == OP_RECURSE) return code;
8630 
8631   /* XCLASS is used for classes that cannot be represented just by a bit map.
8632   This includes negated single high-valued characters. CALLOUT_STR is used for
8633   callouts with string arguments. In both cases the length in the table is
8634   zero; the actual length is stored in the compiled code. */
8635 
8636   if (c == OP_XCLASS) code += GET(code, 1);
8637     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8638 
8639   /* Otherwise, we can get the item's length from the table, except that for
8640   repeated character types, we have to test for \p and \P, which have an extra
8641   two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8642   we must add in its length. */
8643 
8644   else
8645     {
8646     switch(c)
8647       {
8648       case OP_TYPESTAR:
8649       case OP_TYPEMINSTAR:
8650       case OP_TYPEPLUS:
8651       case OP_TYPEMINPLUS:
8652       case OP_TYPEQUERY:
8653       case OP_TYPEMINQUERY:
8654       case OP_TYPEPOSSTAR:
8655       case OP_TYPEPOSPLUS:
8656       case OP_TYPEPOSQUERY:
8657       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8658       break;
8659 
8660       case OP_TYPEPOSUPTO:
8661       case OP_TYPEUPTO:
8662       case OP_TYPEMINUPTO:
8663       case OP_TYPEEXACT:
8664       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8665         code += 2;
8666       break;
8667 
8668       case OP_MARK:
8669       case OP_COMMIT_ARG:
8670       case OP_PRUNE_ARG:
8671       case OP_SKIP_ARG:
8672       case OP_THEN_ARG:
8673       code += code[1];
8674       break;
8675       }
8676 
8677     /* Add in the fixed length from the table */
8678 
8679     code += PRIV(OP_lengths)[c];
8680 
8681     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8682     be followed by a multi-unit character. The length in the table is a
8683     minimum, so we have to arrange to skip the extra units. */
8684 
8685 #ifdef MAYBE_UTF_MULTI
8686     if (utf) switch(c)
8687       {
8688       case OP_CHAR:
8689       case OP_CHARI:
8690       case OP_NOT:
8691       case OP_NOTI:
8692       case OP_EXACT:
8693       case OP_EXACTI:
8694       case OP_NOTEXACT:
8695       case OP_NOTEXACTI:
8696       case OP_UPTO:
8697       case OP_UPTOI:
8698       case OP_NOTUPTO:
8699       case OP_NOTUPTOI:
8700       case OP_MINUPTO:
8701       case OP_MINUPTOI:
8702       case OP_NOTMINUPTO:
8703       case OP_NOTMINUPTOI:
8704       case OP_POSUPTO:
8705       case OP_POSUPTOI:
8706       case OP_NOTPOSUPTO:
8707       case OP_NOTPOSUPTOI:
8708       case OP_STAR:
8709       case OP_STARI:
8710       case OP_NOTSTAR:
8711       case OP_NOTSTARI:
8712       case OP_MINSTAR:
8713       case OP_MINSTARI:
8714       case OP_NOTMINSTAR:
8715       case OP_NOTMINSTARI:
8716       case OP_POSSTAR:
8717       case OP_POSSTARI:
8718       case OP_NOTPOSSTAR:
8719       case OP_NOTPOSSTARI:
8720       case OP_PLUS:
8721       case OP_PLUSI:
8722       case OP_NOTPLUS:
8723       case OP_NOTPLUSI:
8724       case OP_MINPLUS:
8725       case OP_MINPLUSI:
8726       case OP_NOTMINPLUS:
8727       case OP_NOTMINPLUSI:
8728       case OP_POSPLUS:
8729       case OP_POSPLUSI:
8730       case OP_NOTPOSPLUS:
8731       case OP_NOTPOSPLUSI:
8732       case OP_QUERY:
8733       case OP_QUERYI:
8734       case OP_NOTQUERY:
8735       case OP_NOTQUERYI:
8736       case OP_MINQUERY:
8737       case OP_MINQUERYI:
8738       case OP_NOTMINQUERY:
8739       case OP_NOTMINQUERYI:
8740       case OP_POSQUERY:
8741       case OP_POSQUERYI:
8742       case OP_NOTPOSQUERY:
8743       case OP_NOTPOSQUERYI:
8744       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8745       break;
8746       }
8747 #else
8748     (void)(utf);  /* Keep compiler happy by referencing function argument */
8749 #endif  /* MAYBE_UTF_MULTI */
8750     }
8751   }
8752 }
8753 
8754 
8755 
8756 /*************************************************
8757 *    Check for asserted fixed first code unit    *
8758 *************************************************/
8759 
8760 /* During compilation, the "first code unit" settings from forward assertions
8761 are discarded, because they can cause conflicts with actual literals that
8762 follow. However, if we end up without a first code unit setting for an
8763 unanchored pattern, it is worth scanning the regex to see if there is an
8764 initial asserted first code unit. If all branches start with the same asserted
8765 code unit, or with a non-conditional bracket all of whose alternatives start
8766 with the same asserted code unit (recurse ad lib), then we return that code
8767 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8768 REQ_NONE in the flags.
8769 
8770 Arguments:
8771   code       points to start of compiled pattern
8772   flags      points to the first code unit flags
8773   inassert   non-zero if in an assertion
8774 
8775 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
8776 */
8777 
8778 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,uint32_t * flags,uint32_t inassert)8779 find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
8780 {
8781 uint32_t c = 0;
8782 uint32_t cflags = REQ_NONE;
8783 
8784 *flags = REQ_NONE;
8785 do {
8786    uint32_t d;
8787    uint32_t dflags;
8788    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8789              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8790    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8791    PCRE2_UCHAR op = *scode;
8792 
8793    switch(op)
8794      {
8795      default:
8796      return 0;
8797 
8798      case OP_BRA:
8799      case OP_BRAPOS:
8800      case OP_CBRA:
8801      case OP_SCBRA:
8802      case OP_CBRAPOS:
8803      case OP_SCBRAPOS:
8804      case OP_ASSERT:
8805      case OP_ASSERT_NA:
8806      case OP_ONCE:
8807      case OP_SCRIPT_RUN:
8808      d = find_firstassertedcu(scode, &dflags, inassert +
8809        ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
8810      if (dflags >= REQ_NONE) return 0;
8811      if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
8812        else if (c != d || cflags != dflags) return 0;
8813      break;
8814 
8815      case OP_EXACT:
8816      scode += IMM2_SIZE;
8817      /* Fall through */
8818 
8819      case OP_CHAR:
8820      case OP_PLUS:
8821      case OP_MINPLUS:
8822      case OP_POSPLUS:
8823      if (inassert == 0) return 0;
8824      if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
8825        else if (c != scode[1]) return 0;
8826      break;
8827 
8828      case OP_EXACTI:
8829      scode += IMM2_SIZE;
8830      /* Fall through */
8831 
8832      case OP_CHARI:
8833      case OP_PLUSI:
8834      case OP_MINPLUSI:
8835      case OP_POSPLUSI:
8836      if (inassert == 0) return 0;
8837 
8838      /* If the character is more than one code unit long, we cannot set its
8839      first code unit when matching caselessly. Later scanning may pick up
8840      multiple code units. */
8841 
8842 #ifdef SUPPORT_UNICODE
8843 #if PCRE2_CODE_UNIT_WIDTH == 8
8844      if (scode[1] >= 0x80) return 0;
8845 #elif PCRE2_CODE_UNIT_WIDTH == 16
8846      if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
8847 #endif
8848 #endif
8849 
8850      if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
8851        else if (c != scode[1]) return 0;
8852      break;
8853      }
8854 
8855    code += GET(code, 1);
8856    }
8857 while (*code == OP_ALT);
8858 
8859 *flags = cflags;
8860 return c;
8861 }
8862 
8863 
8864 
8865 /*************************************************
8866 *     Add an entry to the name/number table      *
8867 *************************************************/
8868 
8869 /* This function is called between compiling passes to add an entry to the
8870 name/number table, maintaining alphabetical order. Checking for permitted
8871 and forbidden duplicates has already been done.
8872 
8873 Arguments:
8874   cb           the compile data block
8875   name         the name to add
8876   length       the length of the name
8877   groupno      the group number
8878   tablecount   the count of names in the table so far
8879 
8880 Returns:       nothing
8881 */
8882 
8883 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8884 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8885   unsigned int groupno, uint32_t tablecount)
8886 {
8887 uint32_t i;
8888 PCRE2_UCHAR *slot = cb->name_table;
8889 
8890 for (i = 0; i < tablecount; i++)
8891   {
8892   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8893   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8894     crc = -1; /* Current name is a substring */
8895 
8896   /* Make space in the table and break the loop for an earlier name. For a
8897   duplicate or later name, carry on. We do this for duplicates so that in the
8898   simple case (when ?(| is not used) they are in order of their numbers. In all
8899   cases they are in the order in which they appear in the pattern. */
8900 
8901   if (crc < 0)
8902     {
8903     (void)memmove(slot + cb->name_entry_size, slot,
8904       CU2BYTES((tablecount - i) * cb->name_entry_size));
8905     break;
8906     }
8907 
8908   /* Continue the loop for a later or duplicate name */
8909 
8910   slot += cb->name_entry_size;
8911   }
8912 
8913 PUT2(slot, 0, groupno);
8914 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8915 
8916 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8917 the memory is all initialized. Otherwise valgrind moans about uninitialized
8918 memory when saving serialized compiled patterns. */
8919 
8920 memset(slot + IMM2_SIZE + length, 0,
8921   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8922 }
8923 
8924 
8925 
8926 /*************************************************
8927 *             Skip in parsed pattern             *
8928 *************************************************/
8929 
8930 /* This function is called to skip parts of the parsed pattern when finding the
8931 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8932 the end of the branch, it is called to skip over an internal lookaround or
8933 (DEFINE) group, and it is also called to skip to the end of a class, during
8934 which it will never encounter nested groups (but there's no need to have
8935 special code for that).
8936 
8937 When called to find the end of a branch or group, pptr must point to the first
8938 meta code inside the branch, not the branch-starting code. In other cases it
8939 can point to the item that causes the function to be called.
8940 
8941 Arguments:
8942   pptr       current pointer to skip from
8943   skiptype   PSKIP_CLASS when skipping to end of class
8944              PSKIP_ALT when META_ALT ends the skip
8945              PSKIP_KET when only META_KET ends the skip
8946 
8947 Returns:     new value of pptr
8948              NULL if META_END is reached - should never occur
8949                or for an unknown meta value - likewise
8950 */
8951 
8952 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8953 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8954 {
8955 uint32_t nestlevel = 0;
8956 
8957 for (;; pptr++)
8958   {
8959   uint32_t meta = META_CODE(*pptr);
8960 
8961   switch(meta)
8962     {
8963     default:  /* Just skip over most items */
8964     if (meta < META_END) continue;  /* Literal */
8965     break;
8966 
8967     /* This should never occur. */
8968 
8969     case META_END:
8970     return NULL;
8971 
8972     /* The data for these items is variable in length. */
8973 
8974     case META_BACKREF:  /* Offset is present only if group >= 10 */
8975     if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8976     break;
8977 
8978     case META_ESCAPE:   /* A few escapes are followed by data items. */
8979     switch (META_DATA(*pptr))
8980       {
8981       case ESC_P:
8982       case ESC_p:
8983       pptr += 1;
8984       break;
8985 
8986       case ESC_g:
8987       case ESC_k:
8988       pptr += 1 + SIZEOFFSET;
8989       break;
8990       }
8991     break;
8992 
8993     case META_MARK:     /* Add the length of the name. */
8994     case META_COMMIT_ARG:
8995     case META_PRUNE_ARG:
8996     case META_SKIP_ARG:
8997     case META_THEN_ARG:
8998     pptr += pptr[1];
8999     break;
9000 
9001     /* These are the "active" items in this loop. */
9002 
9003     case META_CLASS_END:
9004     if (skiptype == PSKIP_CLASS) return pptr;
9005     break;
9006 
9007     case META_ATOMIC:
9008     case META_CAPTURE:
9009     case META_COND_ASSERT:
9010     case META_COND_DEFINE:
9011     case META_COND_NAME:
9012     case META_COND_NUMBER:
9013     case META_COND_RNAME:
9014     case META_COND_RNUMBER:
9015     case META_COND_VERSION:
9016     case META_LOOKAHEAD:
9017     case META_LOOKAHEADNOT:
9018     case META_LOOKAHEAD_NA:
9019     case META_LOOKBEHIND:
9020     case META_LOOKBEHINDNOT:
9021     case META_LOOKBEHIND_NA:
9022     case META_NOCAPTURE:
9023     case META_SCRIPT_RUN:
9024     nestlevel++;
9025     break;
9026 
9027     case META_ALT:
9028     if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9029     break;
9030 
9031     case META_KET:
9032     if (nestlevel == 0) return pptr;
9033     nestlevel--;
9034     break;
9035     }
9036 
9037   /* The extra data item length for each meta is in a table. */
9038 
9039   meta = (meta >> 16) & 0x7fff;
9040   if (meta >= sizeof(meta_extra_lengths)) return NULL;
9041   pptr += meta_extra_lengths[meta];
9042   }
9043 /* Control never reaches here */
9044 return pptr;
9045 }
9046 
9047 
9048 
9049 /*************************************************
9050 *       Find length of a parsed group            *
9051 *************************************************/
9052 
9053 /* This is called for nested groups within a branch of a lookbehind whose
9054 length is being computed. If all the branches in the nested group have the same
9055 length, that is OK. On entry, the pointer must be at the first element after
9056 the group initializing code. On exit it points to OP_KET. Caching is used to
9057 improve processing speed when the same capturing group occurs many times.
9058 
9059 Arguments:
9060   pptrptr     pointer to pointer in the parsed pattern
9061   isinline    FALSE if a reference or recursion; TRUE for inline group
9062   errcodeptr  pointer to the errorcode
9063   lcptr       pointer to the loop counter
9064   group       number of captured group or -1 for a non-capturing group
9065   recurses    chain of recurse_check to catch mutual recursion
9066   cb          pointer to the compile data
9067 
9068 Returns:      the group length or a negative number
9069 */
9070 
9071 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)9072 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
9073    int group, parsed_recurse_check *recurses, compile_block *cb)
9074 {
9075 int branchlength;
9076 int grouplength = -1;
9077 
9078 /* The cache can be used only if there is no possibility of there being two
9079 groups with the same number. We do not need to set the end pointer for a group
9080 that is being processed as a back reference or recursion, but we must do so for
9081 an inline group. */
9082 
9083 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9084   {
9085   uint32_t groupinfo = cb->groupinfo[group];
9086   if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9087   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9088     {
9089     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9090     return groupinfo & GI_FIXED_LENGTH_MASK;
9091     }
9092   }
9093 
9094 /* Scan the group. In this case we find the end pointer of necessity. */
9095 
9096 for(;;)
9097   {
9098   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9099   if (branchlength < 0) goto ISNOTFIXED;
9100   if (grouplength == -1) grouplength = branchlength;
9101     else if (grouplength != branchlength) goto ISNOTFIXED;
9102   if (**pptrptr == META_KET) break;
9103   *pptrptr += 1;   /* Skip META_ALT */
9104   }
9105 
9106 if (group > 0)
9107   cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9108 return grouplength;
9109 
9110 ISNOTFIXED:
9111 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
9112 return -1;
9113 }
9114 
9115 
9116 
9117 /*************************************************
9118 *        Find length of a parsed branch          *
9119 *************************************************/
9120 
9121 /* Return a fixed length for a branch in a lookbehind, giving an error if the
9122 length is not fixed. On entry, *pptrptr points to the first element inside the
9123 branch. On exit it is set to point to the ALT or KET.
9124 
9125 Arguments:
9126   pptrptr     pointer to pointer in the parsed pattern
9127   errcodeptr  pointer to error code
9128   lcptr       pointer to loop counter
9129   recurses    chain of recurse_check to catch mutual recursion
9130   cb          pointer to compile block
9131 
9132 Returns:      the length, or a negative value on error
9133 */
9134 
9135 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9136 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9137   parsed_recurse_check *recurses, compile_block *cb)
9138 {
9139 int branchlength = 0;
9140 int grouplength;
9141 uint32_t lastitemlength = 0;
9142 uint32_t *pptr = *pptrptr;
9143 PCRE2_SIZE offset;
9144 parsed_recurse_check this_recurse;
9145 
9146 /* A large and/or complex regex can take too long to process. This can happen
9147 more often when (?| groups are present in the pattern because their length
9148 cannot be cached. */
9149 
9150 if ((*lcptr)++ > 2000)
9151   {
9152   *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9153   return -1;
9154   }
9155 
9156 /* Scan the branch, accumulating the length. */
9157 
9158 for (;; pptr++)
9159   {
9160   parsed_recurse_check *r;
9161   uint32_t *gptr, *gptrend;
9162   uint32_t escape;
9163   uint32_t group = 0;
9164   uint32_t itemlength = 0;
9165 
9166   if (*pptr < META_END)
9167     {
9168     itemlength = 1;
9169     }
9170 
9171   else switch (META_CODE(*pptr))
9172     {
9173     case META_KET:
9174     case META_ALT:
9175     goto EXIT;
9176 
9177     /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9178     actual termination. */
9179 
9180     case META_ACCEPT:
9181     case META_FAIL:
9182     pptr = parsed_skip(pptr, PSKIP_ALT);
9183     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9184     goto EXIT;
9185 
9186     case META_MARK:
9187     case META_COMMIT_ARG:
9188     case META_PRUNE_ARG:
9189     case META_SKIP_ARG:
9190     case META_THEN_ARG:
9191     pptr += pptr[1] + 1;
9192     break;
9193 
9194     case META_CIRCUMFLEX:
9195     case META_COMMIT:
9196     case META_DOLLAR:
9197     case META_PRUNE:
9198     case META_SKIP:
9199     case META_THEN:
9200     break;
9201 
9202     case META_OPTIONS:
9203     pptr += 1;
9204     break;
9205 
9206     case META_BIGVALUE:
9207     itemlength = 1;
9208     pptr += 1;
9209     break;
9210 
9211     case META_CLASS:
9212     case META_CLASS_NOT:
9213     itemlength = 1;
9214     pptr = parsed_skip(pptr, PSKIP_CLASS);
9215     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9216     break;
9217 
9218     case META_CLASS_EMPTY_NOT:
9219     case META_DOT:
9220     itemlength = 1;
9221     break;
9222 
9223     case META_CALLOUT_NUMBER:
9224     pptr += 3;
9225     break;
9226 
9227     case META_CALLOUT_STRING:
9228     pptr += 3 + SIZEOFFSET;
9229     break;
9230 
9231     /* Only some escapes consume a character. Of those, \R and \X are never
9232     allowed because they might match more than character. \C is allowed only in
9233     32-bit and non-UTF 8/16-bit modes. */
9234 
9235     case META_ESCAPE:
9236     escape = META_DATA(*pptr);
9237     if (escape == ESC_R || escape == ESC_X) return -1;
9238     if (escape > ESC_b && escape < ESC_Z)
9239       {
9240 #if PCRE2_CODE_UNIT_WIDTH != 32
9241       if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9242         {
9243         *errcodeptr = ERR36;
9244         return -1;
9245         }
9246 #endif
9247       itemlength = 1;
9248       if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9249       }
9250     break;
9251 
9252     /* Lookaheads do not contribute to the length of this branch, but they may
9253     contain lookbehinds within them whose lengths need to be set. */
9254 
9255     case META_LOOKAHEAD:
9256     case META_LOOKAHEADNOT:
9257     case META_LOOKAHEAD_NA:
9258     *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9259     if (*errcodeptr != 0) return -1;
9260 
9261     /* Ignore any qualifiers that follow a lookahead assertion. */
9262 
9263     switch (pptr[1])
9264       {
9265       case META_ASTERISK:
9266       case META_ASTERISK_PLUS:
9267       case META_ASTERISK_QUERY:
9268       case META_PLUS:
9269       case META_PLUS_PLUS:
9270       case META_PLUS_QUERY:
9271       case META_QUERY:
9272       case META_QUERY_PLUS:
9273       case META_QUERY_QUERY:
9274       pptr++;
9275       break;
9276 
9277       case META_MINMAX:
9278       case META_MINMAX_PLUS:
9279       case META_MINMAX_QUERY:
9280       pptr += 3;
9281       break;
9282 
9283       default:
9284       break;
9285       }
9286     break;
9287 
9288     /* A nested lookbehind does not contribute any length to this lookbehind,
9289     but must itself be checked and have its lengths set. */
9290 
9291     case META_LOOKBEHIND:
9292     case META_LOOKBEHINDNOT:
9293     case META_LOOKBEHIND_NA:
9294     if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9295       return -1;
9296     break;
9297 
9298     /* Back references and recursions are handled by very similar code. At this
9299     stage, the names generated in the parsing pass are available, but the main
9300     name table has not yet been created. So for the named varieties, scan the
9301     list of names in order to get the number of the first one in the pattern,
9302     and whether or not this name is duplicated. */
9303 
9304     case META_BACKREF_BYNAME:
9305     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9306       goto ISNOTFIXED;
9307     /* Fall through */
9308 
9309     case META_RECURSE_BYNAME:
9310       {
9311       int i;
9312       PCRE2_SPTR name;
9313       BOOL is_dupname = FALSE;
9314       named_group *ng = cb->named_groups;
9315       uint32_t meta_code = META_CODE(*pptr);
9316       uint32_t length = *(++pptr);
9317 
9318       GETPLUSOFFSET(offset, pptr);
9319       name = cb->start_pattern + offset;
9320       for (i = 0; i < cb->names_found; i++, ng++)
9321         {
9322         if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9323           {
9324           group = ng->number;
9325           is_dupname = ng->isdup;
9326           break;
9327           }
9328         }
9329 
9330       if (group == 0)
9331         {
9332         *errcodeptr = ERR15;  /* Non-existent subpattern */
9333         cb->erroroffset = offset;
9334         return -1;
9335         }
9336 
9337       /* A numerical back reference can be fixed length if duplicate capturing
9338       groups are not being used. A non-duplicate named back reference can also
9339       be handled. */
9340 
9341       if (meta_code == META_RECURSE_BYNAME ||
9342           (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9343         goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9344       }
9345     goto ISNOTFIXED;                     /* Duplicate name or number */
9346 
9347     /* The offset values for back references < 10 are in a separate vector
9348     because otherwise they would use more than two parsed pattern elements on
9349     64-bit systems. */
9350 
9351     case META_BACKREF:
9352     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9353         (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9354       goto ISNOTFIXED;
9355     group = META_DATA(*pptr);
9356     if (group < 10)
9357       {
9358       offset = cb->small_ref_offset[group];
9359       goto RECURSE_OR_BACKREF_LENGTH;
9360       }
9361 
9362     /* Fall through */
9363     /* For groups >= 10 - picking up group twice does no harm. */
9364 
9365     /* A true recursion implies not fixed length, but a subroutine call may
9366     be OK. Back reference "recursions" are also failed. */
9367 
9368     case META_RECURSE:
9369     group = META_DATA(*pptr);
9370     GETPLUSOFFSET(offset, pptr);
9371 
9372     RECURSE_OR_BACKREF_LENGTH:
9373     if (group > cb->bracount)
9374       {
9375       cb->erroroffset = offset;
9376       *errcodeptr = ERR15;  /* Non-existent subpattern */
9377       return -1;
9378       }
9379     if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9380     for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9381       {
9382       if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9383         else if (*gptr == (META_CAPTURE | group)) break;
9384       }
9385 
9386     /* We must start the search for the end of the group at the first meta code
9387     inside the group. Otherwise it will be treated as an enclosed group. */
9388 
9389     gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9390     if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9391     if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9392     for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9393     if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9394     this_recurse.prev = recurses;
9395     this_recurse.groupptr = gptr;
9396 
9397     /* We do not need to know the position of the end of the group, that is,
9398     gptr is not used after the call to get_grouplength(). Setting the second
9399     argument FALSE stops it scanning for the end when the length can be found
9400     in the cache. */
9401 
9402     gptr++;
9403     grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9404       &this_recurse, cb);
9405     if (grouplength < 0)
9406       {
9407       if (*errcodeptr == 0) goto ISNOTFIXED;
9408       return -1;  /* Error already set */
9409       }
9410     itemlength = grouplength;
9411     break;
9412 
9413     /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9414     the length of this branch. Skip from the following item to the next
9415     unpaired ket. */
9416 
9417     case META_COND_DEFINE:
9418     pptr = parsed_skip(pptr + 1, PSKIP_KET);
9419     break;
9420 
9421     /* Check other nested groups - advance past the initial data for each type
9422     and then seek a fixed length with get_grouplength(). */
9423 
9424     case META_COND_NAME:
9425     case META_COND_NUMBER:
9426     case META_COND_RNAME:
9427     case META_COND_RNUMBER:
9428     pptr += 2 + SIZEOFFSET;
9429     goto CHECK_GROUP;
9430 
9431     case META_COND_ASSERT:
9432     pptr += 1;
9433     goto CHECK_GROUP;
9434 
9435     case META_COND_VERSION:
9436     pptr += 4;
9437     goto CHECK_GROUP;
9438 
9439     case META_CAPTURE:
9440     group = META_DATA(*pptr);
9441     /* Fall through */
9442 
9443     case META_ATOMIC:
9444     case META_NOCAPTURE:
9445     case META_SCRIPT_RUN:
9446     pptr++;
9447     CHECK_GROUP:
9448     grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9449       recurses, cb);
9450     if (grouplength < 0) return -1;
9451     itemlength = grouplength;
9452     break;
9453 
9454     /* Exact repetition is OK; variable repetition is not. A repetition of zero
9455     must subtract the length that has already been added. */
9456 
9457     case META_MINMAX:
9458     case META_MINMAX_PLUS:
9459     case META_MINMAX_QUERY:
9460     if (pptr[1] == pptr[2])
9461       {
9462       switch(pptr[1])
9463         {
9464         case 0:
9465         branchlength -= lastitemlength;
9466         break;
9467 
9468         case 1:
9469         itemlength = 0;
9470         break;
9471 
9472         default:  /* Check for integer overflow */
9473         if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9474             INT_MAX/lastitemlength < pptr[1] - 1)
9475           {
9476           *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9477           return -1;
9478           }
9479         itemlength = (pptr[1] - 1) * lastitemlength;
9480         break;
9481         }
9482       pptr += 2;
9483       break;
9484       }
9485     /* Fall through */
9486 
9487     /* Any other item means this branch does not have a fixed length. */
9488 
9489     default:
9490     ISNOTFIXED:
9491     *errcodeptr = ERR25;   /* Not fixed length */
9492     return -1;
9493     }
9494 
9495   /* Add the item length to the branchlength, checking for integer overflow and
9496   for the branch length exceeding the limit. */
9497 
9498   if (INT_MAX - branchlength < (int)itemlength ||
9499       (branchlength += itemlength) > LOOKBEHIND_MAX)
9500     {
9501     *errcodeptr = ERR87;
9502     return -1;
9503     }
9504 
9505   /* Save this item length for use if the next item is a quantifier. */
9506 
9507   lastitemlength = itemlength;
9508   }
9509 
9510 EXIT:
9511 *pptrptr = pptr;
9512 return branchlength;
9513 
9514 PARSED_SKIP_FAILED:
9515 *errcodeptr = ERR90;
9516 return -1;
9517 }
9518 
9519 
9520 
9521 /*************************************************
9522 *        Set lengths in a lookbehind             *
9523 *************************************************/
9524 
9525 /* This function is called for each lookbehind, to set the lengths in its
9526 branches. An error occurs if any branch does not have a fixed length that is
9527 less than the maximum (65535). On exit, the pointer must be left on the final
9528 ket.
9529 
9530 The function also maintains the max_lookbehind value. Any lookbehind branch
9531 that contains a nested lookbehind may actually look further back than the
9532 length of the branch. The additional amount is passed back from
9533 get_branchlength() as an "extra" value.
9534 
9535 Arguments:
9536   pptrptr     pointer to pointer in the parsed pattern
9537   errcodeptr  pointer to error code
9538   lcptr       pointer to loop counter
9539   recurses    chain of recurse_check to catch mutual recursion
9540   cb          pointer to compile block
9541 
9542 Returns:      TRUE if all is well
9543               FALSE otherwise, with error code and offset set
9544 */
9545 
9546 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9547 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9548   parsed_recurse_check *recurses, compile_block *cb)
9549 {
9550 PCRE2_SIZE offset;
9551 int branchlength;
9552 uint32_t *bptr = *pptrptr;
9553 
9554 READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9555 *pptrptr += SIZEOFFSET;
9556 
9557 do
9558   {
9559   *pptrptr += 1;
9560   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9561   if (branchlength < 0)
9562     {
9563     /* The errorcode and offset may already be set from a nested lookbehind. */
9564     if (*errcodeptr == 0) *errcodeptr = ERR25;
9565     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9566     return FALSE;
9567     }
9568   if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9569   *bptr |= branchlength;  /* branchlength never more than 65535 */
9570   bptr = *pptrptr;
9571   }
9572 while (*bptr == META_ALT);
9573 
9574 return TRUE;
9575 }
9576 
9577 
9578 
9579 /*************************************************
9580 *         Check parsed pattern lookbehinds       *
9581 *************************************************/
9582 
9583 /* This function is called at the end of parsing a pattern if any lookbehinds
9584 were encountered. It scans the parsed pattern for them, calling
9585 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9586 the error offset is marked unset. The enables the functions above not to
9587 override settings from deeper nestings.
9588 
9589 This function is called recursively from get_branchlength() for lookaheads in
9590 order to process any lookbehinds that they may contain. It stops when it hits a
9591 non-nested closing parenthesis in this case, returning a pointer to it.
9592 
9593 Arguments
9594   pptr      points to where to start (start of pattern or start of lookahead)
9595   retptr    if not NULL, return the ket pointer here
9596   recurses  chain of recurse_check to catch mutual recursion
9597   cb        points to the compile block
9598   lcptr     points to loop counter
9599 
9600 Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9601 */
9602 
9603 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9604 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9605   parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9606 {
9607 int errorcode = 0;
9608 int nestlevel = 0;
9609 
9610 cb->erroroffset = PCRE2_UNSET;
9611 
9612 for (; *pptr != META_END; pptr++)
9613   {
9614   if (*pptr < META_END) continue;  /* Literal */
9615 
9616   switch (META_CODE(*pptr))
9617     {
9618     default:
9619     return ERR70;  /* Unrecognized meta code */
9620 
9621     case META_ESCAPE:
9622     if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9623       pptr += 1;
9624     break;
9625 
9626     case META_KET:
9627     if (--nestlevel < 0)
9628       {
9629       if (retptr != NULL) *retptr = pptr;
9630       return 0;
9631       }
9632     break;
9633 
9634     case META_ATOMIC:
9635     case META_CAPTURE:
9636     case META_COND_ASSERT:
9637     case META_LOOKAHEAD:
9638     case META_LOOKAHEADNOT:
9639     case META_LOOKAHEAD_NA:
9640     case META_NOCAPTURE:
9641     case META_SCRIPT_RUN:
9642     nestlevel++;
9643     break;
9644 
9645     case META_ACCEPT:
9646     case META_ALT:
9647     case META_ASTERISK:
9648     case META_ASTERISK_PLUS:
9649     case META_ASTERISK_QUERY:
9650     case META_BACKREF:
9651     case META_CIRCUMFLEX:
9652     case META_CLASS:
9653     case META_CLASS_EMPTY:
9654     case META_CLASS_EMPTY_NOT:
9655     case META_CLASS_END:
9656     case META_CLASS_NOT:
9657     case META_COMMIT:
9658     case META_DOLLAR:
9659     case META_DOT:
9660     case META_FAIL:
9661     case META_PLUS:
9662     case META_PLUS_PLUS:
9663     case META_PLUS_QUERY:
9664     case META_PRUNE:
9665     case META_QUERY:
9666     case META_QUERY_PLUS:
9667     case META_QUERY_QUERY:
9668     case META_RANGE_ESCAPED:
9669     case META_RANGE_LITERAL:
9670     case META_SKIP:
9671     case META_THEN:
9672     break;
9673 
9674     case META_RECURSE:
9675     pptr += SIZEOFFSET;
9676     break;
9677 
9678     case META_BACKREF_BYNAME:
9679     case META_RECURSE_BYNAME:
9680     pptr += 1 + SIZEOFFSET;
9681     break;
9682 
9683     case META_COND_DEFINE:
9684     pptr += SIZEOFFSET;
9685     nestlevel++;
9686     break;
9687 
9688     case META_COND_NAME:
9689     case META_COND_NUMBER:
9690     case META_COND_RNAME:
9691     case META_COND_RNUMBER:
9692     pptr += 1 + SIZEOFFSET;
9693     nestlevel++;
9694     break;
9695 
9696     case META_COND_VERSION:
9697     pptr += 3;
9698     nestlevel++;
9699     break;
9700 
9701     case META_CALLOUT_STRING:
9702     pptr += 3 + SIZEOFFSET;
9703     break;
9704 
9705     case META_BIGVALUE:
9706     case META_OPTIONS:
9707     case META_POSIX:
9708     case META_POSIX_NEG:
9709     pptr += 1;
9710     break;
9711 
9712     case META_MINMAX:
9713     case META_MINMAX_QUERY:
9714     case META_MINMAX_PLUS:
9715     pptr += 2;
9716     break;
9717 
9718     case META_CALLOUT_NUMBER:
9719     pptr += 3;
9720     break;
9721 
9722     case META_MARK:
9723     case META_COMMIT_ARG:
9724     case META_PRUNE_ARG:
9725     case META_SKIP_ARG:
9726     case META_THEN_ARG:
9727     pptr += 1 + pptr[1];
9728     break;
9729 
9730     case META_LOOKBEHIND:
9731     case META_LOOKBEHINDNOT:
9732     case META_LOOKBEHIND_NA:
9733     if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
9734       return errorcode;
9735     break;
9736     }
9737   }
9738 
9739 return 0;
9740 }
9741 
9742 
9743 
9744 /*************************************************
9745 *     External function to compile a pattern     *
9746 *************************************************/
9747 
9748 /* This function reads a regular expression in the form of a string and returns
9749 a pointer to a block of store holding a compiled version of the expression.
9750 
9751 Arguments:
9752   pattern       the regular expression
9753   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
9754   options       option bits
9755   errorptr      pointer to errorcode
9756   erroroffset   pointer to error offset
9757   ccontext      points to a compile context or is NULL
9758 
9759 Returns:        pointer to compiled data block, or NULL on error,
9760                 with errorcode and erroroffset set
9761 */
9762 
9763 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9764 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9765    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9766 {
9767 BOOL utf;                             /* Set TRUE for UTF mode */
9768 BOOL ucp;                             /* Set TRUE for UCP mode */
9769 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
9770 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
9771 pcre2_real_code *re = NULL;           /* What we will return */
9772 compile_block cb;                     /* "Static" compile-time data */
9773 const uint8_t *tables;                /* Char tables base pointer */
9774 
9775 PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
9776 PCRE2_SPTR codestart;                 /* Start of compiled code */
9777 PCRE2_SPTR ptr;                       /* Current pointer in pattern */
9778 uint32_t *pptr;                       /* Current pointer in parsed pattern */
9779 
9780 PCRE2_SIZE length = 1;                /* Allow for final END opcode */
9781 PCRE2_SIZE usedlength;                /* Actual length used */
9782 PCRE2_SIZE re_blocksize;              /* Size of memory block */
9783 PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
9784 PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
9785 
9786 uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
9787 uint32_t firstcu, reqcu;              /* Value of first/req code unit */
9788 uint32_t setflags = 0;                /* NL and BSR set flags */
9789 
9790 uint32_t skipatstart;                 /* When checking (*UTF) etc */
9791 uint32_t limit_heap  = UINT32_MAX;
9792 uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
9793 uint32_t limit_depth = UINT32_MAX;
9794 
9795 int newline = 0;                      /* Unset; can be set by the pattern */
9796 int bsr = 0;                          /* Unset; can be set by the pattern */
9797 int errorcode = 0;                    /* Initialize to avoid compiler warn */
9798 int regexrc;                          /* Return from compile */
9799 
9800 uint32_t i;                           /* Local loop counter */
9801 
9802 /* Comments at the head of this file explain about these variables. */
9803 
9804 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9805 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9806 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9807 
9808 /* The workspace is used in different ways in the different compiling phases.
9809 It needs to be 16-bit aligned for the preliminary parsing scan. */
9810 
9811 uint32_t c16workspace[C16_WORK_SIZE];
9812 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9813 
9814 
9815 /* -------------- Check arguments and set up the pattern ----------------- */
9816 
9817 /* There must be error code and offset pointers. */
9818 
9819 if (errorptr == NULL || erroroffset == NULL) return NULL;
9820 *errorptr = ERR0;
9821 *erroroffset = 0;
9822 
9823 /* There must be a pattern! */
9824 
9825 if (pattern == NULL)
9826   {
9827   *errorptr = ERR16;
9828   return NULL;
9829   }
9830 
9831 /* A NULL compile context means "use a default context" */
9832 
9833 if (ccontext == NULL)
9834   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9835 
9836 /* PCRE2_MATCH_INVALID_UTF implies UTF */
9837 
9838 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
9839 
9840 /* Check that all undefined public option bits are zero. */
9841 
9842 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9843     (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9844   {
9845   *errorptr = ERR17;
9846   return NULL;
9847   }
9848 
9849 if ((options & PCRE2_LITERAL) != 0 &&
9850     ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9851      (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9852   {
9853   *errorptr = ERR92;
9854   return NULL;
9855   }
9856 
9857 /* A zero-terminated pattern is indicated by the special length value
9858 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9859 
9860 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9861   patlen = PRIV(strlen)(pattern);
9862 
9863 if (patlen > ccontext->max_pattern_length)
9864   {
9865   *errorptr = ERR88;
9866   return NULL;
9867   }
9868 
9869 /* From here on, all returns from this function should end up going via the
9870 EXIT label. */
9871 
9872 
9873 /* ------------ Initialize the "static" compile data -------------- */
9874 
9875 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9876 
9877 cb.lcc = tables + lcc_offset;          /* Individual */
9878 cb.fcc = tables + fcc_offset;          /*   character */
9879 cb.cbits = tables + cbits_offset;      /*      tables */
9880 cb.ctypes = tables + ctypes_offset;
9881 
9882 cb.assert_depth = 0;
9883 cb.bracount = 0;
9884 cb.cx = ccontext;
9885 cb.dupnames = FALSE;
9886 cb.end_pattern = pattern + patlen;
9887 cb.erroroffset = 0;
9888 cb.external_flags = 0;
9889 cb.external_options = options;
9890 cb.groupinfo = stack_groupinfo;
9891 cb.had_recurse = FALSE;
9892 cb.lastcapture = 0;
9893 cb.max_lookbehind = 0;
9894 cb.name_entry_size = 0;
9895 cb.name_table = NULL;
9896 cb.named_groups = named_groups;
9897 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9898 cb.names_found = 0;
9899 cb.open_caps = NULL;
9900 cb.parens_depth = 0;
9901 cb.parsed_pattern = stack_parsed_pattern;
9902 cb.req_varyopt = 0;
9903 cb.start_code = cworkspace;
9904 cb.start_pattern = pattern;
9905 cb.start_workspace = cworkspace;
9906 cb.workspace_size = COMPILE_WORK_SIZE;
9907 
9908 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9909 references to help in deciding whether (.*) can be treated as anchored or not.
9910 */
9911 
9912 cb.top_backref = 0;
9913 cb.backref_map = 0;
9914 
9915 /* Escape sequences \1 to \9 are always back references, but as they are only
9916 two characters long, only two elements can be used in the parsed_pattern
9917 vector. The first contains the reference, and we'd like to use the second to
9918 record the offset in the pattern, so that forward references to non-existent
9919 groups can be diagnosed later with an offset. However, on 64-bit systems,
9920 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9921 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9922 references have enough space for the offset to be put into the parsed pattern.
9923 */
9924 
9925 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9926 
9927 
9928 /* --------------- Start looking at the pattern --------------- */
9929 
9930 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9931 the start of the pattern, and remember the offset to the actual regex. With
9932 valgrind support, make the terminator of a zero-terminated pattern
9933 inaccessible. This catches bugs that would otherwise only show up for
9934 non-zero-terminated patterns. */
9935 
9936 #ifdef SUPPORT_VALGRIND
9937 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9938 #endif
9939 
9940 ptr = pattern;
9941 skipatstart = 0;
9942 
9943 if ((options & PCRE2_LITERAL) == 0)
9944   {
9945   while (patlen - skipatstart >= 2 &&
9946          ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9947          ptr[skipatstart+1] == CHAR_ASTERISK)
9948     {
9949     for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9950       {
9951       uint32_t c, pp;
9952       pso *p = pso_list + i;
9953 
9954       if (patlen - skipatstart - 2 >= p->length &&
9955           PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9956             p->length) == 0)
9957         {
9958         skipatstart += p->length + 2;
9959         switch(p->type)
9960           {
9961           case PSO_OPT:
9962           cb.external_options |= p->value;
9963           break;
9964 
9965           case PSO_FLG:
9966           setflags |= p->value;
9967           break;
9968 
9969           case PSO_NL:
9970           newline = p->value;
9971           setflags |= PCRE2_NL_SET;
9972           break;
9973 
9974           case PSO_BSR:
9975           bsr = p->value;
9976           setflags |= PCRE2_BSR_SET;
9977           break;
9978 
9979           case PSO_LIMM:
9980           case PSO_LIMD:
9981           case PSO_LIMH:
9982           c = 0;
9983           pp = skipatstart;
9984           if (!IS_DIGIT(ptr[pp]))
9985             {
9986             errorcode = ERR60;
9987             ptr += pp;
9988             goto HAD_EARLY_ERROR;
9989             }
9990           while (IS_DIGIT(ptr[pp]))
9991             {
9992             if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9993             c = c*10 + (ptr[pp++] - CHAR_0);
9994             }
9995           if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9996             {
9997             errorcode = ERR60;
9998             ptr += pp;
9999             goto HAD_EARLY_ERROR;
10000             }
10001           if (p->type == PSO_LIMH) limit_heap = c;
10002             else if (p->type == PSO_LIMM) limit_match = c;
10003             else limit_depth = c;
10004           skipatstart += pp - skipatstart;
10005           break;
10006           }
10007         break;   /* Out of the table scan loop */
10008         }
10009       }
10010     if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10011     }
10012   }
10013 
10014 /* End of pattern-start options; advance to start of real regex. */
10015 
10016 ptr += skipatstart;
10017 
10018 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10019 
10020 #ifndef SUPPORT_UNICODE
10021 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10022   {
10023   errorcode = ERR32;
10024   goto HAD_EARLY_ERROR;
10025   }
10026 #endif
10027 
10028 /* Check UTF. We have the original options in 'options', with that value as
10029 modified by (*UTF) etc in cb->external_options. The extra option
10030 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10031 surrogate code points cannot be represented in UTF-16. */
10032 
10033 utf = (cb.external_options & PCRE2_UTF) != 0;
10034 if (utf)
10035   {
10036   if ((options & PCRE2_NEVER_UTF) != 0)
10037     {
10038     errorcode = ERR74;
10039     goto HAD_EARLY_ERROR;
10040     }
10041   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10042        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10043     goto HAD_ERROR;  /* Offset was set by valid_utf() */
10044 
10045 #if PCRE2_CODE_UNIT_WIDTH == 16
10046   if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10047     {
10048     errorcode = ERR91;
10049     goto HAD_EARLY_ERROR;
10050     }
10051 #endif
10052   }
10053 
10054 /* Check UCP lockout. */
10055 
10056 ucp = (cb.external_options & PCRE2_UCP) != 0;
10057 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10058   {
10059   errorcode = ERR75;
10060   goto HAD_EARLY_ERROR;
10061   }
10062 
10063 /* Process the BSR setting. */
10064 
10065 if (bsr == 0) bsr = ccontext->bsr_convention;
10066 
10067 /* Process the newline setting. */
10068 
10069 if (newline == 0) newline = ccontext->newline_convention;
10070 cb.nltype = NLTYPE_FIXED;
10071 switch(newline)
10072   {
10073   case PCRE2_NEWLINE_CR:
10074   cb.nllen = 1;
10075   cb.nl[0] = CHAR_CR;
10076   break;
10077 
10078   case PCRE2_NEWLINE_LF:
10079   cb.nllen = 1;
10080   cb.nl[0] = CHAR_NL;
10081   break;
10082 
10083   case PCRE2_NEWLINE_NUL:
10084   cb.nllen = 1;
10085   cb.nl[0] = CHAR_NUL;
10086   break;
10087 
10088   case PCRE2_NEWLINE_CRLF:
10089   cb.nllen = 2;
10090   cb.nl[0] = CHAR_CR;
10091   cb.nl[1] = CHAR_NL;
10092   break;
10093 
10094   case PCRE2_NEWLINE_ANY:
10095   cb.nltype = NLTYPE_ANY;
10096   break;
10097 
10098   case PCRE2_NEWLINE_ANYCRLF:
10099   cb.nltype = NLTYPE_ANYCRLF;
10100   break;
10101 
10102   default:
10103   errorcode = ERR56;
10104   goto HAD_EARLY_ERROR;
10105   }
10106 
10107 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10108 their numerical equivalents, so that this information is always available for
10109 the remaining processing. (2) At the same time, parse the pattern and put a
10110 processed version into the parsed_pattern vector. This has escapes interpreted
10111 and comments removed (amongst other things).
10112 
10113 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10114 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10115 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10116 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10117 characters greater than META_END (0x80000000) have to be coded as two units. In
10118 this case, therefore, we scan the pattern to check for such values. */
10119 
10120 #if PCRE2_CODE_UNIT_WIDTH == 32
10121 if (!utf)
10122   {
10123   PCRE2_SPTR p;
10124   for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10125   }
10126 #endif
10127 
10128 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10129 is set we have to assume a numerical callout (4 elements) for each character
10130 plus one at the end. This is overkill, but memory is plentiful these days. For
10131 many smaller patterns the vector on the stack (which was set up above) can be
10132 used. */
10133 
10134 parsed_size_needed = patlen - skipatstart + big32count;
10135 
10136 if ((ccontext->extra_options &
10137      (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10138   parsed_size_needed += 4;
10139 
10140 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10141   parsed_size_needed = (parsed_size_needed + 1) * 5;
10142 
10143 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10144   {
10145   uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10146     (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10147   if (heap_parsed_pattern == NULL)
10148     {
10149     *errorptr = ERR21;
10150     goto EXIT;
10151     }
10152   cb.parsed_pattern = heap_parsed_pattern;
10153   }
10154 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10155 
10156 /* Do the parsing scan. */
10157 
10158 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10159 if (errorcode != 0) goto HAD_CB_ERROR;
10160 
10161 /* Workspace is needed to remember information about numbered groups: whether a
10162 group can match an empty string and what its fixed length is. This is done to
10163 avoid the possibility of recursive references causing very long compile times
10164 when checking these features. Unnumbered groups do not have this exposure since
10165 they cannot be referenced. We use an indexed vector for this purpose. If there
10166 are sufficiently few groups, the default vector on the stack, as set up above,
10167 can be used. Otherwise we have to get/free a special vector. The vector must be
10168 initialized to zero. */
10169 
10170 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
10171   {
10172   cb.groupinfo = ccontext->memctl.malloc(
10173     (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
10174   if (cb.groupinfo == NULL)
10175     {
10176     errorcode = ERR21;
10177     cb.erroroffset = 0;
10178     goto HAD_CB_ERROR;
10179     }
10180   }
10181 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
10182 
10183 /* If there were any lookbehinds, scan the parsed pattern to figure out their
10184 lengths. */
10185 
10186 if (has_lookbehind)
10187   {
10188   int loopcount = 0;
10189   errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10190   if (errorcode != 0) goto HAD_CB_ERROR;
10191   }
10192 
10193 /* For debugging, there is a function that shows the parsed data vector. */
10194 
10195 #ifdef DEBUG_SHOW_PARSED
10196 fprintf(stderr, "+++ Pre-scan complete:\n");
10197 show_parsed(&cb);
10198 #endif
10199 
10200 /* For debugging capturing information this code can be enabled. */
10201 
10202 #ifdef DEBUG_SHOW_CAPTURES
10203   {
10204   named_group *ng = cb.named_groups;
10205   fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10206   for (i = 0; i < cb.names_found; i++, ng++)
10207     {
10208     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10209     }
10210   }
10211 #endif
10212 
10213 /* Pretend to compile the pattern while actually just accumulating the amount
10214 of memory required in the 'length' variable. This behaviour is triggered by
10215 passing a non-NULL final argument to compile_regex(). We pass a block of
10216 workspace (cworkspace) for it to compile parts of the pattern into; the
10217 compiled code is discarded when it is no longer needed, so hopefully this
10218 workspace will never overflow, though there is a test for its doing so.
10219 
10220 On error, errorcode will be set non-zero, so we don't need to look at the
10221 result of the function. The initial options have been put into the cb block,
10222 but we still have to pass a separate options variable (the first argument)
10223 because the options may change as the pattern is processed. */
10224 
10225 cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10226 pptr = cb.parsed_pattern;
10227 code = cworkspace;
10228 *code = OP_BRA;
10229 
10230 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
10231    &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
10232 
10233 if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10234 
10235 /* This should be caught in compile_regex(), but just in case... */
10236 
10237 if (length > MAX_PATTERN_SIZE)
10238   {
10239   errorcode = ERR20;
10240   goto HAD_CB_ERROR;
10241   }
10242 
10243 /* Compute the size of, and then get and initialize, the data block for storing
10244 the compiled pattern and names table. Integer overflow should no longer be
10245 possible because nowadays we limit the maximum value of cb.names_found and
10246 cb.name_entry_size. */
10247 
10248 re_blocksize = sizeof(pcre2_real_code) +
10249   CU2BYTES(length +
10250   (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10251 re = (pcre2_real_code *)
10252   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10253 if (re == NULL)
10254   {
10255   errorcode = ERR21;
10256   goto HAD_CB_ERROR;
10257   }
10258 
10259 /* The compiler may put padding at the end of the pcre2_real_code structure in
10260 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10261 compiled pattern is copied (for example, when serialized) undefined bytes are
10262 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10263 write to the last 8 bytes of the structure before setting the fields. */
10264 
10265 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10266 re->memctl = ccontext->memctl;
10267 re->tables = tables;
10268 re->executable_jit = NULL;
10269 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10270 re->blocksize = re_blocksize;
10271 re->magic_number = MAGIC_NUMBER;
10272 re->compile_options = options;
10273 re->overall_options = cb.external_options;
10274 re->extra_options = ccontext->extra_options;
10275 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10276 re->limit_heap = limit_heap;
10277 re->limit_match = limit_match;
10278 re->limit_depth = limit_depth;
10279 re->first_codeunit = 0;
10280 re->last_codeunit = 0;
10281 re->bsr_convention = bsr;
10282 re->newline_convention = newline;
10283 re->max_lookbehind = 0;
10284 re->minlength = 0;
10285 re->top_bracket = 0;
10286 re->top_backref = 0;
10287 re->name_entry_size = cb.name_entry_size;
10288 re->name_count = cb.names_found;
10289 
10290 /* The basic block is immediately followed by the name table, and the compiled
10291 code follows after that. */
10292 
10293 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10294   re->name_entry_size * re->name_count;
10295 
10296 /* Update the compile data block for the actual compile. The starting points of
10297 the name/number translation table and of the code are passed around in the
10298 compile data block. The start/end pattern and initial options are already set
10299 from the pre-compile phase, as is the name_entry_size field. */
10300 
10301 cb.parens_depth = 0;
10302 cb.assert_depth = 0;
10303 cb.lastcapture = 0;
10304 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10305 cb.start_code = codestart;
10306 cb.req_varyopt = 0;
10307 cb.had_accept = FALSE;
10308 cb.had_pruneorskip = FALSE;
10309 cb.open_caps = NULL;
10310 
10311 /* If any named groups were found, create the name/number table from the list
10312 created in the pre-pass. */
10313 
10314 if (cb.names_found > 0)
10315   {
10316   named_group *ng = cb.named_groups;
10317   for (i = 0; i < cb.names_found; i++, ng++)
10318     add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10319   }
10320 
10321 /* Set up a starting, non-extracting bracket, then compile the expression. On
10322 error, errorcode will be set non-zero, so we don't need to look at the result
10323 of the function here. */
10324 
10325 pptr = cb.parsed_pattern;
10326 code = (PCRE2_UCHAR *)codestart;
10327 *code = OP_BRA;
10328 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
10329   &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
10330 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10331 re->top_bracket = cb.bracount;
10332 re->top_backref = cb.top_backref;
10333 re->max_lookbehind = cb.max_lookbehind;
10334 
10335 if (cb.had_accept)
10336   {
10337   reqcu = 0;                     /* Must disable after (*ACCEPT) */
10338   reqcuflags = REQ_NONE;
10339   re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10340   }
10341 
10342 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10343 but the estimated length exceeds the really used length, adjust the value of
10344 re->blocksize, and if valgrind support is configured, mark the extra allocated
10345 memory as unaddressable, so that any out-of-bound reads can be detected. */
10346 
10347 *code++ = OP_END;
10348 usedlength = code - codestart;
10349 if (usedlength > length) errorcode = ERR23; else
10350   {
10351   re->blocksize -= CU2BYTES(length - usedlength);
10352 #ifdef SUPPORT_VALGRIND
10353   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10354 #endif
10355   }
10356 
10357 /* Scan the pattern for recursion/subroutine calls and convert the group
10358 numbers into offsets. Maintain a small cache so that repeated groups containing
10359 recursions are efficiently handled. */
10360 
10361 #define RSCAN_CACHE_SIZE 8
10362 
10363 if (errorcode == 0 && cb.had_recurse)
10364   {
10365   PCRE2_UCHAR *rcode;
10366   PCRE2_SPTR rgroup;
10367   unsigned int ccount = 0;
10368   int start = RSCAN_CACHE_SIZE;
10369   recurse_cache rc[RSCAN_CACHE_SIZE];
10370 
10371   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10372        rcode != NULL;
10373        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10374     {
10375     int p, groupnumber;
10376 
10377     groupnumber = (int)GET(rcode, 1);
10378     if (groupnumber == 0) rgroup = codestart; else
10379       {
10380       PCRE2_SPTR search_from = codestart;
10381       rgroup = NULL;
10382       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10383         {
10384         if (groupnumber == rc[p].groupnumber)
10385           {
10386           rgroup = rc[p].group;
10387           break;
10388           }
10389 
10390         /* Group n+1 must always start to the right of group n, so we can save
10391         search time below when the new group number is greater than any of the
10392         previously found groups. */
10393 
10394         if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10395         }
10396 
10397       if (rgroup == NULL)
10398         {
10399         rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10400         if (rgroup == NULL)
10401           {
10402           errorcode = ERR53;
10403           break;
10404           }
10405         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10406         rc[start].groupnumber = groupnumber;
10407         rc[start].group = rgroup;
10408         if (ccount < RSCAN_CACHE_SIZE) ccount++;
10409         }
10410       }
10411 
10412     PUT(rcode, 1, rgroup - codestart);
10413     }
10414   }
10415 
10416 /* In rare debugging situations we sometimes need to look at the compiled code
10417 at this stage. */
10418 
10419 #ifdef DEBUG_CALL_PRINTINT
10420 pcre2_printint(re, stderr, TRUE);
10421 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10422 #endif
10423 
10424 /* Unless disabled, check whether any single character iterators can be
10425 auto-possessified. The function overwrites the appropriate opcode values, so
10426 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10427 used in this code because at least one compiler gives a warning about loss of
10428 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10429 function call. */
10430 
10431 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10432   {
10433   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10434   if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10435   }
10436 
10437 /* Failed to compile, or error while post-processing. */
10438 
10439 if (errorcode != 0) goto HAD_CB_ERROR;
10440 
10441 /* Successful compile. If the anchored option was not passed, set it if
10442 we can determine that the pattern is anchored by virtue of ^ characters or \A
10443 or anything else, such as starting with non-atomic .* when DOTALL is set and
10444 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10445 disable this case). */
10446 
10447 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10448      is_anchored(codestart, 0, &cb, 0, FALSE))
10449   re->overall_options |= PCRE2_ANCHORED;
10450 
10451 /* Set up the first code unit or startline flag, the required code unit, and
10452 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10453 is set, as the data it would create will not be used. Note that a first code
10454 unit (but not the startline flag) is useful for anchored patterns because it
10455 can still give a quick "no match" and also avoid searching for a last code
10456 unit. */
10457 
10458 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10459   {
10460   int minminlength = 0;  /* For minimal minlength from first/required CU */
10461 
10462   /* If we do not have a first code unit, see if there is one that is asserted
10463   (these are not saved during the compile because they can cause conflicts with
10464   actual literals that follow). */
10465 
10466   if (firstcuflags >= REQ_NONE)
10467     firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10468 
10469   /* Save the data for a first code unit. The existence of one means the
10470   minimum length must be at least 1. */
10471 
10472   if (firstcuflags < REQ_NONE)
10473     {
10474     re->first_codeunit = firstcu;
10475     re->flags |= PCRE2_FIRSTSET;
10476     minminlength++;
10477 
10478     /* Handle caseless first code units. */
10479 
10480     if ((firstcuflags & REQ_CASELESS) != 0)
10481       {
10482       if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10483         {
10484         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10485         }
10486 
10487       /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10488       In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10489       points and cannot have another case, but if UCP is set they may do. */
10490 
10491 #ifdef SUPPORT_UNICODE
10492 #if PCRE2_CODE_UNIT_WIDTH == 8
10493       else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10494         re->flags |= PCRE2_FIRSTCASELESS;
10495 #else
10496       else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10497                UCD_OTHERCASE(firstcu) != firstcu)
10498         re->flags |= PCRE2_FIRSTCASELESS;
10499 #endif
10500 #endif  /* SUPPORT_UNICODE */
10501       }
10502     }
10503 
10504   /* When there is no first code unit, for non-anchored patterns, see if we can
10505   set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10506   branches start with ^ and also when all branches start with non-atomic .* for
10507   non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10508   that disables this case.) */
10509 
10510   else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10511            is_startline(codestart, 0, &cb, 0, FALSE))
10512     re->flags |= PCRE2_STARTLINE;
10513 
10514   /* Handle the "required code unit", if one is set. In the UTF case we can
10515   increment the minimum minimum length only if we are sure this really is a
10516   different character and not a non-starting code unit of the first character,
10517   because the minimum length count is in characters, not code units. */
10518 
10519   if (reqcuflags < REQ_NONE)
10520     {
10521 #if PCRE2_CODE_UNIT_WIDTH == 16
10522     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10523         firstcuflags >= REQ_NONE ||                 /* First not set */
10524         (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10525         (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10526 #elif PCRE2_CODE_UNIT_WIDTH == 8
10527     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10528         firstcuflags >= REQ_NONE ||                 /* First not set */
10529         (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10530         (reqcu & 0x80) == 0)                        /* Req is ASCII */
10531 #endif
10532       {
10533       minminlength++;
10534       }
10535 
10536     /* In the case of an anchored pattern, set up the value only if it follows
10537     a variable length item in the pattern. */
10538 
10539     if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10540         (reqcuflags & REQ_VARY) != 0)
10541       {
10542       re->last_codeunit = reqcu;
10543       re->flags |= PCRE2_LASTSET;
10544 
10545       /* Handle caseless required code units as for first code units (above). */
10546 
10547       if ((reqcuflags & REQ_CASELESS) != 0)
10548         {
10549         if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10550           {
10551           if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10552           }
10553 #ifdef SUPPORT_UNICODE
10554 #if PCRE2_CODE_UNIT_WIDTH == 8
10555       else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10556         re->flags |= PCRE2_LASTCASELESS;
10557 #else
10558       else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10559                UCD_OTHERCASE(reqcu) != reqcu)
10560         re->flags |= PCRE2_LASTCASELESS;
10561 #endif
10562 #endif  /* SUPPORT_UNICODE */
10563         }
10564       }
10565     }
10566 
10567   /* Study the compiled pattern to set up information such as a bitmap of
10568   starting code units and a minimum matching length. */
10569 
10570   if (PRIV(study)(re) != 0)
10571     {
10572     errorcode = ERR31;
10573     goto HAD_CB_ERROR;
10574     }
10575 
10576   /* If study() set a bitmap of starting code units, it implies a minimum
10577   length of at least one. */
10578 
10579   if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10580     minminlength = 1;
10581 
10582   /* If the minimum length set (or not set) by study() is less than the minimum
10583   implied by required code units, override it. */
10584 
10585   if (re->minlength < minminlength) re->minlength = minminlength;
10586   }   /* End of start-of-match optimizations. */
10587 
10588 /* Control ends up here in all cases. When running under valgrind, make a
10589 pattern's terminating zero defined again. If memory was obtained for the parsed
10590 version of the pattern, free it before returning. Also free the list of named
10591 groups if a larger one had to be obtained, and likewise the group information
10592 vector. */
10593 
10594 EXIT:
10595 #ifdef SUPPORT_VALGRIND
10596 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10597 #endif
10598 if (cb.parsed_pattern != stack_parsed_pattern)
10599   ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10600 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10601   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10602 if (cb.groupinfo != stack_groupinfo)
10603   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10604 return re;    /* Will be NULL after an error */
10605 
10606 /* Errors discovered in parse_regex() set the offset value in the compile
10607 block. Errors discovered before it is called must compute it from the ptr
10608 value. After parse_regex() is called, the offset in the compile block is set to
10609 the end of the pattern, but certain errors in compile_regex() may reset it if
10610 an offset is available in the parsed pattern. */
10611 
10612 HAD_CB_ERROR:
10613 ptr = pattern + cb.erroroffset;
10614 
10615 HAD_EARLY_ERROR:
10616 *erroroffset = ptr - pattern;
10617 
10618 HAD_ERROR:
10619 *errorptr = errorcode;
10620 pcre2_code_free(re);
10621 re = NULL;
10622 goto EXIT;
10623 }
10624 
10625 /* These #undefs are here to enable unity builds with CMake. */
10626 
10627 #undef NLBLOCK /* Block containing newline information */
10628 #undef PSSTART /* Field containing processed string start */
10629 #undef PSEND   /* Field containing processed string end */
10630 
10631 /* End of pcre2_compile.c */
10632