xref: /PHP-8.1/ext/pcre/pcre2lib/pcre2_compile.c (revision 6008a75f)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2021 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #define NLBLOCK cb             /* Block containing newline information */
47 #define PSSTART start_pattern  /* Field containing processed string start */
48 #define PSEND   end_pattern    /* Field containing processed string end */
49 
50 #include "pcre2_internal.h"
51 
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53 
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63 
64 /* Other debugging code can be enabled by these defines. */
65 
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68 
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71 
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c)                xdigitab[c]
75 
76 #else  /* Either 16-bit or 32-bit */
77 #define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
78 
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
81 
82 #else  /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86 
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90 
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100   { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102   { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106   { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110 
111 /* Macros for manipulating elements of the parsed pattern vector. */
112 
113 #define META_CODE(x)   (x & 0xffff0000u)
114 #define META_DATA(x)   (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116 
117 /* Function definitions to allow mutual recursion */
118 
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121   add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122     compile_block *, const uint32_t *, unsigned int);
123 #endif
124 
125 static int
126   compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127     uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
128     compile_block *, PCRE2_SIZE *);
129 
130 static int
131   get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132     compile_block *);
133 
134 static BOOL
135   set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136     compile_block *);
137 
138 static int
139   check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140     compile_block *, int *);
141 
142 
143 /*************************************************
144 *      Code parameters and static tables         *
145 *************************************************/
146 
147 #define MAX_GROUP_NUMBER   65535u
148 #define MAX_REPEAT_COUNT   65535u
149 #define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
150 
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156 
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163 
164 In the real compile phase, this workspace is not currently used. */
165 
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
167 
168 #define C16_WORK_SIZE \
169   ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170 
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174 
175 #define GROUPINFO_DEFAULT_SIZE 256
176 
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179 
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181 
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186 
187 #define NAMED_GROUP_LIST_SIZE  20
188 
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192 
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194 
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199 
200 #define OFLOW_MAX (INT_MAX - 20)
201 
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207 
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210 
211 #define META_END              0x80000000u  /* End of pattern */
212 
213 #define META_ALT              0x80010000u  /* alternation */
214 #define META_ATOMIC           0x80020000u  /* atomic group */
215 #define META_BACKREF          0x80030000u  /* Back ref */
216 #define META_BACKREF_BYNAME   0x80040000u  /* \k'name' */
217 #define META_BIGVALUE         0x80050000u  /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER   0x80060000u  /* (?C with numerical argument */
219 #define META_CALLOUT_STRING   0x80070000u  /* (?C with string argument */
220 #define META_CAPTURE          0x80080000u  /* Capturing parenthesis */
221 #define META_CIRCUMFLEX       0x80090000u  /* ^ metacharacter */
222 #define META_CLASS            0x800a0000u  /* start non-empty class */
223 #define META_CLASS_EMPTY      0x800b0000u  /* empty class */
224 #define META_CLASS_EMPTY_NOT  0x800c0000u  /* negative empty class */
225 #define META_CLASS_END        0x800d0000u  /* end of non-empty class */
226 #define META_CLASS_NOT        0x800e0000u  /* start non-empty negative class */
227 #define META_COND_ASSERT      0x800f0000u  /* (?(?assertion)... */
228 #define META_COND_DEFINE      0x80100000u  /* (?(DEFINE)... */
229 #define META_COND_NAME        0x80110000u  /* (?(<name>)... */
230 #define META_COND_NUMBER      0x80120000u  /* (?(digits)... */
231 #define META_COND_RNAME       0x80130000u  /* (?(R&name)... */
232 #define META_COND_RNUMBER     0x80140000u  /* (?(Rdigits)... */
233 #define META_COND_VERSION     0x80150000u  /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR           0x80160000u  /* $ metacharacter */
235 #define META_DOT              0x80170000u  /* . metacharacter */
236 #define META_ESCAPE           0x80180000u  /* \d and friends */
237 #define META_KET              0x80190000u  /* closing parenthesis */
238 #define META_NOCAPTURE        0x801a0000u  /* no capture parens */
239 #define META_OPTIONS          0x801b0000u  /* (?i) and friends */
240 #define META_POSIX            0x801c0000u  /* POSIX class item */
241 #define META_POSIX_NEG        0x801d0000u  /* negative POSIX class item */
242 #define META_RANGE_ESCAPED    0x801e0000u  /* range with at least one escape */
243 #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
244 #define META_RECURSE          0x80200000u  /* Recursion */
245 #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
246 #define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */
247 
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250 
251 #define META_LOOKAHEAD        0x80230000u  /* (?= */
252 #define META_LOOKAHEADNOT     0x80240000u  /* (?! */
253 #define META_LOOKBEHIND       0x80250000u  /* (?<= */
254 #define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */
255 
256 /* These cannot be conditions */
257 
258 #define META_LOOKAHEAD_NA     0x80270000u  /* (*napla: */
259 #define META_LOOKBEHIND_NA    0x80280000u  /* (*naplb: */
260 
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264 
265 #define META_MARK             0x80290000u  /* (*MARK) */
266 #define META_ACCEPT           0x802a0000u  /* (*ACCEPT) */
267 #define META_FAIL             0x802b0000u  /* (*FAIL) */
268 #define META_COMMIT           0x802c0000u  /* These               */
269 #define META_COMMIT_ARG       0x802d0000u  /*   pairs             */
270 #define META_PRUNE            0x802e0000u  /*     must            */
271 #define META_PRUNE_ARG        0x802f0000u  /*       be            */
272 #define META_SKIP             0x80300000u  /*         kept        */
273 #define META_SKIP_ARG         0x80310000u  /*           in        */
274 #define META_THEN             0x80320000u  /*             this    */
275 #define META_THEN_ARG         0x80330000u  /*               order */
276 
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278 
279 #define META_ASTERISK         0x80340000u  /* *  */
280 #define META_ASTERISK_PLUS    0x80350000u  /* *+ */
281 #define META_ASTERISK_QUERY   0x80360000u  /* *? */
282 #define META_PLUS             0x80370000u  /* +  */
283 #define META_PLUS_PLUS        0x80380000u  /* ++ */
284 #define META_PLUS_QUERY       0x80390000u  /* +? */
285 #define META_QUERY            0x803a0000u  /* ?  */
286 #define META_QUERY_PLUS       0x803b0000u  /* ?+ */
287 #define META_QUERY_QUERY      0x803c0000u  /* ?? */
288 #define META_MINMAX           0x803d0000u  /* {n,m}  repeat */
289 #define META_MINMAX_PLUS      0x803e0000u  /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY     0x803f0000u  /* {n,m}? repeat */
291 
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER  META_MINMAX_QUERY
294 
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299 
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301 
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305 
306 static unsigned char meta_extra_lengths[] = {
307   0,             /* META_END */
308   0,             /* META_ALT */
309   0,             /* META_ATOMIC */
310   0,             /* META_BACKREF - more if group is >= 10 */
311   1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
312   1,             /* META_BIGVALUE */
313   3,             /* META_CALLOUT_NUMBER */
314   3+SIZEOFFSET,  /* META_CALLOUT_STRING */
315   0,             /* META_CAPTURE */
316   0,             /* META_CIRCUMFLEX */
317   0,             /* META_CLASS */
318   0,             /* META_CLASS_EMPTY */
319   0,             /* META_CLASS_EMPTY_NOT */
320   0,             /* META_CLASS_END */
321   0,             /* META_CLASS_NOT */
322   0,             /* META_COND_ASSERT */
323   SIZEOFFSET,    /* META_COND_DEFINE */
324   1+SIZEOFFSET,  /* META_COND_NAME */
325   1+SIZEOFFSET,  /* META_COND_NUMBER */
326   1+SIZEOFFSET,  /* META_COND_RNAME */
327   1+SIZEOFFSET,  /* META_COND_RNUMBER */
328   3,             /* META_COND_VERSION */
329   0,             /* META_DOLLAR */
330   0,             /* META_DOT */
331   0,             /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332   0,             /* META_KET */
333   0,             /* META_NOCAPTURE */
334   1,             /* META_OPTIONS */
335   1,             /* META_POSIX */
336   1,             /* META_POSIX_NEG */
337   0,             /* META_RANGE_ESCAPED */
338   0,             /* META_RANGE_LITERAL */
339   SIZEOFFSET,    /* META_RECURSE */
340   1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
341   0,             /* META_SCRIPT_RUN */
342   0,             /* META_LOOKAHEAD */
343   0,             /* META_LOOKAHEADNOT */
344   SIZEOFFSET,    /* META_LOOKBEHIND */
345   SIZEOFFSET,    /* META_LOOKBEHINDNOT */
346   0,             /* META_LOOKAHEAD_NA */
347   SIZEOFFSET,    /* META_LOOKBEHIND_NA */
348   1,             /* META_MARK - plus the string length */
349   0,             /* META_ACCEPT */
350   0,             /* META_FAIL */
351   0,             /* META_COMMIT */
352   1,             /* META_COMMIT_ARG - plus the string length */
353   0,             /* META_PRUNE */
354   1,             /* META_PRUNE_ARG - plus the string length */
355   0,             /* META_SKIP */
356   1,             /* META_SKIP_ARG - plus the string length */
357   0,             /* META_THEN */
358   1,             /* META_THEN_ARG - plus the string length */
359   0,             /* META_ASTERISK */
360   0,             /* META_ASTERISK_PLUS */
361   0,             /* META_ASTERISK_QUERY */
362   0,             /* META_PLUS */
363   0,             /* META_PLUS_PLUS */
364   0,             /* META_PLUS_QUERY */
365   0,             /* META_QUERY */
366   0,             /* META_QUERY_PLUS */
367   0,             /* META_QUERY_QUERY */
368   2,             /* META_MINMAX */
369   2,             /* META_MINMAX_PLUS */
370   2              /* META_MINMAX_QUERY */
371 };
372 
373 /* Types for skipping parts of a parsed pattern. */
374 
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376 
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380 
381   #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382 
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385 
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387 
388 /* Private flags added to firstcu and reqcu. */
389 
390 #define REQ_CASELESS    (1u << 0)       /* Indicates caselessness */
391 #define REQ_VARY        (1u << 1)       /* reqcu followed non-literal item */
392 /* Negative values for the firstcu and reqcu flags */
393 #define REQ_UNSET       (-2)            /* Not yet found anything */
394 #define REQ_NONE        (-1)            /* Found not fixed char */
395 
396 /* These flags are used in the groupinfo vector. */
397 
398 #define GI_SET_FIXED_LENGTH    0x80000000u
399 #define GI_NOT_FIXED_LENGTH    0x40000000u
400 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
401 
402 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
403 and is fast (a good compiler can turn it into a subtraction and unsigned
404 comparison). */
405 
406 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
407 
408 /* Table to identify hex digits. The tables in chartables are dependent on the
409 locale, and may mark arbitrary characters as digits. We want to recognize only
410 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
411 costs 256 bytes, but it is a lot faster than doing character value tests (at
412 least in some simple cases I timed), and in some applications one wants PCRE2
413 to compile efficiently as well as match efficiently. The value in the table is
414 the binary hex digit value, or 0xff for non-hex digits. */
415 
416 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
417 UTF-8 mode. */
418 
419 #ifndef EBCDIC
420 static const uint8_t xdigitab[] =
421   {
422   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
423   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
424   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
425   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
426   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
427   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
428   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
429   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
430   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
431   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
432   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
433   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
434   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
435   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
436   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
437   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
438   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
439   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
440   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
441   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
442   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
443   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
444   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
445   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
446   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
447   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
448   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
449   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
450   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
451   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
452   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
453   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
454 
455 #else
456 
457 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
458 
459 static const uint8_t xdigitab[] =
460   {
461   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
462   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
463   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
464   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
465   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
466   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
467   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
468   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
469   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
470   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
471   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
472   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
473   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
474   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
475   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
476   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
477   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
478   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
479   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
480   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
481   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
482   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
483   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
484   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
485   0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
486   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
487   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
488   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
489   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
490   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
491   0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
492   0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
493 #endif  /* EBCDIC */
494 
495 
496 /* Table for handling alphanumeric escaped characters. Positive returns are
497 simple data values; negative values are for special things like \d and so on.
498 Zero means further processing is needed (for things like \x), or the escape is
499 invalid. */
500 
501 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
502 in UTF-8 mode. It runs from '0' to 'z'. */
503 
504 #ifndef EBCDIC
505 #define ESCAPES_FIRST       CHAR_0
506 #define ESCAPES_LAST        CHAR_z
507 #define UPPER_CASE(c)       (c-32)
508 
509 static const short int escapes[] = {
510      0,                       0,
511      0,                       0,
512      0,                       0,
513      0,                       0,
514      0,                       0,
515      CHAR_COLON,              CHAR_SEMICOLON,
516      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
517      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
518      CHAR_COMMERCIAL_AT,      -ESC_A,
519      -ESC_B,                  -ESC_C,
520      -ESC_D,                  -ESC_E,
521      0,                       -ESC_G,
522      -ESC_H,                  0,
523      0,                       -ESC_K,
524      0,                       0,
525      -ESC_N,                  0,
526      -ESC_P,                  -ESC_Q,
527      -ESC_R,                  -ESC_S,
528      0,                       0,
529      -ESC_V,                  -ESC_W,
530      -ESC_X,                  0,
531      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
532      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
533      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
534      CHAR_GRAVE_ACCENT,       CHAR_BEL,
535      -ESC_b,                  0,
536      -ESC_d,                  CHAR_ESC,
537      CHAR_FF,                 0,
538      -ESC_h,                  0,
539      0,                       -ESC_k,
540      0,                       0,
541      CHAR_LF,                 0,
542      -ESC_p,                  0,
543      CHAR_CR,                 -ESC_s,
544      CHAR_HT,                 0,
545      -ESC_v,                  -ESC_w,
546      0,                       0,
547      -ESC_z
548 };
549 
550 #else
551 
552 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
553 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
554 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
555 because it is defined as 'a', which of course picks up the ASCII value. */
556 
557 #if 'a' == 0x81                    /* Check for a real EBCDIC environment */
558 #define ESCAPES_FIRST       CHAR_a
559 #define ESCAPES_LAST        CHAR_9
560 #define UPPER_CASE(c)       (c+64)
561 #else                              /* Testing in an ASCII environment */
562 #define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
563 #define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
564 #define UPPER_CASE(c)  (c-32)
565 #endif
566 
567 static const short int escapes[] = {
568 /*  80 */         CHAR_BEL, -ESC_b,       0, -ESC_d, CHAR_ESC, CHAR_FF,      0,
569 /*  88 */ -ESC_h,        0,      0,     '{',      0,        0,       0,      0,
570 /*  90 */      0,        0, -ESC_k,       0,      0,  CHAR_LF,       0, -ESC_p,
571 /*  98 */      0,  CHAR_CR,      0,     '}',      0,        0,       0,      0,
572 /*  A0 */      0,      '~', -ESC_s, CHAR_HT,      0,   -ESC_v,  -ESC_w,      0,
573 /*  A8 */      0,   -ESC_z,      0,       0,      0,      '[',       0,      0,
574 /*  B0 */      0,        0,      0,       0,      0,        0,       0,      0,
575 /*  B8 */      0,        0,      0,       0,      0,      ']',     '=',    '-',
576 /*  C0 */    '{',   -ESC_A, -ESC_B,  -ESC_C, -ESC_D,   -ESC_E,       0, -ESC_G,
577 /*  C8 */ -ESC_H,        0,      0,       0,      0,        0,       0,      0,
578 /*  D0 */    '}',        0, -ESC_K,       0,      0,   -ESC_N,       0, -ESC_P,
579 /*  D8 */ -ESC_Q,   -ESC_R,      0,       0,      0,        0,       0,      0,
580 /*  E0 */   '\\',        0, -ESC_S,       0,      0,   -ESC_V,  -ESC_W, -ESC_X,
581 /*  E8 */      0,   -ESC_Z,      0,       0,      0,        0,       0,      0,
582 /*  F0 */      0,        0,      0,       0,      0,        0,       0,      0,
583 /*  F8 */      0,        0
584 };
585 
586 /* We also need a table of characters that may follow \c in an EBCDIC
587 environment for characters 0-31. */
588 
589 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
590 
591 #endif   /* EBCDIC */
592 
593 
594 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
595 searched linearly. Put all the names into a single string, in order to reduce
596 the number of relocations when a shared library is dynamically linked. The
597 string is built from string macros so that it works in UTF-8 mode on EBCDIC
598 platforms. */
599 
600 typedef struct verbitem {
601   unsigned int len;          /* Length of verb name */
602   uint32_t meta;             /* Base META_ code */
603   int has_arg;               /* Argument requirement */
604 } verbitem;
605 
606 static const char verbnames[] =
607   "\0"                       /* Empty name is a shorthand for MARK */
608   STRING_MARK0
609   STRING_ACCEPT0
610   STRING_F0
611   STRING_FAIL0
612   STRING_COMMIT0
613   STRING_PRUNE0
614   STRING_SKIP0
615   STRING_THEN;
616 
617 static const verbitem verbs[] = {
618   { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
619   { 4, META_MARK,   +1 },
620   { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
621   { 1, META_FAIL,   -1 },
622   { 4, META_FAIL,   -1 },
623   { 6, META_COMMIT,  0 },
624   { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
625   { 4, META_SKIP,    0 },
626   { 4, META_THEN,    0 }
627 };
628 
629 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
630 
631 /* Verb opcodes, indexed by their META code offset from META_MARK. */
632 
633 static const uint32_t verbops[] = {
634   OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
635   OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
636 
637 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
638 
639 typedef struct alasitem {
640   unsigned int len;          /* Length of name */
641   uint32_t meta;             /* Base META_ code */
642 } alasitem;
643 
644 static const char alasnames[] =
645   STRING_pla0
646   STRING_plb0
647   STRING_napla0
648   STRING_naplb0
649   STRING_nla0
650   STRING_nlb0
651   STRING_positive_lookahead0
652   STRING_positive_lookbehind0
653   STRING_non_atomic_positive_lookahead0
654   STRING_non_atomic_positive_lookbehind0
655   STRING_negative_lookahead0
656   STRING_negative_lookbehind0
657   STRING_atomic0
658   STRING_sr0
659   STRING_asr0
660   STRING_script_run0
661   STRING_atomic_script_run;
662 
663 static const alasitem alasmeta[] = {
664   {  3, META_LOOKAHEAD         },
665   {  3, META_LOOKBEHIND        },
666   {  5, META_LOOKAHEAD_NA      },
667   {  5, META_LOOKBEHIND_NA     },
668   {  3, META_LOOKAHEADNOT      },
669   {  3, META_LOOKBEHINDNOT     },
670   { 18, META_LOOKAHEAD         },
671   { 19, META_LOOKBEHIND        },
672   { 29, META_LOOKAHEAD_NA      },
673   { 30, META_LOOKBEHIND_NA     },
674   { 18, META_LOOKAHEADNOT      },
675   { 19, META_LOOKBEHINDNOT     },
676   {  6, META_ATOMIC            },
677   {  2, META_SCRIPT_RUN        }, /* sr = script run */
678   {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
679   { 10, META_SCRIPT_RUN        }, /* script run */
680   { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
681 };
682 
683 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
684 
685 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
686 
687 static uint32_t chartypeoffset[] = {
688   OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
689   OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
690 
691 /* Tables of names of POSIX character classes and their lengths. The names are
692 now all in a single string, to reduce the number of relocations when a shared
693 library is dynamically loaded. The list of lengths is terminated by a zero
694 length entry. The first three must be alpha, lower, upper, as this is assumed
695 for handling case independence. The indices for graph, print, and punct are
696 needed, so identify them. */
697 
698 static const char posix_names[] =
699   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
700   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
701   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
702   STRING_word0  STRING_xdigit;
703 
704 static const uint8_t posix_name_lengths[] = {
705   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
706 
707 #define PC_GRAPH  8
708 #define PC_PRINT  9
709 #define PC_PUNCT 10
710 
711 /* Table of class bit maps for each POSIX class. Each class is formed from a
712 base map, with an optional addition or removal of another map. Then, for some
713 classes, there is some additional tweaking: for [:blank:] the vertical space
714 characters are removed, and for [:alpha:] and [:alnum:] the underscore
715 character is removed. The triples in the table consist of the base map offset,
716 second map offset or -1 if no second map, and a non-negative value for map
717 addition or a negative value for map subtraction (if there are two maps). The
718 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
719 remove vertical space characters, 2 => remove underscore. */
720 
721 static const int posix_class_maps[] = {
722   cbit_word,  cbit_digit, -2,             /* alpha */
723   cbit_lower, -1,          0,             /* lower */
724   cbit_upper, -1,          0,             /* upper */
725   cbit_word,  -1,          2,             /* alnum - word without underscore */
726   cbit_print, cbit_cntrl,  0,             /* ascii */
727   cbit_space, -1,          1,             /* blank - a GNU extension */
728   cbit_cntrl, -1,          0,             /* cntrl */
729   cbit_digit, -1,          0,             /* digit */
730   cbit_graph, -1,          0,             /* graph */
731   cbit_print, -1,          0,             /* print */
732   cbit_punct, -1,          0,             /* punct */
733   cbit_space, -1,          0,             /* space */
734   cbit_word,  -1,          0,             /* word - a Perl extension */
735   cbit_xdigit,-1,          0              /* xdigit */
736 };
737 
738 #ifdef SUPPORT_UNICODE
739 
740 /* The POSIX class Unicode property substitutes that are used in UCP mode must
741 be in the order of the POSIX class names, defined above. */
742 
743 static int posix_substitutes[] = {
744   PT_GC, ucp_L,     /* alpha */
745   PT_PC, ucp_Ll,    /* lower */
746   PT_PC, ucp_Lu,    /* upper */
747   PT_ALNUM, 0,      /* alnum */
748   -1, 0,            /* ascii, treat as non-UCP */
749   -1, 1,            /* blank, treat as \h */
750   PT_PC, ucp_Cc,    /* cntrl */
751   PT_PC, ucp_Nd,    /* digit */
752   PT_PXGRAPH, 0,    /* graph */
753   PT_PXPRINT, 0,    /* print */
754   PT_PXPUNCT, 0,    /* punct */
755   PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
756   PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
757   -1, 0             /* xdigit, treat as non-UCP */
758 };
759 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
760 #endif  /* SUPPORT_UNICODE */
761 
762 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
763 are allowed. */
764 
765 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
766   (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
767    PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
768    PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
769 
770 #define PUBLIC_COMPILE_OPTIONS \
771   (PUBLIC_LITERAL_COMPILE_OPTIONS| \
772    PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
773    PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
774    PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
775    PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
776    PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
777    PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
778 
779 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
780    (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
781 
782 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
783    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
784     PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
785     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
786     PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)
787 
788 /* Compile time error code numbers. They are given names so that they can more
789 easily be tracked. When a new number is added, the tables called eint1 and
790 eint2 in pcre2posix.c may need to be updated, and a new error text must be
791 added to compile_error_texts in pcre2_error.c. Also, the error codes in
792 pcre2.h.in must be updated - their values are exactly 100 greater than these
793 values. */
794 
795 enum { ERR0 = COMPILE_ERROR_BASE,
796        ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
797        ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
798        ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
799        ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
800        ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
801        ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
802        ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
803        ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
804        ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
805        ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 };
806 
807 /* This is a table of start-of-pattern options such as (*UTF) and settings such
808 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
809 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
810 generic and always supported. */
811 
812 enum { PSO_OPT,     /* Value is an option bit */
813        PSO_FLG,     /* Value is a flag bit */
814        PSO_NL,      /* Value is a newline type */
815        PSO_BSR,     /* Value is a \R type */
816        PSO_LIMH,    /* Read integer value for heap limit */
817        PSO_LIMM,    /* Read integer value for match limit */
818        PSO_LIMD };  /* Read integer value for depth limit */
819 
820 typedef struct pso {
821   const uint8_t *name;
822   uint16_t length;
823   uint16_t type;
824   uint32_t value;
825 } pso;
826 
827 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
828 
829 static pso pso_list[] = {
830   { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
831   { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
832   { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
833   { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
834   { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
835   { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
836   { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
837   { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
838   { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
839   { (uint8_t *)STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
840   { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
841   { (uint8_t *)STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
842   { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
843   { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
844   { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
845   { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
846   { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
847   { (uint8_t *)STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
848   { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
849   { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
850   { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
851 };
852 
853 /* This table is used when converting repeating opcodes into possessified
854 versions as a result of an explicit possessive quantifier such as ++. A zero
855 value means there is no possessified version - in those cases the item in
856 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
857 because all relevant opcodes are less than that. */
858 
859 static const uint8_t opcode_possessify[] = {
860   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
861   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
862 
863   0,                       /* NOTI */
864   OP_POSSTAR, 0,           /* STAR, MINSTAR */
865   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
866   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
867   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
868   0,                       /* EXACT */
869   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
870 
871   OP_POSSTARI, 0,          /* STARI, MINSTARI */
872   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
873   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
874   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
875   0,                       /* EXACTI */
876   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
877 
878   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
879   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
880   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
881   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
882   0,                       /* NOTEXACT */
883   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
884 
885   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
886   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
887   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
888   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
889   0,                       /* NOTEXACTI */
890   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
891 
892   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
893   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
894   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
895   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
896   0,                       /* TYPEEXACT */
897   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
898 
899   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
900   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
901   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
902   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
903   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
904 
905   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
906   0, 0,                    /* REF, REFI */
907   0, 0,                    /* DNREF, DNREFI */
908   0, 0                     /* RECURSE, CALLOUT */
909 };
910 
911 
912 #ifdef DEBUG_SHOW_PARSED
913 /*************************************************
914 *     Show the parsed pattern for debugging      *
915 *************************************************/
916 
917 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
918 can be enabled. */
919 
show_parsed(compile_block * cb)920 static void show_parsed(compile_block *cb)
921 {
922 uint32_t *pptr = cb->parsed_pattern;
923 
924 for (;;)
925   {
926   int max, min;
927   PCRE2_SIZE offset;
928   uint32_t i;
929   uint32_t length;
930   uint32_t meta_arg = META_DATA(*pptr);
931 
932   fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
933 
934   if (*pptr < META_END)
935     {
936     if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
937     pptr++;
938     }
939 
940   else switch (META_CODE(*pptr++))
941     {
942     default:
943     fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
944     return;
945 
946     case META_END:
947     fprintf(stderr, "META_END\n");
948     return;
949 
950     case META_CAPTURE:
951     fprintf(stderr, "META_CAPTURE %d", meta_arg);
952     break;
953 
954     case META_RECURSE:
955     GETOFFSET(offset, pptr);
956     fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
957     break;
958 
959     case META_BACKREF:
960     if (meta_arg < 10)
961       offset = cb->small_ref_offset[meta_arg];
962     else
963       GETOFFSET(offset, pptr);
964     fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
965     break;
966 
967     case META_ESCAPE:
968     if (meta_arg == ESC_P || meta_arg == ESC_p)
969       {
970       uint32_t ptype = *pptr >> 16;
971       uint32_t pvalue = *pptr++ & 0xffff;
972       fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
973         ptype, pvalue);
974       }
975     else
976       {
977       uint32_t cc;
978       /* There's just one escape we might have here that isn't negated in the
979       escapes table. */
980       if (meta_arg == ESC_g) cc = CHAR_g;
981       else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
982         {
983         if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
984         }
985       if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
986       fprintf(stderr, "META \\%c", cc);
987       }
988     break;
989 
990     case META_MINMAX:
991     min = *pptr++;
992     max = *pptr++;
993     if (max != REPEAT_UNLIMITED)
994       fprintf(stderr, "META {%d,%d}", min, max);
995     else
996       fprintf(stderr, "META {%d,}", min);
997     break;
998 
999     case META_MINMAX_QUERY:
1000     min = *pptr++;
1001     max = *pptr++;
1002     if (max != REPEAT_UNLIMITED)
1003       fprintf(stderr, "META {%d,%d}?", min, max);
1004     else
1005       fprintf(stderr, "META {%d,}?", min);
1006     break;
1007 
1008     case META_MINMAX_PLUS:
1009     min = *pptr++;
1010     max = *pptr++;
1011     if (max != REPEAT_UNLIMITED)
1012       fprintf(stderr, "META {%d,%d}+", min, max);
1013     else
1014       fprintf(stderr, "META {%d,}+", min);
1015     break;
1016 
1017     case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1018     case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1019     case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1020     case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1021     case META_DOT: fprintf(stderr, "META_DOT"); break;
1022     case META_ASTERISK: fprintf(stderr, "META *"); break;
1023     case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1024     case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1025     case META_PLUS: fprintf(stderr, "META +"); break;
1026     case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1027     case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1028     case META_QUERY: fprintf(stderr, "META ?"); break;
1029     case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1030     case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1031 
1032     case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1033     case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1034     case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1035     case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1036     case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1037     case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1038     case META_KET: fprintf(stderr, "META )"); break;
1039     case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1040 
1041     case META_CLASS: fprintf(stderr, "META ["); break;
1042     case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1043     case META_CLASS_END: fprintf(stderr, "META ]"); break;
1044     case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1045     case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1046 
1047     case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1048     case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1049 
1050     case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1051     case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1052 
1053     case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1054     case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1055     case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1056     case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1057     case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1058     case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1059 
1060     case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1061 
1062     case META_LOOKBEHIND:
1063     fprintf(stderr, "META (?<= %d offset=", meta_arg);
1064     GETOFFSET(offset, pptr);
1065     fprintf(stderr, "%zd", offset);
1066     break;
1067 
1068     case META_LOOKBEHIND_NA:
1069     fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
1070     GETOFFSET(offset, pptr);
1071     fprintf(stderr, "%zd", offset);
1072     break;
1073 
1074     case META_LOOKBEHINDNOT:
1075     fprintf(stderr, "META (?<! %d offset=", meta_arg);
1076     GETOFFSET(offset, pptr);
1077     fprintf(stderr, "%zd", offset);
1078     break;
1079 
1080     case META_CALLOUT_NUMBER:
1081     fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1082        pptr[1]);
1083     pptr += 3;
1084     break;
1085 
1086     case META_CALLOUT_STRING:
1087       {
1088       uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1089       uint32_t patlength = *pptr++;    /* Length of next pattern item */
1090       fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1091       GETOFFSET(offset, pptr);
1092       fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1093       }
1094     break;
1095 
1096     case META_RECURSE_BYNAME:
1097     fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1098     GETOFFSET(offset, pptr);
1099     fprintf(stderr, "%zd", offset);
1100     break;
1101 
1102     case META_BACKREF_BYNAME:
1103     fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1104     GETOFFSET(offset, pptr);
1105     fprintf(stderr, "%zd", offset);
1106     break;
1107 
1108     case META_COND_NUMBER:
1109     fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1110     GETOFFSET(offset, pptr);
1111     fprintf(stderr, "%zd", offset);
1112     pptr++;
1113     break;
1114 
1115     case META_COND_DEFINE:
1116     fprintf(stderr, "META (?(DEFINE) offset=");
1117     GETOFFSET(offset, pptr);
1118     fprintf(stderr, "%zd", offset);
1119     break;
1120 
1121     case META_COND_VERSION:
1122     fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1123     fprintf(stderr, "%d.", *pptr++);
1124     fprintf(stderr, "%d)", *pptr++);
1125     break;
1126 
1127     case META_COND_NAME:
1128     fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1129     GETOFFSET(offset, pptr);
1130     fprintf(stderr, "%zd", offset);
1131     break;
1132 
1133     case META_COND_RNAME:
1134     fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1135     GETOFFSET(offset, pptr);
1136     fprintf(stderr, "%zd", offset);
1137     break;
1138 
1139     /* This is kept as a name, because it might be. */
1140 
1141     case META_COND_RNUMBER:
1142     fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1143     GETOFFSET(offset, pptr);
1144     fprintf(stderr, "%zd", offset);
1145     break;
1146 
1147     case META_MARK:
1148     fprintf(stderr, "META (*MARK:");
1149     goto SHOWARG;
1150 
1151     case META_COMMIT_ARG:
1152     fprintf(stderr, "META (*COMMIT:");
1153     goto SHOWARG;
1154 
1155     case META_PRUNE_ARG:
1156     fprintf(stderr, "META (*PRUNE:");
1157     goto SHOWARG;
1158 
1159     case META_SKIP_ARG:
1160     fprintf(stderr, "META (*SKIP:");
1161     goto SHOWARG;
1162 
1163     case META_THEN_ARG:
1164     fprintf(stderr, "META (*THEN:");
1165     SHOWARG:
1166     length = *pptr++;
1167     for (i = 0; i < length; i++)
1168       {
1169       uint32_t cc = *pptr++;
1170       if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1171         else fprintf(stderr, "\\x{%x}", cc);
1172       }
1173     fprintf(stderr, ") length=%u", length);
1174     break;
1175     }
1176   fprintf(stderr, "\n");
1177   }
1178 return;
1179 }
1180 #endif  /* DEBUG_SHOW_PARSED */
1181 
1182 
1183 
1184 /*************************************************
1185 *               Copy compiled code               *
1186 *************************************************/
1187 
1188 /* Compiled JIT code cannot be copied, so the new compiled block has no
1189 associated JIT data. */
1190 
1191 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1192 pcre2_code_copy(const pcre2_code *code)
1193 {
1194 PCRE2_SIZE* ref_count;
1195 pcre2_code *newcode;
1196 
1197 if (code == NULL) return NULL;
1198 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1199 if (newcode == NULL) return NULL;
1200 memcpy(newcode, code, code->blocksize);
1201 newcode->executable_jit = NULL;
1202 
1203 /* If the code is one that has been deserialized, increment the reference count
1204 in the decoded tables. */
1205 
1206 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1207   {
1208   ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1209   (*ref_count)++;
1210   }
1211 
1212 return newcode;
1213 }
1214 
1215 
1216 
1217 /*************************************************
1218 *     Copy compiled code and character tables    *
1219 *************************************************/
1220 
1221 /* Compiled JIT code cannot be copied, so the new compiled block has no
1222 associated JIT data. This version of code_copy also makes a separate copy of
1223 the character tables. */
1224 
1225 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1226 pcre2_code_copy_with_tables(const pcre2_code *code)
1227 {
1228 PCRE2_SIZE* ref_count;
1229 pcre2_code *newcode;
1230 uint8_t *newtables;
1231 
1232 if (code == NULL) return NULL;
1233 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1234 if (newcode == NULL) return NULL;
1235 memcpy(newcode, code, code->blocksize);
1236 newcode->executable_jit = NULL;
1237 
1238 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1239   code->memctl.memory_data);
1240 if (newtables == NULL)
1241   {
1242   code->memctl.free((void *)newcode, code->memctl.memory_data);
1243   return NULL;
1244   }
1245 memcpy(newtables, code->tables, TABLES_LENGTH);
1246 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1247 *ref_count = 1;
1248 
1249 newcode->tables = newtables;
1250 newcode->flags |= PCRE2_DEREF_TABLES;
1251 return newcode;
1252 }
1253 
1254 
1255 
1256 /*************************************************
1257 *               Free compiled code               *
1258 *************************************************/
1259 
1260 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1261 pcre2_code_free(pcre2_code *code)
1262 {
1263 PCRE2_SIZE* ref_count;
1264 
1265 if (code != NULL)
1266   {
1267   if (code->executable_jit != NULL)
1268     PRIV(jit_free)(code->executable_jit, &code->memctl);
1269 
1270   if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1271     {
1272     /* Decoded tables belong to the codes after deserialization, and they must
1273     be freed when there are no more references to them. The *ref_count should
1274     always be > 0. */
1275 
1276     ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1277     if (*ref_count > 0)
1278       {
1279       (*ref_count)--;
1280       if (*ref_count == 0)
1281         code->memctl.free((void *)code->tables, code->memctl.memory_data);
1282       }
1283     }
1284 
1285   code->memctl.free(code, code->memctl.memory_data);
1286   }
1287 }
1288 
1289 
1290 
1291 /*************************************************
1292 *         Read a number, possibly signed         *
1293 *************************************************/
1294 
1295 /* This function is used to read numbers in the pattern. The initial pointer
1296 must be the sign or first digit of the number. When relative values (introduced
1297 by + or -) are allowed, they are relative group numbers, and the result must be
1298 greater than zero.
1299 
1300 Arguments:
1301   ptrptr      points to the character pointer variable
1302   ptrend      points to the end of the input string
1303   allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1304   max_value   the largest number allowed
1305   max_error   the error to give for an over-large number
1306   intptr      where to put the result
1307   errcodeptr  where to put an error code
1308 
1309 Returns:      TRUE  - a number was read
1310               FALSE - errorcode == 0 => no number was found
1311                       errorcode != 0 => an error occurred
1312 */
1313 
1314 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1315 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1316   uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1317 {
1318 int sign = 0;
1319 uint32_t n = 0;
1320 PCRE2_SPTR ptr = *ptrptr;
1321 BOOL yield = FALSE;
1322 
1323 *errorcodeptr = 0;
1324 
1325 if (allow_sign >= 0 && ptr < ptrend)
1326   {
1327   if (*ptr == CHAR_PLUS)
1328     {
1329     sign = +1;
1330     max_value -= allow_sign;
1331     ptr++;
1332     }
1333   else if (*ptr == CHAR_MINUS)
1334     {
1335     sign = -1;
1336     ptr++;
1337     }
1338   }
1339 
1340 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1341 while (ptr < ptrend && IS_DIGIT(*ptr))
1342   {
1343   n = n * 10 + *ptr++ - CHAR_0;
1344   if (n > max_value)
1345     {
1346     *errorcodeptr = max_error;
1347     goto EXIT;
1348     }
1349   }
1350 
1351 if (allow_sign >= 0 && sign != 0)
1352   {
1353   if (n == 0)
1354     {
1355     *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1356     goto EXIT;
1357     }
1358 
1359   if (sign > 0) n += allow_sign;
1360   else if ((int)n > allow_sign)
1361     {
1362     *errorcodeptr = ERR15;  /* Non-existent subpattern */
1363     goto EXIT;
1364     }
1365   else n = allow_sign + 1 - n;
1366   }
1367 
1368 yield = TRUE;
1369 
1370 EXIT:
1371 *intptr = n;
1372 *ptrptr = ptr;
1373 return yield;
1374 }
1375 
1376 
1377 
1378 /*************************************************
1379 *         Read repeat counts                     *
1380 *************************************************/
1381 
1382 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1383 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1384 larger value is used for "unlimited". We have to use signed arguments for
1385 read_number() because it is capable of returning a signed value.
1386 
1387 Arguments:
1388   ptrptr         points to pointer to character after'{'
1389   ptrend         pointer to end of input
1390   minp           if not NULL, pointer to int for min
1391   maxp           if not NULL, pointer to int for max (-1 if no max)
1392                  returned as -1 if no max
1393   errorcodeptr   points to error code variable
1394 
1395 Returns:         FALSE if not a repeat quantifier, errorcode set zero
1396                  FALSE on error, with errorcode set non-zero
1397                  TRUE on success, with pointer updated to point after '}'
1398 */
1399 
1400 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1401 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1402   uint32_t *maxp, int *errorcodeptr)
1403 {
1404 PCRE2_SPTR p;
1405 BOOL yield = FALSE;
1406 BOOL had_comma = FALSE;
1407 int32_t min = 0;
1408 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1409 
1410 /* Check the syntax */
1411 
1412 *errorcodeptr = 0;
1413 for (p = *ptrptr;; p++)
1414   {
1415   uint32_t c;
1416   if (p >= ptrend) return FALSE;
1417   c = *p;
1418   if (IS_DIGIT(c)) continue;
1419   if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1420   if (c == CHAR_COMMA)
1421     {
1422     if (had_comma) return FALSE;
1423     had_comma = TRUE;
1424     }
1425   else return FALSE;
1426   }
1427 
1428 /* The only error from read_number() is for a number that is too big. */
1429 
1430 p = *ptrptr;
1431 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1432   goto EXIT;
1433 
1434 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1435   {
1436   p++;
1437   max = min;
1438   }
1439 else
1440   {
1441   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1442     {
1443     if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1444         errorcodeptr))
1445       goto EXIT;
1446     if (max < min)
1447       {
1448       *errorcodeptr = ERR4;
1449       goto EXIT;
1450       }
1451     }
1452   p++;
1453   }
1454 
1455 yield = TRUE;
1456 if (minp != NULL) *minp = (uint32_t)min;
1457 if (maxp != NULL) *maxp = (uint32_t)max;
1458 
1459 /* Update the pattern pointer */
1460 
1461 EXIT:
1462 *ptrptr = p;
1463 return yield;
1464 }
1465 
1466 
1467 
1468 /*************************************************
1469 *            Handle escapes                      *
1470 *************************************************/
1471 
1472 /* This function is called when a \ has been encountered. It either returns a
1473 positive value for a simple escape such as \d, or 0 for a data character, which
1474 is placed in chptr. A backreference to group n is returned as negative n. On
1475 entry, ptr is pointing at the character after \. On exit, it points after the
1476 final code unit of the escape sequence.
1477 
1478 This function is also called from pcre2_substitute() to handle escape sequences
1479 in replacement strings. In this case, the cb argument is NULL, and in the case
1480 of escapes that have further processing, only sequences that define a data
1481 character are recognised. The isclass argument is not relevant; the options
1482 argument is the final value of the compiled pattern's options.
1483 
1484 Arguments:
1485   ptrptr         points to the input position pointer
1486   ptrend         points to the end of the input
1487   chptr          points to a returned data character
1488   errorcodeptr   points to the errorcode variable (containing zero)
1489   options        the current options bits
1490   isclass        TRUE if inside a character class
1491   cb             compile data block or NULL when called from pcre2_substitute()
1492 
1493 Returns:         zero => a data character
1494                  positive => a special escape sequence
1495                  negative => a numerical back reference
1496                  on error, errorcodeptr is set non-zero
1497 */
1498 
1499 int
PRIV(check_escape)1500 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1501   int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1502   compile_block *cb)
1503 {
1504 BOOL utf = (options & PCRE2_UTF) != 0;
1505 PCRE2_SPTR ptr = *ptrptr;
1506 uint32_t c, cc;
1507 int escape = 0;
1508 int i;
1509 
1510 /* If backslash is at the end of the string, it's an error. */
1511 
1512 if (ptr >= ptrend)
1513   {
1514   *errorcodeptr = ERR1;
1515   return 0;
1516   }
1517 
1518 GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1519 *errorcodeptr = 0;              /* Be optimistic */
1520 
1521 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1522 value test saves a memory lookup for code points outside the alphanumeric
1523 range. */
1524 
1525 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1526 
1527 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1528 positive value is a literal value for something like \n. A negative value is
1529 the negation of one of the ESC_ macros that is passed back for handling by the
1530 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1531 is supported. If the value is zero, further processing is handled below. */
1532 
1533 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1534   {
1535   if (i > 0)
1536     {
1537     c = (uint32_t)i;
1538     if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1539       c = CHAR_LF;
1540     }
1541   else  /* Negative table entry */
1542     {
1543     escape = -i;                    /* Else return a special escape */
1544     if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1545       cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1546 
1547     /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1548     Unicode code points, as well as plain \N for "not newline". PCRE does not
1549     support \N{name}. However, it does support quantification such as \N{2,3},
1550     so if \N{ is not followed by U+dddd we check for a quantifier. */
1551 
1552     if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1553       {
1554       PCRE2_SPTR p = ptr + 1;
1555 
1556       /* \N{U+ can be handled by the \x{ code. However, this construction is
1557       not valid in EBCDIC environments because it specifies a Unicode
1558       character, not a codepoint in the local code. For example \N{U+0041}
1559       must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1560       casing semantics for the entire pattern, so allow it only in UTF (i.e.
1561       Unicode) mode. */
1562 
1563       if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1564         {
1565 #ifdef EBCDIC
1566         *errorcodeptr = ERR93;
1567 #else
1568         if (utf)
1569           {
1570           ptr = p + 1;
1571           escape = 0;   /* Not a fancy escape after all */
1572           goto COME_FROM_NU;
1573           }
1574         else *errorcodeptr = ERR93;
1575 #endif
1576         }
1577 
1578       /* Give an error if what follows is not a quantifier, but don't override
1579       an error set by the quantifier reader (e.g. number overflow). */
1580 
1581       else
1582         {
1583         if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1584              *errorcodeptr == 0)
1585           *errorcodeptr = ERR37;
1586         }
1587       }
1588     }
1589   }
1590 
1591 /* Escapes that need further processing, including those that are unknown, have
1592 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1593 \o, and \x are recognized (\u and \U can never appear as they are used for case
1594 forcing). */
1595 
1596 else
1597   {
1598   int s;
1599   PCRE2_SPTR oldptr;
1600   BOOL overflow;
1601   BOOL alt_bsux =
1602     ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1603 
1604   /* Filter calls from pcre2_substitute(). */
1605 
1606   if (cb == NULL)
1607     {
1608     if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1609       {
1610       *errorcodeptr = ERR3;
1611       return 0;
1612       }
1613     alt_bsux = FALSE;   /* Do not modify \x handling */
1614     }
1615 
1616   switch (c)
1617     {
1618     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1619     error. */
1620 
1621     case CHAR_F:
1622     case CHAR_l:
1623     case CHAR_L:
1624     *errorcodeptr = ERR37;
1625     break;
1626 
1627     /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1628     is set. Otherwise, \u must be followed by exactly four hex digits or, if
1629     PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1630     Otherwise it is a lowercase u letter. This gives some compatibility with
1631     ECMAScript (aka JavaScript). */
1632 
1633     case CHAR_u:
1634     if (!alt_bsux) *errorcodeptr = ERR37; else
1635       {
1636       uint32_t xc;
1637 
1638       if (ptr >= ptrend) break;
1639       if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1640           (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1641         {
1642         PCRE2_SPTR hptr = ptr + 1;
1643         cc = 0;
1644 
1645         while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1646           {
1647           if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1648             {
1649             *errorcodeptr = ERR77;
1650             ptr = hptr;   /* Show where */
1651             break;        /* *hptr != } will cause another break below */
1652             }
1653           cc = (cc << 4) | xc;
1654           hptr++;
1655           }
1656 
1657         if (hptr == ptr + 1 ||   /* No hex digits */
1658             hptr >= ptrend ||    /* Hit end of input */
1659             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1660           break;         /* Hex escape not recognized */
1661 
1662         c = cc;          /* Accept the code point */
1663         ptr = hptr + 1;
1664         }
1665 
1666       else  /* Must be exactly 4 hex digits */
1667         {
1668         if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1669         if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1670         if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1671         cc = (cc << 4) | xc;
1672         if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1673         cc = (cc << 4) | xc;
1674         if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1675         c = (cc << 4) | xc;
1676         ptr += 4;
1677         }
1678 
1679       if (utf)
1680         {
1681         if (c > 0x10ffffU) *errorcodeptr = ERR77;
1682         else
1683           if (c >= 0xd800 && c <= 0xdfff &&
1684               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1685                 *errorcodeptr = ERR73;
1686         }
1687       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1688       }
1689     break;
1690 
1691     /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1692     in which case it is an upper case letter. */
1693 
1694     case CHAR_U:
1695     if (!alt_bsux) *errorcodeptr = ERR37;
1696     break;
1697 
1698     /* In a character class, \g is just a literal "g". Outside a character
1699     class, \g must be followed by one of a number of specific things:
1700 
1701     (1) A number, either plain or braced. If positive, it is an absolute
1702     backreference. If negative, it is a relative backreference. This is a Perl
1703     5.10 feature.
1704 
1705     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1706     is part of Perl's movement towards a unified syntax for back references. As
1707     this is synonymous with \k{name}, we fudge it up by pretending it really
1708     was \k{name}.
1709 
1710     (3) For Oniguruma compatibility we also support \g followed by a name or a
1711     number either in angle brackets or in single quotes. However, these are
1712     (possibly recursive) subroutine calls, _not_ backreferences. We return
1713     the ESC_g code.
1714 
1715     Summary: Return a negative number for a numerical back reference, ESC_k for
1716     a named back reference, and ESC_g for a named or numbered subroutine call.
1717     */
1718 
1719     case CHAR_g:
1720     if (isclass) break;
1721 
1722     if (ptr >= ptrend)
1723       {
1724       *errorcodeptr = ERR57;
1725       break;
1726       }
1727 
1728     if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1729       {
1730       escape = ESC_g;
1731       break;
1732       }
1733 
1734     /* If there is a brace delimiter, try to read a numerical reference. If
1735     there isn't one, assume we have a name and treat it as \k. */
1736 
1737     if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1738       {
1739       PCRE2_SPTR p = ptr + 1;
1740       if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1741           errorcodeptr))
1742         {
1743         if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1744         break;
1745         }
1746       if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1747         {
1748         *errorcodeptr = ERR57;
1749         break;
1750         }
1751       ptr = p + 1;
1752       }
1753 
1754     /* Read an undelimited number */
1755 
1756     else
1757       {
1758       if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1759           errorcodeptr))
1760         {
1761         if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1762         break;
1763         }
1764       }
1765 
1766     if (s <= 0)
1767       {
1768       *errorcodeptr = ERR15;
1769       break;
1770       }
1771 
1772     escape = -s;
1773     break;
1774 
1775     /* The handling of escape sequences consisting of a string of digits
1776     starting with one that is not zero is not straightforward. Perl has changed
1777     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1778     recommended to avoid the ambiguities in the old syntax.
1779 
1780     Outside a character class, the digits are read as a decimal number. If the
1781     number is less than 10, or if there are that many previous extracting left
1782     brackets, it is a back reference. Otherwise, up to three octal digits are
1783     read to form an escaped character code. Thus \123 is likely to be octal 123
1784     (cf \0123, which is octal 012 followed by the literal 3).
1785 
1786     Inside a character class, \ followed by a digit is always either a literal
1787     8 or 9 or an octal number. */
1788 
1789     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1790     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1791 
1792     if (!isclass)
1793       {
1794       oldptr = ptr;
1795       ptr--;   /* Back to the digit */
1796 
1797       /* As we know we are at a digit, the only possible error from
1798       read_number() is a number that is too large to be a group number. In this
1799       case we fall through handle this as not a group reference. If we have
1800       read a small enough number, check for a back reference.
1801 
1802       \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1803       are octal escapes if there are not that many previous captures. */
1804 
1805       if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1806           (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1807         {
1808         if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1809           else escape = -s;     /* Indicates a back reference */
1810         break;
1811         }
1812 
1813       ptr = oldptr;      /* Put the pointer back and fall through */
1814       }
1815 
1816     /* Handle a digit following \ when the number is not a back reference, or
1817     we are within a character class. If the first digit is 8 or 9, Perl used to
1818     generate a binary zero and then treat the digit as a following literal. At
1819     least by Perl 5.18 this changed so as not to insert the binary zero. */
1820 
1821     if (c >= CHAR_8) break;
1822 
1823     /* Fall through */
1824 
1825     /* \0 always starts an octal number, but we may drop through to here with a
1826     larger first octal digit. The original code used just to take the least
1827     significant 8 bits of octal numbers (I think this is what early Perls used
1828     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1829     but no more than 3 octal digits. */
1830 
1831     case CHAR_0:
1832     c -= CHAR_0;
1833     while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1834         c = c * 8 + *ptr++ - CHAR_0;
1835 #if PCRE2_CODE_UNIT_WIDTH == 8
1836     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1837 #endif
1838     break;
1839 
1840     /* \o is a relatively new Perl feature, supporting a more general way of
1841     specifying character codes in octal. The only supported form is \o{ddd}. */
1842 
1843     case CHAR_o:
1844     if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1845       {
1846       ptr--;
1847       *errorcodeptr = ERR55;
1848       }
1849     else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1850       *errorcodeptr = ERR78;
1851     else
1852       {
1853       c = 0;
1854       overflow = FALSE;
1855       while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1856         {
1857         cc = *ptr++;
1858         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1859 #if PCRE2_CODE_UNIT_WIDTH == 32
1860         if (c >= 0x20000000l) { overflow = TRUE; break; }
1861 #endif
1862         c = (c << 3) + (cc - CHAR_0);
1863 #if PCRE2_CODE_UNIT_WIDTH == 8
1864         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1865 #elif PCRE2_CODE_UNIT_WIDTH == 16
1866         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1867 #elif PCRE2_CODE_UNIT_WIDTH == 32
1868         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1869 #endif
1870         }
1871       if (overflow)
1872         {
1873         while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1874         *errorcodeptr = ERR34;
1875         }
1876       else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1877         {
1878         if (utf && c >= 0xd800 && c <= 0xdfff &&
1879             (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1880           {
1881           ptr--;
1882           *errorcodeptr = ERR73;
1883           }
1884         }
1885       else
1886         {
1887         ptr--;
1888         *errorcodeptr = ERR64;
1889         }
1890       }
1891     break;
1892 
1893     /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1894     by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1895 
1896     case CHAR_x:
1897     if (alt_bsux)
1898       {
1899       uint32_t xc;
1900       if (ptrend - ptr < 2) break;               /* Less than 2 characters */
1901       if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1902       if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1903       c = (cc << 4) | xc;
1904       ptr += 2;
1905       }
1906 
1907     /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1908     greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1909     digits. If not, { used to be treated as a data character. However, Perl
1910     seems to read hex digits up to the first non-such, and ignore the rest, so
1911     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1912     now gives an error. */
1913 
1914     else
1915       {
1916       if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1917         {
1918 #ifndef EBCDIC
1919         COME_FROM_NU:
1920 #endif
1921         if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1922           {
1923           *errorcodeptr = ERR78;
1924           break;
1925           }
1926         c = 0;
1927         overflow = FALSE;
1928 
1929         while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1930           {
1931           ptr++;
1932           if (c == 0 && cc == 0) continue;   /* Leading zeroes */
1933 #if PCRE2_CODE_UNIT_WIDTH == 32
1934           if (c >= 0x10000000l) { overflow = TRUE; break; }
1935 #endif
1936           c = (c << 4) | cc;
1937           if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1938             {
1939             overflow = TRUE;
1940             break;
1941             }
1942           }
1943 
1944         if (overflow)
1945           {
1946           while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1947           *errorcodeptr = ERR34;
1948           }
1949         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1950           {
1951           if (utf && c >= 0xd800 && c <= 0xdfff &&
1952               (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1953             {
1954             ptr--;
1955             *errorcodeptr = ERR73;
1956             }
1957           }
1958 
1959         /* If the sequence of hex digits does not end with '}', give an error.
1960         We used just to recognize this construct and fall through to the normal
1961         \x handling, but nowadays Perl gives an error, which seems much more
1962         sensible, so we do too. */
1963 
1964         else
1965           {
1966           ptr--;
1967           *errorcodeptr = ERR67;
1968           }
1969         }   /* End of \x{} processing */
1970 
1971       /* Read a up to two hex digits after \x */
1972 
1973       else
1974         {
1975         c = 0;
1976         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1977         ptr++;
1978         c = cc;
1979         if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
1980         ptr++;
1981         c = (c << 4) | cc;
1982         }     /* End of \xdd handling */
1983       }       /* End of Perl-style \x handling */
1984     break;
1985 
1986     /* The handling of \c is different in ASCII and EBCDIC environments. In an
1987     ASCII (or Unicode) environment, an error is given if the character
1988     following \c is not a printable ASCII character. Otherwise, the following
1989     character is upper-cased if it is a letter, and after that the 0x40 bit is
1990     flipped. The result is the value of the escape.
1991 
1992     In an EBCDIC environment the handling of \c is compatible with the
1993     specification in the perlebcdic document. The following character must be
1994     a letter or one of small number of special characters. These provide a
1995     means of defining the character values 0-31.
1996 
1997     For testing the EBCDIC handling of \c in an ASCII environment, recognize
1998     the EBCDIC value of 'c' explicitly. */
1999 
2000 #if defined EBCDIC && 'a' != 0x81
2001     case 0x83:
2002 #else
2003     case CHAR_c:
2004 #endif
2005     if (ptr >= ptrend)
2006       {
2007       *errorcodeptr = ERR2;
2008       break;
2009       }
2010     c = *ptr;
2011     if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2012 
2013     /* Handle \c in an ASCII/Unicode environment. */
2014 
2015 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
2016     if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2017       {
2018       *errorcodeptr = ERR68;
2019       break;
2020       }
2021     c ^= 0x40;
2022 
2023     /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2024     255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2025     POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2026     The other valid sequences correspond to a list of specific characters. */
2027 
2028 #else
2029     if (c == CHAR_QUESTION_MARK)
2030       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2031     else
2032       {
2033       for (i = 0; i < 32; i++)
2034         {
2035         if (c == ebcdic_escape_c[i]) break;
2036         }
2037       if (i < 32) c = i; else *errorcodeptr = ERR68;
2038       }
2039 #endif  /* EBCDIC */
2040 
2041     ptr++;
2042     break;
2043 
2044     /* Any other alphanumeric following \ is an error. Perl gives an error only
2045     if in warning mode, but PCRE doesn't have a warning mode. */
2046 
2047     default:
2048     *errorcodeptr = ERR3;
2049     *ptrptr = ptr - 1;     /* Point to the character at fault */
2050     return 0;
2051     }
2052   }
2053 
2054 /* Set the pointer to the next character before returning. */
2055 
2056 *ptrptr = ptr;
2057 *chptr = c;
2058 return escape;
2059 }
2060 
2061 
2062 
2063 #ifdef SUPPORT_UNICODE
2064 /*************************************************
2065 *               Handle \P and \p                 *
2066 *************************************************/
2067 
2068 /* This function is called after \P or \p has been encountered, provided that
2069 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2070 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2071 after the final code unit of the escape sequence.
2072 
2073 Arguments:
2074   ptrptr         the pattern position pointer
2075   negptr         a boolean that is set TRUE for negation else FALSE
2076   ptypeptr       an unsigned int that is set to the type value
2077   pdataptr       an unsigned int that is set to the detailed property value
2078   errorcodeptr   the error code variable
2079   cb             the compile data
2080 
2081 Returns:         TRUE if the type value was found, or FALSE for an invalid type
2082 */
2083 
2084 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2085 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2086   uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2087 {
2088 PCRE2_UCHAR c;
2089 PCRE2_SIZE i, bot, top;
2090 PCRE2_SPTR ptr = *ptrptr;
2091 PCRE2_UCHAR name[32];
2092 
2093 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2094 c = *ptr++;
2095 *negptr = FALSE;
2096 
2097 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2098 negation. */
2099 
2100 if (c == CHAR_LEFT_CURLY_BRACKET)
2101   {
2102   if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2103   if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2104     {
2105     *negptr = TRUE;
2106     ptr++;
2107     }
2108   for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2109     {
2110     if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2111     c = *ptr++;
2112     if (c == CHAR_NUL) goto ERROR_RETURN;
2113     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2114     name[i] = c;
2115     }
2116   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2117   name[i] = 0;
2118   }
2119 
2120 /* Otherwise there is just one following character, which must be an ASCII
2121 letter. */
2122 
2123 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2124   {
2125   name[0] = c;
2126   name[1] = 0;
2127   }
2128 else goto ERROR_RETURN;
2129 
2130 *ptrptr = ptr;
2131 
2132 /* Search for a recognized property name using binary chop. */
2133 
2134 bot = 0;
2135 top = PRIV(utt_size);
2136 
2137 while (bot < top)
2138   {
2139   int r;
2140   i = (bot + top) >> 1;
2141   r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2142   if (r == 0)
2143     {
2144     *ptypeptr = PRIV(utt)[i].type;
2145     *pdataptr = PRIV(utt)[i].value;
2146     return TRUE;
2147     }
2148   if (r > 0) bot = i + 1; else top = i;
2149   }
2150 *errorcodeptr = ERR47;   /* Unrecognized name */
2151 return FALSE;
2152 
2153 ERROR_RETURN:            /* Malformed \P or \p */
2154 *errorcodeptr = ERR46;
2155 *ptrptr = ptr;
2156 return FALSE;
2157 }
2158 #endif
2159 
2160 
2161 
2162 /*************************************************
2163 *           Check for POSIX class syntax         *
2164 *************************************************/
2165 
2166 /* This function is called when the sequence "[:" or "[." or "[=" is
2167 encountered in a character class. It checks whether this is followed by a
2168 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2169 reach an unescaped ']' without the special preceding character, return FALSE.
2170 
2171 Originally, this function only recognized a sequence of letters between the
2172 terminators, but it seems that Perl recognizes any sequence of characters,
2173 though of course unknown POSIX names are subsequently rejected. Perl gives an
2174 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2175 didn't consider this to be a POSIX class. Likewise for [:1234:].
2176 
2177 The problem in trying to be exactly like Perl is in the handling of escapes. We
2178 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2179 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2180 below handles the special cases \\ and \], but does not try to do any other
2181 escape processing. This makes it different from Perl for cases such as
2182 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2183 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2184 when Perl does, I think.
2185 
2186 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2187 It seems that the appearance of a nested POSIX class supersedes an apparent
2188 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2189 a digit. This is handled by returning FALSE if the start of a new group with
2190 the same terminator is encountered, since the next closing sequence must close
2191 the nested group, not the outer one.
2192 
2193 In Perl, unescaped square brackets may also appear as part of class names. For
2194 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2195 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2196 seem right at all. PCRE does not allow closing square brackets in POSIX class
2197 names.
2198 
2199 Arguments:
2200   ptr      pointer to the character after the initial [ (colon, dot, equals)
2201   ptrend   pointer to the end of the pattern
2202   endptr   where to return a pointer to the terminating ':', '.', or '='
2203 
2204 Returns:   TRUE or FALSE
2205 */
2206 
2207 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2208 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2209 {
2210 PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2211 terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2212 
2213 for (; ptrend - ptr >= 2; ptr++)
2214   {
2215   if (*ptr == CHAR_BACKSLASH &&
2216       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2217     ptr++;
2218 
2219   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2220             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2221 
2222   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2223     {
2224     *endptr = ptr;
2225     return TRUE;
2226     }
2227   }
2228 
2229 return FALSE;
2230 }
2231 
2232 
2233 
2234 /*************************************************
2235 *          Check POSIX class name                *
2236 *************************************************/
2237 
2238 /* This function is called to check the name given in a POSIX-style class entry
2239 such as [:alnum:].
2240 
2241 Arguments:
2242   ptr        points to the first letter
2243   len        the length of the name
2244 
2245 Returns:     a value representing the name, or -1 if unknown
2246 */
2247 
2248 static int
check_posix_name(PCRE2_SPTR ptr,int len)2249 check_posix_name(PCRE2_SPTR ptr, int len)
2250 {
2251 const char *pn = posix_names;
2252 int yield = 0;
2253 while (posix_name_lengths[yield] != 0)
2254   {
2255   if (len == posix_name_lengths[yield] &&
2256     PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2257   pn += posix_name_lengths[yield] + 1;
2258   yield++;
2259   }
2260 return -1;
2261 }
2262 
2263 
2264 
2265 /*************************************************
2266 *       Read a subpattern or VERB name           *
2267 *************************************************/
2268 
2269 /* This function is called from parse_regex() below whenever it needs to read
2270 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2271 pointer must be to the character before the name. If that character is '*' we
2272 are reading a verb or alpha assertion name. The pointer is updated to point
2273 after the name, for a VERB or alpha assertion name, or after tha name's
2274 terminator for a subpattern name. Returning both the offset and the name
2275 pointer is redundant information, but some callers use one and some the other,
2276 so it is simplest just to return both.
2277 
2278 Arguments:
2279   ptrptr      points to the character pointer variable
2280   ptrend      points to the end of the input string
2281   utf         true if the input is UTF-encoded
2282   terminator  the terminator of a subpattern name must be this
2283   offsetptr   where to put the offset from the start of the pattern
2284   nameptr     where to put a pointer to the name in the input
2285   namelenptr  where to put the length of the name
2286   errcodeptr  where to put an error code
2287   cb          pointer to the compile data block
2288 
2289 Returns:    TRUE if a name was read
2290             FALSE otherwise, with error code set
2291 */
2292 
2293 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2294 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2295   PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2296   int *errorcodeptr, compile_block *cb)
2297 {
2298 PCRE2_SPTR ptr = *ptrptr;
2299 BOOL is_group = (*ptr != CHAR_ASTERISK);
2300 
2301 if (++ptr >= ptrend)               /* No characters in name */
2302   {
2303   *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2304                             ERR60; /* Verb not recognized or malformed */
2305   goto FAILED;
2306   }
2307 
2308 *nameptr = ptr;
2309 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2310 
2311 /* In UTF mode, a group name may contain letters and decimal digits as defined
2312 by Unicode properties, and underscores, but must not start with a digit. */
2313 
2314 #ifdef SUPPORT_UNICODE
2315 if (utf && is_group)
2316   {
2317   uint32_t c, type;
2318 
2319   GETCHAR(c, ptr);
2320   type = UCD_CHARTYPE(c);
2321 
2322   if (type == ucp_Nd)
2323     {
2324     *errorcodeptr = ERR44;
2325     goto FAILED;
2326     }
2327 
2328   for(;;)
2329     {
2330     if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2331         c != CHAR_UNDERSCORE) break;
2332     ptr++;
2333     FORWARDCHARTEST(ptr, ptrend);
2334     if (ptr >= ptrend) break;
2335     GETCHAR(c, ptr);
2336     type = UCD_CHARTYPE(c);
2337     }
2338   }
2339 else
2340 #else
2341 (void)utf;  /* Avoid compiler warning */
2342 #endif      /* SUPPORT_UNICODE */
2343 
2344 /* Handle non-group names and group names in non-UTF modes. A group name must
2345 not start with a digit. If either of the others start with a digit it just
2346 won't be recognized. */
2347 
2348   {
2349   if (is_group && IS_DIGIT(*ptr))
2350     {
2351     *errorcodeptr = ERR44;
2352     goto FAILED;
2353     }
2354 
2355   while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2356     {
2357     ptr++;
2358     }
2359   }
2360 
2361 /* Check name length */
2362 
2363 if (ptr > *nameptr + MAX_NAME_SIZE)
2364   {
2365   *errorcodeptr = ERR48;
2366   goto FAILED;
2367   }
2368 *namelenptr = (uint32_t)(ptr - *nameptr);
2369 
2370 /* Subpattern names must not be empty, and their terminator is checked here.
2371 (What follows a verb or alpha assertion name is checked separately.) */
2372 
2373 if (is_group)
2374   {
2375   if (ptr == *nameptr)
2376     {
2377     *errorcodeptr = ERR62;   /* Subpattern name expected */
2378     goto FAILED;
2379     }
2380   if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2381     {
2382     *errorcodeptr = ERR42;
2383     goto FAILED;
2384     }
2385   ptr++;
2386   }
2387 
2388 *ptrptr = ptr;
2389 return TRUE;
2390 
2391 FAILED:
2392 *ptrptr = ptr;
2393 return FALSE;
2394 }
2395 
2396 
2397 
2398 /*************************************************
2399 *          Manage callouts at start of cycle     *
2400 *************************************************/
2401 
2402 /* At the start of a new item in parse_regex() we are able to record the
2403 details of the previous item in a prior callout, and also to set up an
2404 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2405 which would otherwise happen for items such as \Q that contribute nothing to
2406 the parsed pattern.
2407 
2408 Arguments:
2409   ptr              current pattern pointer
2410   pcalloutptr      points to a pointer to previous callout, or NULL
2411   auto_callout     TRUE if auto_callouts are enabled
2412   parsed_pattern   the parsed pattern pointer
2413   cb               compile block
2414 
2415 Returns: possibly updated parsed_pattern pointer.
2416 */
2417 
2418 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2419 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2420   uint32_t *parsed_pattern, compile_block *cb)
2421 {
2422 uint32_t *previous_callout = *pcalloutptr;
2423 
2424 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2425   cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2426 
2427 if (!auto_callout) previous_callout = NULL; else
2428   {
2429   if (previous_callout == NULL ||
2430       previous_callout != parsed_pattern - 4 ||
2431       previous_callout[3] != 255)
2432     {
2433     previous_callout = parsed_pattern;  /* Set up new automatic callout */
2434     parsed_pattern += 4;
2435     previous_callout[0] = META_CALLOUT_NUMBER;
2436     previous_callout[2] = 0;
2437     previous_callout[3] = 255;
2438     }
2439   previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2440   }
2441 
2442 *pcalloutptr = previous_callout;
2443 return parsed_pattern;
2444 }
2445 
2446 
2447 
2448 /*************************************************
2449 *      Parse regex and identify named groups     *
2450 *************************************************/
2451 
2452 /* This function is called first of all. It scans the pattern and does two
2453 things: (1) It identifies capturing groups and makes a table of named capturing
2454 groups so that information about them is fully available to both the compiling
2455 scans. (2) It writes a parsed version of the pattern with comments omitted and
2456 escapes processed into the parsed_pattern vector.
2457 
2458 Arguments:
2459   ptr             points to the start of the pattern
2460   options         compiling dynamic options (may change during the scan)
2461   has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2462   cb              pointer to the compile data block
2463 
2464 Returns:   zero on success or a non-zero error code, with the
2465              error offset placed in the cb field
2466 */
2467 
2468 /* A structure and some flags for dealing with nested groups. */
2469 
2470 typedef struct nest_save {
2471   uint16_t  nest_depth;
2472   uint16_t  reset_group;
2473   uint16_t  max_group;
2474   uint16_t  flags;
2475   uint32_t  options;
2476 } nest_save;
2477 
2478 #define NSF_RESET          0x0001u
2479 #define NSF_CONDASSERT     0x0002u
2480 #define NSF_ATOMICSR       0x0004u
2481 
2482 /* Options that are changeable within the pattern must be tracked during
2483 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2484 but all must be tracked so that META_OPTIONS items set the correct values for
2485 the main compiling phase. */
2486 
2487 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2488   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2489   PCRE2_UNGREEDY)
2490 
2491 /* States used for analyzing ranges in character classes. The two OK values
2492 must be last. */
2493 
2494 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2495 
2496 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2497 the storing of literal values in the main parsed pattern, where they can always
2498 be quantified. */
2499 
2500 #if PCRE2_CODE_UNIT_WIDTH == 32
2501 #define PARSED_LITERAL(c, p) \
2502   { \
2503   if (c >= META_END) *p++ = META_BIGVALUE; \
2504   *p++ = c; \
2505   okquantifier = TRUE; \
2506   }
2507 #else
2508 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2509 #endif
2510 
2511 /* Here's the actual function. */
2512 
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2513 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2514   compile_block *cb)
2515 {
2516 uint32_t c;
2517 uint32_t delimiter;
2518 uint32_t namelen;
2519 uint32_t class_range_state;
2520 uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
2521 uint32_t *verbstartptr = NULL;
2522 uint32_t *previous_callout = NULL;
2523 uint32_t *parsed_pattern = cb->parsed_pattern;
2524 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2525 uint32_t meta_quantifier = 0;
2526 uint32_t add_after_mark = 0;
2527 uint32_t extra_options = cb->cx->extra_options;
2528 uint16_t nest_depth = 0;
2529 int after_manual_callout = 0;
2530 int expect_cond_assert = 0;
2531 int errorcode = 0;
2532 int escape;
2533 int i;
2534 BOOL inescq = FALSE;
2535 BOOL inverbname = FALSE;
2536 BOOL utf = (options & PCRE2_UTF) != 0;
2537 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2538 BOOL isdupname;
2539 BOOL negate_class;
2540 BOOL okquantifier = FALSE;
2541 PCRE2_SPTR thisptr;
2542 PCRE2_SPTR name;
2543 PCRE2_SPTR ptrend = cb->end_pattern;
2544 PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
2545 named_group *ng;
2546 nest_save *top_nest, *end_nests;
2547 
2548 /* Insert leading items for word and line matching (features provided for the
2549 benefit of pcre2grep). */
2550 
2551 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2552   {
2553   *parsed_pattern++ = META_CIRCUMFLEX;
2554   *parsed_pattern++ = META_NOCAPTURE;
2555   }
2556 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2557   {
2558   *parsed_pattern++ = META_ESCAPE + ESC_b;
2559   *parsed_pattern++ = META_NOCAPTURE;
2560   }
2561 
2562 /* If the pattern is actually a literal string, process it separately to avoid
2563 cluttering up the main loop. */
2564 
2565 if ((options & PCRE2_LITERAL) != 0)
2566   {
2567   while (ptr < ptrend)
2568     {
2569     if (parsed_pattern >= parsed_pattern_end)
2570       {
2571       errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2572       goto FAILED;
2573       }
2574     thisptr = ptr;
2575     GETCHARINCTEST(c, ptr);
2576     if (auto_callout)
2577       parsed_pattern = manage_callouts(thisptr, &previous_callout,
2578         auto_callout, parsed_pattern, cb);
2579     PARSED_LITERAL(c, parsed_pattern);
2580     }
2581   goto PARSED_END;
2582   }
2583 
2584 /* Process a real regex which may contain meta-characters. */
2585 
2586 top_nest = NULL;
2587 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2588 
2589 /* The size of the nest_save structure might not be a factor of the size of the
2590 workspace. Therefore we must round down end_nests so as to correctly avoid
2591 creating a nest_save that spans the end of the workspace. */
2592 
2593 end_nests = (nest_save *)((char *)end_nests -
2594   ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2595 
2596 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2597 
2598 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2599 
2600 /* Now scan the pattern */
2601 
2602 while (ptr < ptrend)
2603   {
2604   int prev_expect_cond_assert;
2605   uint32_t min_repeat, max_repeat;
2606   uint32_t set, unset, *optset;
2607   uint32_t terminator;
2608   uint32_t prev_meta_quantifier;
2609   BOOL prev_okquantifier;
2610   PCRE2_SPTR tempptr;
2611   PCRE2_SIZE offset;
2612 
2613   if (parsed_pattern >= parsed_pattern_end)
2614     {
2615     errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
2616     goto FAILED;
2617     }
2618 
2619   if (nest_depth > cb->cx->parens_nest_limit)
2620     {
2621     errorcode = ERR19;
2622     goto FAILED;        /* Parentheses too deeply nested */
2623     }
2624 
2625   /* Get next input character, save its position for callout handling. */
2626 
2627   thisptr = ptr;
2628   GETCHARINCTEST(c, ptr);
2629 
2630   /* Copy quoted literals until \E, allowing for the possibility of automatic
2631   callouts, except when processing a (*VERB) "name".  */
2632 
2633   if (inescq)
2634     {
2635     if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2636       {
2637       inescq = FALSE;
2638       ptr++;   /* Skip E */
2639       }
2640     else
2641       {
2642       if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
2643         {                           /* expecting a conditional assertion, */
2644         ptr--;                      /* but an empty \Q\E sequence is OK.  */
2645         errorcode = ERR28;
2646         goto FAILED;
2647         }
2648       if (inverbname)
2649         {                          /* Don't use PARSED_LITERAL() because it */
2650 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2651         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2652 #endif
2653         *parsed_pattern++ = c;
2654         }
2655       else
2656         {
2657         if (after_manual_callout-- <= 0)
2658           parsed_pattern = manage_callouts(thisptr, &previous_callout,
2659             auto_callout, parsed_pattern, cb);
2660         PARSED_LITERAL(c, parsed_pattern);
2661         }
2662       meta_quantifier = 0;
2663       }
2664     continue;  /* Next character */
2665     }
2666 
2667   /* If we are processing the "name" part of a (*VERB:NAME) item, all
2668   characters up to the closing parenthesis are literals except when
2669   PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2670   and \E and escaped characters are allowed (no character types such as \d). If
2671   PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2672   this by not entering the special (*VERB:NAME) processing - they are then
2673   picked up below. Note that c is a character, not a code unit, so we must not
2674   use MAX_255 to test its size because MAX_255 tests code units and is assumed
2675   TRUE in 8-bit mode. */
2676 
2677   if (inverbname &&
2678        (
2679         /* EITHER: not both options set */
2680         ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2681                     (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2682 #ifdef SUPPORT_UNICODE
2683         /* OR: character > 255 AND not Unicode Pattern White Space */
2684         (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2685 #endif
2686         /* OR: not a # comment or isspace() white space */
2687         (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2688 #ifdef SUPPORT_UNICODE
2689         /* and not CHAR_NEL when Unicode is supported */
2690           && c != CHAR_NEL
2691 #endif
2692        )))
2693     {
2694     PCRE2_SIZE verbnamelength;
2695 
2696     switch(c)
2697       {
2698       default:                     /* Don't use PARSED_LITERAL() because it */
2699 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2700       if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2701 #endif
2702       *parsed_pattern++ = c;
2703       break;
2704 
2705       case CHAR_RIGHT_PARENTHESIS:
2706       inverbname = FALSE;
2707       /* This is the length in characters */
2708       verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2709       /* But the limit on the length is in code units */
2710       if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2711         {
2712         ptr--;
2713         errorcode = ERR76;
2714         goto FAILED;
2715         }
2716       *verblengthptr = (uint32_t)verbnamelength;
2717 
2718       /* If this name was on a verb such as (*ACCEPT) which does not continue,
2719       a (*MARK) was generated for the name. We now add the original verb as the
2720       next item. */
2721 
2722       if (add_after_mark != 0)
2723         {
2724         *parsed_pattern++ = add_after_mark;
2725         add_after_mark = 0;
2726         }
2727       break;
2728 
2729       case CHAR_BACKSLASH:
2730       if ((options & PCRE2_ALT_VERBNAMES) != 0)
2731         {
2732         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2733           cb->cx->extra_options, FALSE, cb);
2734         if (errorcode != 0) goto FAILED;
2735         }
2736       else escape = 0;   /* Treat all as literal */
2737 
2738       switch(escape)
2739         {
2740         case 0:                    /* Don't use PARSED_LITERAL() because it */
2741 #if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
2742         if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2743 #endif
2744         *parsed_pattern++ = c;
2745         break;
2746 
2747         case ESC_Q:
2748         inescq = TRUE;
2749         break;
2750 
2751         case ESC_E:           /* Ignore */
2752         break;
2753 
2754         default:
2755         errorcode = ERR40;    /* Invalid in verb name */
2756         goto FAILED;
2757         }
2758       }
2759     continue;   /* Next character in pattern */
2760     }
2761 
2762   /* Not a verb name character. At this point we must process everything that
2763   must not change the quantification state. This is mainly comments, but we
2764   handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2765   A+, as in Perl. An isolated \E is ignored. */
2766 
2767   if (c == CHAR_BACKSLASH && ptr < ptrend)
2768     {
2769     if (*ptr == CHAR_Q || *ptr == CHAR_E)
2770       {
2771       inescq = *ptr == CHAR_Q;
2772       ptr++;
2773       continue;
2774       }
2775     }
2776 
2777   /* Skip over whitespace and # comments in extended mode. Note that c is a
2778   character, not a code unit, so we must not use MAX_255 to test its size
2779   because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2780   whitespace characters are those designated as "Pattern White Space" by
2781   Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2782   U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2783   subset of space characters that match \h and \v. */
2784 
2785   if ((options & PCRE2_EXTENDED) != 0)
2786     {
2787     if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2788 #ifdef SUPPORT_UNICODE
2789     if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2790 #endif
2791     if (c == CHAR_NUMBER_SIGN)
2792       {
2793       while (ptr < ptrend)
2794         {
2795         if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
2796           {                       /* IS_NEWLINE sets cb->nllen. */
2797           ptr += cb->nllen;
2798           break;
2799           }
2800         ptr++;
2801 #ifdef SUPPORT_UNICODE
2802         if (utf) FORWARDCHARTEST(ptr, ptrend);
2803 #endif
2804         }
2805       continue;  /* Next character in pattern */
2806       }
2807     }
2808 
2809   /* Skip over bracketed comments */
2810 
2811   if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2812       ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2813     {
2814     while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2815     if (ptr >= ptrend)
2816       {
2817       errorcode = ERR18;  /* A special error for missing ) in a comment */
2818       goto FAILED;        /* to make it easier to debug. */
2819       }
2820     ptr++;
2821     continue;  /* Next character in pattern */
2822     }
2823 
2824   /* If the next item is not a quantifier, fill in length of any previous
2825   callout and create an auto callout if required. */
2826 
2827   if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2828        (c != CHAR_LEFT_CURLY_BRACKET ||
2829          (tempptr = ptr,
2830          !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2831     {
2832     if (after_manual_callout-- <= 0)
2833       parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2834         parsed_pattern, cb);
2835     }
2836 
2837   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2838   assertion, possibly preceded by a callout. If the value is 1, we have just
2839   had the callout and expect an assertion. There must be at least 3 more
2840   characters in all cases. When expect_cond_assert is 2, we know that the
2841   current character is an opening parenthesis, as otherwise we wouldn't be
2842   here. However, when it is 1, we need to check, and it's easiest just to check
2843   always. Note that expect_cond_assert may be negative, since all callouts just
2844   decrement it. */
2845 
2846   if (expect_cond_assert > 0)
2847     {
2848     BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2849               (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2850     if (ok)
2851       {
2852       if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
2853         {
2854         ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2855         }
2856       else switch(ptr[1])  /* Traditional symbolic format */
2857         {
2858         case CHAR_C:
2859         ok = expect_cond_assert == 2;
2860         break;
2861 
2862         case CHAR_EQUALS_SIGN:
2863         case CHAR_EXCLAMATION_MARK:
2864         break;
2865 
2866         case CHAR_LESS_THAN_SIGN:
2867         ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2868         break;
2869 
2870         default:
2871         ok = FALSE;
2872         }
2873       }
2874 
2875     if (!ok)
2876       {
2877       ptr--;   /* Adjust error offset */
2878       errorcode = ERR28;
2879       goto FAILED;
2880       }
2881     }
2882 
2883   /* Remember whether we are expecting a conditional assertion, and set the
2884   default for this item. */
2885 
2886   prev_expect_cond_assert = expect_cond_assert;
2887   expect_cond_assert = 0;
2888 
2889   /* Remember quantification status for the previous significant item, then set
2890   default for this item. */
2891 
2892   prev_okquantifier = okquantifier;
2893   prev_meta_quantifier = meta_quantifier;
2894   okquantifier = FALSE;
2895   meta_quantifier = 0;
2896 
2897   /* If the previous significant item was a quantifier, adjust the parsed code
2898   if there is a following modifier. The base meta value is always followed by
2899   the PLUS and QUERY values, in that order. We do this here rather than after
2900   reading a quantifier so that intervening comments and /x whitespace can be
2901   ignored without having to replicate code. */
2902 
2903   if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2904     {
2905     parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2906       prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2907         0x00020000u : 0x00010000u);
2908     continue;  /* Next character in pattern */
2909     }
2910 
2911 
2912   /* Process the next item in the main part of a pattern. */
2913 
2914   switch(c)
2915     {
2916     default:              /* Non-special character */
2917     PARSED_LITERAL(c, parsed_pattern);
2918     break;
2919 
2920 
2921     /* ---- Escape sequence ---- */
2922 
2923     case CHAR_BACKSLASH:
2924     tempptr = ptr;
2925     escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2926       cb->cx->extra_options, FALSE, cb);
2927     if (errorcode != 0)
2928       {
2929       ESCAPE_FAILED:
2930       if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
2931         goto FAILED;
2932       ptr = tempptr;
2933       if (ptr >= ptrend) c = CHAR_BACKSLASH; else
2934         {
2935         GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
2936         }
2937       escape = 0;                 /* Treat as literal character */
2938       }
2939 
2940     /* The escape was a data escape or literal character. */
2941 
2942     if (escape == 0)
2943       {
2944       PARSED_LITERAL(c, parsed_pattern);
2945       }
2946 
2947     /* The escape was a back (or forward) reference. We keep the offset in
2948     order to give a more useful diagnostic for a bad forward reference. For
2949     references to groups numbered less than 10 we can't use more than two items
2950     in parsed_pattern because they may be just two characters in the input (and
2951     in a 64-bit world an offset may need two elements). So for them, the offset
2952     of the first occurrent is held in a special vector. */
2953 
2954     else if (escape < 0)
2955       {
2956       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
2957       escape = -escape;
2958       *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
2959       if (escape < 10)
2960         {
2961         if (cb->small_ref_offset[escape] == PCRE2_UNSET)
2962           cb->small_ref_offset[escape] = offset;
2963         }
2964       else
2965         {
2966         PUTOFFSET(offset, parsed_pattern);
2967         }
2968       okquantifier = TRUE;
2969       }
2970 
2971     /* The escape was a character class such as \d etc. or other special
2972     escape indicator such as \A or \X. Most of them generate just a single
2973     parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
2974     value. They are supported only when Unicode is available. The type and
2975     value are packed into a single 32-bit value so that the whole sequences
2976     uses only two elements in the parsed_vector. This is because the same
2977     coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
2978     set.
2979 
2980     There are also some cases where the escape sequence is followed by a name:
2981     \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
2982     and \g'name' are subroutine calls by name; \g{name} is a synonym for
2983     \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
2984     and returned as a negative value (handled above). A name is coded as an
2985     offset into the pattern and a length. */
2986 
2987     else switch (escape)
2988       {
2989       case ESC_C:
2990 #ifdef NEVER_BACKSLASH_C
2991       errorcode = ERR85;
2992       goto ESCAPE_FAILED;
2993 #else
2994       if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
2995         {
2996         errorcode = ERR83;
2997         goto ESCAPE_FAILED;
2998         }
2999 #endif
3000       okquantifier = TRUE;
3001       *parsed_pattern++ = META_ESCAPE + escape;
3002       break;
3003 
3004       case ESC_X:
3005 #ifndef SUPPORT_UNICODE
3006       errorcode = ERR45;   /* Supported only with Unicode support */
3007       goto ESCAPE_FAILED;
3008 #endif
3009       case ESC_H:
3010       case ESC_h:
3011       case ESC_N:
3012       case ESC_R:
3013       case ESC_V:
3014       case ESC_v:
3015       okquantifier = TRUE;
3016       *parsed_pattern++ = META_ESCAPE + escape;
3017       break;
3018 
3019       default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3020       *parsed_pattern++ = META_ESCAPE + escape;
3021       break;
3022 
3023       /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
3024       without Unicode support because it is checked when pcre2_compile() is
3025       called. */
3026 
3027       case ESC_d:
3028       case ESC_D:
3029       case ESC_s:
3030       case ESC_S:
3031       case ESC_w:
3032       case ESC_W:
3033       okquantifier = TRUE;
3034       if ((options & PCRE2_UCP) == 0)
3035         {
3036         *parsed_pattern++ = META_ESCAPE + escape;
3037         }
3038       else
3039         {
3040         *parsed_pattern++ = META_ESCAPE +
3041           ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3042             ESC_p : ESC_P);
3043         switch(escape)
3044           {
3045           case ESC_d:
3046           case ESC_D:
3047           *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3048           break;
3049 
3050           case ESC_s:
3051           case ESC_S:
3052           *parsed_pattern++ = PT_SPACE << 16;
3053           break;
3054 
3055           case ESC_w:
3056           case ESC_W:
3057           *parsed_pattern++ = PT_WORD << 16;
3058           break;
3059           }
3060         }
3061       break;
3062 
3063       /* Unicode property matching */
3064 
3065       case ESC_P:
3066       case ESC_p:
3067 #ifdef SUPPORT_UNICODE
3068         {
3069         BOOL negated;
3070         uint16_t ptype = 0, pdata = 0;
3071         if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3072           goto ESCAPE_FAILED;
3073         if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3074         *parsed_pattern++ = META_ESCAPE + escape;
3075         *parsed_pattern++ = (ptype << 16) | pdata;
3076         okquantifier = TRUE;
3077         }
3078 #else
3079       errorcode = ERR45;
3080       goto ESCAPE_FAILED;
3081 #endif
3082       break;  /* End \P and \p */
3083 
3084       /* When \g is used with quotes or angle brackets as delimiters, it is a
3085       numerical or named subroutine call, and control comes here. When used
3086       with brace delimiters it is a numberical back reference and does not come
3087       here because check_escape() returns it directly as a reference. \k is
3088       always a named back reference. */
3089 
3090       case ESC_g:
3091       case ESC_k:
3092       if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3093           *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3094         {
3095         errorcode = (escape == ESC_g)? ERR57 : ERR69;
3096         goto ESCAPE_FAILED;
3097         }
3098       terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3099         CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3100         CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3101 
3102       /* For a non-braced \g, check for a numerical recursion. */
3103 
3104       if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3105         {
3106         PCRE2_SPTR p = ptr + 1;
3107 
3108         if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3109             &errorcode))
3110           {
3111           if (p >= ptrend || *p != terminator)
3112             {
3113             errorcode = ERR57;
3114             goto ESCAPE_FAILED;
3115             }
3116           ptr = p;
3117           goto SET_RECURSION;
3118           }
3119         if (errorcode != 0) goto ESCAPE_FAILED;
3120         }
3121 
3122       /* Not a numerical recursion */
3123 
3124       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3125           &errorcode, cb)) goto ESCAPE_FAILED;
3126 
3127       /* \k and \g when used with braces are back references, whereas \g used
3128       with quotes or angle brackets is a recursion */
3129 
3130       *parsed_pattern++ =
3131         (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3132           META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3133       *parsed_pattern++ = namelen;
3134 
3135       PUTOFFSET(offset, parsed_pattern);
3136       okquantifier = TRUE;
3137       break;  /* End special escape processing */
3138       }
3139     break;    /* End escape sequence processing */
3140 
3141 
3142     /* ---- Single-character special items ---- */
3143 
3144     case CHAR_CIRCUMFLEX_ACCENT:
3145     *parsed_pattern++ = META_CIRCUMFLEX;
3146     break;
3147 
3148     case CHAR_DOLLAR_SIGN:
3149     *parsed_pattern++ = META_DOLLAR;
3150     break;
3151 
3152     case CHAR_DOT:
3153     *parsed_pattern++ = META_DOT;
3154     okquantifier = TRUE;
3155     break;
3156 
3157 
3158     /* ---- Single-character quantifiers ---- */
3159 
3160     case CHAR_ASTERISK:
3161     meta_quantifier = META_ASTERISK;
3162     goto CHECK_QUANTIFIER;
3163 
3164     case CHAR_PLUS:
3165     meta_quantifier = META_PLUS;
3166     goto CHECK_QUANTIFIER;
3167 
3168     case CHAR_QUESTION_MARK:
3169     meta_quantifier = META_QUERY;
3170     goto CHECK_QUANTIFIER;
3171 
3172 
3173     /* ---- Potential {n,m} quantifier ---- */
3174 
3175     case CHAR_LEFT_CURLY_BRACKET:
3176     if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3177         &errorcode))
3178       {
3179       if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3180       PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3181       break;                               /* No more quantifier processing */
3182       }
3183     meta_quantifier = META_MINMAX;
3184     /* Fall through */
3185 
3186 
3187     /* ---- Quantifier post-processing ---- */
3188 
3189     /* Check that a quantifier is allowed after the previous item. */
3190 
3191     CHECK_QUANTIFIER:
3192     if (!prev_okquantifier)
3193       {
3194       errorcode = ERR9;
3195       goto FAILED_BACK;
3196       }
3197 
3198     /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3199     quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3200     sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3201     wrapping it in non-capturing brackets, but we have to allow for a preceding
3202     (*MARK) for when (*ACCEPT) has an argument. */
3203 
3204     if (parsed_pattern[-1] == META_ACCEPT)
3205       {
3206       uint32_t *p;
3207       for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3208       *verbstartptr = META_NOCAPTURE;
3209       parsed_pattern[1] = META_KET;
3210       parsed_pattern += 2;
3211       }
3212 
3213     /* Now we can put the quantifier into the parsed pattern vector. At this
3214     stage, we have only the basic quantifier. The check for a following + or ?
3215     modifier happens at the top of the loop, after any intervening comments
3216     have been removed. */
3217 
3218     *parsed_pattern++ = meta_quantifier;
3219     if (c == CHAR_LEFT_CURLY_BRACKET)
3220       {
3221       *parsed_pattern++ = min_repeat;
3222       *parsed_pattern++ = max_repeat;
3223       }
3224     break;
3225 
3226 
3227     /* ---- Character class ---- */
3228 
3229     case CHAR_LEFT_SQUARE_BRACKET:
3230     okquantifier = TRUE;
3231 
3232     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3233     used for "start of word" and "end of word". As these are otherwise illegal
3234     sequences, we don't break anything by recognizing them. They are replaced
3235     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3236     erroneous and are handled by the normal code below. */
3237 
3238     if (ptrend - ptr >= 6 &&
3239          (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3240           PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3241       {
3242       *parsed_pattern++ = META_ESCAPE + ESC_b;
3243 
3244       if (ptr[2] == CHAR_LESS_THAN_SIGN)
3245         {
3246         *parsed_pattern++ = META_LOOKAHEAD;
3247         }
3248       else
3249         {
3250         *parsed_pattern++ = META_LOOKBEHIND;
3251         *has_lookbehind = TRUE;
3252 
3253         /* The offset is used only for the "non-fixed length" error; this won't
3254         occur here, so just store zero. */
3255 
3256         PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3257         }
3258 
3259       if ((options & PCRE2_UCP) == 0)
3260         *parsed_pattern++ = META_ESCAPE + ESC_w;
3261       else
3262         {
3263         *parsed_pattern++ = META_ESCAPE + ESC_p;
3264         *parsed_pattern++ = PT_WORD << 16;
3265         }
3266       *parsed_pattern++ = META_KET;
3267       ptr += 6;
3268       break;
3269       }
3270 
3271     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3272     they are encountered at the top level, so we'll do that too. */
3273 
3274     if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3275          *ptr == CHAR_EQUALS_SIGN) &&
3276         check_posix_syntax(ptr, ptrend, &tempptr))
3277       {
3278       errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3279       goto FAILED;
3280       }
3281 
3282     /* Process a regular character class. If the first character is '^', set
3283     the negation flag. If the first few characters (either before or after ^)
3284     are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3285     This makes for compatibility with Perl. */
3286 
3287     negate_class = FALSE;
3288     while (ptr < ptrend)
3289       {
3290       GETCHARINCTEST(c, ptr);
3291       if (c == CHAR_BACKSLASH)
3292         {
3293         if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3294         else if (ptrend - ptr >= 3 &&
3295              PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3296           ptr += 3;
3297         else
3298           break;
3299         }
3300       else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3301                (c == CHAR_SPACE || c == CHAR_HT))  /* Note: just these two */
3302         continue;
3303       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3304         negate_class = TRUE;
3305       else break;
3306       }
3307 
3308     /* Now the real contents of the class; c has the first "real" character.
3309     Empty classes are permitted only if the option is set. */
3310 
3311     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3312         (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3313       {
3314       *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3315       break;  /* End of class processing */
3316       }
3317 
3318     /* Process a non-empty class. */
3319 
3320     *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3321     class_range_state = RANGE_NO;
3322 
3323     /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3324     because there are holes in the encoding, and simply using the range A-Z
3325     (for example) would include the characters in the holes. This applies only
3326     to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3327     in this respect. In order to accommodate this, we keep track of whether
3328     character values are literal or not, and a state variable for handling
3329     ranges. */
3330 
3331     /* Loop for the contents of the class */
3332 
3333     for (;;)
3334       {
3335       BOOL char_is_literal = TRUE;
3336 
3337       /* Inside \Q...\E everything is literal except \E */
3338 
3339       if (inescq)
3340         {
3341         if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3342           {
3343           inescq = FALSE;                   /* Reset literal state */
3344           ptr++;                            /* Skip the 'E' */
3345           goto CLASS_CONTINUE;
3346           }
3347         goto CLASS_LITERAL;
3348         }
3349 
3350       /* Skip over space and tab (only) in extended-more mode. */
3351 
3352       if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3353           (c == CHAR_SPACE || c == CHAR_HT))
3354         goto CLASS_CONTINUE;
3355 
3356       /* Handle POSIX class names. Perl allows a negation extension of the
3357       form [:^name:]. A square bracket that doesn't match the syntax is
3358       treated as a literal. We also recognize the POSIX constructions
3359       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3360       5.6 and 5.8 do. */
3361 
3362       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3363           ptrend - ptr >= 3 &&
3364           (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3365            *ptr == CHAR_EQUALS_SIGN) &&
3366           check_posix_syntax(ptr, ptrend, &tempptr))
3367         {
3368         BOOL posix_negate = FALSE;
3369         int posix_class;
3370 
3371         /* Perl treats a hyphen before a POSIX class as a literal, not the
3372         start of a range. However, it gives a warning in its warning mode. PCRE
3373         does not have a warning mode, so we give an error, because this is
3374         likely an error on the user's part. */
3375 
3376         if (class_range_state == RANGE_STARTED)
3377           {
3378           errorcode = ERR50;
3379           goto FAILED;
3380           }
3381 
3382         if (*ptr != CHAR_COLON)
3383           {
3384           errorcode = ERR13;
3385           goto FAILED_BACK;
3386           }
3387 
3388         if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3389           {
3390           posix_negate = TRUE;
3391           ptr++;
3392           }
3393 
3394         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3395         if (posix_class < 0)
3396           {
3397           errorcode = ERR30;
3398           goto FAILED;
3399           }
3400         ptr = tempptr + 2;
3401 
3402         /* Perl treats a hyphen after a POSIX class as a literal, not the
3403         start of a range. However, it gives a warning in its warning mode
3404         unless the hyphen is the last character in the class. PCRE does not
3405         have a warning mode, so we give an error, because this is likely an
3406         error on the user's part. */
3407 
3408         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3409             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3410           {
3411           errorcode = ERR50;
3412           goto FAILED;
3413           }
3414 
3415         /* Set "a hyphen is not the start of a range" for the -] case, and also
3416         in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3417         fuzzers do that kind of thing) and *then* a hyphen. This causes that
3418         hyphen to be treated as a literal. I don't think it's worth setting up
3419         special apparatus to do otherwise. */
3420 
3421         class_range_state = RANGE_NO;
3422 
3423         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3424         use Unicode properties \p or \P or, in one case, \h or \H. The
3425         substitutes table has two values per class, containing the type and
3426         value of a \p or \P item. The special cases are specified with a
3427         negative type: a non-zero value causes \h or \H to be used, and a zero
3428         value falls through to behave like a non-UCP POSIX class. */
3429 
3430 #ifdef SUPPORT_UNICODE
3431         if ((options & PCRE2_UCP) != 0)
3432           {
3433           int ptype = posix_substitutes[2*posix_class];
3434           int pvalue = posix_substitutes[2*posix_class + 1];
3435           if (ptype >= 0)
3436             {
3437             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3438             *parsed_pattern++ = (ptype << 16) | pvalue;
3439             goto CLASS_CONTINUE;
3440             }
3441 
3442           if (pvalue != 0)
3443             {
3444             *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3445             goto CLASS_CONTINUE;
3446             }
3447 
3448           /* Fall through */
3449           }
3450 #endif  /* SUPPORT_UNICODE */
3451 
3452         /* Non-UCP POSIX class */
3453 
3454         *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3455         *parsed_pattern++ = posix_class;
3456         }
3457 
3458       /* Handle potential start of range */
3459 
3460       else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3461         {
3462         *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3463           META_RANGE_LITERAL : META_RANGE_ESCAPED;
3464         class_range_state = RANGE_STARTED;
3465         }
3466 
3467       /* Handle a literal character */
3468 
3469       else if (c != CHAR_BACKSLASH)
3470         {
3471         CLASS_LITERAL:
3472         if (class_range_state == RANGE_STARTED)
3473           {
3474           if (c == parsed_pattern[-2])       /* Optimize one-char range */
3475             parsed_pattern--;
3476           else if (parsed_pattern[-2] > c)   /* Check range is in order */
3477             {
3478             errorcode = ERR8;
3479             goto FAILED_BACK;
3480             }
3481           else
3482             {
3483             if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3484               parsed_pattern[-1] = META_RANGE_ESCAPED;
3485             PARSED_LITERAL(c, parsed_pattern);
3486             }
3487           class_range_state = RANGE_NO;
3488           }
3489         else  /* Potential start of range */
3490           {
3491           class_range_state = char_is_literal?
3492             RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3493           PARSED_LITERAL(c, parsed_pattern);
3494           }
3495         }
3496 
3497       /* Handle escapes in a class */
3498 
3499       else
3500         {
3501         tempptr = ptr;
3502         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3503           cb->cx->extra_options, TRUE, cb);
3504 
3505         if (errorcode != 0)
3506           {
3507           if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3508             goto FAILED;
3509           ptr = tempptr;
3510           if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3511             {
3512             GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3513             }
3514           escape = 0;                 /* Treat as literal character */
3515           }
3516 
3517         switch(escape)
3518           {
3519           case 0:  /* Escaped character code point is in c */
3520           char_is_literal = FALSE;
3521           goto CLASS_LITERAL;
3522 
3523           case ESC_b:
3524           c = CHAR_BS;    /* \b is backspace in a class */
3525           char_is_literal = FALSE;
3526           goto CLASS_LITERAL;
3527 
3528           case ESC_Q:
3529           inescq = TRUE;  /* Enter literal mode */
3530           goto CLASS_CONTINUE;
3531 
3532           case ESC_E:     /* Ignore orphan \E */
3533           goto CLASS_CONTINUE;
3534 
3535           case ESC_B:     /* Always an error in a class */
3536           case ESC_R:
3537           case ESC_X:
3538           errorcode = ERR7;
3539           ptr--;
3540           goto FAILED;
3541           }
3542 
3543         /* The second part of a range can be a single-character escape
3544         sequence (detected above), but not any of the other escapes. Perl
3545         treats a hyphen as a literal in such circumstances. However, in Perl's
3546         warning mode, a warning is given, so PCRE now faults it, as it is
3547         almost certainly a mistake on the user's part. */
3548 
3549         if (class_range_state == RANGE_STARTED)
3550           {
3551           errorcode = ERR50;
3552           goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
3553           }
3554 
3555         /* Of the remaining escapes, only those that define characters are
3556         allowed in a class. None may start a range. */
3557 
3558         class_range_state = RANGE_NO;
3559         switch(escape)
3560           {
3561           case ESC_N:
3562           errorcode = ERR71;
3563           goto FAILED;
3564 
3565           case ESC_H:
3566           case ESC_h:
3567           case ESC_V:
3568           case ESC_v:
3569           *parsed_pattern++ = META_ESCAPE + escape;
3570           break;
3571 
3572           /* These escapes are converted to Unicode property tests when
3573           PCRE2_UCP is set. */
3574 
3575           case ESC_d:
3576           case ESC_D:
3577           case ESC_s:
3578           case ESC_S:
3579           case ESC_w:
3580           case ESC_W:
3581           if ((options & PCRE2_UCP) == 0)
3582             {
3583             *parsed_pattern++ = META_ESCAPE + escape;
3584             }
3585           else
3586             {
3587             *parsed_pattern++ = META_ESCAPE +
3588               ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3589                 ESC_p : ESC_P);
3590             switch(escape)
3591               {
3592               case ESC_d:
3593               case ESC_D:
3594               *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3595               break;
3596 
3597               case ESC_s:
3598               case ESC_S:
3599               *parsed_pattern++ = PT_SPACE << 16;
3600               break;
3601 
3602               case ESC_w:
3603               case ESC_W:
3604               *parsed_pattern++ = PT_WORD << 16;
3605               break;
3606               }
3607             }
3608           break;
3609 
3610           /* Explicit Unicode property matching */
3611 
3612           case ESC_P:
3613           case ESC_p:
3614 #ifdef SUPPORT_UNICODE
3615             {
3616             BOOL negated;
3617             uint16_t ptype = 0, pdata = 0;
3618             if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3619               goto FAILED;
3620             if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3621             *parsed_pattern++ = META_ESCAPE + escape;
3622             *parsed_pattern++ = (ptype << 16) | pdata;
3623             }
3624 #else
3625           errorcode = ERR45;
3626           goto FAILED;
3627 #endif
3628           break;  /* End \P and \p */
3629 
3630           default:    /* All others are not allowed in a class */
3631           errorcode = ERR7;
3632           ptr--;
3633           goto FAILED;
3634           }
3635 
3636         /* Perl gives a warning unless a following hyphen is the last character
3637         in the class. PCRE throws an error. */
3638 
3639         if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3640             ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3641           {
3642           errorcode = ERR50;
3643           goto FAILED;
3644           }
3645         }
3646 
3647       /* Proceed to next thing in the class. */
3648 
3649       CLASS_CONTINUE:
3650       if (ptr >= ptrend)
3651         {
3652         errorcode = ERR6;  /* Missing terminating ']' */
3653         goto FAILED;
3654         }
3655       GETCHARINCTEST(c, ptr);
3656       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3657       }     /* End of class-processing loop */
3658 
3659     /* -] at the end of a class is a literal '-' */
3660 
3661     if (class_range_state == RANGE_STARTED)
3662       {
3663       parsed_pattern[-1] = CHAR_MINUS;
3664       class_range_state = RANGE_NO;
3665       }
3666 
3667     *parsed_pattern++ = META_CLASS_END;
3668     break;  /* End of character class */
3669 
3670 
3671     /* ---- Opening parenthesis ---- */
3672 
3673     case CHAR_LEFT_PARENTHESIS:
3674     if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3675 
3676     /* If ( is not followed by ? it is either a capture or a special verb or an
3677     alpha assertion or a positive non-atomic lookahead. */
3678 
3679     if (*ptr != CHAR_QUESTION_MARK)
3680       {
3681       const char *vn;
3682 
3683       /* Handle capturing brackets (or non-capturing if auto-capture is turned
3684       off). */
3685 
3686       if (*ptr != CHAR_ASTERISK)
3687         {
3688         nest_depth++;
3689         if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3690           {
3691           if (cb->bracount >= MAX_GROUP_NUMBER)
3692             {
3693             errorcode = ERR97;
3694             goto FAILED;
3695             }
3696           cb->bracount++;
3697           *parsed_pattern++ = META_CAPTURE | cb->bracount;
3698           }
3699         else *parsed_pattern++ = META_NOCAPTURE;
3700         }
3701 
3702       /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3703       quantifier" error rather than "(*MARK) must have an argument". */
3704 
3705       else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3706         break;
3707 
3708       /* Handle "alpha assertions" such as (*pla:...). Most of these are
3709       synonyms for the historical symbolic assertions, but the script run and
3710       non-atomic lookaround ones are new. They are distinguished by starting
3711       with a lower case letter. Checking both ends of the alphabet makes this
3712       work in all character codes. */
3713 
3714       else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3715         {
3716         uint32_t meta;
3717 
3718         vn = alasnames;
3719         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3720           &errorcode, cb)) goto FAILED;
3721         if (ptr >= ptrend || *ptr != CHAR_COLON)
3722           {
3723           errorcode = ERR95;  /* Malformed */
3724           goto FAILED;
3725           }
3726 
3727         /* Scan the table of alpha assertion names */
3728 
3729         for (i = 0; i < alascount; i++)
3730           {
3731           if (namelen == alasmeta[i].len &&
3732               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3733             break;
3734           vn += alasmeta[i].len + 1;
3735           }
3736 
3737         if (i >= alascount)
3738           {
3739           errorcode = ERR95;  /* Alpha assertion not recognized */
3740           goto FAILED;
3741           }
3742 
3743         /* Check for expecting an assertion condition. If so, only atomic
3744         lookaround assertions are valid. */
3745 
3746         meta = alasmeta[i].meta;
3747         if (prev_expect_cond_assert > 0 &&
3748             (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3749           {
3750           errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3751             ERR98 : ERR28;  /* (Atomic) assertion expected */
3752           goto FAILED;
3753           }
3754 
3755         /* The lookaround alphabetic synonyms can mostly be handled by jumping
3756         to the code that handles the traditional symbolic forms. */
3757 
3758         switch(meta)
3759           {
3760           default:
3761           errorcode = ERR89;  /* Unknown code; should never occur because */
3762           goto FAILED;        /* the meta values come from a table above. */
3763 
3764           case META_ATOMIC:
3765           goto ATOMIC_GROUP;
3766 
3767           case META_LOOKAHEAD:
3768           goto POSITIVE_LOOK_AHEAD;
3769 
3770           case META_LOOKAHEAD_NA:
3771           goto POSITIVE_NONATOMIC_LOOK_AHEAD;
3772 
3773           case META_LOOKAHEADNOT:
3774           goto NEGATIVE_LOOK_AHEAD;
3775 
3776           case META_LOOKBEHIND:
3777           case META_LOOKBEHINDNOT:
3778           case META_LOOKBEHIND_NA:
3779           *parsed_pattern++ = meta;
3780           ptr--;
3781           goto POST_LOOKBEHIND;
3782 
3783           /* The script run facilities are handled here. Unicode support is
3784           required (give an error if not, as this is a security issue). Always
3785           record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3786           META_ATOMIC and remember that we need two META_KETs at the end. */
3787 
3788           case META_SCRIPT_RUN:
3789           case META_ATOMIC_SCRIPT_RUN:
3790 #ifdef SUPPORT_UNICODE
3791           *parsed_pattern++ = META_SCRIPT_RUN;
3792           nest_depth++;
3793           ptr++;
3794           if (meta == META_ATOMIC_SCRIPT_RUN)
3795             {
3796             *parsed_pattern++ = META_ATOMIC;
3797             if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3798             else if (++top_nest >= end_nests)
3799               {
3800               errorcode = ERR84;
3801               goto FAILED;
3802               }
3803             top_nest->nest_depth = nest_depth;
3804             top_nest->flags = NSF_ATOMICSR;
3805             top_nest->options = options & PARSE_TRACKED_OPTIONS;
3806             }
3807           break;
3808 #else  /* SUPPORT_UNICODE */
3809           errorcode = ERR96;
3810           goto FAILED;
3811 #endif
3812           }
3813         }
3814 
3815 
3816       /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3817 
3818       else
3819         {
3820         vn = verbnames;
3821         if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3822           &errorcode, cb)) goto FAILED;
3823         if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3824                               *ptr != CHAR_RIGHT_PARENTHESIS))
3825           {
3826           errorcode = ERR60;  /* Malformed */
3827           goto FAILED;
3828           }
3829 
3830         /* Scan the table of verb names */
3831 
3832         for (i = 0; i < verbcount; i++)
3833           {
3834           if (namelen == verbs[i].len &&
3835               PRIV(strncmp_c8)(name, vn, namelen) == 0)
3836             break;
3837           vn += verbs[i].len + 1;
3838           }
3839 
3840         if (i >= verbcount)
3841           {
3842           errorcode = ERR60;  /* Verb not recognized */
3843           goto FAILED;
3844           }
3845 
3846         /* An empty argument is treated as no argument. */
3847 
3848         if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3849              ptr[1] == CHAR_RIGHT_PARENTHESIS)
3850           ptr++;    /* Advance to the closing parens */
3851 
3852         /* Check for mandatory non-empty argument; this is (*MARK) */
3853 
3854         if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3855           {
3856           errorcode = ERR66;
3857           goto FAILED;
3858           }
3859 
3860         /* Remember where this verb, possibly with a preceding (*MARK), starts,
3861         for handling quantified (*ACCEPT). */
3862 
3863         verbstartptr = parsed_pattern;
3864         okquantifier = (verbs[i].meta == META_ACCEPT);
3865 
3866         /* It appears that Perl allows any characters whatsoever, other than a
3867         closing parenthesis, to appear in arguments ("names"), so we no longer
3868         insist on letters, digits, and underscores. Perl does not, however, do
3869         any interpretation within arguments, and has no means of including a
3870         closing parenthesis. PCRE supports escape processing but only when it
3871         is requested by an option. We set inverbname TRUE here, and let the
3872         main loop take care of this so that escape and \x processing is done by
3873         the main code above. */
3874 
3875         if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
3876           {
3877           /* Some optional arguments can be treated as a preceding (*MARK) */
3878 
3879           if (verbs[i].has_arg < 0)
3880             {
3881             add_after_mark = verbs[i].meta;
3882             *parsed_pattern++ = META_MARK;
3883             }
3884 
3885           /* The remaining verbs with arguments (except *MARK) need a different
3886           opcode. */
3887 
3888           else
3889             {
3890             *parsed_pattern++ = verbs[i].meta +
3891               ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3892             }
3893 
3894           /* Set up for reading the name in the main loop. */
3895 
3896           verblengthptr = parsed_pattern++;
3897           verbnamestart = ptr;
3898           inverbname = TRUE;
3899           }
3900         else  /* No verb "name" argument */
3901           {
3902           *parsed_pattern++ = verbs[i].meta;
3903           }
3904         }     /* End of (*VERB) handling */
3905       break;  /* Done with this parenthesis */
3906       }       /* End of groups that don't start with (? */
3907 
3908 
3909     /* ---- Items starting (? ---- */
3910 
3911     /* The type of item is determined by what follows (?. Handle (?| and option
3912     changes under "default" because both need a new block on the nest stack.
3913     Comments starting with (?# are handled above. Note that there is some
3914     ambiguity about the sequence (?- because if a digit follows it's a relative
3915     recursion or subroutine call whereas otherwise it's an option unsetting. */
3916 
3917     if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3918 
3919     switch(*ptr)
3920       {
3921       default:
3922       if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
3923         goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
3924 
3925       /* We now have either (?| or a (possibly empty) option setting,
3926       optionally followed by a non-capturing group. */
3927 
3928       nest_depth++;
3929       if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3930       else if (++top_nest >= end_nests)
3931         {
3932         errorcode = ERR84;
3933         goto FAILED;
3934         }
3935       top_nest->nest_depth = nest_depth;
3936       top_nest->flags = 0;
3937       top_nest->options = options & PARSE_TRACKED_OPTIONS;
3938 
3939       /* Start of non-capturing group that resets the capture count for each
3940       branch. */
3941 
3942       if (*ptr == CHAR_VERTICAL_LINE)
3943         {
3944         top_nest->reset_group = (uint16_t)cb->bracount;
3945         top_nest->max_group = (uint16_t)cb->bracount;
3946         top_nest->flags |= NSF_RESET;
3947         cb->external_flags |= PCRE2_DUPCAPUSED;
3948         *parsed_pattern++ = META_NOCAPTURE;
3949         ptr++;
3950         }
3951 
3952       /* Scan for options imnsxJU to be set or unset. */
3953 
3954       else
3955         {
3956         BOOL hyphenok = TRUE;
3957         uint32_t oldoptions = options;
3958 
3959         top_nest->reset_group = 0;
3960         top_nest->max_group = 0;
3961         set = unset = 0;
3962         optset = &set;
3963 
3964         /* ^ at the start unsets imnsx and disables the subsequent use of - */
3965 
3966         if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
3967           {
3968           options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
3969                        PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
3970           hyphenok = FALSE;
3971           ptr++;
3972           }
3973 
3974         while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
3975                                *ptr != CHAR_COLON)
3976           {
3977           switch (*ptr++)
3978             {
3979             case CHAR_MINUS:
3980             if (!hyphenok)
3981               {
3982               errorcode = ERR94;
3983               ptr--;  /* Correct the offset */
3984               goto FAILED;
3985               }
3986             optset = &unset;
3987             hyphenok = FALSE;
3988             break;
3989 
3990             case CHAR_J:  /* Record that it changed in the external options */
3991             *optset |= PCRE2_DUPNAMES;
3992             cb->external_flags |= PCRE2_JCHANGED;
3993             break;
3994 
3995             case CHAR_i: *optset |= PCRE2_CASELESS; break;
3996             case CHAR_m: *optset |= PCRE2_MULTILINE; break;
3997             case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
3998             case CHAR_s: *optset |= PCRE2_DOTALL; break;
3999             case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4000 
4001             /* If x appears twice it sets the extended extended option. */
4002 
4003             case CHAR_x:
4004             *optset |= PCRE2_EXTENDED;
4005             if (ptr < ptrend && *ptr == CHAR_x)
4006               {
4007               *optset |= PCRE2_EXTENDED_MORE;
4008               ptr++;
4009               }
4010             break;
4011 
4012             default:
4013             errorcode = ERR11;
4014             ptr--;    /* Correct the offset */
4015             goto FAILED;
4016             }
4017           }
4018 
4019         /* If we are setting extended without extended-more, ensure that any
4020         existing extended-more gets unset. Also, unsetting extended must also
4021         unset extended-more. */
4022 
4023         if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4024             (unset & PCRE2_EXTENDED) != 0)
4025           unset |= PCRE2_EXTENDED_MORE;
4026 
4027         options = (options | set) & (~unset);
4028 
4029         /* If the options ended with ')' this is not the start of a nested
4030         group with option changes, so the options change at this level.
4031         In this case, if the previous level set up a nest block, discard the
4032         one we have just created. Otherwise adjust it for the previous level.
4033         If the options ended with ':' we are starting a non-capturing group,
4034         possibly with an options setting. */
4035 
4036         if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4037         if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4038           {
4039           nest_depth--;  /* This is not a nested group after all. */
4040           if (top_nest > (nest_save *)(cb->start_workspace) &&
4041               (top_nest-1)->nest_depth == nest_depth) top_nest--;
4042           else top_nest->nest_depth = nest_depth;
4043           }
4044         else *parsed_pattern++ = META_NOCAPTURE;
4045 
4046         /* If nothing changed, no need to record. */
4047 
4048         if (options != oldoptions)
4049           {
4050           *parsed_pattern++ = META_OPTIONS;
4051           *parsed_pattern++ = options;
4052           }
4053         }     /* End options processing */
4054       break;  /* End default case after (? */
4055 
4056 
4057       /* ---- Python syntax support ---- */
4058 
4059       case CHAR_P:
4060       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4061 
4062       /* (?P<name> is the same as (?<name>, which defines a named group. */
4063 
4064       if (*ptr == CHAR_LESS_THAN_SIGN)
4065         {
4066         terminator = CHAR_GREATER_THAN_SIGN;
4067         goto DEFINE_NAME;
4068         }
4069 
4070       /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4071       call. */
4072 
4073       if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4074 
4075       /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4076       else after (?P is an error. */
4077 
4078       if (*ptr != CHAR_EQUALS_SIGN)
4079         {
4080         errorcode = ERR41;
4081         goto FAILED;
4082         }
4083       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4084           &namelen, &errorcode, cb)) goto FAILED;
4085       *parsed_pattern++ = META_BACKREF_BYNAME;
4086       *parsed_pattern++ = namelen;
4087       PUTOFFSET(offset, parsed_pattern);
4088       okquantifier = TRUE;
4089       break;   /* End of (?P processing */
4090 
4091 
4092       /* ---- Recursion/subroutine calls by number ---- */
4093 
4094       case CHAR_R:
4095       i = 0;         /* (?R) == (?R0) */
4096       ptr++;
4097       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4098         {
4099         errorcode = ERR58;
4100         goto FAILED;
4101         }
4102       goto SET_RECURSION;
4103 
4104       /* An item starting (?- followed by a digit comes here via the "default"
4105       case because (?- followed by a non-digit is an options setting. */
4106 
4107       case CHAR_PLUS:
4108       if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4109         {
4110         errorcode = ERR29;   /* Missing number */
4111         goto FAILED;
4112         }
4113       /* Fall through */
4114 
4115       case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4116       case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4117       RECURSION_BYNUMBER:
4118       if (!read_number(&ptr, ptrend,
4119           (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4120           MAX_GROUP_NUMBER, ERR61,
4121           &i, &errorcode)) goto FAILED;
4122       if (i < 0)  /* NB (?0) is permitted */
4123         {
4124         errorcode = ERR15;   /* Unknown group */
4125         goto FAILED_BACK;
4126         }
4127       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4128         goto UNCLOSED_PARENTHESIS;
4129 
4130       SET_RECURSION:
4131       *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4132       offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4133       ptr++;
4134       PUTOFFSET(offset, parsed_pattern);
4135       okquantifier = TRUE;
4136       break;  /* End of recursive call by number handling */
4137 
4138 
4139       /* ---- Recursion/subroutine calls by name ---- */
4140 
4141       case CHAR_AMPERSAND:
4142       RECURSE_BY_NAME:
4143       if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4144           &namelen, &errorcode, cb)) goto FAILED;
4145       *parsed_pattern++ = META_RECURSE_BYNAME;
4146       *parsed_pattern++ = namelen;
4147       PUTOFFSET(offset, parsed_pattern);
4148       okquantifier = TRUE;
4149       break;
4150 
4151       /* ---- Callout with numerical or string argument ---- */
4152 
4153       case CHAR_C:
4154       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4155 
4156       /* If the previous item was a condition starting (?(? an assertion,
4157       optionally preceded by a callout, is expected. This is checked later on,
4158       during actual compilation. However we need to identify this kind of
4159       assertion in this pass because it must not be qualified. The value of
4160       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4161       for a callout - still leaving a positive value that identifies the
4162       assertion. Multiple callouts or any other items will make it zero or
4163       less, which doesn't matter because they will cause an error later. */
4164 
4165       expect_cond_assert = prev_expect_cond_assert - 1;
4166 
4167       /* If previous_callout is not NULL, it means this follows a previous
4168       callout. If it was a manual callout, do nothing; this means its "length
4169       of next pattern item" field will remain zero. If it was an automatic
4170       callout, abolish it. */
4171 
4172       if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4173           previous_callout == parsed_pattern - 4 &&
4174           parsed_pattern[-1] == 255)
4175         parsed_pattern = previous_callout;
4176 
4177       /* Save for updating next pattern item length, and skip one item before
4178       completing. */
4179 
4180       previous_callout = parsed_pattern;
4181       after_manual_callout = 1;
4182 
4183       /* Handle a string argument; specific delimiter is required. */
4184 
4185       if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4186         {
4187         PCRE2_SIZE calloutlength;
4188         PCRE2_SPTR startptr = ptr;
4189 
4190         delimiter = 0;
4191         for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4192           {
4193           if (*ptr == PRIV(callout_start_delims)[i])
4194             {
4195             delimiter = PRIV(callout_end_delims)[i];
4196             break;
4197             }
4198           }
4199         if (delimiter == 0)
4200           {
4201           errorcode = ERR82;
4202           goto FAILED;
4203           }
4204 
4205         *parsed_pattern = META_CALLOUT_STRING;
4206         parsed_pattern += 3;   /* Skip pattern info */
4207 
4208         for (;;)
4209           {
4210           if (++ptr >= ptrend)
4211             {
4212             errorcode = ERR81;
4213             ptr = startptr;   /* To give a more useful message */
4214             goto FAILED;
4215             }
4216           if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4217             break;
4218           }
4219 
4220         calloutlength = (PCRE2_SIZE)(ptr - startptr);
4221         if (calloutlength > UINT32_MAX)
4222           {
4223           errorcode = ERR72;
4224           goto FAILED;
4225           }
4226         *parsed_pattern++ = (uint32_t)calloutlength;
4227         offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4228         PUTOFFSET(offset, parsed_pattern);
4229         }
4230 
4231       /* Handle a callout with an optional numerical argument, which must be
4232       less than or equal to 255. A missing argument gives 0. */
4233 
4234       else
4235         {
4236         int n = 0;
4237         *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
4238         parsed_pattern += 3;                       /* Skip pattern info */
4239         while (ptr < ptrend && IS_DIGIT(*ptr))
4240           {
4241           n = n * 10 + *ptr++ - CHAR_0;
4242           if (n > 255)
4243             {
4244             errorcode = ERR38;
4245             goto FAILED;
4246             }
4247           }
4248         *parsed_pattern++ = n;
4249         }
4250 
4251       /* Both formats must have a closing parenthesis */
4252 
4253       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4254         {
4255         errorcode = ERR39;
4256         goto FAILED;
4257         }
4258       ptr++;
4259 
4260       /* Remember the offset to the next item in the pattern, and set a default
4261       length. This should get updated after the next item is read. */
4262 
4263       previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4264       previous_callout[2] = 0;
4265       break;                  /* End callout */
4266 
4267 
4268       /* ---- Conditional group ---- */
4269 
4270       /* A condition can be an assertion, a number (referring to a numbered
4271       group's having been set), a name (referring to a named group), or 'R',
4272       referring to overall recursion. R<digits> and R&name are also permitted
4273       for recursion state tests. Numbers may be preceded by + or - to specify a
4274       relative group number.
4275 
4276       There are several syntaxes for testing a named group: (?(name)) is used
4277       by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4278 
4279       There are two unfortunate ambiguities. 'R' can be the recursive thing or
4280       the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4281       the Perl DEFINE feature or the Python named test. We look for a name
4282       first; if not found, we try the other case.
4283 
4284       For compatibility with auto-callouts, we allow a callout to be specified
4285       before a condition that is an assertion. */
4286 
4287       case CHAR_LEFT_PARENTHESIS:
4288       if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4289       nest_depth++;
4290 
4291       /* If the next character is ? or * there must be an assertion next
4292       (optionally preceded by a callout). We do not check this here, but
4293       instead we set expect_cond_assert to 2. If this is still greater than
4294       zero (callouts decrement it) when the next assertion is read, it will be
4295       marked as a condition that must not be repeated. A value greater than
4296       zero also causes checking that an assertion (possibly with callout)
4297       follows. */
4298 
4299       if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4300         {
4301         *parsed_pattern++ = META_COND_ASSERT;
4302         ptr--;   /* Pull pointer back to the opening parenthesis. */
4303         expect_cond_assert = 2;
4304         break;  /* End of conditional */
4305         }
4306 
4307       /* Handle (?([+-]number)... */
4308 
4309       if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4310           &errorcode))
4311         {
4312         if (i <= 0)
4313           {
4314           errorcode = ERR15;
4315           goto FAILED;
4316           }
4317         *parsed_pattern++ = META_COND_NUMBER;
4318         offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4319         PUTOFFSET(offset, parsed_pattern);
4320         *parsed_pattern++ = i;
4321         }
4322       else if (errorcode != 0) goto FAILED;   /* Number too big */
4323 
4324       /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4325 
4326       else if (ptrend - ptr >= 10 &&
4327                PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4328                ptr[7] != CHAR_RIGHT_PARENTHESIS)
4329         {
4330         uint32_t ge = 0;
4331         int major = 0;
4332         int minor = 0;
4333 
4334         ptr += 7;
4335         if (*ptr == CHAR_GREATER_THAN_SIGN)
4336           {
4337           ge = 1;
4338           ptr++;
4339           }
4340 
4341         /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4342         references its argument twice. */
4343 
4344         if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4345           goto BAD_VERSION_CONDITION;
4346 
4347         if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4348           goto FAILED;
4349 
4350         if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4351         if (*ptr == CHAR_DOT)
4352           {
4353           if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4354           minor = (*ptr++ - CHAR_0) * 10;
4355           if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4356           if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4357           if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4358             goto BAD_VERSION_CONDITION;
4359           }
4360 
4361         *parsed_pattern++ = META_COND_VERSION;
4362         *parsed_pattern++ = ge;
4363         *parsed_pattern++ = major;
4364         *parsed_pattern++ = minor;
4365         }
4366 
4367       /* All the remaining cases now require us to read a name. We cannot at
4368       this stage distinguish ambiguous cases such as (?(R12) which might be a
4369       recursion test by number or a name, because the named groups have not yet
4370       all been identified. Those cases are treated as names, but given a
4371       different META code. */
4372 
4373       else
4374         {
4375         BOOL was_r_ampersand = FALSE;
4376 
4377         if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4378           {
4379           terminator = CHAR_RIGHT_PARENTHESIS;
4380           was_r_ampersand = TRUE;
4381           ptr++;
4382           }
4383         else if (*ptr == CHAR_LESS_THAN_SIGN)
4384           terminator = CHAR_GREATER_THAN_SIGN;
4385         else if (*ptr == CHAR_APOSTROPHE)
4386           terminator = CHAR_APOSTROPHE;
4387         else
4388           {
4389           terminator = CHAR_RIGHT_PARENTHESIS;
4390           ptr--;   /* Point to char before name */
4391           }
4392         if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4393             &errorcode, cb)) goto FAILED;
4394 
4395         /* Handle (?(R&name) */
4396 
4397         if (was_r_ampersand)
4398           {
4399           *parsed_pattern = META_COND_RNAME;
4400           ptr--;   /* Back to closing parens */
4401           }
4402 
4403         /* Handle (?(name). If the name is "DEFINE" we identify it with a
4404         special code. Likewise if the name consists of R followed only by
4405         digits. Otherwise, handle it like a quoted name. */
4406 
4407         else if (terminator == CHAR_RIGHT_PARENTHESIS)
4408           {
4409           if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4410             *parsed_pattern = META_COND_DEFINE;
4411           else
4412             {
4413             for (i = 1; i < (int)namelen; i++)
4414               if (!IS_DIGIT(name[i])) break;
4415             *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4416               META_COND_RNUMBER : META_COND_NAME;
4417             }
4418           ptr--;   /* Back to closing parens */
4419           }
4420 
4421         /* Handle (?('name') or (?(<name>) */
4422 
4423         else *parsed_pattern = META_COND_NAME;
4424 
4425         /* All these cases except DEFINE end with the name length and offset;
4426         DEFINE just has an offset (for the "too many branches" error). */
4427 
4428         if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4429         PUTOFFSET(offset, parsed_pattern);
4430         }  /* End cases that read a name */
4431 
4432       /* Check the closing parenthesis of the condition */
4433 
4434       if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4435         {
4436         errorcode = ERR24;
4437         goto FAILED;
4438         }
4439       ptr++;
4440       break;  /* End of condition processing */
4441 
4442 
4443       /* ---- Atomic group ---- */
4444 
4445       case CHAR_GREATER_THAN_SIGN:
4446       ATOMIC_GROUP:                          /* Come from (*atomic: */
4447       *parsed_pattern++ = META_ATOMIC;
4448       nest_depth++;
4449       ptr++;
4450       break;
4451 
4452 
4453       /* ---- Lookahead assertions ---- */
4454 
4455       case CHAR_EQUALS_SIGN:
4456       POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
4457       *parsed_pattern++ = META_LOOKAHEAD;
4458       ptr++;
4459       goto POST_ASSERTION;
4460 
4461       case CHAR_ASTERISK:
4462       POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */
4463       *parsed_pattern++ = META_LOOKAHEAD_NA;
4464       ptr++;
4465       goto POST_ASSERTION;
4466 
4467       case CHAR_EXCLAMATION_MARK:
4468       NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
4469       *parsed_pattern++ = META_LOOKAHEADNOT;
4470       ptr++;
4471       goto POST_ASSERTION;
4472 
4473 
4474       /* ---- Lookbehind assertions ---- */
4475 
4476       /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4477       is the start of the name of a capturing group. */
4478 
4479       case CHAR_LESS_THAN_SIGN:
4480       if (ptrend - ptr <= 1 ||
4481          (ptr[1] != CHAR_EQUALS_SIGN &&
4482           ptr[1] != CHAR_EXCLAMATION_MARK &&
4483           ptr[1] != CHAR_ASTERISK))
4484         {
4485         terminator = CHAR_GREATER_THAN_SIGN;
4486         goto DEFINE_NAME;
4487         }
4488       *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4489         META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4490         META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4491 
4492       POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
4493       *has_lookbehind = TRUE;
4494       offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4495       PUTOFFSET(offset, parsed_pattern);
4496       ptr += 2;
4497       /* Fall through */
4498 
4499       /* If the previous item was a condition starting (?(? an assertion,
4500       optionally preceded by a callout, is expected. This is checked later on,
4501       during actual compilation. However we need to identify this kind of
4502       assertion in this pass because it must not be qualified. The value of
4503       expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4504       for a callout - still leaving a positive value that identifies the
4505       assertion. Multiple callouts or any other items will make it zero or
4506       less, which doesn't matter because they will cause an error later. */
4507 
4508       POST_ASSERTION:
4509       nest_depth++;
4510       if (prev_expect_cond_assert > 0)
4511         {
4512         if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4513         else if (++top_nest >= end_nests)
4514           {
4515           errorcode = ERR84;
4516           goto FAILED;
4517           }
4518         top_nest->nest_depth = nest_depth;
4519         top_nest->flags = NSF_CONDASSERT;
4520         top_nest->options = options & PARSE_TRACKED_OPTIONS;
4521         }
4522       break;
4523 
4524 
4525       /* ---- Define a named group ---- */
4526 
4527       /* A named group may be defined as (?'name') or (?<name>). In the latter
4528       case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4529       terminator set to '>'. */
4530 
4531       case CHAR_APOSTROPHE:
4532       terminator = CHAR_APOSTROPHE;    /* Terminator */
4533 
4534       DEFINE_NAME:
4535       if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4536           &errorcode, cb)) goto FAILED;
4537 
4538       /* We have a name for this capturing group. It is also assigned a number,
4539       which is its primary means of identification. */
4540 
4541       if (cb->bracount >= MAX_GROUP_NUMBER)
4542         {
4543         errorcode = ERR97;
4544         goto FAILED;
4545         }
4546       cb->bracount++;
4547       *parsed_pattern++ = META_CAPTURE | cb->bracount;
4548       nest_depth++;
4549 
4550       /* Check not too many names */
4551 
4552       if (cb->names_found >= MAX_NAME_COUNT)
4553         {
4554         errorcode = ERR49;
4555         goto FAILED;
4556         }
4557 
4558       /* Adjust the entry size to accommodate the longest name found. */
4559 
4560       if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4561         cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4562 
4563       /* Scan the list to check for duplicates. For duplicate names, if the
4564       number is the same, break the loop, which causes the name to be
4565       discarded; otherwise, if DUPNAMES is not set, give an error.
4566       If it is set, allow the name with a different number, but continue
4567       scanning in case this is a duplicate with the same number. For
4568       non-duplicate names, give an error if the number is duplicated. */
4569 
4570       isdupname = FALSE;
4571       ng = cb->named_groups;
4572       for (i = 0; i < cb->names_found; i++, ng++)
4573         {
4574         if (namelen == ng->length &&
4575             PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4576           {
4577           if (ng->number == cb->bracount) break;
4578           if ((options & PCRE2_DUPNAMES) == 0)
4579             {
4580             errorcode = ERR43;
4581             goto FAILED;
4582             }
4583           isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
4584           cb->dupnames = TRUE;              /* Duplicate names exist */
4585           }
4586         else if (ng->number == cb->bracount)
4587           {
4588           errorcode = ERR65;
4589           goto FAILED;
4590           }
4591         }
4592 
4593       if (i < cb->names_found) break;   /* Ignore duplicate with same number */
4594 
4595       /* Increase the list size if necessary */
4596 
4597       if (cb->names_found >= cb->named_group_list_size)
4598         {
4599         uint32_t newsize = cb->named_group_list_size * 2;
4600         named_group *newspace =
4601           cb->cx->memctl.malloc(newsize * sizeof(named_group),
4602           cb->cx->memctl.memory_data);
4603         if (newspace == NULL)
4604           {
4605           errorcode = ERR21;
4606           goto FAILED;
4607           }
4608 
4609         memcpy(newspace, cb->named_groups,
4610           cb->named_group_list_size * sizeof(named_group));
4611         if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4612           cb->cx->memctl.free((void *)cb->named_groups,
4613           cb->cx->memctl.memory_data);
4614         cb->named_groups = newspace;
4615         cb->named_group_list_size = newsize;
4616         }
4617 
4618       /* Add this name to the list */
4619 
4620       cb->named_groups[cb->names_found].name = name;
4621       cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4622       cb->named_groups[cb->names_found].number = cb->bracount;
4623       cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4624       cb->names_found++;
4625       break;
4626       }        /* End of (? switch */
4627     break;     /* End of ( handling */
4628 
4629 
4630     /* ---- Branch terminators ---- */
4631 
4632     /* Alternation: reset the capture count if we are in a (?| group. */
4633 
4634     case CHAR_VERTICAL_LINE:
4635     if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4636         (top_nest->flags & NSF_RESET) != 0)
4637       {
4638       if (cb->bracount > top_nest->max_group)
4639         top_nest->max_group = (uint16_t)cb->bracount;
4640       cb->bracount = top_nest->reset_group;
4641       }
4642     *parsed_pattern++ = META_ALT;
4643     break;
4644 
4645     /* End of group; reset the capture count to the maximum if we are in a (?|
4646     group and/or reset the options that are tracked during parsing. Disallow
4647     quantifier for a condition that is an assertion. */
4648 
4649     case CHAR_RIGHT_PARENTHESIS:
4650     okquantifier = TRUE;
4651     if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4652       {
4653       options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4654       if ((top_nest->flags & NSF_RESET) != 0 &&
4655           top_nest->max_group > cb->bracount)
4656         cb->bracount = top_nest->max_group;
4657       if ((top_nest->flags & NSF_CONDASSERT) != 0)
4658         okquantifier = FALSE;
4659 
4660       if ((top_nest->flags & NSF_ATOMICSR) != 0)
4661         {
4662         *parsed_pattern++ = META_KET;
4663         }
4664 
4665       if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4666         else top_nest--;
4667       }
4668     if (nest_depth == 0)    /* Unmatched closing parenthesis */
4669       {
4670       errorcode = ERR22;
4671       goto FAILED_BACK;
4672       }
4673     nest_depth--;
4674     *parsed_pattern++ = META_KET;
4675     break;
4676     }  /* End of switch on pattern character */
4677   }    /* End of main character scan loop */
4678 
4679 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4680 
4681 if (inverbname && ptr >= ptrend)
4682   {
4683   errorcode = ERR60;
4684   goto FAILED;
4685   }
4686 
4687 /* Manage callout for the final item */
4688 
4689 PARSED_END:
4690 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4691   parsed_pattern, cb);
4692 
4693 /* Insert trailing items for word and line matching (features provided for the
4694 benefit of pcre2grep). */
4695 
4696 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4697   {
4698   *parsed_pattern++ = META_KET;
4699   *parsed_pattern++ = META_DOLLAR;
4700   }
4701 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4702   {
4703   *parsed_pattern++ = META_KET;
4704   *parsed_pattern++ = META_ESCAPE + ESC_b;
4705   }
4706 
4707 /* Terminate the parsed pattern, then return success if all groups are closed.
4708 Otherwise we have unclosed parentheses. */
4709 
4710 if (parsed_pattern >= parsed_pattern_end)
4711   {
4712   errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
4713   goto FAILED;
4714   }
4715 
4716 *parsed_pattern = META_END;
4717 if (nest_depth == 0) return 0;
4718 
4719 UNCLOSED_PARENTHESIS:
4720 errorcode = ERR14;
4721 
4722 /* Come here for all failures. */
4723 
4724 FAILED:
4725 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4726 return errorcode;
4727 
4728 /* Some errors need to indicate the previous character. */
4729 
4730 FAILED_BACK:
4731 ptr--;
4732 goto FAILED;
4733 
4734 /* This failure happens several times. */
4735 
4736 BAD_VERSION_CONDITION:
4737 errorcode = ERR79;
4738 goto FAILED;
4739 }
4740 
4741 
4742 
4743 /*************************************************
4744 *       Find first significant opcode            *
4745 *************************************************/
4746 
4747 /* This is called by several functions that scan a compiled expression looking
4748 for a fixed first character, or an anchoring opcode etc. It skips over things
4749 that do not influence this. For some calls, it makes sense to skip negative
4750 forward and all backward assertions, and also the \b assertion; for others it
4751 does not.
4752 
4753 Arguments:
4754   code         pointer to the start of the group
4755   skipassert   TRUE if certain assertions are to be skipped
4756 
4757 Returns:       pointer to the first significant opcode
4758 */
4759 
4760 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4761 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4762 {
4763 for (;;)
4764   {
4765   switch ((int)*code)
4766     {
4767     case OP_ASSERT_NOT:
4768     case OP_ASSERTBACK:
4769     case OP_ASSERTBACK_NOT:
4770     case OP_ASSERTBACK_NA:
4771     if (!skipassert) return code;
4772     do code += GET(code, 1); while (*code == OP_ALT);
4773     code += PRIV(OP_lengths)[*code];
4774     break;
4775 
4776     case OP_WORD_BOUNDARY:
4777     case OP_NOT_WORD_BOUNDARY:
4778     if (!skipassert) return code;
4779     /* Fall through */
4780 
4781     case OP_CALLOUT:
4782     case OP_CREF:
4783     case OP_DNCREF:
4784     case OP_RREF:
4785     case OP_DNRREF:
4786     case OP_FALSE:
4787     case OP_TRUE:
4788     code += PRIV(OP_lengths)[*code];
4789     break;
4790 
4791     case OP_CALLOUT_STR:
4792     code += GET(code, 1 + 2*LINK_SIZE);
4793     break;
4794 
4795     case OP_SKIPZERO:
4796     code += 2 + GET(code, 2) + LINK_SIZE;
4797     break;
4798 
4799     case OP_COND:
4800     case OP_SCOND:
4801     if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
4802         code[GET(code, 1)] != OP_KET)      /* More than one branch */
4803       return code;
4804     code += GET(code, 1) + 1 + LINK_SIZE;
4805     break;
4806 
4807     case OP_MARK:
4808     case OP_COMMIT_ARG:
4809     case OP_PRUNE_ARG:
4810     case OP_SKIP_ARG:
4811     case OP_THEN_ARG:
4812     code += code[1] + PRIV(OP_lengths)[*code];
4813     break;
4814 
4815     default:
4816     return code;
4817     }
4818   }
4819 /* Control never reaches here */
4820 }
4821 
4822 
4823 
4824 #ifdef SUPPORT_UNICODE
4825 /*************************************************
4826 *           Get othercase range                  *
4827 *************************************************/
4828 
4829 /* This function is passed the start and end of a class range in UCP mode. It
4830 searches up the characters, looking for ranges of characters in the "other"
4831 case. Each call returns the next one, updating the start address. A character
4832 with multiple other cases is returned on its own with a special return value.
4833 
4834 Arguments:
4835   cptr        points to starting character value; updated
4836   d           end value
4837   ocptr       where to put start of othercase range
4838   odptr       where to put end of othercase range
4839 
4840 Yield:        -1 when no more
4841                0 when a range is returned
4842               >0 the CASESET offset for char with multiple other cases
4843                 in this case, ocptr contains the original
4844 */
4845 
4846 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4847 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4848   uint32_t *odptr)
4849 {
4850 uint32_t c, othercase, next;
4851 unsigned int co;
4852 
4853 /* Find the first character that has an other case. If it has multiple other
4854 cases, return its case offset value. */
4855 
4856 for (c = *cptr; c <= d; c++)
4857   {
4858   if ((co = UCD_CASESET(c)) != 0)
4859     {
4860     *ocptr = c++;   /* Character that has the set */
4861     *cptr = c;      /* Rest of input range */
4862     return (int)co;
4863     }
4864   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4865   }
4866 
4867 if (c > d) return -1;  /* Reached end of range */
4868 
4869 /* Found a character that has a single other case. Search for the end of the
4870 range, which is either the end of the input range, or a character that has zero
4871 or more than one other cases. */
4872 
4873 *ocptr = othercase;
4874 next = othercase + 1;
4875 
4876 for (++c; c <= d; c++)
4877   {
4878   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4879   next++;
4880   }
4881 
4882 *odptr = next - 1;     /* End of othercase range */
4883 *cptr = c;             /* Rest of input range */
4884 return 0;
4885 }
4886 #endif  /* SUPPORT_UNICODE */
4887 
4888 
4889 
4890 /*************************************************
4891 * Add a character or range to a class (internal) *
4892 *************************************************/
4893 
4894 /* This function packages up the logic of adding a character or range of
4895 characters to a class. The character values in the arguments will be within the
4896 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4897 called only from within the "add to class" group of functions, some of which
4898 are recursive and mutually recursive. The external entry point is
4899 add_to_class().
4900 
4901 Arguments:
4902   classbits     the bit map for characters < 256
4903   uchardptr     points to the pointer for extra data
4904   options       the options word
4905   cb            compile data
4906   start         start of range character
4907   end           end of range character
4908 
4909 Returns:        the number of < 256 characters added
4910                 the pointer to extra data is updated
4911 */
4912 
4913 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4914 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4915   uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
4916 {
4917 uint32_t c;
4918 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
4919 unsigned int n8 = 0;
4920 
4921 /* If caseless matching is required, scan the range and process alternate
4922 cases. In Unicode, there are 8-bit characters that have alternate cases that
4923 are greater than 255 and vice-versa. Sometimes we can just extend the original
4924 range. */
4925 
4926 if ((options & PCRE2_CASELESS) != 0)
4927   {
4928 #ifdef SUPPORT_UNICODE
4929   if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
4930     {
4931     int rc;
4932     uint32_t oc, od;
4933 
4934     options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
4935     c = start;
4936 
4937     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4938       {
4939       /* Handle a single character that has more than one other case. */
4940 
4941       if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
4942         PRIV(ucd_caseless_sets) + rc, oc);
4943 
4944       /* Do nothing if the other case range is within the original range. */
4945 
4946       else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
4947 
4948       /* Extend the original range if there is overlap, noting that if oc < c, we
4949       can't have od > end because a subrange is always shorter than the basic
4950       range. Otherwise, use a recursive call to add the additional range. */
4951 
4952       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4953       else if (od > end && oc <= end + 1)
4954         {
4955         end = od;       /* Extend upwards */
4956         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4957         }
4958       else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
4959       }
4960     }
4961   else
4962 #endif  /* SUPPORT_UNICODE */
4963 
4964   /* Not UTF mode */
4965 
4966   for (c = start; c <= classbits_end; c++)
4967     {
4968     SETBIT(classbits, cb->fcc[c]);
4969     n8++;
4970     }
4971   }
4972 
4973 /* Now handle the originally supplied range. Adjust the final value according
4974 to the bit length - this means that the same lists of (e.g.) horizontal spaces
4975 can be used in all cases. */
4976 
4977 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
4978   end = MAX_NON_UTF_CHAR;
4979 
4980 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
4981 
4982 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4983 
4984 for (c = start; c <= classbits_end; c++)
4985   {
4986   /* Regardless of start, c will always be <= 255. */
4987   SETBIT(classbits, c);
4988   n8++;
4989   }
4990 
4991 #ifdef SUPPORT_WIDE_CHARS
4992 if (start <= 0xff) start = 0xff + 1;
4993 
4994 if (end >= start)
4995   {
4996   PCRE2_UCHAR *uchardata = *uchardptr;
4997 
4998 #ifdef SUPPORT_UNICODE
4999   if ((options & PCRE2_UTF) != 0)
5000     {
5001     if (start < end)
5002       {
5003       *uchardata++ = XCL_RANGE;
5004       uchardata += PRIV(ord2utf)(start, uchardata);
5005       uchardata += PRIV(ord2utf)(end, uchardata);
5006       }
5007     else if (start == end)
5008       {
5009       *uchardata++ = XCL_SINGLE;
5010       uchardata += PRIV(ord2utf)(start, uchardata);
5011       }
5012     }
5013   else
5014 #endif  /* SUPPORT_UNICODE */
5015 
5016   /* Without UTF support, character values are constrained by the bit length,
5017   and can only be > 256 for 16-bit and 32-bit libraries. */
5018 
5019 #if PCRE2_CODE_UNIT_WIDTH == 8
5020     {}
5021 #else
5022   if (start < end)
5023     {
5024     *uchardata++ = XCL_RANGE;
5025     *uchardata++ = start;
5026     *uchardata++ = end;
5027     }
5028   else if (start == end)
5029     {
5030     *uchardata++ = XCL_SINGLE;
5031     *uchardata++ = start;
5032     }
5033 #endif  /* PCRE2_CODE_UNIT_WIDTH == 8 */
5034   *uchardptr = uchardata;   /* Updata extra data pointer */
5035   }
5036 #else  /* SUPPORT_WIDE_CHARS */
5037   (void)uchardptr;          /* Avoid compiler warning */
5038 #endif /* SUPPORT_WIDE_CHARS */
5039 
5040 return n8;    /* Number of 8-bit characters */
5041 }
5042 
5043 
5044 
5045 #ifdef SUPPORT_UNICODE
5046 /*************************************************
5047 * Add a list of characters to a class (internal) *
5048 *************************************************/
5049 
5050 /* This function is used for adding a list of case-equivalent characters to a
5051 class when in UTF mode. This function is called only from within
5052 add_to_class_internal(), with which it is mutually recursive.
5053 
5054 Arguments:
5055   classbits     the bit map for characters < 256
5056   uchardptr     points to the pointer for extra data
5057   options       the options word
5058   cb            contains pointers to tables etc.
5059   p             points to row of 32-bit values, terminated by NOTACHAR
5060   except        character to omit; this is used when adding lists of
5061                   case-equivalent characters to avoid including the one we
5062                   already know about
5063 
5064 Returns:        the number of < 256 characters added
5065                 the pointer to extra data is updated
5066 */
5067 
5068 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5069 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5070   uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
5071 {
5072 unsigned int n8 = 0;
5073 while (p[0] < NOTACHAR)
5074   {
5075   unsigned int n = 0;
5076   if (p[0] != except)
5077     {
5078     while(p[n+1] == p[0] + n + 1) n++;
5079     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5080     }
5081   p += n + 1;
5082   }
5083 return n8;
5084 }
5085 #endif
5086 
5087 
5088 
5089 /*************************************************
5090 *   External entry point for add range to class  *
5091 *************************************************/
5092 
5093 /* This function sets the overall range so that the internal functions can try
5094 to avoid duplication when handling case-independence.
5095 
5096 Arguments:
5097   classbits     the bit map for characters < 256
5098   uchardptr     points to the pointer for extra data
5099   options       the options word
5100   cb            compile data
5101   start         start of range character
5102   end           end of range character
5103 
5104 Returns:        the number of < 256 characters added
5105                 the pointer to extra data is updated
5106 */
5107 
5108 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5109 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5110   compile_block *cb, uint32_t start, uint32_t end)
5111 {
5112 cb->class_range_start = start;
5113 cb->class_range_end = end;
5114 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5115 }
5116 
5117 
5118 /*************************************************
5119 *   External entry point for add list to class   *
5120 *************************************************/
5121 
5122 /* This function is used for adding a list of horizontal or vertical whitespace
5123 characters to a class. The list must be in order so that ranges of characters
5124 can be detected and handled appropriately. This function sets the overall range
5125 so that the internal functions can try to avoid duplication when handling
5126 case-independence.
5127 
5128 Arguments:
5129   classbits     the bit map for characters < 256
5130   uchardptr     points to the pointer for extra data
5131   options       the options word
5132   cb            contains pointers to tables etc.
5133   p             points to row of 32-bit values, terminated by NOTACHAR
5134   except        character to omit; this is used when adding lists of
5135                   case-equivalent characters to avoid including the one we
5136                   already know about
5137 
5138 Returns:        the number of < 256 characters added
5139                 the pointer to extra data is updated
5140 */
5141 
5142 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5143 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5144   compile_block *cb, const uint32_t *p, unsigned int except)
5145 {
5146 unsigned int n8 = 0;
5147 while (p[0] < NOTACHAR)
5148   {
5149   unsigned int n = 0;
5150   if (p[0] != except)
5151     {
5152     while(p[n+1] == p[0] + n + 1) n++;
5153     cb->class_range_start = p[0];
5154     cb->class_range_end = p[n];
5155     n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5156     }
5157   p += n + 1;
5158   }
5159 return n8;
5160 }
5161 
5162 
5163 
5164 /*************************************************
5165 *    Add characters not in a list to a class     *
5166 *************************************************/
5167 
5168 /* This function is used for adding the complement of a list of horizontal or
5169 vertical whitespace to a class. The list must be in order.
5170 
5171 Arguments:
5172   classbits     the bit map for characters < 256
5173   uchardptr     points to the pointer for extra data
5174   options       the options word
5175   cb            contains pointers to tables etc.
5176   p             points to row of 32-bit values, terminated by NOTACHAR
5177 
5178 Returns:        the number of < 256 characters added
5179                 the pointer to extra data is updated
5180 */
5181 
5182 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)5183 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5184   uint32_t options, compile_block *cb, const uint32_t *p)
5185 {
5186 BOOL utf = (options & PCRE2_UTF) != 0;
5187 unsigned int n8 = 0;
5188 if (p[0] > 0)
5189   n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5190 while (p[0] < NOTACHAR)
5191   {
5192   while (p[1] == p[0] + 1) p++;
5193   n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5194     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5195   p++;
5196   }
5197 return n8;
5198 }
5199 
5200 
5201 
5202 /*************************************************
5203 *    Find details of duplicate group names       *
5204 *************************************************/
5205 
5206 /* This is called from compile_branch() when it needs to know the index and
5207 count of duplicates in the names table when processing named backreferences,
5208 either directly, or as conditions.
5209 
5210 Arguments:
5211   name          points to the name
5212   length        the length of the name
5213   indexptr      where to put the index
5214   countptr      where to put the count of duplicates
5215   errorcodeptr  where to put an error code
5216   cb            the compile block
5217 
5218 Returns:        TRUE if OK, FALSE if not, error code set
5219 */
5220 
5221 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5222 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5223   int *countptr, int *errorcodeptr, compile_block *cb)
5224 {
5225 uint32_t i, groupnumber;
5226 int count;
5227 PCRE2_UCHAR *slot = cb->name_table;
5228 
5229 /* Find the first entry in the table */
5230 
5231 for (i = 0; i < cb->names_found; i++)
5232   {
5233   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5234       slot[IMM2_SIZE+length] == 0) break;
5235   slot += cb->name_entry_size;
5236   }
5237 
5238 /* This should not occur, because this function is called only when we know we
5239 have duplicate names. Give an internal error. */
5240 
5241 if (i >= cb->names_found)
5242   {
5243   *errorcodeptr = ERR53;
5244   cb->erroroffset = name - cb->start_pattern;
5245   return FALSE;
5246   }
5247 
5248 /* Record the index and then see how many duplicates there are, updating the
5249 backref map and maximum back reference as we do. */
5250 
5251 *indexptr = i;
5252 count = 0;
5253 
5254 for (;;)
5255   {
5256   count++;
5257   groupnumber = GET2(slot,0);
5258   cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5259   if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5260   if (++i >= cb->names_found) break;
5261   slot += cb->name_entry_size;
5262   if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5263     (slot+IMM2_SIZE)[length] != 0) break;
5264   }
5265 
5266 *countptr = count;
5267 return TRUE;
5268 }
5269 
5270 
5271 
5272 /*************************************************
5273 *           Compile one branch                   *
5274 *************************************************/
5275 
5276 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5277 the options are changed during the branch, the pointer is used to change the
5278 external options bits. This function is used during the pre-compile phase when
5279 we are trying to find out the amount of memory needed, as well as during the
5280 real compile phase. The value of lengthptr distinguishes the two phases.
5281 
5282 Arguments:
5283   optionsptr        pointer to the option bits
5284   codeptr           points to the pointer to the current code point
5285   pptrptr           points to the current parsed pattern pointer
5286   errorcodeptr      points to error code variable
5287   firstcuptr        place to put the first required code unit
5288   firstcuflagsptr   place to put the first code unit flags, or a negative number
5289   reqcuptr          place to put the last required code unit
5290   reqcuflagsptr     place to put the last required code unit flags, or a negative number
5291   bcptr             points to current branch chain
5292   cb                contains pointers to tables etc.
5293   lengthptr         NULL during the real compile phase
5294                     points to length accumulator during pre-compile phase
5295 
5296 Returns:            0 There's been an error, *errorcodeptr is non-zero
5297                    +1 Success, this branch must match at least one character
5298                    -1 Success, this branch may match an empty string
5299 */
5300 
5301 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)5302 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
5303   int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
5304   uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
5305   compile_block *cb, PCRE2_SIZE *lengthptr)
5306 {
5307 int bravalue = 0;
5308 int okreturn = -1;
5309 int group_return = 0;
5310 uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
5311 uint32_t greedy_default, greedy_non_default;
5312 uint32_t repeat_type, op_type;
5313 uint32_t options = *optionsptr;               /* May change dynamically */
5314 uint32_t firstcu, reqcu;
5315 uint32_t zeroreqcu, zerofirstcu;
5316 uint32_t escape;
5317 uint32_t *pptr = *pptrptr;
5318 uint32_t meta, meta_arg;
5319 int32_t firstcuflags, reqcuflags;
5320 int32_t zeroreqcuflags, zerofirstcuflags;
5321 int32_t req_caseopt, reqvary, tempreqvary;
5322 PCRE2_SIZE offset = 0;
5323 PCRE2_SIZE length_prevgroup = 0;
5324 PCRE2_UCHAR *code = *codeptr;
5325 PCRE2_UCHAR *last_code = code;
5326 PCRE2_UCHAR *orig_code = code;
5327 PCRE2_UCHAR *tempcode;
5328 PCRE2_UCHAR *previous = NULL;
5329 PCRE2_UCHAR op_previous;
5330 BOOL groupsetfirstcu = FALSE;
5331 BOOL had_accept = FALSE;
5332 BOOL matched_char = FALSE;
5333 BOOL previous_matched_char = FALSE;
5334 BOOL reset_caseful = FALSE;
5335 const uint8_t *cbits = cb->cbits;
5336 uint8_t classbits[32];
5337 
5338 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5339 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5340 dynamically as we process the pattern. */
5341 
5342 #ifdef SUPPORT_UNICODE
5343 BOOL utf = (options & PCRE2_UTF) != 0;
5344 BOOL ucp = (options & PCRE2_UCP) != 0;
5345 #else  /* No Unicode support */
5346 BOOL utf = FALSE;
5347 #endif
5348 
5349 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5350 class_uchardata always so that it can be passed to add_to_class() always,
5351 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5352 alternative calls for the different cases. */
5353 
5354 PCRE2_UCHAR *class_uchardata;
5355 #ifdef SUPPORT_WIDE_CHARS
5356 BOOL xclass;
5357 PCRE2_UCHAR *class_uchardata_base;
5358 #endif
5359 
5360 /* Set up the default and non-default settings for greediness */
5361 
5362 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5363 greedy_non_default = greedy_default ^ 1;
5364 
5365 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5366 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5367 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5368 
5369 When we hit a repeat whose minimum is zero, we may have to adjust these values
5370 to take the zero repeat into account. This is implemented by setting them to
5371 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5372 item types that can be repeated set these backoff variables appropriately. */
5373 
5374 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5375 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5376 
5377 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
5378 according to the current setting of the caseless flag. The REQ_CASELESS value
5379 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5380 to record the case status of the value. This is used only for ASCII characters.
5381 */
5382 
5383 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
5384 
5385 /* Switch on next META item until the end of the branch */
5386 
5387 for (;; pptr++)
5388   {
5389 #ifdef SUPPORT_WIDE_CHARS
5390   BOOL xclass_has_prop;
5391 #endif
5392   BOOL negate_class;
5393   BOOL should_flip_negation;
5394   BOOL match_all_or_no_wide_chars;
5395   BOOL possessive_quantifier;
5396   BOOL note_group_empty;
5397   int class_has_8bitchar;
5398   int i;
5399   uint32_t mclength;
5400   uint32_t skipunits;
5401   uint32_t subreqcu, subfirstcu;
5402   uint32_t groupnumber;
5403   uint32_t verbarglen, verbculen;
5404   int32_t subreqcuflags, subfirstcuflags;  /* Must be signed */
5405   open_capitem *oc;
5406   PCRE2_UCHAR mcbuffer[8];
5407 
5408   /* Get next META item in the pattern and its potential argument. */
5409 
5410   meta = META_CODE(*pptr);
5411   meta_arg = META_DATA(*pptr);
5412 
5413   /* If we are in the pre-compile phase, accumulate the length used for the
5414   previous cycle of this loop, unless the next item is a quantifier. */
5415 
5416   if (lengthptr != NULL)
5417     {
5418     if (code > cb->start_workspace + cb->workspace_size -
5419         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
5420       {
5421       *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5422         ERR52 : ERR86;
5423       return 0;
5424       }
5425 
5426     /* There is at least one situation where code goes backwards: this is the
5427     case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5428     is processed, the whole class is eliminated. However, it is created first,
5429     so we have to allow memory for it. Therefore, don't ever reduce the length
5430     at this point. */
5431 
5432     if (code < last_code) code = last_code;
5433 
5434     /* If the next thing is not a quantifier, we add the length of the previous
5435     item into the total, and reset the code pointer to the start of the
5436     workspace. Otherwise leave the previous item available to be quantified. */
5437 
5438     if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5439       {
5440       if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5441         {
5442         *errorcodeptr = ERR20;   /* Integer overflow */
5443         return 0;
5444         }
5445       *lengthptr += (PCRE2_SIZE)(code - orig_code);
5446       if (*lengthptr > MAX_PATTERN_SIZE)
5447         {
5448         *errorcodeptr = ERR20;   /* Pattern is too large */
5449         return 0;
5450         }
5451       code = orig_code;
5452       }
5453 
5454     /* Remember where this code item starts so we can catch the "backwards"
5455     case above next time round. */
5456 
5457     last_code = code;
5458     }
5459 
5460   /* Process the next parsed pattern item. If it is not a quantifier, remember
5461   where it starts so that it can be quantified when a quantifier follows.
5462   Checking for the legality of quantifiers happens in parse_regex(), except for
5463   a quantifier after an assertion that is a condition. */
5464 
5465   if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5466     {
5467     previous = code;
5468     if (matched_char && !had_accept) okreturn = 1;
5469     }
5470 
5471   previous_matched_char = matched_char;
5472   matched_char = FALSE;
5473   note_group_empty = FALSE;
5474   skipunits = 0;         /* Default value for most subgroups */
5475 
5476   switch(meta)
5477     {
5478     /* ===================================================================*/
5479     /* The branch terminates at pattern end or | or ) */
5480 
5481     case META_END:
5482     case META_ALT:
5483     case META_KET:
5484     *firstcuptr = firstcu;
5485     *firstcuflagsptr = firstcuflags;
5486     *reqcuptr = reqcu;
5487     *reqcuflagsptr = reqcuflags;
5488     *codeptr = code;
5489     *pptrptr = pptr;
5490     return okreturn;
5491 
5492 
5493     /* ===================================================================*/
5494     /* Handle single-character metacharacters. In multiline mode, ^ disables
5495     the setting of any following char as a first character. */
5496 
5497     case META_CIRCUMFLEX:
5498     if ((options & PCRE2_MULTILINE) != 0)
5499       {
5500       if (firstcuflags == REQ_UNSET)
5501         zerofirstcuflags = firstcuflags = REQ_NONE;
5502       *code++ = OP_CIRCM;
5503       }
5504     else *code++ = OP_CIRC;
5505     break;
5506 
5507     case META_DOLLAR:
5508     *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5509     break;
5510 
5511     /* There can never be a first char if '.' is first, whatever happens about
5512     repeats. The value of reqcu doesn't change either. */
5513 
5514     case META_DOT:
5515     matched_char = TRUE;
5516     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5517     zerofirstcu = firstcu;
5518     zerofirstcuflags = firstcuflags;
5519     zeroreqcu = reqcu;
5520     zeroreqcuflags = reqcuflags;
5521     *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5522     break;
5523 
5524 
5525     /* ===================================================================*/
5526     /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5527     Otherwise, an initial ']' is taken as a data character. When empty classes
5528     are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5529     match any character, so generate OP_ALLANY. */
5530 
5531     case META_CLASS_EMPTY:
5532     case META_CLASS_EMPTY_NOT:
5533     matched_char = TRUE;
5534     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5535     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5536     zerofirstcu = firstcu;
5537     zerofirstcuflags = firstcuflags;
5538     break;
5539 
5540 
5541     /* ===================================================================*/
5542     /* Non-empty character class. If the included characters are all < 256, we
5543     build a 32-byte bitmap of the permitted characters, except in the special
5544     case where there is only one such character. For negated classes, we build
5545     the map as usual, then invert it at the end. However, we use a different
5546     opcode so that data characters > 255 can be handled correctly.
5547 
5548     If the class contains characters outside the 0-255 range, a different
5549     opcode is compiled. It may optionally have a bit map for characters < 256,
5550     but those above are are explicitly listed afterwards. A flag code unit
5551     tells whether the bitmap is present, and whether this is a negated class or
5552     not. */
5553 
5554     case META_CLASS_NOT:
5555     case META_CLASS:
5556     matched_char = TRUE;
5557     negate_class = meta == META_CLASS_NOT;
5558 
5559     /* We can optimize the case of a single character in a class by generating
5560     OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5561     negative. In the negative case there can be no first char if this item is
5562     first, whatever repeat count may follow. In the case of reqcu, save the
5563     previous value for reinstating. */
5564 
5565     /* NOTE: at present this optimization is not effective if the only
5566     character in a class in 32-bit, non-UCP mode has its top bit set. */
5567 
5568     if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5569       {
5570 #ifdef SUPPORT_UNICODE
5571       uint32_t d;
5572 #endif
5573       uint32_t c = pptr[1];
5574 
5575       pptr += 2;                 /* Move on to class end */
5576       if (meta == META_CLASS)    /* A positive one-char class can be */
5577         {                        /* handled as a normal literal character. */
5578         meta = c;                /* Set up the character */
5579         goto NORMAL_CHAR_SET;
5580         }
5581 
5582       /* Handle a negative one-character class */
5583 
5584       zeroreqcu = reqcu;
5585       zeroreqcuflags = reqcuflags;
5586       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5587       zerofirstcu = firstcu;
5588       zerofirstcuflags = firstcuflags;
5589 
5590       /* For caseless UTF or UCP mode, check whether this character has more
5591       than one other case. If so, generate a special OP_NOTPROP item instead of
5592       OP_NOTI. */
5593 
5594 #ifdef SUPPORT_UNICODE
5595       if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5596           (d = UCD_CASESET(c)) != 0)
5597         {
5598         *code++ = OP_NOTPROP;
5599         *code++ = PT_CLIST;
5600         *code++ = d;
5601         break;   /* We are finished with this class */
5602         }
5603 #endif
5604       /* Char has only one other case, or UCP not available */
5605 
5606       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5607       code += PUTCHAR(c, code);
5608       break;   /* We are finished with this class */
5609       }        /* End of 1-char optimization */
5610 
5611     /* Handle character classes that contain more than just one literal
5612     character. If there are exactly two characters in a positive class, see if
5613     they are case partners. This can be optimized to generate a caseless single
5614     character match (which also sets first/required code units if relevant). */
5615 
5616     if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5617         pptr[3] == META_CLASS_END)
5618       {
5619       uint32_t c = pptr[1];
5620 
5621 #ifdef SUPPORT_UNICODE
5622       if (UCD_CASESET(c) == 0)
5623 #endif
5624         {
5625         uint32_t d;
5626 
5627 #ifdef SUPPORT_UNICODE
5628         if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5629 #endif
5630           {
5631 #if PCRE2_CODE_UNIT_WIDTH != 8
5632           if (c > 255) d = c; else
5633 #endif
5634           d = TABLE_GET(c, cb->fcc, c);
5635           }
5636 
5637         if (c != d && pptr[2] == d)
5638           {
5639           pptr += 3;                 /* Move on to class end */
5640           meta = c;
5641           if ((options & PCRE2_CASELESS) == 0)
5642             {
5643             reset_caseful = TRUE;
5644             options |= PCRE2_CASELESS;
5645             req_caseopt = REQ_CASELESS;
5646             }
5647           goto CLASS_CASELESS_CHAR;
5648           }
5649         }
5650       }
5651 
5652     /* If a non-extended class contains a negative special such as \S, we need
5653     to flip the negation flag at the end, so that support for characters > 255
5654     works correctly (they are all included in the class). An extended class may
5655     need to insert specific matching or non-matching code for wide characters.
5656     */
5657 
5658     should_flip_negation = match_all_or_no_wide_chars = FALSE;
5659 
5660     /* Extended class (xclass) will be used when characters > 255
5661     might match. */
5662 
5663 #ifdef SUPPORT_WIDE_CHARS
5664     xclass = FALSE;
5665     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
5666     class_uchardata_base = class_uchardata;   /* Save the start */
5667 #endif
5668 
5669     /* For optimization purposes, we track some properties of the class:
5670     class_has_8bitchar will be non-zero if the class contains at least one
5671     character with a code point less than 256; xclass_has_prop will be TRUE if
5672     Unicode property checks are present in the class. */
5673 
5674     class_has_8bitchar = 0;
5675 #ifdef SUPPORT_WIDE_CHARS
5676     xclass_has_prop = FALSE;
5677 #endif
5678 
5679     /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5680     in a temporary bit of memory, in case the class contains fewer than two
5681     8-bit characters because in that case the compiled code doesn't use the bit
5682     map. */
5683 
5684     memset(classbits, 0, 32 * sizeof(uint8_t));
5685 
5686     /* Process items until META_CLASS_END is reached. */
5687 
5688     while ((meta = *(++pptr)) != META_CLASS_END)
5689       {
5690       /* Handle POSIX classes such as [:alpha:] etc. */
5691 
5692       if (meta == META_POSIX || meta == META_POSIX_NEG)
5693         {
5694         BOOL local_negate = (meta == META_POSIX_NEG);
5695         int posix_class = *(++pptr);
5696         int taboffset, tabopt;
5697         uint8_t pbits[32];
5698 
5699         should_flip_negation = local_negate;  /* Note negative special */
5700 
5701         /* If matching is caseless, upper and lower are converted to alpha.
5702         This relies on the fact that the class table starts with alpha,
5703         lower, upper as the first 3 entries. */
5704 
5705         if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5706           posix_class = 0;
5707 
5708         /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5709         different escape sequences that use Unicode properties \p or \P.
5710         Others that are not available via \p or \P have to generate
5711         XCL_PROP/XCL_NOTPROP directly, which is done here. */
5712 
5713 #ifdef SUPPORT_UNICODE
5714         if ((options & PCRE2_UCP) != 0) switch(posix_class)
5715           {
5716           case PC_GRAPH:
5717           case PC_PRINT:
5718           case PC_PUNCT:
5719           *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5720           *class_uchardata++ = (PCRE2_UCHAR)
5721             ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5722              (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5723           *class_uchardata++ = 0;
5724           xclass_has_prop = TRUE;
5725           goto CONTINUE_CLASS;
5726 
5727           /* For the other POSIX classes (ascii, xdigit) we are going to
5728           fall through to the non-UCP case and build a bit map for
5729           characters with code points less than 256. However, if we are in
5730           a negated POSIX class, characters with code points greater than
5731           255 must either all match or all not match, depending on whether
5732           the whole class is not or is negated. For example, for
5733           [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5734           they must not.
5735 
5736           In the special case where there are no xclass items, this is
5737           automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5738           explicit range is needed for OP_XCLASS. Setting a flag here
5739           causes the range to be generated later when it is known that
5740           OP_XCLASS is required. In the 8-bit library this is relevant only in
5741           utf mode, since no wide characters can exist otherwise. */
5742 
5743           default:
5744 #if PCRE2_CODE_UNIT_WIDTH == 8
5745           if (utf)
5746 #endif
5747           match_all_or_no_wide_chars |= local_negate;
5748           break;
5749           }
5750 #endif  /* SUPPORT_UNICODE */
5751 
5752         /* In the non-UCP case, or when UCP makes no difference, we build the
5753         bit map for the POSIX class in a chunk of local store because we may
5754         be adding and subtracting from it, and we don't want to subtract bits
5755         that may be in the main map already. At the end we or the result into
5756         the bit map that is being built. */
5757 
5758         posix_class *= 3;
5759 
5760         /* Copy in the first table (always present) */
5761 
5762         memcpy(pbits, cbits + posix_class_maps[posix_class],
5763           32 * sizeof(uint8_t));
5764 
5765         /* If there is a second table, add or remove it as required. */
5766 
5767         taboffset = posix_class_maps[posix_class + 1];
5768         tabopt = posix_class_maps[posix_class + 2];
5769 
5770         if (taboffset >= 0)
5771           {
5772           if (tabopt >= 0)
5773             for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5774           else
5775             for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5776           }
5777 
5778         /* Now see if we need to remove any special characters. An option
5779         value of 1 removes vertical space and 2 removes underscore. */
5780 
5781         if (tabopt < 0) tabopt = -tabopt;
5782         if (tabopt == 1) pbits[1] &= ~0x3c;
5783           else if (tabopt == 2) pbits[11] &= 0x7f;
5784 
5785         /* Add the POSIX table or its complement into the main table that is
5786         being built and we are done. */
5787 
5788         if (local_negate)
5789           for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
5790         else
5791           for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
5792 
5793         /* Every class contains at least one < 256 character. */
5794 
5795         class_has_8bitchar = 1;
5796         goto CONTINUE_CLASS;    /* End of POSIX handling */
5797         }
5798 
5799       /* Other than POSIX classes, the only items we should encounter are
5800       \d-type escapes and literal characters (possibly as ranges). */
5801 
5802       if (meta == META_BIGVALUE)
5803         {
5804         meta = *(++pptr);
5805         goto CLASS_LITERAL;
5806         }
5807 
5808       /* Any other non-literal must be an escape */
5809 
5810       if (meta >= META_END)
5811         {
5812         if (META_CODE(meta) != META_ESCAPE)
5813           {
5814 #ifdef DEBUG_SHOW_PARSED
5815           fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5816                           "in character class\n", meta);
5817 #endif
5818           *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
5819           return 0;
5820           }
5821         escape = META_DATA(meta);
5822 
5823         /* Every class contains at least one < 256 character. */
5824 
5825         class_has_8bitchar++;
5826 
5827         switch(escape)
5828           {
5829           case ESC_d:
5830           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5831           break;
5832 
5833           case ESC_D:
5834           should_flip_negation = TRUE;
5835           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
5836           break;
5837 
5838           case ESC_w:
5839           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5840           break;
5841 
5842           case ESC_W:
5843           should_flip_negation = TRUE;
5844           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
5845           break;
5846 
5847           /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5848           5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5849           previously set by something earlier in the character class.
5850           Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5851           we could just adjust the appropriate bit. From PCRE 8.34 we no
5852           longer treat \s and \S specially. */
5853 
5854           case ESC_s:
5855           for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5856           break;
5857 
5858           case ESC_S:
5859           should_flip_negation = TRUE;
5860           for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
5861           break;
5862 
5863           /* When adding the horizontal or vertical space lists to a class, or
5864           their complements, disable PCRE2_CASELESS, because it justs wastes
5865           time, and in the "not-x" UTF cases can create unwanted duplicates in
5866           the XCLASS list (provoked by characters that have more than one other
5867           case and by both cases being in the same "not-x" sublist). */
5868 
5869           case ESC_h:
5870           (void)add_list_to_class(classbits, &class_uchardata,
5871             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5872           break;
5873 
5874           case ESC_H:
5875           (void)add_not_list_to_class(classbits, &class_uchardata,
5876             options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5877           break;
5878 
5879           case ESC_v:
5880           (void)add_list_to_class(classbits, &class_uchardata,
5881             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5882           break;
5883 
5884           case ESC_V:
5885           (void)add_not_list_to_class(classbits, &class_uchardata,
5886             options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5887           break;
5888 
5889           /* If Unicode is not supported, \P and \p are not allowed and are
5890           faulted at parse time, so will never appear here. */
5891 
5892 #ifdef SUPPORT_UNICODE
5893           case ESC_p:
5894           case ESC_P:
5895             {
5896             uint32_t ptype = *(++pptr) >> 16;
5897             uint32_t pdata = *pptr & 0xffff;
5898             *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5899             *class_uchardata++ = ptype;
5900             *class_uchardata++ = pdata;
5901             xclass_has_prop = TRUE;
5902             class_has_8bitchar--;                /* Undo! */
5903             }
5904           break;
5905 #endif
5906           }
5907 
5908         goto CONTINUE_CLASS;
5909         }  /* End handling \d-type escapes */
5910 
5911       /* A literal character may be followed by a range meta. At parse time
5912       there are checks for out-of-order characters, for ranges where the two
5913       characters are equal, and for hyphens that cannot indicate a range. At
5914       this point, therefore, no checking is needed. */
5915 
5916       else
5917         {
5918         uint32_t c, d;
5919 
5920         CLASS_LITERAL:
5921         c = d = meta;
5922 
5923         /* Remember if \r or \n were explicitly used */
5924 
5925         if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5926 
5927         /* Process a character range */
5928 
5929         if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
5930           {
5931 #ifdef EBCDIC
5932           BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
5933 #endif
5934           pptr += 2;
5935           d = *pptr;
5936           if (d == META_BIGVALUE) d = *(++pptr);
5937 
5938           /* Remember an explicit \r or \n, and add the range to the class. */
5939 
5940           if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5941 
5942           /* In an EBCDIC environment, Perl treats alphabetic ranges specially
5943           because there are holes in the encoding, and simply using the range
5944           A-Z (for example) would include the characters in the holes. This
5945           applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
5946 
5947 #ifdef EBCDIC
5948           if (range_is_literal &&
5949                (cb->ctypes[c] & ctype_letter) != 0 &&
5950                (cb->ctypes[d] & ctype_letter) != 0 &&
5951                (c <= CHAR_z) == (d <= CHAR_z))
5952             {
5953             uint32_t uc = (d <= CHAR_z)? 0 : 64;
5954             uint32_t C = c - uc;
5955             uint32_t D = d - uc;
5956 
5957             if (C <= CHAR_i)
5958               {
5959               class_has_8bitchar +=
5960                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5961                   ((D < CHAR_i)? D : CHAR_i) + uc);
5962               C = CHAR_j;
5963               }
5964 
5965             if (C <= D && C <= CHAR_r)
5966               {
5967               class_has_8bitchar +=
5968                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5969                   ((D < CHAR_r)? D : CHAR_r) + uc);
5970               C = CHAR_s;
5971               }
5972 
5973             if (C <= D)
5974               {
5975               class_has_8bitchar +=
5976                 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5977                   D + uc);
5978               }
5979             }
5980           else
5981 #endif
5982           /* Not an EBCDIC special range */
5983 
5984           class_has_8bitchar +=
5985             add_to_class(classbits, &class_uchardata, options, cb, c, d);
5986           goto CONTINUE_CLASS;   /* Go get the next char in the class */
5987           }  /* End of range handling */
5988 
5989 
5990         /* Handle a single character. */
5991 
5992         class_has_8bitchar +=
5993           add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
5994         }
5995 
5996       /* Continue to the next item in the class. */
5997 
5998       CONTINUE_CLASS:
5999 
6000 #ifdef SUPPORT_WIDE_CHARS
6001       /* If any wide characters or Unicode properties have been encountered,
6002       set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6003       of the extra data and reset the pointer. This is so that very large
6004       classes that contain a zillion wide characters or Unicode property tests
6005       do not overwrite the workspace (which is on the stack). */
6006 
6007       if (class_uchardata > class_uchardata_base)
6008         {
6009         xclass = TRUE;
6010         if (lengthptr != NULL)
6011           {
6012           *lengthptr += class_uchardata - class_uchardata_base;
6013           class_uchardata = class_uchardata_base;
6014           }
6015         }
6016 #endif
6017 
6018       continue;  /* Needed to avoid error when not supporting wide chars */
6019       }   /* End of main class-processing loop */
6020 
6021     /* If this class is the first thing in the branch, there can be no first
6022     char setting, whatever the repeat count. Any reqcu setting must remain
6023     unchanged after any kind of repeat. */
6024 
6025     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6026     zerofirstcu = firstcu;
6027     zerofirstcuflags = firstcuflags;
6028     zeroreqcu = reqcu;
6029     zeroreqcuflags = reqcuflags;
6030 
6031     /* If there are characters with values > 255, or Unicode property settings
6032     (\p or \P), we have to compile an extended class, with its own opcode,
6033     unless there were no property settings and there was a negated special such
6034     as \S in the class, and PCRE2_UCP is not set, because in that case all
6035     characters > 255 are in or not in the class, so any that were explicitly
6036     given as well can be ignored.
6037 
6038     In the UCP case, if certain negated POSIX classes ([:^ascii:] or
6039     [^:xdigit:]) were present in a class, we either have to match or not match
6040     all wide characters (depending on whether the whole class is or is not
6041     negated). This requirement is indicated by match_all_or_no_wide_chars being
6042     true. We do this by including an explicit range, which works in both cases.
6043     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6044     cannot be any wide characters in 8-bit non-UTF mode.
6045 
6046     When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6047     class where \S etc is present without PCRE2_UCP, causing an extended class
6048     to be compiled, we make sure that all characters > 255 are included by
6049     forcing match_all_or_no_wide_chars to be true.
6050 
6051     If, when generating an xclass, there are no characters < 256, we can omit
6052     the bitmap in the actual compiled code. */
6053 
6054 #ifdef SUPPORT_WIDE_CHARS  /* Defined for 16/32 bits, or 8-bit with Unicode */
6055     if (xclass && (
6056 #ifdef SUPPORT_UNICODE
6057         (options & PCRE2_UCP) != 0 ||
6058 #endif
6059         xclass_has_prop || !should_flip_negation))
6060       {
6061       if (match_all_or_no_wide_chars || (
6062 #if PCRE2_CODE_UNIT_WIDTH == 8
6063            utf &&
6064 #endif
6065            should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6066         {
6067         *class_uchardata++ = XCL_RANGE;
6068         if (utf)   /* Will always be utf in the 8-bit library */
6069           {
6070           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6071           class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6072           }
6073         else       /* Can only happen for the 16-bit & 32-bit libraries */
6074           {
6075 #if PCRE2_CODE_UNIT_WIDTH == 16
6076           *class_uchardata++ = 0x100;
6077           *class_uchardata++ = 0xffffu;
6078 #elif PCRE2_CODE_UNIT_WIDTH == 32
6079           *class_uchardata++ = 0x100;
6080           *class_uchardata++ = 0xffffffffu;
6081 #endif
6082           }
6083         }
6084       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
6085       *code++ = OP_XCLASS;
6086       code += LINK_SIZE;
6087       *code = negate_class? XCL_NOT:0;
6088       if (xclass_has_prop) *code |= XCL_HASPROP;
6089 
6090       /* If the map is required, move up the extra data to make room for it;
6091       otherwise just move the code pointer to the end of the extra data. */
6092 
6093       if (class_has_8bitchar > 0)
6094         {
6095         *code++ |= XCL_MAP;
6096         (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6097           CU2BYTES(class_uchardata - code));
6098         if (negate_class && !xclass_has_prop)
6099           {
6100           /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6101           for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6102           }
6103         memcpy(code, classbits, 32);
6104         code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6105         }
6106       else code = class_uchardata;
6107 
6108       /* Now fill in the complete length of the item */
6109 
6110       PUT(previous, 1, (int)(code - previous));
6111       break;   /* End of class handling */
6112       }
6113 #endif  /* SUPPORT_WIDE_CHARS */
6114 
6115     /* If there are no characters > 255, or they are all to be included or
6116     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6117     whole class was negated and whether there were negative specials such as \S
6118     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6119     negating it if necessary. */
6120 
6121     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6122     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
6123       {
6124       if (negate_class)
6125         {
6126        /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6127        for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6128        }
6129       memcpy(code, classbits, 32);
6130       }
6131     code += 32 / sizeof(PCRE2_UCHAR);
6132     break;  /* End of class processing */
6133 
6134 
6135     /* ===================================================================*/
6136     /* Deal with (*VERB)s. */
6137 
6138     /* Check for open captures before ACCEPT and close those that are within
6139     the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6140     assertion. In the first pass, just accumulate the length required;
6141     otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6142     workspace overflow. Do not set firstcu after *ACCEPT. */
6143 
6144     case META_ACCEPT:
6145     cb->had_accept = had_accept = TRUE;
6146     for (oc = cb->open_caps;
6147          oc != NULL && oc->assert_depth >= cb->assert_depth;
6148          oc = oc->next)
6149       {
6150       if (lengthptr != NULL)
6151         {
6152         *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6153         }
6154       else
6155         {
6156         *code++ = OP_CLOSE;
6157         PUT2INC(code, 0, oc->number);
6158         }
6159       }
6160     *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6161     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6162     break;
6163 
6164     case META_PRUNE:
6165     case META_SKIP:
6166     cb->had_pruneorskip = TRUE;
6167     /* Fall through */
6168     case META_COMMIT:
6169     case META_FAIL:
6170     *code++ = verbops[(meta - META_MARK) >> 16];
6171     break;
6172 
6173     case META_THEN:
6174     cb->external_flags |= PCRE2_HASTHEN;
6175     *code++ = OP_THEN;
6176     break;
6177 
6178     /* Handle verbs with arguments. Arguments can be very long, especially in
6179     16- and 32-bit modes, and can overflow the workspace in the first pass.
6180     However, the argument length is constrained to be small enough to fit in
6181     one code unit. This check happens in parse_regex(). In the first pass,
6182     instead of putting the argument into memory, we just update the length
6183     counter and set up an empty argument. */
6184 
6185     case META_THEN_ARG:
6186     cb->external_flags |= PCRE2_HASTHEN;
6187     goto VERB_ARG;
6188 
6189     case META_PRUNE_ARG:
6190     case META_SKIP_ARG:
6191     cb->had_pruneorskip = TRUE;
6192     /* Fall through */
6193     case META_MARK:
6194     case META_COMMIT_ARG:
6195     VERB_ARG:
6196     *code++ = verbops[(meta - META_MARK) >> 16];
6197     /* The length is in characters. */
6198     verbarglen = *(++pptr);
6199     verbculen = 0;
6200     tempcode = code++;
6201     for (i = 0; i < (int)verbarglen; i++)
6202       {
6203       meta = *(++pptr);
6204 #ifdef SUPPORT_UNICODE
6205       if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6206 #endif
6207         {
6208         mclength = 1;
6209         mcbuffer[0] = meta;
6210         }
6211       if (lengthptr != NULL) *lengthptr += mclength; else
6212         {
6213         memcpy(code, mcbuffer, CU2BYTES(mclength));
6214         code += mclength;
6215         verbculen += mclength;
6216         }
6217       }
6218 
6219     *tempcode = verbculen;   /* Fill in the code unit length */
6220     *code++ = 0;             /* Terminating zero */
6221     break;
6222 
6223 
6224     /* ===================================================================*/
6225     /* Handle options change. The new setting must be passed back for use in
6226     subsequent branches. Reset the greedy defaults and the case value for
6227     firstcu and reqcu. */
6228 
6229     case META_OPTIONS:
6230     *optionsptr = options = *(++pptr);
6231     greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6232     greedy_non_default = greedy_default ^ 1;
6233     req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6234     break;
6235 
6236 
6237     /* ===================================================================*/
6238     /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6239     because it could be a numerical check on recursion, or a name check on a
6240     group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6241     we can handle it either way. We first try for a name; if not found, process
6242     the number. */
6243 
6244     case META_COND_RNUMBER:   /* (?(Rdigits) */
6245     case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6246     case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6247     bravalue = OP_COND;
6248       {
6249       int count, index;
6250       PCRE2_SPTR name;
6251       named_group *ng = cb->named_groups;
6252       uint32_t length = *(++pptr);
6253 
6254       GETPLUSOFFSET(offset, pptr);
6255       name = cb->start_pattern + offset;
6256 
6257       /* In the first pass, the names generated in the pre-pass are available,
6258       but the main name table has not yet been created. Scan the list of names
6259       generated in the pre-pass in order to get a number and whether or not
6260       this name is duplicated. If it is not duplicated, we can handle it as a
6261       numerical group. */
6262 
6263       for (i = 0; i < cb->names_found; i++, ng++)
6264         {
6265         if (length == ng->length &&
6266             PRIV(strncmp)(name, ng->name, length) == 0)
6267           {
6268           if (!ng->isdup)
6269             {
6270             code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6271             PUT2(code, 2+LINK_SIZE, ng->number);
6272             if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6273             skipunits = 1+IMM2_SIZE;
6274             goto GROUP_PROCESS_NOTE_EMPTY;
6275             }
6276           break;  /* Found a duplicated name */
6277           }
6278         }
6279 
6280       /* If the name was not found we have a bad reference, unless we are
6281       dealing with R<digits>, which is treated as a recursion test by number.
6282       */
6283 
6284       if (i >= cb->names_found)
6285         {
6286         groupnumber = 0;
6287         if (meta == META_COND_RNUMBER)
6288           {
6289           for (i = 1; i < (int)length; i++)
6290             {
6291             groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6292             if (groupnumber > MAX_GROUP_NUMBER)
6293               {
6294               *errorcodeptr = ERR61;
6295               cb->erroroffset = offset + i;
6296               return 0;
6297               }
6298             }
6299           }
6300 
6301         if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6302           {
6303           *errorcodeptr = ERR15;
6304           cb->erroroffset = offset;
6305           return 0;
6306           }
6307 
6308         /* (?Rdigits) treated as a recursion reference by number. A value of
6309         zero (which is the result of both (?R) and (?R0)) means "any", and is
6310         translated into RREF_ANY (which is 0xffff). */
6311 
6312         if (groupnumber == 0) groupnumber = RREF_ANY;
6313         code[1+LINK_SIZE] = OP_RREF;
6314         PUT2(code, 2+LINK_SIZE, groupnumber);
6315         skipunits = 1+IMM2_SIZE;
6316         goto GROUP_PROCESS_NOTE_EMPTY;
6317         }
6318 
6319       /* A duplicated name was found. Note that if an R<digits> name is found
6320       (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6321 
6322       code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6323 
6324       /* We have a duplicated name. In the compile pass we have to search the
6325       main table in order to get the index and count values. */
6326 
6327       count = 0;  /* Values for first pass (avoids compiler warning) */
6328       index = 0;
6329       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6330             &count, errorcodeptr, cb)) return 0;
6331 
6332       /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6333       insert appropriate data values. */
6334 
6335       code[1+LINK_SIZE]++;
6336       skipunits = 1+2*IMM2_SIZE;
6337       PUT2(code, 2+LINK_SIZE, index);
6338       PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6339       }
6340     goto GROUP_PROCESS_NOTE_EMPTY;
6341 
6342     /* The DEFINE condition is always false. Its internal groups may never
6343     be called, so matched_char must remain false, hence the jump to
6344     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6345 
6346     case META_COND_DEFINE:
6347     bravalue = OP_COND;
6348     GETPLUSOFFSET(offset, pptr);
6349     code[1+LINK_SIZE] = OP_DEFINE;
6350     skipunits = 1;
6351     goto GROUP_PROCESS;
6352 
6353     /* Conditional test of a group's being set. */
6354 
6355     case META_COND_NUMBER:
6356     bravalue = OP_COND;
6357     GETPLUSOFFSET(offset, pptr);
6358     groupnumber = *(++pptr);
6359     if (groupnumber > cb->bracount)
6360       {
6361       *errorcodeptr = ERR15;
6362       cb->erroroffset = offset;
6363       return 0;
6364       }
6365     if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6366     offset -= 2;   /* Point at initial ( for too many branches error */
6367     code[1+LINK_SIZE] = OP_CREF;
6368     skipunits = 1+IMM2_SIZE;
6369     PUT2(code, 2+LINK_SIZE, groupnumber);
6370     goto GROUP_PROCESS_NOTE_EMPTY;
6371 
6372     /* Test for the PCRE2 version. */
6373 
6374     case META_COND_VERSION:
6375     bravalue = OP_COND;
6376     if (pptr[1] > 0)
6377       code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6378         (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6379           OP_TRUE : OP_FALSE;
6380     else
6381       code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6382         OP_TRUE : OP_FALSE;
6383     skipunits = 1;
6384     pptr += 3;
6385     goto GROUP_PROCESS_NOTE_EMPTY;
6386 
6387     /* The condition is an assertion, possibly preceded by a callout. */
6388 
6389     case META_COND_ASSERT:
6390     bravalue = OP_COND;
6391     goto GROUP_PROCESS_NOTE_EMPTY;
6392 
6393 
6394     /* ===================================================================*/
6395     /* Handle all kinds of nested bracketed groups. The non-capturing,
6396     non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6397 
6398     case META_LOOKAHEAD:
6399     bravalue = OP_ASSERT;
6400     cb->assert_depth += 1;
6401     goto GROUP_PROCESS;
6402 
6403     case META_LOOKAHEAD_NA:
6404     bravalue = OP_ASSERT_NA;
6405     cb->assert_depth += 1;
6406     goto GROUP_PROCESS;
6407 
6408     /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6409     thing to do, but Perl allows all assertions to be quantified, and when
6410     they contain capturing parentheses there may be a potential use for
6411     this feature. Not that that applies to a quantified (?!) but we allow
6412     it for uniformity. */
6413 
6414     case META_LOOKAHEADNOT:
6415     if (pptr[1] == META_KET &&
6416          (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6417       {
6418       *code++ = OP_FAIL;
6419       pptr++;
6420       }
6421     else
6422       {
6423       bravalue = OP_ASSERT_NOT;
6424       cb->assert_depth += 1;
6425       goto GROUP_PROCESS;
6426       }
6427     break;
6428 
6429     case META_LOOKBEHIND:
6430     bravalue = OP_ASSERTBACK;
6431     cb->assert_depth += 1;
6432     goto GROUP_PROCESS;
6433 
6434     case META_LOOKBEHINDNOT:
6435     bravalue = OP_ASSERTBACK_NOT;
6436     cb->assert_depth += 1;
6437     goto GROUP_PROCESS;
6438 
6439     case META_LOOKBEHIND_NA:
6440     bravalue = OP_ASSERTBACK_NA;
6441     cb->assert_depth += 1;
6442     goto GROUP_PROCESS;
6443 
6444     case META_ATOMIC:
6445     bravalue = OP_ONCE;
6446     goto GROUP_PROCESS_NOTE_EMPTY;
6447 
6448     case META_SCRIPT_RUN:
6449     bravalue = OP_SCRIPT_RUN;
6450     goto GROUP_PROCESS_NOTE_EMPTY;
6451 
6452     case META_NOCAPTURE:
6453     bravalue = OP_BRA;
6454     /* Fall through */
6455 
6456     /* Process nested bracketed regex. The nesting depth is maintained for the
6457     benefit of the stackguard function. The test for too deep nesting is now
6458     done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6459     others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6460     note of whether or not they may match an empty string. */
6461 
6462     GROUP_PROCESS_NOTE_EMPTY:
6463     note_group_empty = TRUE;
6464 
6465     GROUP_PROCESS:
6466     cb->parens_depth += 1;
6467     *code = bravalue;
6468     pptr++;
6469     tempcode = code;
6470     tempreqvary = cb->req_varyopt;        /* Save value before group */
6471     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6472 
6473     if ((group_return =
6474          compile_regex(
6475          options,                         /* The option state */
6476          &tempcode,                       /* Where to put code (updated) */
6477          &pptr,                           /* Input pointer (updated) */
6478          errorcodeptr,                    /* Where to put an error message */
6479          skipunits,                       /* Skip over bracket number */
6480          &subfirstcu,                     /* For possible first char */
6481          &subfirstcuflags,
6482          &subreqcu,                       /* For possible last char */
6483          &subreqcuflags,
6484          bcptr,                           /* Current branch chain */
6485          cb,                              /* Compile data block */
6486          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6487            &length_prevgroup              /* Pre-compile phase */
6488          )) == 0)
6489       return 0;  /* Error */
6490 
6491     cb->parens_depth -= 1;
6492 
6493     /* If that was a non-conditional significant group (not an assertion, not a
6494     DEFINE) that matches at least one character, then the current item matches
6495     a character. Conditionals are handled below. */
6496 
6497     if (note_group_empty && bravalue != OP_COND && group_return > 0)
6498       matched_char = TRUE;
6499 
6500     /* If we've just compiled an assertion, pop the assert depth. */
6501 
6502     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6503       cb->assert_depth -= 1;
6504 
6505     /* At the end of compiling, code is still pointing to the start of the
6506     group, while tempcode has been updated to point past the end of the group.
6507     The parsed pattern pointer (pptr) is on the closing META_KET.
6508 
6509     If this is a conditional bracket, check that there are no more than
6510     two branches in the group, or just one if it's a DEFINE group. We do this
6511     in the real compile phase, not in the pre-pass, where the whole group may
6512     not be available. */
6513 
6514     if (bravalue == OP_COND && lengthptr == NULL)
6515       {
6516       PCRE2_UCHAR *tc = code;
6517       int condcount = 0;
6518 
6519       do {
6520          condcount++;
6521          tc += GET(tc,1);
6522          }
6523       while (*tc != OP_KET);
6524 
6525       /* A DEFINE group is never obeyed inline (the "condition" is always
6526       false). It must have only one branch. Having checked this, change the
6527       opcode to OP_FALSE. */
6528 
6529       if (code[LINK_SIZE+1] == OP_DEFINE)
6530         {
6531         if (condcount > 1)
6532           {
6533           cb->erroroffset = offset;
6534           *errorcodeptr = ERR54;
6535           return 0;
6536           }
6537         code[LINK_SIZE+1] = OP_FALSE;
6538         bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6539         }
6540 
6541       /* A "normal" conditional group. If there is just one branch, we must not
6542       make use of its firstcu or reqcu, because this is equivalent to an
6543       empty second branch. Also, it may match an empty string. If there are two
6544       branches, this item must match a character if the group must. */
6545 
6546       else
6547         {
6548         if (condcount > 2)
6549           {
6550           cb->erroroffset = offset;
6551           *errorcodeptr = ERR27;
6552           return 0;
6553           }
6554         if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6555           else if (group_return > 0) matched_char = TRUE;
6556         }
6557       }
6558 
6559     /* In the pre-compile phase, update the length by the length of the group,
6560     less the brackets at either end. Then reduce the compiled code to just a
6561     set of non-capturing brackets so that it doesn't use much memory if it is
6562     duplicated by a quantifier.*/
6563 
6564     if (lengthptr != NULL)
6565       {
6566       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6567         {
6568         *errorcodeptr = ERR20;
6569         return 0;
6570         }
6571       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6572       code++;   /* This already contains bravalue */
6573       PUTINC(code, 0, 1 + LINK_SIZE);
6574       *code++ = OP_KET;
6575       PUTINC(code, 0, 1 + LINK_SIZE);
6576       break;    /* No need to waste time with special character handling */
6577       }
6578 
6579     /* Otherwise update the main code pointer to the end of the group. */
6580 
6581     code = tempcode;
6582 
6583     /* For a DEFINE group, required and first character settings are not
6584     relevant. */
6585 
6586     if (bravalue == OP_DEFINE) break;
6587 
6588     /* Handle updating of the required and first code units for other types of
6589     group. Update for normal brackets of all kinds, and conditions with two
6590     branches (see code above). If the bracket is followed by a quantifier with
6591     zero repeat, we have to back off. Hence the definition of zeroreqcu and
6592     zerofirstcu outside the main loop so that they can be accessed for the back
6593     off. */
6594 
6595     zeroreqcu = reqcu;
6596     zeroreqcuflags = reqcuflags;
6597     zerofirstcu = firstcu;
6598     zerofirstcuflags = firstcuflags;
6599     groupsetfirstcu = FALSE;
6600 
6601     if (bravalue >= OP_ONCE)  /* Not an assertion */
6602       {
6603       /* If we have not yet set a firstcu in this branch, take it from the
6604       subpattern, remembering that it was set here so that a repeat of more
6605       than one can replicate it as reqcu if necessary. If the subpattern has
6606       no firstcu, set "none" for the whole branch. In both cases, a zero
6607       repeat forces firstcu to "none". */
6608 
6609       if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6610         {
6611         if (subfirstcuflags >= 0)
6612           {
6613           firstcu = subfirstcu;
6614           firstcuflags = subfirstcuflags;
6615           groupsetfirstcu = TRUE;
6616           }
6617         else firstcuflags = REQ_NONE;
6618         zerofirstcuflags = REQ_NONE;
6619         }
6620 
6621       /* If firstcu was previously set, convert the subpattern's firstcu
6622       into reqcu if there wasn't one, using the vary flag that was in
6623       existence beforehand. */
6624 
6625       else if (subfirstcuflags >= 0 && subreqcuflags < 0)
6626         {
6627         subreqcu = subfirstcu;
6628         subreqcuflags = subfirstcuflags | tempreqvary;
6629         }
6630 
6631       /* If the subpattern set a required code unit (or set a first code unit
6632       that isn't really the first code unit - see above), set it. */
6633 
6634       if (subreqcuflags >= 0)
6635         {
6636         reqcu = subreqcu;
6637         reqcuflags = subreqcuflags;
6638         }
6639       }
6640 
6641     /* For a forward assertion, we take the reqcu, if set, provided that the
6642     group has also set a firstcu. This can be helpful if the pattern that
6643     follows the assertion doesn't set a different char. For example, it's
6644     useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6645     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6646     the "real" "a" would then become a reqcu instead of a firstcu. This is
6647     overcome by a scan at the end if there's no firstcu, looking for an
6648     asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6649     we must only take the reqcu when the group also set a firstcu. Otherwise,
6650     in that example, 'X' ends up set for both. */
6651 
6652     else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6653              subreqcuflags >= 0 && subfirstcuflags >= 0)
6654       {
6655       reqcu = subreqcu;
6656       reqcuflags = subreqcuflags;
6657       }
6658 
6659     break;  /* End of nested group handling */
6660 
6661 
6662     /* ===================================================================*/
6663     /* Handle named backreferences and recursions. */
6664 
6665     case META_BACKREF_BYNAME:
6666     case META_RECURSE_BYNAME:
6667       {
6668       int count, index;
6669       PCRE2_SPTR name;
6670       BOOL is_dupname = FALSE;
6671       named_group *ng = cb->named_groups;
6672       uint32_t length = *(++pptr);
6673 
6674       GETPLUSOFFSET(offset, pptr);
6675       name = cb->start_pattern + offset;
6676 
6677       /* In the first pass, the names generated in the pre-pass are available,
6678       but the main name table has not yet been created. Scan the list of names
6679       generated in the pre-pass in order to get a number and whether or not
6680       this name is duplicated. */
6681 
6682       groupnumber = 0;
6683       for (i = 0; i < cb->names_found; i++, ng++)
6684         {
6685         if (length == ng->length &&
6686             PRIV(strncmp)(name, ng->name, length) == 0)
6687           {
6688           is_dupname = ng->isdup;
6689           groupnumber = ng->number;
6690 
6691           /* For a recursion, that's all that is needed. We can now go to
6692           the code that handles numerical recursion, applying it to the first
6693           group with the given name. */
6694 
6695           if (meta == META_RECURSE_BYNAME)
6696             {
6697             meta_arg = groupnumber;
6698             goto HANDLE_NUMERICAL_RECURSION;
6699             }
6700 
6701           /* For a back reference, update the back reference map and the
6702           maximum back reference. */
6703 
6704           cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6705           if (groupnumber > cb->top_backref)
6706             cb->top_backref = groupnumber;
6707           }
6708         }
6709 
6710       /* If the name was not found we have a bad reference. */
6711 
6712       if (groupnumber == 0)
6713         {
6714         *errorcodeptr = ERR15;
6715         cb->erroroffset = offset;
6716         return 0;
6717         }
6718 
6719       /* If a back reference name is not duplicated, we can handle it as
6720       a numerical reference. */
6721 
6722       if (!is_dupname)
6723         {
6724         meta_arg = groupnumber;
6725         goto HANDLE_SINGLE_REFERENCE;
6726         }
6727 
6728       /* If a back reference name is duplicated, we generate a different
6729       opcode to a numerical back reference. In the second pass we must
6730       search for the index and count in the final name table. */
6731 
6732       count = 0;  /* Values for first pass (avoids compiler warning) */
6733       index = 0;
6734       if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6735             &count, errorcodeptr, cb)) return 0;
6736 
6737       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6738       *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6739       PUT2INC(code, 0, index);
6740       PUT2INC(code, 0, count);
6741       }
6742     break;
6743 
6744 
6745     /* ===================================================================*/
6746     /* Handle a numerical callout. */
6747 
6748     case META_CALLOUT_NUMBER:
6749     code[0] = OP_CALLOUT;
6750     PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6751     PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6752     code[1 + 2*LINK_SIZE] = pptr[3];
6753     pptr += 3;
6754     code += PRIV(OP_lengths)[OP_CALLOUT];
6755     break;
6756 
6757 
6758     /* ===================================================================*/
6759     /* Handle a callout with a string argument. In the pre-pass we just compute
6760     the length without generating anything. The length in pptr[3] includes both
6761     delimiters; in the actual compile only the first one is copied, but a
6762     terminating zero is added. Any doubled delimiters within the string make
6763     this an overestimate, but it is not worth bothering about. */
6764 
6765     case META_CALLOUT_STRING:
6766     if (lengthptr != NULL)
6767       {
6768       *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6769       pptr += 3;
6770       SKIPOFFSET(pptr);
6771       }
6772 
6773     /* In the real compile we can copy the string. The starting delimiter is
6774      included so that the client can discover it if they want. We also pass the
6775      start offset to help a script language give better error messages. */
6776 
6777     else
6778       {
6779       PCRE2_SPTR pp;
6780       uint32_t delimiter;
6781       uint32_t length = pptr[3];
6782       PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6783 
6784       code[0] = OP_CALLOUT_STR;
6785       PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
6786       PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
6787 
6788       pptr += 3;
6789       GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
6790       pp = cb->start_pattern + offset;
6791       delimiter = *callout_string++ = *pp++;
6792       if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6793         delimiter = CHAR_RIGHT_CURLY_BRACKET;
6794       PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
6795 
6796       /* The syntax of the pattern was checked in the parsing scan. The length
6797       includes both delimiters, but we have passed the opening one just above,
6798       so we reduce length before testing it. The test is for > 1 because we do
6799       not want to copy the final delimiter. This also ensures that pp[1] is
6800       accessible. */
6801 
6802       while (--length > 1)
6803         {
6804         if (*pp == delimiter && pp[1] == delimiter)
6805           {
6806           *callout_string++ = delimiter;
6807           pp += 2;
6808           length--;
6809           }
6810         else *callout_string++ = *pp++;
6811         }
6812       *callout_string++ = CHAR_NUL;
6813 
6814       /* Set the length of the entire item, the advance to its end. */
6815 
6816       PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6817       code = callout_string;
6818       }
6819     break;
6820 
6821 
6822     /* ===================================================================*/
6823     /* Handle repetition. The different types are all sorted out in the parsing
6824     pass. */
6825 
6826     case META_MINMAX_PLUS:
6827     case META_MINMAX_QUERY:
6828     case META_MINMAX:
6829     repeat_min = *(++pptr);
6830     repeat_max = *(++pptr);
6831     goto REPEAT;
6832 
6833     case META_ASTERISK:
6834     case META_ASTERISK_PLUS:
6835     case META_ASTERISK_QUERY:
6836     repeat_min = 0;
6837     repeat_max = REPEAT_UNLIMITED;
6838     goto REPEAT;
6839 
6840     case META_PLUS:
6841     case META_PLUS_PLUS:
6842     case META_PLUS_QUERY:
6843     repeat_min = 1;
6844     repeat_max = REPEAT_UNLIMITED;
6845     goto REPEAT;
6846 
6847     case META_QUERY:
6848     case META_QUERY_PLUS:
6849     case META_QUERY_QUERY:
6850     repeat_min = 0;
6851     repeat_max = 1;
6852 
6853     REPEAT:
6854     if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6855 
6856     /* Remember whether this is a variable length repeat, and default to
6857     single-char opcodes. */
6858 
6859     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6860     op_type = 0;
6861 
6862     /* Adjust first and required code units for a zero repeat. */
6863 
6864     if (repeat_min == 0)
6865       {
6866       firstcu = zerofirstcu;
6867       firstcuflags = zerofirstcuflags;
6868       reqcu = zeroreqcu;
6869       reqcuflags = zeroreqcuflags;
6870       }
6871 
6872     /* Note the greediness and possessiveness. */
6873 
6874     switch (meta)
6875       {
6876       case META_MINMAX_PLUS:
6877       case META_ASTERISK_PLUS:
6878       case META_PLUS_PLUS:
6879       case META_QUERY_PLUS:
6880       repeat_type = 0;                  /* Force greedy */
6881       possessive_quantifier = TRUE;
6882       break;
6883 
6884       case META_MINMAX_QUERY:
6885       case META_ASTERISK_QUERY:
6886       case META_PLUS_QUERY:
6887       case META_QUERY_QUERY:
6888       repeat_type = greedy_non_default;
6889       possessive_quantifier = FALSE;
6890       break;
6891 
6892       default:
6893       repeat_type = greedy_default;
6894       possessive_quantifier = FALSE;
6895       break;
6896       }
6897 
6898     /* Save start of previous item, in case we have to move it up in order to
6899     insert something before it, and remember what it was. */
6900 
6901     tempcode = previous;
6902     op_previous = *previous;
6903 
6904     /* Now handle repetition for the different types of item. If the repeat
6905     minimum and the repeat maximum are both 1, we can ignore the quantifier for
6906     non-parenthesized items, as they have only one alternative. For anything in
6907     parentheses, we must not ignore if {1} is possessive. */
6908 
6909     switch (op_previous)
6910       {
6911       /* If previous was a character or negated character match, abolish the
6912       item and generate a repeat item instead. If a char item has a minimum of
6913       more than one, ensure that it is set in reqcu - it might not be if a
6914       sequence such as x{3} is the first thing in a branch because the x will
6915       have gone into firstcu instead.  */
6916 
6917       case OP_CHAR:
6918       case OP_CHARI:
6919       case OP_NOT:
6920       case OP_NOTI:
6921       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6922       op_type = chartypeoffset[op_previous - OP_CHAR];
6923 
6924       /* Deal with UTF characters that take up more than one code unit. */
6925 
6926 #ifdef MAYBE_UTF_MULTI
6927       if (utf && NOT_FIRSTCU(code[-1]))
6928         {
6929         PCRE2_UCHAR *lastchar = code - 1;
6930         BACKCHAR(lastchar);
6931         mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
6932         memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
6933         }
6934       else
6935 #endif  /* MAYBE_UTF_MULTI */
6936 
6937       /* Handle the case of a single code unit - either with no UTF support, or
6938       with UTF disabled, or for a single-code-unit UTF character. */
6939         {
6940         mcbuffer[0] = code[-1];
6941         mclength = 1;
6942         if (op_previous <= OP_CHARI && repeat_min > 1)
6943           {
6944           reqcu = mcbuffer[0];
6945           reqcuflags = req_caseopt | cb->req_varyopt;
6946           }
6947         }
6948       goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
6949 
6950       /* If previous was a character class or a back reference, we put the
6951       repeat stuff after it, but just skip the item if the repeat was {0,0}. */
6952 
6953 #ifdef SUPPORT_WIDE_CHARS
6954       case OP_XCLASS:
6955 #endif
6956       case OP_CLASS:
6957       case OP_NCLASS:
6958       case OP_REF:
6959       case OP_REFI:
6960       case OP_DNREF:
6961       case OP_DNREFI:
6962 
6963       if (repeat_max == 0)
6964         {
6965         code = previous;
6966         goto END_REPEAT;
6967         }
6968       if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6969 
6970       if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
6971         *code++ = OP_CRSTAR + repeat_type;
6972       else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
6973         *code++ = OP_CRPLUS + repeat_type;
6974       else if (repeat_min == 0 && repeat_max == 1)
6975         *code++ = OP_CRQUERY + repeat_type;
6976       else
6977         {
6978         *code++ = OP_CRRANGE + repeat_type;
6979         PUT2INC(code, 0, repeat_min);
6980         if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
6981         PUT2INC(code, 0, repeat_max);
6982         }
6983       break;
6984 
6985       /* If previous is OP_FAIL, it was generated by an empty class []
6986       (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
6987       generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
6988       time. We can just ignore this repeat. */
6989 
6990       case OP_FAIL:
6991       goto END_REPEAT;
6992 
6993       /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
6994       because pcre2_match() could not handle backtracking into recursively
6995       called groups. Now that this backtracking is available, we no longer need
6996       to do this. However, we still need to replicate recursions as we do for
6997       groups so as to have independent backtracking points. We can replicate
6998       for the minimum number of repeats directly. For optional repeats we now
6999       wrap the recursion in OP_BRA brackets and make use of the bracket
7000       repetition. */
7001 
7002       case OP_RECURSE:
7003       if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7004         goto END_REPEAT;
7005 
7006       /* Generate unwrapped repeats for a non-zero minimum, except when the
7007       minimum is 1 and the maximum unlimited, because that can be handled with
7008       OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7009       minimum, we just need to generate the appropriate additional copies.
7010       Otherwise we need to generate one more, to simulate the situation when
7011       the minimum is zero. */
7012 
7013       if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7014         {
7015         int replicate = repeat_min;
7016         if (repeat_min == repeat_max) replicate--;
7017 
7018         /* In the pre-compile phase, we don't actually do the replication. We
7019         just adjust the length as if we had. Do some paranoid checks for
7020         potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7021         integer type when available, otherwise double. */
7022 
7023         if (lengthptr != NULL)
7024           {
7025           PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
7026           if ((INT64_OR_DOUBLE)replicate*
7027                 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
7028                   (INT64_OR_DOUBLE)INT_MAX ||
7029               OFLOW_MAX - *lengthptr < delta)
7030             {
7031             *errorcodeptr = ERR20;
7032             return 0;
7033             }
7034           *lengthptr += delta;
7035           }
7036 
7037         else for (i = 0; i < replicate; i++)
7038           {
7039           memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7040           previous = code;
7041           code += 1 + LINK_SIZE;
7042           }
7043 
7044         /* If the number of repeats is fixed, we are done. Otherwise, adjust
7045         the counts and fall through. */
7046 
7047         if (repeat_min == repeat_max) break;
7048         if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7049         repeat_min = 0;
7050         }
7051 
7052       /* Wrap the recursion call in OP_BRA brackets. */
7053 
7054       (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7055       op_previous = *previous = OP_BRA;
7056       PUT(previous, 1, 2 + 2*LINK_SIZE);
7057       previous[2 + 2*LINK_SIZE] = OP_KET;
7058       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7059       code += 2 + 2 * LINK_SIZE;
7060       length_prevgroup = 3 + 3*LINK_SIZE;
7061       group_return = -1;  /* Set "may match empty string" */
7062 
7063       /* Now treat as a repeated OP_BRA. */
7064       /* Fall through */
7065 
7066       /* If previous was a bracket group, we may have to replicate it in
7067       certain cases. Note that at this point we can encounter only the "basic"
7068       bracket opcodes such as BRA and CBRA, as this is the place where they get
7069       converted into the more special varieties such as BRAPOS and SBRA.
7070       Originally, PCRE did not allow repetition of assertions, but now it does,
7071       for Perl compatibility. */
7072 
7073       case OP_ASSERT:
7074       case OP_ASSERT_NOT:
7075       case OP_ASSERT_NA:
7076       case OP_ASSERTBACK:
7077       case OP_ASSERTBACK_NOT:
7078       case OP_ASSERTBACK_NA:
7079       case OP_ONCE:
7080       case OP_SCRIPT_RUN:
7081       case OP_BRA:
7082       case OP_CBRA:
7083       case OP_COND:
7084         {
7085         int len = (int)(code - previous);
7086         PCRE2_UCHAR *bralink = NULL;
7087         PCRE2_UCHAR *brazeroptr = NULL;
7088 
7089         if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7090           goto END_REPEAT;
7091 
7092         /* Repeating a DEFINE group (or any group where the condition is always
7093         FALSE and there is only one branch) is pointless, but Perl allows the
7094         syntax, so we just ignore the repeat. */
7095 
7096         if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7097             previous[GET(previous, 1)] != OP_ALT)
7098           goto END_REPEAT;
7099 
7100         /* Perl allows all assertions to be quantified, and when they contain
7101         capturing parentheses and/or are optional there are potential uses for
7102         this feature. PCRE2 used to force the maximum quantifier to 1 on the
7103         invalid grounds that further repetition was never useful. This was
7104         always a bit pointless, since an assertion could be wrapped with a
7105         repeated group to achieve the effect. General repetition is now
7106         permitted, but if the maximum is unlimited it is set to one more than
7107         the minimum. */
7108 
7109         if (op_previous < OP_ONCE)    /* Assertion */
7110           {
7111           if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7112           }
7113 
7114         /* The case of a zero minimum is special because of the need to stick
7115         OP_BRAZERO in front of it, and because the group appears once in the
7116         data, whereas in other cases it appears the minimum number of times. For
7117         this reason, it is simplest to treat this case separately, as otherwise
7118         the code gets far too messy. There are several special subcases when the
7119         minimum is zero. */
7120 
7121         if (repeat_min == 0)
7122           {
7123           /* If the maximum is also zero, we used to just omit the group from
7124           the output altogether, like this:
7125 
7126           ** if (repeat_max == 0)
7127           **   {
7128           **   code = previous;
7129           **   goto END_REPEAT;
7130           **   }
7131 
7132           However, that fails when a group or a subgroup within it is
7133           referenced as a subroutine from elsewhere in the pattern, so now we
7134           stick in OP_SKIPZERO in front of it so that it is skipped on
7135           execution. As we don't have a list of which groups are referenced, we
7136           cannot do this selectively.
7137 
7138           If the maximum is 1 or unlimited, we just have to stick in the
7139           BRAZERO and do no more at this point. */
7140 
7141           if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7142             {
7143             (void)memmove(previous + 1, previous, CU2BYTES(len));
7144             code++;
7145             if (repeat_max == 0)
7146               {
7147               *previous++ = OP_SKIPZERO;
7148               goto END_REPEAT;
7149               }
7150             brazeroptr = previous;    /* Save for possessive optimizing */
7151             *previous++ = OP_BRAZERO + repeat_type;
7152             }
7153 
7154           /* If the maximum is greater than 1 and limited, we have to replicate
7155           in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7156           The first one has to be handled carefully because it's the original
7157           copy, which has to be moved up. The remainder can be handled by code
7158           that is common with the non-zero minimum case below. We have to
7159           adjust the value or repeat_max, since one less copy is required. */
7160 
7161           else
7162             {
7163             int linkoffset;
7164             (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7165             code += 2 + LINK_SIZE;
7166             *previous++ = OP_BRAZERO + repeat_type;
7167             *previous++ = OP_BRA;
7168 
7169             /* We chain together the bracket link offset fields that have to be
7170             filled in later when the ends of the brackets are reached. */
7171 
7172             linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7173             bralink = previous;
7174             PUTINC(previous, 0, linkoffset);
7175             }
7176 
7177           if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7178           }
7179 
7180         /* If the minimum is greater than zero, replicate the group as many
7181         times as necessary, and adjust the maximum to the number of subsequent
7182         copies that we need. */
7183 
7184         else
7185           {
7186           if (repeat_min > 1)
7187             {
7188             /* In the pre-compile phase, we don't actually do the replication.
7189             We just adjust the length as if we had. Do some paranoid checks for
7190             potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7191             integer type when available, otherwise double. */
7192 
7193             if (lengthptr != NULL)
7194               {
7195               PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7196               if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7197                     (INT64_OR_DOUBLE)length_prevgroup >
7198                       (INT64_OR_DOUBLE)INT_MAX ||
7199                   OFLOW_MAX - *lengthptr < delta)
7200                 {
7201                 *errorcodeptr = ERR20;
7202                 return 0;
7203                 }
7204               *lengthptr += delta;
7205               }
7206 
7207             /* This is compiling for real. If there is a set first code unit
7208             for the group, and we have not yet set a "required code unit", set
7209             it. */
7210 
7211             else
7212               {
7213               if (groupsetfirstcu && reqcuflags < 0)
7214                 {
7215                 reqcu = firstcu;
7216                 reqcuflags = firstcuflags;
7217                 }
7218               for (i = 1; (uint32_t)i < repeat_min; i++)
7219                 {
7220                 memcpy(code, previous, CU2BYTES(len));
7221                 code += len;
7222                 }
7223               }
7224             }
7225 
7226           if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7227           }
7228 
7229         /* This code is common to both the zero and non-zero minimum cases. If
7230         the maximum is limited, it replicates the group in a nested fashion,
7231         remembering the bracket starts on a stack. In the case of a zero
7232         minimum, the first one was set up above. In all cases the repeat_max
7233         now specifies the number of additional copies needed. Again, we must
7234         remember to replicate entries on the forward reference list. */
7235 
7236         if (repeat_max != REPEAT_UNLIMITED)
7237           {
7238           /* In the pre-compile phase, we don't actually do the replication. We
7239           just adjust the length as if we had. For each repetition we must add
7240           1 to the length for BRAZERO and for all but the last repetition we
7241           must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7242           paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7243           is a 64-bit integer type when available, otherwise double. */
7244 
7245           if (lengthptr != NULL && repeat_max > 0)
7246             {
7247             PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7248                         2 - 2*LINK_SIZE;   /* Last one doesn't nest */
7249             if ((INT64_OR_DOUBLE)repeat_max *
7250                   (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7251                     > (INT64_OR_DOUBLE)INT_MAX ||
7252                 OFLOW_MAX - *lengthptr < delta)
7253               {
7254               *errorcodeptr = ERR20;
7255               return 0;
7256               }
7257             *lengthptr += delta;
7258             }
7259 
7260           /* This is compiling for real */
7261 
7262           else for (i = repeat_max - 1; i >= 0; i--)
7263             {
7264             *code++ = OP_BRAZERO + repeat_type;
7265 
7266             /* All but the final copy start a new nesting, maintaining the
7267             chain of brackets outstanding. */
7268 
7269             if (i != 0)
7270               {
7271               int linkoffset;
7272               *code++ = OP_BRA;
7273               linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7274               bralink = code;
7275               PUTINC(code, 0, linkoffset);
7276               }
7277 
7278             memcpy(code, previous, CU2BYTES(len));
7279             code += len;
7280             }
7281 
7282           /* Now chain through the pending brackets, and fill in their length
7283           fields (which are holding the chain links pro tem). */
7284 
7285           while (bralink != NULL)
7286             {
7287             int oldlinkoffset;
7288             int linkoffset = (int)(code - bralink + 1);
7289             PCRE2_UCHAR *bra = code - linkoffset;
7290             oldlinkoffset = GET(bra, 1);
7291             bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7292             *code++ = OP_KET;
7293             PUTINC(code, 0, linkoffset);
7294             PUT(bra, 1, linkoffset);
7295             }
7296           }
7297 
7298         /* If the maximum is unlimited, set a repeater in the final copy. For
7299         SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7300         possessively repeated ONCE brackets can be converted into non-capturing
7301         brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7302         saves having to deal with possessive ONCEs specially.
7303 
7304         Otherwise, when we are doing the actual compile phase, check to see
7305         whether this group is one that could match an empty string. If so,
7306         convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7307         that runtime checking can be done. [This check is also applied to ONCE
7308         and SCRIPT_RUN groups at runtime, but in a different way.]
7309 
7310         Then, if the quantifier was possessive and the bracket is not a
7311         conditional, we convert the BRA code to the POS form, and the KET code
7312         to KETRPOS. (It turns out to be convenient at runtime to detect this
7313         kind of subpattern at both the start and at the end.) The use of
7314         special opcodes makes it possible to reduce greatly the stack usage in
7315         pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7316         OP_BRAPOSZERO.
7317 
7318         Then, if the minimum number of matches is 1 or 0, cancel the possessive
7319         flag so that the default action below, of wrapping everything inside
7320         atomic brackets, does not happen. When the minimum is greater than 1,
7321         there will be earlier copies of the group, and so we still have to wrap
7322         the whole thing. */
7323 
7324         else
7325           {
7326           PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7327           PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7328 
7329           /* Convert possessive ONCE brackets to non-capturing */
7330 
7331           if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7332 
7333           /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7334           to do is to set the KET. */
7335 
7336           if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7337             *ketcode = OP_KETRMAX + repeat_type;
7338 
7339           /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7340           (which have been converted to non-capturing above). */
7341 
7342           else
7343             {
7344             /* In the compile phase, adjust the opcode if the group can match
7345             an empty string. For a conditional group with only one branch, the
7346             value of group_return will not show "could be empty", so we must
7347             check that separately. */
7348 
7349             if (lengthptr == NULL)
7350               {
7351               if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7352               if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7353                 *bracode = OP_SCOND;
7354               }
7355 
7356             /* Handle possessive quantifiers. */
7357 
7358             if (possessive_quantifier)
7359               {
7360               /* For COND brackets, we wrap the whole thing in a possessively
7361               repeated non-capturing bracket, because we have not invented POS
7362               versions of the COND opcodes. */
7363 
7364               if (*bracode == OP_COND || *bracode == OP_SCOND)
7365                 {
7366                 int nlen = (int)(code - bracode);
7367                 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7368                 code += 1 + LINK_SIZE;
7369                 nlen += 1 + LINK_SIZE;
7370                 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7371                 *code++ = OP_KETRPOS;
7372                 PUTINC(code, 0, nlen);
7373                 PUT(bracode, 1, nlen);
7374                 }
7375 
7376               /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7377 
7378               else
7379                 {
7380                 *bracode += 1;              /* Switch to xxxPOS opcodes */
7381                 *ketcode = OP_KETRPOS;
7382                 }
7383 
7384               /* If the minimum is zero, mark it as possessive, then unset the
7385               possessive flag when the minimum is 0 or 1. */
7386 
7387               if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7388               if (repeat_min < 2) possessive_quantifier = FALSE;
7389               }
7390 
7391             /* Non-possessive quantifier */
7392 
7393             else *ketcode = OP_KETRMAX + repeat_type;
7394             }
7395           }
7396         }
7397       break;
7398 
7399       /* If previous was a character type match (\d or similar), abolish it and
7400       create a suitable repeat item. The code is shared with single-character
7401       repeats by setting op_type to add a suitable offset into repeat_type.
7402       Note the the Unicode property types will be present only when
7403       SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7404       here because it just makes it horribly messy. */
7405 
7406       default:
7407       if (op_previous >= OP_EODN)   /* Not a character type - internal error */
7408         {
7409         *errorcodeptr = ERR10;
7410         return 0;
7411         }
7412       else
7413         {
7414         int prop_type, prop_value;
7415         PCRE2_UCHAR *oldcode;
7416 
7417         if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7418 
7419         op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7420         mclength = 0;                         /* Not a character */
7421 
7422         if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7423           {
7424           prop_type = previous[1];
7425           prop_value = previous[2];
7426           }
7427         else
7428           {
7429           /* Come here from just above with a character in mcbuffer/mclength. */
7430           OUTPUT_SINGLE_REPEAT:
7431           prop_type = prop_value = -1;
7432           }
7433 
7434         /* At this point, if prop_type == prop_value == -1 we either have a
7435         character in mcbuffer when mclength is greater than zero, or we have
7436         mclength zero, in which case there is a non-property character type in
7437         op_previous. If prop_type/value are not negative, we have a property
7438         character type in op_previous. */
7439 
7440         oldcode = code;                   /* Save where we were */
7441         code = previous;                  /* Usually overwrite previous item */
7442 
7443         /* If the maximum is zero then the minimum must also be zero; Perl allows
7444         this case, so we do too - by simply omitting the item altogether. */
7445 
7446         if (repeat_max == 0) goto END_REPEAT;
7447 
7448         /* Combine the op_type with the repeat_type */
7449 
7450         repeat_type += op_type;
7451 
7452         /* A minimum of zero is handled either as the special case * or ?, or as
7453         an UPTO, with the maximum given. */
7454 
7455         if (repeat_min == 0)
7456           {
7457           if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7458             else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7459           else
7460             {
7461             *code++ = OP_UPTO + repeat_type;
7462             PUT2INC(code, 0, repeat_max);
7463             }
7464           }
7465 
7466         /* A repeat minimum of 1 is optimized into some special cases. If the
7467         maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7468         left in place and, if the maximum is greater than 1, we use OP_UPTO with
7469         one less than the maximum. */
7470 
7471         else if (repeat_min == 1)
7472           {
7473           if (repeat_max == REPEAT_UNLIMITED)
7474             *code++ = OP_PLUS + repeat_type;
7475           else
7476             {
7477             code = oldcode;  /* Leave previous item in place */
7478             if (repeat_max == 1) goto END_REPEAT;
7479             *code++ = OP_UPTO + repeat_type;
7480             PUT2INC(code, 0, repeat_max - 1);
7481             }
7482           }
7483 
7484         /* The case {n,n} is just an EXACT, while the general case {n,m} is
7485         handled as an EXACT followed by an UPTO or STAR or QUERY. */
7486 
7487         else
7488           {
7489           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7490           PUT2INC(code, 0, repeat_min);
7491 
7492           /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7493           and then generate the second opcode. For a repeated Unicode property
7494           match, there are two extra values that define the required property,
7495           and mclength is set zero to indicate this. */
7496 
7497           if (repeat_max != repeat_min)
7498             {
7499             if (mclength > 0)
7500               {
7501               memcpy(code, mcbuffer, CU2BYTES(mclength));
7502               code += mclength;
7503               }
7504             else
7505               {
7506               *code++ = op_previous;
7507               if (prop_type >= 0)
7508                 {
7509                 *code++ = prop_type;
7510                 *code++ = prop_value;
7511                 }
7512               }
7513 
7514             /* Now set up the following opcode */
7515 
7516             if (repeat_max == REPEAT_UNLIMITED)
7517               *code++ = OP_STAR + repeat_type;
7518             else
7519               {
7520               repeat_max -= repeat_min;
7521               if (repeat_max == 1)
7522                 {
7523                 *code++ = OP_QUERY + repeat_type;
7524                 }
7525               else
7526                 {
7527                 *code++ = OP_UPTO + repeat_type;
7528                 PUT2INC(code, 0, repeat_max);
7529                 }
7530               }
7531             }
7532           }
7533 
7534         /* Fill in the character or character type for the final opcode. */
7535 
7536         if (mclength > 0)
7537           {
7538           memcpy(code, mcbuffer, CU2BYTES(mclength));
7539           code += mclength;
7540           }
7541         else
7542           {
7543           *code++ = op_previous;
7544           if (prop_type >= 0)
7545             {
7546             *code++ = prop_type;
7547             *code++ = prop_value;
7548             }
7549           }
7550         }
7551       break;
7552       }  /* End of switch on different op_previous values */
7553 
7554 
7555     /* If the character following a repeat is '+', possessive_quantifier is
7556     TRUE. For some opcodes, there are special alternative opcodes for this
7557     case. For anything else, we wrap the entire repeated item inside OP_ONCE
7558     brackets. Logically, the '+' notation is just syntactic sugar, taken from
7559     Sun's Java package, but the special opcodes can optimize it.
7560 
7561     Some (but not all) possessively repeated subpatterns have already been
7562     completely handled in the code just above. For them, possessive_quantifier
7563     is always FALSE at this stage. Note that the repeated item starts at
7564     tempcode, not at previous, which might be the first part of a string whose
7565     (former) last char we repeated. */
7566 
7567     if (possessive_quantifier)
7568       {
7569       int len;
7570 
7571       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7572       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7573       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7574       remains is greater than zero, there's a further opcode that can be
7575       handled. If not, do nothing, leaving the EXACT alone. */
7576 
7577       switch(*tempcode)
7578         {
7579         case OP_TYPEEXACT:
7580         tempcode += PRIV(OP_lengths)[*tempcode] +
7581           ((tempcode[1 + IMM2_SIZE] == OP_PROP
7582           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7583         break;
7584 
7585         /* CHAR opcodes are used for exacts whose count is 1. */
7586 
7587         case OP_CHAR:
7588         case OP_CHARI:
7589         case OP_NOT:
7590         case OP_NOTI:
7591         case OP_EXACT:
7592         case OP_EXACTI:
7593         case OP_NOTEXACT:
7594         case OP_NOTEXACTI:
7595         tempcode += PRIV(OP_lengths)[*tempcode];
7596 #ifdef SUPPORT_UNICODE
7597         if (utf && HAS_EXTRALEN(tempcode[-1]))
7598           tempcode += GET_EXTRALEN(tempcode[-1]);
7599 #endif
7600         break;
7601 
7602         /* For the class opcodes, the repeat operator appears at the end;
7603         adjust tempcode to point to it. */
7604 
7605         case OP_CLASS:
7606         case OP_NCLASS:
7607         tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7608         break;
7609 
7610 #ifdef SUPPORT_WIDE_CHARS
7611         case OP_XCLASS:
7612         tempcode += GET(tempcode, 1);
7613         break;
7614 #endif
7615         }
7616 
7617       /* If tempcode is equal to code (which points to the end of the repeated
7618       item), it means we have skipped an EXACT item but there is no following
7619       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7620       all other cases, tempcode will be pointing to the repeat opcode, and will
7621       be less than code, so the value of len will be greater than 0. */
7622 
7623       len = (int)(code - tempcode);
7624       if (len > 0)
7625         {
7626         unsigned int repcode = *tempcode;
7627 
7628         /* There is a table for possessifying opcodes, all of which are less
7629         than OP_CALLOUT. A zero entry means there is no possessified version.
7630         */
7631 
7632         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7633           *tempcode = opcode_possessify[repcode];
7634 
7635         /* For opcode without a special possessified version, wrap the item in
7636         ONCE brackets. */
7637 
7638         else
7639           {
7640           (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7641           code += 1 + LINK_SIZE;
7642           len += 1 + LINK_SIZE;
7643           tempcode[0] = OP_ONCE;
7644           *code++ = OP_KET;
7645           PUTINC(code, 0, len);
7646           PUT(tempcode, 1, len);
7647           }
7648         }
7649       }
7650 
7651     /* We set the "follows varying string" flag for subsequently encountered
7652     reqcus if it isn't already set and we have just passed a varying length
7653     item. */
7654 
7655     END_REPEAT:
7656     cb->req_varyopt |= reqvary;
7657     break;
7658 
7659 
7660     /* ===================================================================*/
7661     /* Handle a 32-bit data character with a value greater than META_END. */
7662 
7663     case META_BIGVALUE:
7664     pptr++;
7665     goto NORMAL_CHAR;
7666 
7667 
7668     /* ===============================================================*/
7669     /* Handle a back reference by number, which is the meta argument. The
7670     pattern offsets for back references to group numbers less than 10 are held
7671     in a special vector, to avoid using more than two parsed pattern elements
7672     in 64-bit environments. We only need the offset to the first occurrence,
7673     because if that doesn't fail, subsequent ones will also be OK. */
7674 
7675     case META_BACKREF:
7676     if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7677       else GETPLUSOFFSET(offset, pptr);
7678 
7679     if (meta_arg > cb->bracount)
7680       {
7681       cb->erroroffset = offset;
7682       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7683       return 0;
7684       }
7685 
7686     /* Come here from named backref handling when the reference is to a
7687     single group (that is, not to a duplicated name). The back reference
7688     data will have already been updated. We must disable firstcu if not
7689     set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7690     later. */
7691 
7692     HANDLE_SINGLE_REFERENCE:
7693     if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7694     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7695     PUT2INC(code, 0, meta_arg);
7696 
7697     /* Update the map of back references, and keep the highest one. We
7698     could do this in parse_regex() for numerical back references, but not
7699     for named back references, because we don't know the numbers to which
7700     named back references refer. So we do it all in this function. */
7701 
7702     cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7703     if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7704     break;
7705 
7706 
7707     /* ===============================================================*/
7708     /* Handle recursion by inserting the number of the called group (which is
7709     the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7710     scanned and these numbers are replaced by offsets within the pattern. It is
7711     done like this to avoid problems with forward references and adjusting
7712     offsets when groups are duplicated and moved (as discovered in previous
7713     implementations). Note that a recursion does not have a set first
7714     character. */
7715 
7716     case META_RECURSE:
7717     GETPLUSOFFSET(offset, pptr);
7718     if (meta_arg > cb->bracount)
7719       {
7720       cb->erroroffset = offset;
7721       *errorcodeptr = ERR15;  /* Non-existent subpattern */
7722       return 0;
7723       }
7724     HANDLE_NUMERICAL_RECURSION:
7725     *code = OP_RECURSE;
7726     PUT(code, 1, meta_arg);
7727     code += 1 + LINK_SIZE;
7728     groupsetfirstcu = FALSE;
7729     cb->had_recurse = TRUE;
7730     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7731     zerofirstcu = firstcu;
7732     zerofirstcuflags = firstcuflags;
7733     break;
7734 
7735 
7736     /* ===============================================================*/
7737     /* Handle capturing parentheses; the number is the meta argument. */
7738 
7739     case META_CAPTURE:
7740     bravalue = OP_CBRA;
7741     skipunits = IMM2_SIZE;
7742     PUT2(code, 1+LINK_SIZE, meta_arg);
7743     cb->lastcapture = meta_arg;
7744     goto GROUP_PROCESS_NOTE_EMPTY;
7745 
7746 
7747     /* ===============================================================*/
7748     /* Handle escape sequence items. For ones like \d, the ESC_values are
7749     arranged to be the same as the corresponding OP_values in the default case
7750     when PCRE2_UCP is not set (which is the only case in which they will appear
7751     here).
7752 
7753     Note: \Q and \E are never seen here, as they were dealt with in
7754     parse_pattern(). Neither are numerical back references or recursions, which
7755     were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7756     \g, when followed by names, are turned into META_BACKREF_BYNAME or
7757     META_RECURSE_BYNAME. */
7758 
7759     case META_ESCAPE:
7760 
7761     /* We can test for escape sequences that consume a character because their
7762     values lie between ESC_b and ESC_Z; this may have to change if any new ones
7763     are ever created. For these sequences, we disable the setting of a first
7764     character if it hasn't already been set. */
7765 
7766     if (meta_arg > ESC_b && meta_arg < ESC_Z)
7767       {
7768       matched_char = TRUE;
7769       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7770       }
7771 
7772     /* Set values to reset to if this is followed by a zero repeat. */
7773 
7774     zerofirstcu = firstcu;
7775     zerofirstcuflags = firstcuflags;
7776     zeroreqcu = reqcu;
7777     zeroreqcuflags = reqcuflags;
7778 
7779     /* If Unicode is not supported, \P and \p are not allowed and are
7780     faulted at parse time, so will never appear here. */
7781 
7782 #ifdef SUPPORT_UNICODE
7783     if (meta_arg == ESC_P || meta_arg == ESC_p)
7784       {
7785       uint32_t ptype = *(++pptr) >> 16;
7786       uint32_t pdata = *pptr & 0xffff;
7787 
7788       /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7789       from the auto-anchoring code. */
7790 
7791       if (meta_arg == ESC_p && ptype == PT_ANY)
7792         {
7793         *code++ = OP_ALLANY;
7794         }
7795       else
7796         {
7797         *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7798         *code++ = ptype;
7799         *code++ = pdata;
7800         }
7801       break;  /* End META_ESCAPE */
7802       }
7803 #endif
7804 
7805     /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
7806     done. However, there's an option, in case anyone was relying on it. */
7807 
7808     if (cb->assert_depth > 0 && meta_arg == ESC_K &&
7809         (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
7810       {
7811       *errorcodeptr = ERR99;
7812       return 0;
7813       }
7814 
7815     /* For the rest (including \X when Unicode is supported - if not it's
7816     faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7817     not set; if it is set, these escapes do not show up here because they are
7818     converted into Unicode property tests in parse_regex(). Note that \b and \B
7819     do a one-character lookbehind, and \A also behaves as if it does. */
7820 
7821     if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7822     if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7823          cb->max_lookbehind == 0)
7824       cb->max_lookbehind = 1;
7825 
7826     /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7827     instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7828 
7829 #if PCRE2_CODE_UNIT_WIDTH == 32
7830     *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7831 #else
7832     *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7833 #endif
7834     break;  /* End META_ESCAPE */
7835 
7836 
7837     /* ===================================================================*/
7838     /* Handle an unrecognized meta value. A parsed pattern value less than
7839     META_END is a literal. Otherwise we have a problem. */
7840 
7841     default:
7842     if (meta >= META_END)
7843       {
7844 #ifdef DEBUG_SHOW_PARSED
7845       fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7846 #endif
7847       *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
7848       return 0;
7849       }
7850 
7851     /* Handle a literal character. We come here by goto in the case of a
7852     32-bit, non-UTF character whose value is greater than META_END. */
7853 
7854     NORMAL_CHAR:
7855     meta = *pptr;     /* Get the full 32 bits */
7856     NORMAL_CHAR_SET:  /* Character is already in meta */
7857     matched_char = TRUE;
7858 
7859     /* For caseless UTF or UCP mode, check whether this character has more than
7860     one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
7861     */
7862 
7863 #ifdef SUPPORT_UNICODE
7864     if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
7865       {
7866       uint32_t caseset = UCD_CASESET(meta);
7867       if (caseset != 0)
7868         {
7869         *code++ = OP_PROP;
7870         *code++ = PT_CLIST;
7871         *code++ = caseset;
7872         if (firstcuflags == REQ_UNSET)
7873           firstcuflags = zerofirstcuflags = REQ_NONE;
7874         break;  /* End handling this meta item */
7875         }
7876       }
7877 #endif
7878 
7879     /* Caseful matches, or caseless and not one of the multicase characters. We
7880     come here by goto in the case of a positive class that contains only
7881     case-partners of a character with just two cases; matched_char has already
7882     been set TRUE and options fudged if necessary. */
7883 
7884     CLASS_CASELESS_CHAR:
7885 
7886     /* Get the character's code units into mcbuffer, with the length in
7887     mclength. When not in UTF mode, the length is always 1. */
7888 
7889 #ifdef SUPPORT_UNICODE
7890     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7891 #endif
7892       {
7893       mclength = 1;
7894       mcbuffer[0] = meta;
7895       }
7896 
7897     /* Generate the appropriate code */
7898 
7899     *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7900     memcpy(code, mcbuffer, CU2BYTES(mclength));
7901     code += mclength;
7902 
7903     /* Remember if \r or \n were seen */
7904 
7905     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7906       cb->external_flags |= PCRE2_HASCRORLF;
7907 
7908     /* Set the first and required code units appropriately. If no previous
7909     first code unit, set it from this character, but revert to none on a zero
7910     repeat. Otherwise, leave the firstcu value alone, and don't change it on
7911     a zero repeat. */
7912 
7913     if (firstcuflags == REQ_UNSET)
7914       {
7915       zerofirstcuflags = REQ_NONE;
7916       zeroreqcu = reqcu;
7917       zeroreqcuflags = reqcuflags;
7918 
7919       /* If the character is more than one code unit long, we can set a single
7920       firstcu only if it is not to be matched caselessly. Multiple possible
7921       starting code units may be picked up later in the studying code. */
7922 
7923       if (mclength == 1 || req_caseopt == 0)
7924         {
7925         firstcu = mcbuffer[0];
7926         firstcuflags = req_caseopt;
7927         if (mclength != 1)
7928           {
7929           reqcu = code[-1];
7930           reqcuflags = cb->req_varyopt;
7931           }
7932         }
7933       else firstcuflags = reqcuflags = REQ_NONE;
7934       }
7935 
7936     /* firstcu was previously set; we can set reqcu only if the length is
7937     1 or the matching is caseful. */
7938 
7939     else
7940       {
7941       zerofirstcu = firstcu;
7942       zerofirstcuflags = firstcuflags;
7943       zeroreqcu = reqcu;
7944       zeroreqcuflags = reqcuflags;
7945       if (mclength == 1 || req_caseopt == 0)
7946         {
7947         reqcu = code[-1];
7948         reqcuflags = req_caseopt | cb->req_varyopt;
7949         }
7950       }
7951 
7952     /* If caselessness was temporarily instated, reset it. */
7953 
7954     if (reset_caseful)
7955       {
7956       options &= ~PCRE2_CASELESS;
7957       req_caseopt = 0;
7958       reset_caseful = FALSE;
7959       }
7960 
7961     break;    /* End literal character handling */
7962     }         /* End of big switch */
7963   }           /* End of big loop */
7964 
7965 /* Control never reaches here. */
7966 }
7967 
7968 
7969 
7970 /*************************************************
7971 *   Compile regex: a sequence of alternatives    *
7972 *************************************************/
7973 
7974 /* On entry, pptr is pointing past the bracket meta, but on return it points to
7975 the closing bracket or META_END. The code variable is pointing at the code unit
7976 into which the BRA operator has been stored. This function is used during the
7977 pre-compile phase when we are trying to find out the amount of memory needed,
7978 as well as during the real compile phase. The value of lengthptr distinguishes
7979 the two phases.
7980 
7981 Arguments:
7982   options           option bits, including any changes for this subpattern
7983   codeptr           -> the address of the current code pointer
7984   pptrptr           -> the address of the current parsed pattern pointer
7985   errorcodeptr      -> pointer to error code variable
7986   skipunits         skip this many code units at start (for brackets and OP_COND)
7987   firstcuptr        place to put the first required code unit
7988   firstcuflagsptr   place to put the first code unit flags, or a negative number
7989   reqcuptr          place to put the last required code unit
7990   reqcuflagsptr     place to put the last required code unit flags, or a negative number
7991   bcptr             pointer to the chain of currently open branches
7992   cb                points to the data block with tables pointers etc.
7993   lengthptr         NULL during the real compile phase
7994                     points to length accumulator during pre-compile phase
7995 
7996 Returns:            0 There has been an error
7997                    +1 Success, this group must match at least one character
7998                    -1 Success, this group may match an empty string
7999 */
8000 
8001 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)8002 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
8003   int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
8004   int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
8005   branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
8006 {
8007 PCRE2_UCHAR *code = *codeptr;
8008 PCRE2_UCHAR *last_branch = code;
8009 PCRE2_UCHAR *start_bracket = code;
8010 BOOL lookbehind;
8011 open_capitem capitem;
8012 int capnumber = 0;
8013 int okreturn = 1;
8014 uint32_t *pptr = *pptrptr;
8015 uint32_t firstcu, reqcu;
8016 uint32_t lookbehindlength;
8017 int32_t firstcuflags, reqcuflags;
8018 uint32_t branchfirstcu, branchreqcu;
8019 int32_t branchfirstcuflags, branchreqcuflags;
8020 PCRE2_SIZE length;
8021 branch_chain bc;
8022 
8023 /* If set, call the external function that checks for stack availability. */
8024 
8025 if (cb->cx->stack_guard != NULL &&
8026     cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8027   {
8028   *errorcodeptr= ERR33;
8029   return 0;
8030   }
8031 
8032 /* Miscellaneous initialization */
8033 
8034 bc.outer = bcptr;
8035 bc.current_branch = code;
8036 
8037 firstcu = reqcu = 0;
8038 firstcuflags = reqcuflags = REQ_UNSET;
8039 
8040 /* Accumulate the length for use in the pre-compile phase. Start with the
8041 length of the BRA and KET and any extra code units that are required at the
8042 beginning. We accumulate in a local variable to save frequent testing of
8043 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8044 start and end of each alternative, because compiled items are discarded during
8045 the pre-compile phase so that the workspace is not exceeded. */
8046 
8047 length = 2 + 2*LINK_SIZE + skipunits;
8048 
8049 /* Remember if this is a lookbehind assertion, and if it is, save its length
8050 and skip over the pattern offset. */
8051 
8052 lookbehind = *code == OP_ASSERTBACK ||
8053              *code == OP_ASSERTBACK_NOT ||
8054              *code == OP_ASSERTBACK_NA;
8055 
8056 if (lookbehind)
8057   {
8058   lookbehindlength = META_DATA(pptr[-1]);
8059   pptr += SIZEOFFSET;
8060   }
8061 else lookbehindlength = 0;
8062 
8063 /* If this is a capturing subpattern, add to the chain of open capturing items
8064 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8065 need be tested here; changing this opcode to one of its variants, e.g.
8066 OP_SCBRAPOS, happens later, after the group has been compiled. */
8067 
8068 if (*code == OP_CBRA)
8069   {
8070   capnumber = GET2(code, 1 + LINK_SIZE);
8071   capitem.number = capnumber;
8072   capitem.next = cb->open_caps;
8073   capitem.assert_depth = cb->assert_depth;
8074   cb->open_caps = &capitem;
8075   }
8076 
8077 /* Offset is set zero to mark that this bracket is still open */
8078 
8079 PUT(code, 1, 0);
8080 code += 1 + LINK_SIZE + skipunits;
8081 
8082 /* Loop for each alternative branch */
8083 
8084 for (;;)
8085   {
8086   int branch_return;
8087 
8088   /* Insert OP_REVERSE if this is as lookbehind assertion. */
8089 
8090   if (lookbehind && lookbehindlength > 0)
8091     {
8092     *code++ = OP_REVERSE;
8093     PUTINC(code, 0, lookbehindlength);
8094     length += 1 + LINK_SIZE;
8095     }
8096 
8097   /* Now compile the branch; in the pre-compile phase its length gets added
8098   into the length. */
8099 
8100   if ((branch_return =
8101         compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
8102           &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
8103           cb, (lengthptr == NULL)? NULL : &length)) == 0)
8104     return 0;
8105 
8106   /* If a branch can match an empty string, so can the whole group. */
8107 
8108   if (branch_return < 0) okreturn = -1;
8109 
8110   /* In the real compile phase, there is some post-processing to be done. */
8111 
8112   if (lengthptr == NULL)
8113     {
8114     /* If this is the first branch, the firstcu and reqcu values for the
8115     branch become the values for the regex. */
8116 
8117     if (*last_branch != OP_ALT)
8118       {
8119       firstcu = branchfirstcu;
8120       firstcuflags = branchfirstcuflags;
8121       reqcu = branchreqcu;
8122       reqcuflags = branchreqcuflags;
8123       }
8124 
8125     /* If this is not the first branch, the first char and reqcu have to
8126     match the values from all the previous branches, except that if the
8127     previous value for reqcu didn't have REQ_VARY set, it can still match,
8128     and we set REQ_VARY for the group from this branch's value. */
8129 
8130     else
8131       {
8132       /* If we previously had a firstcu, but it doesn't match the new branch,
8133       we have to abandon the firstcu for the regex, but if there was
8134       previously no reqcu, it takes on the value of the old firstcu. */
8135 
8136       if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8137         {
8138         if (firstcuflags >= 0)
8139           {
8140           if (reqcuflags < 0)
8141             {
8142             reqcu = firstcu;
8143             reqcuflags = firstcuflags;
8144             }
8145           }
8146         firstcuflags = REQ_NONE;
8147         }
8148 
8149       /* If we (now or from before) have no firstcu, a firstcu from the
8150       branch becomes a reqcu if there isn't a branch reqcu. */
8151 
8152       if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
8153           branchreqcuflags < 0)
8154         {
8155         branchreqcu = branchfirstcu;
8156         branchreqcuflags = branchfirstcuflags;
8157         }
8158 
8159       /* Now ensure that the reqcus match */
8160 
8161       if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8162           reqcu != branchreqcu)
8163         reqcuflags = REQ_NONE;
8164       else
8165         {
8166         reqcu = branchreqcu;
8167         reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8168         }
8169       }
8170     }
8171 
8172   /* Handle reaching the end of the expression, either ')' or end of pattern.
8173   In the real compile phase, go back through the alternative branches and
8174   reverse the chain of offsets, with the field in the BRA item now becoming an
8175   offset to the first alternative. If there are no alternatives, it points to
8176   the end of the group. The length in the terminating ket is always the length
8177   of the whole bracketed item. Return leaving the pointer at the terminating
8178   char. */
8179 
8180   if (META_CODE(*pptr) != META_ALT)
8181     {
8182     if (lengthptr == NULL)
8183       {
8184       PCRE2_SIZE branch_length = code - last_branch;
8185       do
8186         {
8187         PCRE2_SIZE prev_length = GET(last_branch, 1);
8188         PUT(last_branch, 1, branch_length);
8189         branch_length = prev_length;
8190         last_branch -= branch_length;
8191         }
8192       while (branch_length > 0);
8193       }
8194 
8195     /* Fill in the ket */
8196 
8197     *code = OP_KET;
8198     PUT(code, 1, (int)(code - start_bracket));
8199     code += 1 + LINK_SIZE;
8200 
8201     /* If it was a capturing subpattern, remove the block from the chain. */
8202 
8203     if (capnumber > 0) cb->open_caps = cb->open_caps->next;
8204 
8205     /* Set values to pass back */
8206 
8207     *codeptr = code;
8208     *pptrptr = pptr;
8209     *firstcuptr = firstcu;
8210     *firstcuflagsptr = firstcuflags;
8211     *reqcuptr = reqcu;
8212     *reqcuflagsptr = reqcuflags;
8213     if (lengthptr != NULL)
8214       {
8215       if (OFLOW_MAX - *lengthptr < length)
8216         {
8217         *errorcodeptr = ERR20;
8218         return 0;
8219         }
8220       *lengthptr += length;
8221       }
8222     return okreturn;
8223     }
8224 
8225   /* Another branch follows. In the pre-compile phase, we can move the code
8226   pointer back to where it was for the start of the first branch. (That is,
8227   pretend that each branch is the only one.)
8228 
8229   In the real compile phase, insert an ALT node. Its length field points back
8230   to the previous branch while the bracket remains open. At the end the chain
8231   is reversed. It's done like this so that the start of the bracket has a
8232   zero offset until it is closed, making it possible to detect recursion. */
8233 
8234   if (lengthptr != NULL)
8235     {
8236     code = *codeptr + 1 + LINK_SIZE + skipunits;
8237     length += 1 + LINK_SIZE;
8238     }
8239   else
8240     {
8241     *code = OP_ALT;
8242     PUT(code, 1, (int)(code - last_branch));
8243     bc.current_branch = last_branch = code;
8244     code += 1 + LINK_SIZE;
8245     }
8246 
8247   /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8248   and then advance past the vertical bar. */
8249 
8250   lookbehindlength = META_DATA(*pptr);
8251   pptr++;
8252   }
8253 /* Control never reaches here */
8254 }
8255 
8256 
8257 
8258 /*************************************************
8259 *          Check for anchored pattern            *
8260 *************************************************/
8261 
8262 /* Try to find out if this is an anchored regular expression. Consider each
8263 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8264 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8265 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8266 be found, because ^ generates OP_CIRCM in that mode.
8267 
8268 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8269 This is the code for \G, which means "match at start of match position, taking
8270 into account the match offset".
8271 
8272 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8273 because that will try the rest of the pattern at all possible matching points,
8274 so there is no point trying again.... er ....
8275 
8276 .... except when the .* appears inside capturing parentheses, and there is a
8277 subsequent back reference to those parentheses. We haven't enough information
8278 to catch that case precisely.
8279 
8280 At first, the best we could do was to detect when .* was in capturing brackets
8281 and the highest back reference was greater than or equal to that level.
8282 However, by keeping a bitmap of the first 31 back references, we can catch some
8283 of the more common cases more precisely.
8284 
8285 ... A second exception is when the .* appears inside an atomic group, because
8286 this prevents the number of characters it matches from being adjusted.
8287 
8288 Arguments:
8289   code           points to start of the compiled pattern
8290   bracket_map    a bitmap of which brackets we are inside while testing; this
8291                    handles up to substring 31; after that we just have to take
8292                    the less precise approach
8293   cb             points to the compile data block
8294   atomcount      atomic group level
8295   inassert       TRUE if in an assertion
8296 
8297 Returns:     TRUE or FALSE
8298 */
8299 
8300 static BOOL
is_anchored(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8301 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8302   int atomcount, BOOL inassert)
8303 {
8304 do {
8305    PCRE2_SPTR scode = first_significant_code(
8306      code + PRIV(OP_lengths)[*code], FALSE);
8307    int op = *scode;
8308 
8309    /* Non-capturing brackets */
8310 
8311    if (op == OP_BRA  || op == OP_BRAPOS ||
8312        op == OP_SBRA || op == OP_SBRAPOS)
8313      {
8314      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8315        return FALSE;
8316      }
8317 
8318    /* Capturing brackets */
8319 
8320    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8321             op == OP_SCBRA || op == OP_SCBRAPOS)
8322      {
8323      int n = GET2(scode, 1+LINK_SIZE);
8324      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8325      if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8326      }
8327 
8328    /* Positive forward assertion */
8329 
8330    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8331      {
8332      if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8333      }
8334 
8335    /* Condition. If there is no second branch, it can't be anchored. */
8336 
8337    else if (op == OP_COND || op == OP_SCOND)
8338      {
8339      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8340      if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8341        return FALSE;
8342      }
8343 
8344    /* Atomic groups */
8345 
8346    else if (op == OP_ONCE)
8347      {
8348      if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8349        return FALSE;
8350      }
8351 
8352    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8353    it isn't in brackets that are or may be referenced or inside an atomic
8354    group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8355    because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8356    with the subject "aab", which matches "b", i.e. not at the start of a line.
8357    There is also an option that disables auto-anchoring. */
8358 
8359    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8360              op == OP_TYPEPOSSTAR))
8361      {
8362      if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8363          atomcount > 0 || cb->had_pruneorskip || inassert ||
8364          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8365        return FALSE;
8366      }
8367 
8368    /* Check for explicit anchoring */
8369 
8370    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8371 
8372    code += GET(code, 1);
8373    }
8374 while (*code == OP_ALT);   /* Loop for each alternative */
8375 return TRUE;
8376 }
8377 
8378 
8379 
8380 /*************************************************
8381 *         Check for starting with ^ or .*        *
8382 *************************************************/
8383 
8384 /* This is called to find out if every branch starts with ^ or .* so that
8385 "first char" processing can be done to speed things up in multiline
8386 matching and for non-DOTALL patterns that start with .* (which must start at
8387 the beginning or after \n). As in the case of is_anchored() (see above), we
8388 have to take account of back references to capturing brackets that contain .*
8389 because in that case we can't make the assumption. Also, the appearance of .*
8390 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8391 or *SKIP does not count, because once again the assumption no longer holds.
8392 
8393 Arguments:
8394   code           points to start of the compiled pattern or a group
8395   bracket_map    a bitmap of which brackets we are inside while testing; this
8396                    handles up to substring 31; after that we just have to take
8397                    the less precise approach
8398   cb             points to the compile data
8399   atomcount      atomic group level
8400   inassert       TRUE if in an assertion
8401 
8402 Returns:         TRUE or FALSE
8403 */
8404 
8405 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8406 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8407   int atomcount, BOOL inassert)
8408 {
8409 do {
8410    PCRE2_SPTR scode = first_significant_code(
8411      code + PRIV(OP_lengths)[*code], FALSE);
8412    int op = *scode;
8413 
8414    /* If we are at the start of a conditional assertion group, *both* the
8415    conditional assertion *and* what follows the condition must satisfy the test
8416    for start of line. Other kinds of condition fail. Note that there may be an
8417    auto-callout at the start of a condition. */
8418 
8419    if (op == OP_COND)
8420      {
8421      scode += 1 + LINK_SIZE;
8422 
8423      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8424        else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8425 
8426      switch (*scode)
8427        {
8428        case OP_CREF:
8429        case OP_DNCREF:
8430        case OP_RREF:
8431        case OP_DNRREF:
8432        case OP_FAIL:
8433        case OP_FALSE:
8434        case OP_TRUE:
8435        return FALSE;
8436 
8437        default:     /* Assertion */
8438        if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8439        do scode += GET(scode, 1); while (*scode == OP_ALT);
8440        scode += 1 + LINK_SIZE;
8441        break;
8442        }
8443      scode = first_significant_code(scode, FALSE);
8444      op = *scode;
8445      }
8446 
8447    /* Non-capturing brackets */
8448 
8449    if (op == OP_BRA  || op == OP_BRAPOS ||
8450        op == OP_SBRA || op == OP_SBRAPOS)
8451      {
8452      if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8453        return FALSE;
8454      }
8455 
8456    /* Capturing brackets */
8457 
8458    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8459             op == OP_SCBRA || op == OP_SCBRAPOS)
8460      {
8461      int n = GET2(scode, 1+LINK_SIZE);
8462      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8463      if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8464      }
8465 
8466    /* Positive forward assertions */
8467 
8468    else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8469      {
8470      if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8471        return FALSE;
8472      }
8473 
8474    /* Atomic brackets */
8475 
8476    else if (op == OP_ONCE)
8477      {
8478      if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8479        return FALSE;
8480      }
8481 
8482    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8483    brackets that may be referenced or an assertion, and as long as the pattern
8484    does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8485    for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8486    i.e. not at the start of a line. There is also an option that disables this
8487    optimization. */
8488 
8489    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8490      {
8491      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8492          atomcount > 0 || cb->had_pruneorskip || inassert ||
8493          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8494        return FALSE;
8495      }
8496 
8497    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8498    in particular that this includes atomic brackets OP_ONCE because the number
8499    of characters matched by .* cannot be adjusted inside them. */
8500 
8501    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8502 
8503    /* Move on to the next alternative */
8504 
8505    code += GET(code, 1);
8506    }
8507 while (*code == OP_ALT);  /* Loop for each alternative */
8508 return TRUE;
8509 }
8510 
8511 
8512 
8513 /*************************************************
8514 *   Scan compiled regex for recursion reference  *
8515 *************************************************/
8516 
8517 /* This function scans through a compiled pattern until it finds an instance of
8518 OP_RECURSE.
8519 
8520 Arguments:
8521   code        points to start of expression
8522   utf         TRUE in UTF mode
8523 
8524 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
8525 */
8526 
8527 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8528 find_recurse(PCRE2_SPTR code, BOOL utf)
8529 {
8530 for (;;)
8531   {
8532   PCRE2_UCHAR c = *code;
8533   if (c == OP_END) return NULL;
8534   if (c == OP_RECURSE) return code;
8535 
8536   /* XCLASS is used for classes that cannot be represented just by a bit map.
8537   This includes negated single high-valued characters. CALLOUT_STR is used for
8538   callouts with string arguments. In both cases the length in the table is
8539   zero; the actual length is stored in the compiled code. */
8540 
8541   if (c == OP_XCLASS) code += GET(code, 1);
8542     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8543 
8544   /* Otherwise, we can get the item's length from the table, except that for
8545   repeated character types, we have to test for \p and \P, which have an extra
8546   two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8547   we must add in its length. */
8548 
8549   else
8550     {
8551     switch(c)
8552       {
8553       case OP_TYPESTAR:
8554       case OP_TYPEMINSTAR:
8555       case OP_TYPEPLUS:
8556       case OP_TYPEMINPLUS:
8557       case OP_TYPEQUERY:
8558       case OP_TYPEMINQUERY:
8559       case OP_TYPEPOSSTAR:
8560       case OP_TYPEPOSPLUS:
8561       case OP_TYPEPOSQUERY:
8562       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8563       break;
8564 
8565       case OP_TYPEPOSUPTO:
8566       case OP_TYPEUPTO:
8567       case OP_TYPEMINUPTO:
8568       case OP_TYPEEXACT:
8569       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8570         code += 2;
8571       break;
8572 
8573       case OP_MARK:
8574       case OP_COMMIT_ARG:
8575       case OP_PRUNE_ARG:
8576       case OP_SKIP_ARG:
8577       case OP_THEN_ARG:
8578       code += code[1];
8579       break;
8580       }
8581 
8582     /* Add in the fixed length from the table */
8583 
8584     code += PRIV(OP_lengths)[c];
8585 
8586     /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8587     be followed by a multi-unit character. The length in the table is a
8588     minimum, so we have to arrange to skip the extra units. */
8589 
8590 #ifdef MAYBE_UTF_MULTI
8591     if (utf) switch(c)
8592       {
8593       case OP_CHAR:
8594       case OP_CHARI:
8595       case OP_NOT:
8596       case OP_NOTI:
8597       case OP_EXACT:
8598       case OP_EXACTI:
8599       case OP_NOTEXACT:
8600       case OP_NOTEXACTI:
8601       case OP_UPTO:
8602       case OP_UPTOI:
8603       case OP_NOTUPTO:
8604       case OP_NOTUPTOI:
8605       case OP_MINUPTO:
8606       case OP_MINUPTOI:
8607       case OP_NOTMINUPTO:
8608       case OP_NOTMINUPTOI:
8609       case OP_POSUPTO:
8610       case OP_POSUPTOI:
8611       case OP_NOTPOSUPTO:
8612       case OP_NOTPOSUPTOI:
8613       case OP_STAR:
8614       case OP_STARI:
8615       case OP_NOTSTAR:
8616       case OP_NOTSTARI:
8617       case OP_MINSTAR:
8618       case OP_MINSTARI:
8619       case OP_NOTMINSTAR:
8620       case OP_NOTMINSTARI:
8621       case OP_POSSTAR:
8622       case OP_POSSTARI:
8623       case OP_NOTPOSSTAR:
8624       case OP_NOTPOSSTARI:
8625       case OP_PLUS:
8626       case OP_PLUSI:
8627       case OP_NOTPLUS:
8628       case OP_NOTPLUSI:
8629       case OP_MINPLUS:
8630       case OP_MINPLUSI:
8631       case OP_NOTMINPLUS:
8632       case OP_NOTMINPLUSI:
8633       case OP_POSPLUS:
8634       case OP_POSPLUSI:
8635       case OP_NOTPOSPLUS:
8636       case OP_NOTPOSPLUSI:
8637       case OP_QUERY:
8638       case OP_QUERYI:
8639       case OP_NOTQUERY:
8640       case OP_NOTQUERYI:
8641       case OP_MINQUERY:
8642       case OP_MINQUERYI:
8643       case OP_NOTMINQUERY:
8644       case OP_NOTMINQUERYI:
8645       case OP_POSQUERY:
8646       case OP_POSQUERYI:
8647       case OP_NOTPOSQUERY:
8648       case OP_NOTPOSQUERYI:
8649       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8650       break;
8651       }
8652 #else
8653     (void)(utf);  /* Keep compiler happy by referencing function argument */
8654 #endif  /* MAYBE_UTF_MULTI */
8655     }
8656   }
8657 }
8658 
8659 
8660 
8661 /*************************************************
8662 *    Check for asserted fixed first code unit    *
8663 *************************************************/
8664 
8665 /* During compilation, the "first code unit" settings from forward assertions
8666 are discarded, because they can cause conflicts with actual literals that
8667 follow. However, if we end up without a first code unit setting for an
8668 unanchored pattern, it is worth scanning the regex to see if there is an
8669 initial asserted first code unit. If all branches start with the same asserted
8670 code unit, or with a non-conditional bracket all of whose alternatives start
8671 with the same asserted code unit (recurse ad lib), then we return that code
8672 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8673 REQ_NONE in the flags.
8674 
8675 Arguments:
8676   code       points to start of compiled pattern
8677   flags      points to the first code unit flags
8678   inassert   non-zero if in an assertion
8679 
8680 Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
8681 */
8682 
8683 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,uint32_t inassert)8684 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
8685 {
8686 uint32_t c = 0;
8687 int cflags = REQ_NONE;
8688 
8689 *flags = REQ_NONE;
8690 do {
8691    uint32_t d;
8692    int dflags;
8693    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8694              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8695    PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8696    PCRE2_UCHAR op = *scode;
8697 
8698    switch(op)
8699      {
8700      default:
8701      return 0;
8702 
8703      case OP_BRA:
8704      case OP_BRAPOS:
8705      case OP_CBRA:
8706      case OP_SCBRA:
8707      case OP_CBRAPOS:
8708      case OP_SCBRAPOS:
8709      case OP_ASSERT:
8710      case OP_ASSERT_NA:
8711      case OP_ONCE:
8712      case OP_SCRIPT_RUN:
8713      d = find_firstassertedcu(scode, &dflags, inassert +
8714        ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
8715      if (dflags < 0)
8716        return 0;
8717      if (cflags < 0) { c = d; cflags = dflags; }
8718        else if (c != d || cflags != dflags) return 0;
8719      break;
8720 
8721      case OP_EXACT:
8722      scode += IMM2_SIZE;
8723      /* Fall through */
8724 
8725      case OP_CHAR:
8726      case OP_PLUS:
8727      case OP_MINPLUS:
8728      case OP_POSPLUS:
8729      if (inassert == 0) return 0;
8730      if (cflags < 0) { c = scode[1]; cflags = 0; }
8731        else if (c != scode[1]) return 0;
8732      break;
8733 
8734      case OP_EXACTI:
8735      scode += IMM2_SIZE;
8736      /* Fall through */
8737 
8738      case OP_CHARI:
8739      case OP_PLUSI:
8740      case OP_MINPLUSI:
8741      case OP_POSPLUSI:
8742      if (inassert == 0) return 0;
8743 
8744      /* If the character is more than one code unit long, we cannot set its
8745      first code unit when matching caselessly. Later scanning may pick up
8746      multiple code units. */
8747 
8748 #ifdef SUPPORT_UNICODE
8749 #if PCRE2_CODE_UNIT_WIDTH == 8
8750      if (scode[1] >= 0x80) return 0;
8751 #elif PCRE2_CODE_UNIT_WIDTH == 16
8752      if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
8753 #endif
8754 #endif
8755 
8756      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8757        else if (c != scode[1]) return 0;
8758      break;
8759      }
8760 
8761    code += GET(code, 1);
8762    }
8763 while (*code == OP_ALT);
8764 
8765 *flags = cflags;
8766 return c;
8767 }
8768 
8769 
8770 
8771 /*************************************************
8772 *     Add an entry to the name/number table      *
8773 *************************************************/
8774 
8775 /* This function is called between compiling passes to add an entry to the
8776 name/number table, maintaining alphabetical order. Checking for permitted
8777 and forbidden duplicates has already been done.
8778 
8779 Arguments:
8780   cb           the compile data block
8781   name         the name to add
8782   length       the length of the name
8783   groupno      the group number
8784   tablecount   the count of names in the table so far
8785 
8786 Returns:       nothing
8787 */
8788 
8789 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8790 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8791   unsigned int groupno, uint32_t tablecount)
8792 {
8793 uint32_t i;
8794 PCRE2_UCHAR *slot = cb->name_table;
8795 
8796 for (i = 0; i < tablecount; i++)
8797   {
8798   int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8799   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8800     crc = -1; /* Current name is a substring */
8801 
8802   /* Make space in the table and break the loop for an earlier name. For a
8803   duplicate or later name, carry on. We do this for duplicates so that in the
8804   simple case (when ?(| is not used) they are in order of their numbers. In all
8805   cases they are in the order in which they appear in the pattern. */
8806 
8807   if (crc < 0)
8808     {
8809     (void)memmove(slot + cb->name_entry_size, slot,
8810       CU2BYTES((tablecount - i) * cb->name_entry_size));
8811     break;
8812     }
8813 
8814   /* Continue the loop for a later or duplicate name */
8815 
8816   slot += cb->name_entry_size;
8817   }
8818 
8819 PUT2(slot, 0, groupno);
8820 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8821 
8822 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8823 the memory is all initialized. Otherwise valgrind moans about uninitialized
8824 memory when saving serialized compiled patterns. */
8825 
8826 memset(slot + IMM2_SIZE + length, 0,
8827   CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8828 }
8829 
8830 
8831 
8832 /*************************************************
8833 *             Skip in parsed pattern             *
8834 *************************************************/
8835 
8836 /* This function is called to skip parts of the parsed pattern when finding the
8837 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8838 the end of the branch, it is called to skip over an internal lookaround or
8839 (DEFINE) group, and it is also called to skip to the end of a class, during
8840 which it will never encounter nested groups (but there's no need to have
8841 special code for that).
8842 
8843 When called to find the end of a branch or group, pptr must point to the first
8844 meta code inside the branch, not the branch-starting code. In other cases it
8845 can point to the item that causes the function to be called.
8846 
8847 Arguments:
8848   pptr       current pointer to skip from
8849   skiptype   PSKIP_CLASS when skipping to end of class
8850              PSKIP_ALT when META_ALT ends the skip
8851              PSKIP_KET when only META_KET ends the skip
8852 
8853 Returns:     new value of pptr
8854              NULL if META_END is reached - should never occur
8855                or for an unknown meta value - likewise
8856 */
8857 
8858 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8859 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8860 {
8861 uint32_t nestlevel = 0;
8862 
8863 for (;; pptr++)
8864   {
8865   uint32_t meta = META_CODE(*pptr);
8866 
8867   switch(meta)
8868     {
8869     default:  /* Just skip over most items */
8870     if (meta < META_END) continue;  /* Literal */
8871     break;
8872 
8873     /* This should never occur. */
8874 
8875     case META_END:
8876     return NULL;
8877 
8878     /* The data for these items is variable in length. */
8879 
8880     case META_BACKREF:  /* Offset is present only if group >= 10 */
8881     if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8882     break;
8883 
8884     case META_ESCAPE:   /* A few escapes are followed by data items. */
8885     switch (META_DATA(*pptr))
8886       {
8887       case ESC_P:
8888       case ESC_p:
8889       pptr += 1;
8890       break;
8891 
8892       case ESC_g:
8893       case ESC_k:
8894       pptr += 1 + SIZEOFFSET;
8895       break;
8896       }
8897     break;
8898 
8899     case META_MARK:     /* Add the length of the name. */
8900     case META_COMMIT_ARG:
8901     case META_PRUNE_ARG:
8902     case META_SKIP_ARG:
8903     case META_THEN_ARG:
8904     pptr += pptr[1];
8905     break;
8906 
8907     /* These are the "active" items in this loop. */
8908 
8909     case META_CLASS_END:
8910     if (skiptype == PSKIP_CLASS) return pptr;
8911     break;
8912 
8913     case META_ATOMIC:
8914     case META_CAPTURE:
8915     case META_COND_ASSERT:
8916     case META_COND_DEFINE:
8917     case META_COND_NAME:
8918     case META_COND_NUMBER:
8919     case META_COND_RNAME:
8920     case META_COND_RNUMBER:
8921     case META_COND_VERSION:
8922     case META_LOOKAHEAD:
8923     case META_LOOKAHEADNOT:
8924     case META_LOOKAHEAD_NA:
8925     case META_LOOKBEHIND:
8926     case META_LOOKBEHINDNOT:
8927     case META_LOOKBEHIND_NA:
8928     case META_NOCAPTURE:
8929     case META_SCRIPT_RUN:
8930     nestlevel++;
8931     break;
8932 
8933     case META_ALT:
8934     if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
8935     break;
8936 
8937     case META_KET:
8938     if (nestlevel == 0) return pptr;
8939     nestlevel--;
8940     break;
8941     }
8942 
8943   /* The extra data item length for each meta is in a table. */
8944 
8945   meta = (meta >> 16) & 0x7fff;
8946   if (meta >= sizeof(meta_extra_lengths)) return NULL;
8947   pptr += meta_extra_lengths[meta];
8948   }
8949 /* Control never reaches here */
8950 return pptr;
8951 }
8952 
8953 
8954 
8955 /*************************************************
8956 *       Find length of a parsed group            *
8957 *************************************************/
8958 
8959 /* This is called for nested groups within a branch of a lookbehind whose
8960 length is being computed. If all the branches in the nested group have the same
8961 length, that is OK. On entry, the pointer must be at the first element after
8962 the group initializing code. On exit it points to OP_KET. Caching is used to
8963 improve processing speed when the same capturing group occurs many times.
8964 
8965 Arguments:
8966   pptrptr     pointer to pointer in the parsed pattern
8967   isinline    FALSE if a reference or recursion; TRUE for inline group
8968   errcodeptr  pointer to the errorcode
8969   lcptr       pointer to the loop counter
8970   group       number of captured group or -1 for a non-capturing group
8971   recurses    chain of recurse_check to catch mutual recursion
8972   cb          pointer to the compile data
8973 
8974 Returns:      the group length or a negative number
8975 */
8976 
8977 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)8978 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
8979    int group, parsed_recurse_check *recurses, compile_block *cb)
8980 {
8981 int branchlength;
8982 int grouplength = -1;
8983 
8984 /* The cache can be used only if there is no possibility of there being two
8985 groups with the same number. We do not need to set the end pointer for a group
8986 that is being processed as a back reference or recursion, but we must do so for
8987 an inline group. */
8988 
8989 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
8990   {
8991   uint32_t groupinfo = cb->groupinfo[group];
8992   if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
8993   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
8994     {
8995     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
8996     return groupinfo & GI_FIXED_LENGTH_MASK;
8997     }
8998   }
8999 
9000 /* Scan the group. In this case we find the end pointer of necessity. */
9001 
9002 for(;;)
9003   {
9004   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9005   if (branchlength < 0) goto ISNOTFIXED;
9006   if (grouplength == -1) grouplength = branchlength;
9007     else if (grouplength != branchlength) goto ISNOTFIXED;
9008   if (**pptrptr == META_KET) break;
9009   *pptrptr += 1;   /* Skip META_ALT */
9010   }
9011 
9012 if (group > 0)
9013   cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9014 return grouplength;
9015 
9016 ISNOTFIXED:
9017 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
9018 return -1;
9019 }
9020 
9021 
9022 
9023 /*************************************************
9024 *        Find length of a parsed branch          *
9025 *************************************************/
9026 
9027 /* Return a fixed length for a branch in a lookbehind, giving an error if the
9028 length is not fixed. On entry, *pptrptr points to the first element inside the
9029 branch. On exit it is set to point to the ALT or KET.
9030 
9031 Arguments:
9032   pptrptr     pointer to pointer in the parsed pattern
9033   errcodeptr  pointer to error code
9034   lcptr       pointer to loop counter
9035   recurses    chain of recurse_check to catch mutual recursion
9036   cb          pointer to compile block
9037 
9038 Returns:      the length, or a negative value on error
9039 */
9040 
9041 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9042 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9043   parsed_recurse_check *recurses, compile_block *cb)
9044 {
9045 int branchlength = 0;
9046 int grouplength;
9047 uint32_t lastitemlength = 0;
9048 uint32_t *pptr = *pptrptr;
9049 PCRE2_SIZE offset;
9050 parsed_recurse_check this_recurse;
9051 
9052 /* A large and/or complex regex can take too long to process. This can happen
9053 more often when (?| groups are present in the pattern because their length
9054 cannot be cached. */
9055 
9056 if ((*lcptr)++ > 2000)
9057   {
9058   *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9059   return -1;
9060   }
9061 
9062 /* Scan the branch, accumulating the length. */
9063 
9064 for (;; pptr++)
9065   {
9066   parsed_recurse_check *r;
9067   uint32_t *gptr, *gptrend;
9068   uint32_t escape;
9069   uint32_t group = 0;
9070   uint32_t itemlength = 0;
9071 
9072   if (*pptr < META_END)
9073     {
9074     itemlength = 1;
9075     }
9076 
9077   else switch (META_CODE(*pptr))
9078     {
9079     case META_KET:
9080     case META_ALT:
9081     goto EXIT;
9082 
9083     /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9084     actual termination. */
9085 
9086     case META_ACCEPT:
9087     case META_FAIL:
9088     pptr = parsed_skip(pptr, PSKIP_ALT);
9089     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9090     goto EXIT;
9091 
9092     case META_MARK:
9093     case META_COMMIT_ARG:
9094     case META_PRUNE_ARG:
9095     case META_SKIP_ARG:
9096     case META_THEN_ARG:
9097     pptr += pptr[1] + 1;
9098     break;
9099 
9100     case META_CIRCUMFLEX:
9101     case META_COMMIT:
9102     case META_DOLLAR:
9103     case META_PRUNE:
9104     case META_SKIP:
9105     case META_THEN:
9106     break;
9107 
9108     case META_OPTIONS:
9109     pptr += 1;
9110     break;
9111 
9112     case META_BIGVALUE:
9113     itemlength = 1;
9114     pptr += 1;
9115     break;
9116 
9117     case META_CLASS:
9118     case META_CLASS_NOT:
9119     itemlength = 1;
9120     pptr = parsed_skip(pptr, PSKIP_CLASS);
9121     if (pptr == NULL) goto PARSED_SKIP_FAILED;
9122     break;
9123 
9124     case META_CLASS_EMPTY_NOT:
9125     case META_DOT:
9126     itemlength = 1;
9127     break;
9128 
9129     case META_CALLOUT_NUMBER:
9130     pptr += 3;
9131     break;
9132 
9133     case META_CALLOUT_STRING:
9134     pptr += 3 + SIZEOFFSET;
9135     break;
9136 
9137     /* Only some escapes consume a character. Of those, \R and \X are never
9138     allowed because they might match more than character. \C is allowed only in
9139     32-bit and non-UTF 8/16-bit modes. */
9140 
9141     case META_ESCAPE:
9142     escape = META_DATA(*pptr);
9143     if (escape == ESC_R || escape == ESC_X) return -1;
9144     if (escape > ESC_b && escape < ESC_Z)
9145       {
9146 #if PCRE2_CODE_UNIT_WIDTH != 32
9147       if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9148         {
9149         *errcodeptr = ERR36;
9150         return -1;
9151         }
9152 #endif
9153       itemlength = 1;
9154       if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9155       }
9156     break;
9157 
9158     /* Lookaheads do not contribute to the length of this branch, but they may
9159     contain lookbehinds within them whose lengths need to be set. */
9160 
9161     case META_LOOKAHEAD:
9162     case META_LOOKAHEADNOT:
9163     case META_LOOKAHEAD_NA:
9164     *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9165     if (*errcodeptr != 0) return -1;
9166 
9167     /* Ignore any qualifiers that follow a lookahead assertion. */
9168 
9169     switch (pptr[1])
9170       {
9171       case META_ASTERISK:
9172       case META_ASTERISK_PLUS:
9173       case META_ASTERISK_QUERY:
9174       case META_PLUS:
9175       case META_PLUS_PLUS:
9176       case META_PLUS_QUERY:
9177       case META_QUERY:
9178       case META_QUERY_PLUS:
9179       case META_QUERY_QUERY:
9180       pptr++;
9181       break;
9182 
9183       case META_MINMAX:
9184       case META_MINMAX_PLUS:
9185       case META_MINMAX_QUERY:
9186       pptr += 3;
9187       break;
9188 
9189       default:
9190       break;
9191       }
9192     break;
9193 
9194     /* A nested lookbehind does not contribute any length to this lookbehind,
9195     but must itself be checked and have its lengths set. */
9196 
9197     case META_LOOKBEHIND:
9198     case META_LOOKBEHINDNOT:
9199     case META_LOOKBEHIND_NA:
9200     if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9201       return -1;
9202     break;
9203 
9204     /* Back references and recursions are handled by very similar code. At this
9205     stage, the names generated in the parsing pass are available, but the main
9206     name table has not yet been created. So for the named varieties, scan the
9207     list of names in order to get the number of the first one in the pattern,
9208     and whether or not this name is duplicated. */
9209 
9210     case META_BACKREF_BYNAME:
9211     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9212       goto ISNOTFIXED;
9213     /* Fall through */
9214 
9215     case META_RECURSE_BYNAME:
9216       {
9217       int i;
9218       PCRE2_SPTR name;
9219       BOOL is_dupname = FALSE;
9220       named_group *ng = cb->named_groups;
9221       uint32_t meta_code = META_CODE(*pptr);
9222       uint32_t length = *(++pptr);
9223 
9224       GETPLUSOFFSET(offset, pptr);
9225       name = cb->start_pattern + offset;
9226       for (i = 0; i < cb->names_found; i++, ng++)
9227         {
9228         if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9229           {
9230           group = ng->number;
9231           is_dupname = ng->isdup;
9232           break;
9233           }
9234         }
9235 
9236       if (group == 0)
9237         {
9238         *errcodeptr = ERR15;  /* Non-existent subpattern */
9239         cb->erroroffset = offset;
9240         return -1;
9241         }
9242 
9243       /* A numerical back reference can be fixed length if duplicate capturing
9244       groups are not being used. A non-duplicate named back reference can also
9245       be handled. */
9246 
9247       if (meta_code == META_RECURSE_BYNAME ||
9248           (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9249         goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9250       }
9251     goto ISNOTFIXED;                     /* Duplicate name or number */
9252 
9253     /* The offset values for back references < 10 are in a separate vector
9254     because otherwise they would use more than two parsed pattern elements on
9255     64-bit systems. */
9256 
9257     case META_BACKREF:
9258     if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9259         (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9260       goto ISNOTFIXED;
9261     group = META_DATA(*pptr);
9262     if (group < 10)
9263       {
9264       offset = cb->small_ref_offset[group];
9265       goto RECURSE_OR_BACKREF_LENGTH;
9266       }
9267 
9268     /* Fall through */
9269     /* For groups >= 10 - picking up group twice does no harm. */
9270 
9271     /* A true recursion implies not fixed length, but a subroutine call may
9272     be OK. Back reference "recursions" are also failed. */
9273 
9274     case META_RECURSE:
9275     group = META_DATA(*pptr);
9276     GETPLUSOFFSET(offset, pptr);
9277 
9278     RECURSE_OR_BACKREF_LENGTH:
9279     if (group > cb->bracount)
9280       {
9281       cb->erroroffset = offset;
9282       *errcodeptr = ERR15;  /* Non-existent subpattern */
9283       return -1;
9284       }
9285     if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9286     for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9287       {
9288       if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9289         else if (*gptr == (META_CAPTURE | group)) break;
9290       }
9291 
9292     /* We must start the search for the end of the group at the first meta code
9293     inside the group. Otherwise it will be treated as an enclosed group. */
9294 
9295     gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9296     if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9297     if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9298     for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9299     if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9300     this_recurse.prev = recurses;
9301     this_recurse.groupptr = gptr;
9302 
9303     /* We do not need to know the position of the end of the group, that is,
9304     gptr is not used after the call to get_grouplength(). Setting the second
9305     argument FALSE stops it scanning for the end when the length can be found
9306     in the cache. */
9307 
9308     gptr++;
9309     grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9310       &this_recurse, cb);
9311     if (grouplength < 0)
9312       {
9313       if (*errcodeptr == 0) goto ISNOTFIXED;
9314       return -1;  /* Error already set */
9315       }
9316     itemlength = grouplength;
9317     break;
9318 
9319     /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9320     the length of this branch. Skip from the following item to the next
9321     unpaired ket. */
9322 
9323     case META_COND_DEFINE:
9324     pptr = parsed_skip(pptr + 1, PSKIP_KET);
9325     break;
9326 
9327     /* Check other nested groups - advance past the initial data for each type
9328     and then seek a fixed length with get_grouplength(). */
9329 
9330     case META_COND_NAME:
9331     case META_COND_NUMBER:
9332     case META_COND_RNAME:
9333     case META_COND_RNUMBER:
9334     pptr += 2 + SIZEOFFSET;
9335     goto CHECK_GROUP;
9336 
9337     case META_COND_ASSERT:
9338     pptr += 1;
9339     goto CHECK_GROUP;
9340 
9341     case META_COND_VERSION:
9342     pptr += 4;
9343     goto CHECK_GROUP;
9344 
9345     case META_CAPTURE:
9346     group = META_DATA(*pptr);
9347     /* Fall through */
9348 
9349     case META_ATOMIC:
9350     case META_NOCAPTURE:
9351     case META_SCRIPT_RUN:
9352     pptr++;
9353     CHECK_GROUP:
9354     grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9355       recurses, cb);
9356     if (grouplength < 0) return -1;
9357     itemlength = grouplength;
9358     break;
9359 
9360     /* Exact repetition is OK; variable repetition is not. A repetition of zero
9361     must subtract the length that has already been added. */
9362 
9363     case META_MINMAX:
9364     case META_MINMAX_PLUS:
9365     case META_MINMAX_QUERY:
9366     if (pptr[1] == pptr[2])
9367       {
9368       switch(pptr[1])
9369         {
9370         case 0:
9371         branchlength -= lastitemlength;
9372         break;
9373 
9374         case 1:
9375         itemlength = 0;
9376         break;
9377 
9378         default:  /* Check for integer overflow */
9379         if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9380             INT_MAX/lastitemlength < pptr[1] - 1)
9381           {
9382           *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9383           return -1;
9384           }
9385         itemlength = (pptr[1] - 1) * lastitemlength;
9386         break;
9387         }
9388       pptr += 2;
9389       break;
9390       }
9391     /* Fall through */
9392 
9393     /* Any other item means this branch does not have a fixed length. */
9394 
9395     default:
9396     ISNOTFIXED:
9397     *errcodeptr = ERR25;   /* Not fixed length */
9398     return -1;
9399     }
9400 
9401   /* Add the item length to the branchlength, checking for integer overflow and
9402   for the branch length exceeding the limit. */
9403 
9404   if (INT_MAX - branchlength < (int)itemlength ||
9405       (branchlength += itemlength) > LOOKBEHIND_MAX)
9406     {
9407     *errcodeptr = ERR87;
9408     return -1;
9409     }
9410 
9411   /* Save this item length for use if the next item is a quantifier. */
9412 
9413   lastitemlength = itemlength;
9414   }
9415 
9416 EXIT:
9417 *pptrptr = pptr;
9418 return branchlength;
9419 
9420 PARSED_SKIP_FAILED:
9421 *errcodeptr = ERR90;
9422 return -1;
9423 }
9424 
9425 
9426 
9427 /*************************************************
9428 *        Set lengths in a lookbehind             *
9429 *************************************************/
9430 
9431 /* This function is called for each lookbehind, to set the lengths in its
9432 branches. An error occurs if any branch does not have a fixed length that is
9433 less than the maximum (65535). On exit, the pointer must be left on the final
9434 ket.
9435 
9436 The function also maintains the max_lookbehind value. Any lookbehind branch
9437 that contains a nested lookbehind may actually look further back than the
9438 length of the branch. The additional amount is passed back from
9439 get_branchlength() as an "extra" value.
9440 
9441 Arguments:
9442   pptrptr     pointer to pointer in the parsed pattern
9443   errcodeptr  pointer to error code
9444   lcptr       pointer to loop counter
9445   recurses    chain of recurse_check to catch mutual recursion
9446   cb          pointer to compile block
9447 
9448 Returns:      TRUE if all is well
9449               FALSE otherwise, with error code and offset set
9450 */
9451 
9452 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9453 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9454   parsed_recurse_check *recurses, compile_block *cb)
9455 {
9456 PCRE2_SIZE offset;
9457 int branchlength;
9458 uint32_t *bptr = *pptrptr;
9459 
9460 READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9461 *pptrptr += SIZEOFFSET;
9462 
9463 do
9464   {
9465   *pptrptr += 1;
9466   branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9467   if (branchlength < 0)
9468     {
9469     /* The errorcode and offset may already be set from a nested lookbehind. */
9470     if (*errcodeptr == 0) *errcodeptr = ERR25;
9471     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9472     return FALSE;
9473     }
9474   if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9475   *bptr |= branchlength;  /* branchlength never more than 65535 */
9476   bptr = *pptrptr;
9477   }
9478 while (*bptr == META_ALT);
9479 
9480 return TRUE;
9481 }
9482 
9483 
9484 
9485 /*************************************************
9486 *         Check parsed pattern lookbehinds       *
9487 *************************************************/
9488 
9489 /* This function is called at the end of parsing a pattern if any lookbehinds
9490 were encountered. It scans the parsed pattern for them, calling
9491 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9492 the error offset is marked unset. The enables the functions above not to
9493 override settings from deeper nestings.
9494 
9495 This function is called recursively from get_branchlength() for lookaheads in
9496 order to process any lookbehinds that they may contain. It stops when it hits a
9497 non-nested closing parenthesis in this case, returning a pointer to it.
9498 
9499 Arguments
9500   pptr      points to where to start (start of pattern or start of lookahead)
9501   retptr    if not NULL, return the ket pointer here
9502   recurses  chain of recurse_check to catch mutual recursion
9503   cb        points to the compile block
9504   lcptr     points to loop counter
9505 
9506 Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9507 */
9508 
9509 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9510 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9511   parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9512 {
9513 int errorcode = 0;
9514 int nestlevel = 0;
9515 
9516 cb->erroroffset = PCRE2_UNSET;
9517 
9518 for (; *pptr != META_END; pptr++)
9519   {
9520   if (*pptr < META_END) continue;  /* Literal */
9521 
9522   switch (META_CODE(*pptr))
9523     {
9524     default:
9525     return ERR70;  /* Unrecognized meta code */
9526 
9527     case META_ESCAPE:
9528     if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9529       pptr += 1;
9530     break;
9531 
9532     case META_KET:
9533     if (--nestlevel < 0)
9534       {
9535       if (retptr != NULL) *retptr = pptr;
9536       return 0;
9537       }
9538     break;
9539 
9540     case META_ATOMIC:
9541     case META_CAPTURE:
9542     case META_COND_ASSERT:
9543     case META_LOOKAHEAD:
9544     case META_LOOKAHEADNOT:
9545     case META_LOOKAHEAD_NA:
9546     case META_NOCAPTURE:
9547     case META_SCRIPT_RUN:
9548     nestlevel++;
9549     break;
9550 
9551     case META_ACCEPT:
9552     case META_ALT:
9553     case META_ASTERISK:
9554     case META_ASTERISK_PLUS:
9555     case META_ASTERISK_QUERY:
9556     case META_BACKREF:
9557     case META_CIRCUMFLEX:
9558     case META_CLASS:
9559     case META_CLASS_EMPTY:
9560     case META_CLASS_EMPTY_NOT:
9561     case META_CLASS_END:
9562     case META_CLASS_NOT:
9563     case META_COMMIT:
9564     case META_DOLLAR:
9565     case META_DOT:
9566     case META_FAIL:
9567     case META_PLUS:
9568     case META_PLUS_PLUS:
9569     case META_PLUS_QUERY:
9570     case META_PRUNE:
9571     case META_QUERY:
9572     case META_QUERY_PLUS:
9573     case META_QUERY_QUERY:
9574     case META_RANGE_ESCAPED:
9575     case META_RANGE_LITERAL:
9576     case META_SKIP:
9577     case META_THEN:
9578     break;
9579 
9580     case META_RECURSE:
9581     pptr += SIZEOFFSET;
9582     break;
9583 
9584     case META_BACKREF_BYNAME:
9585     case META_RECURSE_BYNAME:
9586     pptr += 1 + SIZEOFFSET;
9587     break;
9588 
9589     case META_COND_DEFINE:
9590     pptr += SIZEOFFSET;
9591     nestlevel++;
9592     break;
9593 
9594     case META_COND_NAME:
9595     case META_COND_NUMBER:
9596     case META_COND_RNAME:
9597     case META_COND_RNUMBER:
9598     pptr += 1 + SIZEOFFSET;
9599     nestlevel++;
9600     break;
9601 
9602     case META_COND_VERSION:
9603     pptr += 3;
9604     nestlevel++;
9605     break;
9606 
9607     case META_CALLOUT_STRING:
9608     pptr += 3 + SIZEOFFSET;
9609     break;
9610 
9611     case META_BIGVALUE:
9612     case META_OPTIONS:
9613     case META_POSIX:
9614     case META_POSIX_NEG:
9615     pptr += 1;
9616     break;
9617 
9618     case META_MINMAX:
9619     case META_MINMAX_QUERY:
9620     case META_MINMAX_PLUS:
9621     pptr += 2;
9622     break;
9623 
9624     case META_CALLOUT_NUMBER:
9625     pptr += 3;
9626     break;
9627 
9628     case META_MARK:
9629     case META_COMMIT_ARG:
9630     case META_PRUNE_ARG:
9631     case META_SKIP_ARG:
9632     case META_THEN_ARG:
9633     pptr += 1 + pptr[1];
9634     break;
9635 
9636     case META_LOOKBEHIND:
9637     case META_LOOKBEHINDNOT:
9638     case META_LOOKBEHIND_NA:
9639     if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
9640       return errorcode;
9641     break;
9642     }
9643   }
9644 
9645 return 0;
9646 }
9647 
9648 
9649 
9650 /*************************************************
9651 *     External function to compile a pattern     *
9652 *************************************************/
9653 
9654 /* This function reads a regular expression in the form of a string and returns
9655 a pointer to a block of store holding a compiled version of the expression.
9656 
9657 Arguments:
9658   pattern       the regular expression
9659   patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
9660   options       option bits
9661   errorptr      pointer to errorcode
9662   erroroffset   pointer to error offset
9663   ccontext      points to a compile context or is NULL
9664 
9665 Returns:        pointer to compiled data block, or NULL on error,
9666                 with errorcode and erroroffset set
9667 */
9668 
9669 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9670 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9671    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9672 {
9673 BOOL utf;                             /* Set TRUE for UTF mode */
9674 BOOL ucp;                             /* Set TRUE for UCP mode */
9675 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
9676 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
9677 pcre2_real_code *re = NULL;           /* What we will return */
9678 compile_block cb;                     /* "Static" compile-time data */
9679 const uint8_t *tables;                /* Char tables base pointer */
9680 
9681 PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
9682 PCRE2_SPTR codestart;                 /* Start of compiled code */
9683 PCRE2_SPTR ptr;                       /* Current pointer in pattern */
9684 uint32_t *pptr;                       /* Current pointer in parsed pattern */
9685 
9686 PCRE2_SIZE length = 1;                /* Allow for final END opcode */
9687 PCRE2_SIZE usedlength;                /* Actual length used */
9688 PCRE2_SIZE re_blocksize;              /* Size of memory block */
9689 PCRE2_SIZE big32count = 0;            /* 32-bit literals >= 0x80000000 */
9690 PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
9691 
9692 int32_t firstcuflags, reqcuflags;     /* Type of first/req code unit */
9693 uint32_t firstcu, reqcu;              /* Value of first/req code unit */
9694 uint32_t setflags = 0;                /* NL and BSR set flags */
9695 
9696 uint32_t skipatstart;                 /* When checking (*UTF) etc */
9697 uint32_t limit_heap  = UINT32_MAX;
9698 uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
9699 uint32_t limit_depth = UINT32_MAX;
9700 
9701 int newline = 0;                      /* Unset; can be set by the pattern */
9702 int bsr = 0;                          /* Unset; can be set by the pattern */
9703 int errorcode = 0;                    /* Initialize to avoid compiler warn */
9704 int regexrc;                          /* Return from compile */
9705 
9706 uint32_t i;                           /* Local loop counter */
9707 
9708 /* Comments at the head of this file explain about these variables. */
9709 
9710 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9711 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9712 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9713 
9714 /* The workspace is used in different ways in the different compiling phases.
9715 It needs to be 16-bit aligned for the preliminary parsing scan. */
9716 
9717 uint32_t c16workspace[C16_WORK_SIZE];
9718 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9719 
9720 
9721 /* -------------- Check arguments and set up the pattern ----------------- */
9722 
9723 /* There must be error code and offset pointers. */
9724 
9725 if (errorptr == NULL || erroroffset == NULL) return NULL;
9726 *errorptr = ERR0;
9727 *erroroffset = 0;
9728 
9729 /* There must be a pattern! */
9730 
9731 if (pattern == NULL)
9732   {
9733   *errorptr = ERR16;
9734   return NULL;
9735   }
9736 
9737 /* A NULL compile context means "use a default context" */
9738 
9739 if (ccontext == NULL)
9740   ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9741 
9742 /* PCRE2_MATCH_INVALID_UTF implies UTF */
9743 
9744 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
9745 
9746 /* Check that all undefined public option bits are zero. */
9747 
9748 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9749     (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9750   {
9751   *errorptr = ERR17;
9752   return NULL;
9753   }
9754 
9755 if ((options & PCRE2_LITERAL) != 0 &&
9756     ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9757      (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9758   {
9759   *errorptr = ERR92;
9760   return NULL;
9761   }
9762 
9763 /* A zero-terminated pattern is indicated by the special length value
9764 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9765 
9766 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9767   patlen = PRIV(strlen)(pattern);
9768 
9769 if (patlen > ccontext->max_pattern_length)
9770   {
9771   *errorptr = ERR88;
9772   return NULL;
9773   }
9774 
9775 /* From here on, all returns from this function should end up going via the
9776 EXIT label. */
9777 
9778 
9779 /* ------------ Initialize the "static" compile data -------------- */
9780 
9781 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9782 
9783 cb.lcc = tables + lcc_offset;          /* Individual */
9784 cb.fcc = tables + fcc_offset;          /*   character */
9785 cb.cbits = tables + cbits_offset;      /*      tables */
9786 cb.ctypes = tables + ctypes_offset;
9787 
9788 cb.assert_depth = 0;
9789 cb.bracount = 0;
9790 cb.cx = ccontext;
9791 cb.dupnames = FALSE;
9792 cb.end_pattern = pattern + patlen;
9793 cb.erroroffset = 0;
9794 cb.external_flags = 0;
9795 cb.external_options = options;
9796 cb.groupinfo = stack_groupinfo;
9797 cb.had_recurse = FALSE;
9798 cb.lastcapture = 0;
9799 cb.max_lookbehind = 0;
9800 cb.name_entry_size = 0;
9801 cb.name_table = NULL;
9802 cb.named_groups = named_groups;
9803 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9804 cb.names_found = 0;
9805 cb.open_caps = NULL;
9806 cb.parens_depth = 0;
9807 cb.parsed_pattern = stack_parsed_pattern;
9808 cb.req_varyopt = 0;
9809 cb.start_code = cworkspace;
9810 cb.start_pattern = pattern;
9811 cb.start_workspace = cworkspace;
9812 cb.workspace_size = COMPILE_WORK_SIZE;
9813 
9814 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9815 references to help in deciding whether (.*) can be treated as anchored or not.
9816 */
9817 
9818 cb.top_backref = 0;
9819 cb.backref_map = 0;
9820 
9821 /* Escape sequences \1 to \9 are always back references, but as they are only
9822 two characters long, only two elements can be used in the parsed_pattern
9823 vector. The first contains the reference, and we'd like to use the second to
9824 record the offset in the pattern, so that forward references to non-existent
9825 groups can be diagnosed later with an offset. However, on 64-bit systems,
9826 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9827 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9828 references have enough space for the offset to be put into the parsed pattern.
9829 */
9830 
9831 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9832 
9833 
9834 /* --------------- Start looking at the pattern --------------- */
9835 
9836 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9837 the start of the pattern, and remember the offset to the actual regex. With
9838 valgrind support, make the terminator of a zero-terminated pattern
9839 inaccessible. This catches bugs that would otherwise only show up for
9840 non-zero-terminated patterns. */
9841 
9842 #ifdef SUPPORT_VALGRIND
9843 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9844 #endif
9845 
9846 ptr = pattern;
9847 skipatstart = 0;
9848 
9849 if ((options & PCRE2_LITERAL) == 0)
9850   {
9851   while (patlen - skipatstart >= 2 &&
9852          ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9853          ptr[skipatstart+1] == CHAR_ASTERISK)
9854     {
9855     for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9856       {
9857       uint32_t c, pp;
9858       pso *p = pso_list + i;
9859 
9860       if (patlen - skipatstart - 2 >= p->length &&
9861           PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9862             p->length) == 0)
9863         {
9864         skipatstart += p->length + 2;
9865         switch(p->type)
9866           {
9867           case PSO_OPT:
9868           cb.external_options |= p->value;
9869           break;
9870 
9871           case PSO_FLG:
9872           setflags |= p->value;
9873           break;
9874 
9875           case PSO_NL:
9876           newline = p->value;
9877           setflags |= PCRE2_NL_SET;
9878           break;
9879 
9880           case PSO_BSR:
9881           bsr = p->value;
9882           setflags |= PCRE2_BSR_SET;
9883           break;
9884 
9885           case PSO_LIMM:
9886           case PSO_LIMD:
9887           case PSO_LIMH:
9888           c = 0;
9889           pp = skipatstart;
9890           if (!IS_DIGIT(ptr[pp]))
9891             {
9892             errorcode = ERR60;
9893             ptr += pp;
9894             goto HAD_EARLY_ERROR;
9895             }
9896           while (IS_DIGIT(ptr[pp]))
9897             {
9898             if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9899             c = c*10 + (ptr[pp++] - CHAR_0);
9900             }
9901           if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9902             {
9903             errorcode = ERR60;
9904             ptr += pp;
9905             goto HAD_EARLY_ERROR;
9906             }
9907           if (p->type == PSO_LIMH) limit_heap = c;
9908             else if (p->type == PSO_LIMM) limit_match = c;
9909             else limit_depth = c;
9910           skipatstart += pp - skipatstart;
9911           break;
9912           }
9913         break;   /* Out of the table scan loop */
9914         }
9915       }
9916     if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
9917     }
9918   }
9919 
9920 /* End of pattern-start options; advance to start of real regex. */
9921 
9922 ptr += skipatstart;
9923 
9924 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
9925 
9926 #ifndef SUPPORT_UNICODE
9927 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
9928   {
9929   errorcode = ERR32;
9930   goto HAD_EARLY_ERROR;
9931   }
9932 #endif
9933 
9934 /* Check UTF. We have the original options in 'options', with that value as
9935 modified by (*UTF) etc in cb->external_options. The extra option
9936 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
9937 surrogate code points cannot be represented in UTF-16. */
9938 
9939 utf = (cb.external_options & PCRE2_UTF) != 0;
9940 if (utf)
9941   {
9942   if ((options & PCRE2_NEVER_UTF) != 0)
9943     {
9944     errorcode = ERR74;
9945     goto HAD_EARLY_ERROR;
9946     }
9947   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
9948        (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
9949     goto HAD_ERROR;  /* Offset was set by valid_utf() */
9950 
9951 #if PCRE2_CODE_UNIT_WIDTH == 16
9952   if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
9953     {
9954     errorcode = ERR91;
9955     goto HAD_EARLY_ERROR;
9956     }
9957 #endif
9958   }
9959 
9960 /* Check UCP lockout. */
9961 
9962 ucp = (cb.external_options & PCRE2_UCP) != 0;
9963 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
9964   {
9965   errorcode = ERR75;
9966   goto HAD_EARLY_ERROR;
9967   }
9968 
9969 /* Process the BSR setting. */
9970 
9971 if (bsr == 0) bsr = ccontext->bsr_convention;
9972 
9973 /* Process the newline setting. */
9974 
9975 if (newline == 0) newline = ccontext->newline_convention;
9976 cb.nltype = NLTYPE_FIXED;
9977 switch(newline)
9978   {
9979   case PCRE2_NEWLINE_CR:
9980   cb.nllen = 1;
9981   cb.nl[0] = CHAR_CR;
9982   break;
9983 
9984   case PCRE2_NEWLINE_LF:
9985   cb.nllen = 1;
9986   cb.nl[0] = CHAR_NL;
9987   break;
9988 
9989   case PCRE2_NEWLINE_NUL:
9990   cb.nllen = 1;
9991   cb.nl[0] = CHAR_NUL;
9992   break;
9993 
9994   case PCRE2_NEWLINE_CRLF:
9995   cb.nllen = 2;
9996   cb.nl[0] = CHAR_CR;
9997   cb.nl[1] = CHAR_NL;
9998   break;
9999 
10000   case PCRE2_NEWLINE_ANY:
10001   cb.nltype = NLTYPE_ANY;
10002   break;
10003 
10004   case PCRE2_NEWLINE_ANYCRLF:
10005   cb.nltype = NLTYPE_ANYCRLF;
10006   break;
10007 
10008   default:
10009   errorcode = ERR56;
10010   goto HAD_EARLY_ERROR;
10011   }
10012 
10013 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10014 their numerical equivalents, so that this information is always available for
10015 the remaining processing. (2) At the same time, parse the pattern and put a
10016 processed version into the parsed_pattern vector. This has escapes interpreted
10017 and comments removed (amongst other things).
10018 
10019 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10020 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10021 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10022 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10023 characters greater than META_END (0x80000000) have to be coded as two units. In
10024 this case, therefore, we scan the pattern to check for such values. */
10025 
10026 #if PCRE2_CODE_UNIT_WIDTH == 32
10027 if (!utf)
10028   {
10029   PCRE2_SPTR p;
10030   for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10031   }
10032 #endif
10033 
10034 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10035 is set we have to assume a numerical callout (4 elements) for each character
10036 plus one at the end. This is overkill, but memory is plentiful these days. For
10037 many smaller patterns the vector on the stack (which was set up above) can be
10038 used. */
10039 
10040 parsed_size_needed = patlen - skipatstart + big32count;
10041 
10042 if ((ccontext->extra_options &
10043      (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10044   parsed_size_needed += 4;
10045 
10046 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10047   parsed_size_needed = (parsed_size_needed + 1) * 5;
10048 
10049 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10050   {
10051   uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10052     (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10053   if (heap_parsed_pattern == NULL)
10054     {
10055     *errorptr = ERR21;
10056     goto EXIT;
10057     }
10058   cb.parsed_pattern = heap_parsed_pattern;
10059   }
10060 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10061 
10062 /* Do the parsing scan. */
10063 
10064 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10065 if (errorcode != 0) goto HAD_CB_ERROR;
10066 
10067 /* Workspace is needed to remember information about numbered groups: whether a
10068 group can match an empty string and what its fixed length is. This is done to
10069 avoid the possibility of recursive references causing very long compile times
10070 when checking these features. Unnumbered groups do not have this exposure since
10071 they cannot be referenced. We use an indexed vector for this purpose. If there
10072 are sufficiently few groups, the default vector on the stack, as set up above,
10073 can be used. Otherwise we have to get/free a special vector. The vector must be
10074 initialized to zero. */
10075 
10076 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
10077   {
10078   cb.groupinfo = ccontext->memctl.malloc(
10079     (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
10080   if (cb.groupinfo == NULL)
10081     {
10082     errorcode = ERR21;
10083     cb.erroroffset = 0;
10084     goto HAD_CB_ERROR;
10085     }
10086   }
10087 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
10088 
10089 /* If there were any lookbehinds, scan the parsed pattern to figure out their
10090 lengths. */
10091 
10092 if (has_lookbehind)
10093   {
10094   int loopcount = 0;
10095   errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10096   if (errorcode != 0) goto HAD_CB_ERROR;
10097   }
10098 
10099 /* For debugging, there is a function that shows the parsed data vector. */
10100 
10101 #ifdef DEBUG_SHOW_PARSED
10102 fprintf(stderr, "+++ Pre-scan complete:\n");
10103 show_parsed(&cb);
10104 #endif
10105 
10106 /* For debugging capturing information this code can be enabled. */
10107 
10108 #ifdef DEBUG_SHOW_CAPTURES
10109   {
10110   named_group *ng = cb.named_groups;
10111   fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10112   for (i = 0; i < cb.names_found; i++, ng++)
10113     {
10114     fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10115     }
10116   }
10117 #endif
10118 
10119 /* Pretend to compile the pattern while actually just accumulating the amount
10120 of memory required in the 'length' variable. This behaviour is triggered by
10121 passing a non-NULL final argument to compile_regex(). We pass a block of
10122 workspace (cworkspace) for it to compile parts of the pattern into; the
10123 compiled code is discarded when it is no longer needed, so hopefully this
10124 workspace will never overflow, though there is a test for its doing so.
10125 
10126 On error, errorcode will be set non-zero, so we don't need to look at the
10127 result of the function. The initial options have been put into the cb block,
10128 but we still have to pass a separate options variable (the first argument)
10129 because the options may change as the pattern is processed. */
10130 
10131 cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10132 pptr = cb.parsed_pattern;
10133 code = cworkspace;
10134 *code = OP_BRA;
10135 
10136 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
10137    &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
10138 
10139 if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10140 
10141 /* This should be caught in compile_regex(), but just in case... */
10142 
10143 if (length > MAX_PATTERN_SIZE)
10144   {
10145   errorcode = ERR20;
10146   goto HAD_CB_ERROR;
10147   }
10148 
10149 /* Compute the size of, and then get and initialize, the data block for storing
10150 the compiled pattern and names table. Integer overflow should no longer be
10151 possible because nowadays we limit the maximum value of cb.names_found and
10152 cb.name_entry_size. */
10153 
10154 re_blocksize = sizeof(pcre2_real_code) +
10155   CU2BYTES(length +
10156   (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10157 re = (pcre2_real_code *)
10158   ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10159 if (re == NULL)
10160   {
10161   errorcode = ERR21;
10162   goto HAD_CB_ERROR;
10163   }
10164 
10165 /* The compiler may put padding at the end of the pcre2_real_code structure in
10166 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10167 compiled pattern is copied (for example, when serialized) undefined bytes are
10168 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10169 write to the last 8 bytes of the structure before setting the fields. */
10170 
10171 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10172 re->memctl = ccontext->memctl;
10173 re->tables = tables;
10174 re->executable_jit = NULL;
10175 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10176 re->blocksize = re_blocksize;
10177 re->magic_number = MAGIC_NUMBER;
10178 re->compile_options = options;
10179 re->overall_options = cb.external_options;
10180 re->extra_options = ccontext->extra_options;
10181 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10182 re->limit_heap = limit_heap;
10183 re->limit_match = limit_match;
10184 re->limit_depth = limit_depth;
10185 re->first_codeunit = 0;
10186 re->last_codeunit = 0;
10187 re->bsr_convention = bsr;
10188 re->newline_convention = newline;
10189 re->max_lookbehind = 0;
10190 re->minlength = 0;
10191 re->top_bracket = 0;
10192 re->top_backref = 0;
10193 re->name_entry_size = cb.name_entry_size;
10194 re->name_count = cb.names_found;
10195 
10196 /* The basic block is immediately followed by the name table, and the compiled
10197 code follows after that. */
10198 
10199 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10200   re->name_entry_size * re->name_count;
10201 
10202 /* Update the compile data block for the actual compile. The starting points of
10203 the name/number translation table and of the code are passed around in the
10204 compile data block. The start/end pattern and initial options are already set
10205 from the pre-compile phase, as is the name_entry_size field. */
10206 
10207 cb.parens_depth = 0;
10208 cb.assert_depth = 0;
10209 cb.lastcapture = 0;
10210 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10211 cb.start_code = codestart;
10212 cb.req_varyopt = 0;
10213 cb.had_accept = FALSE;
10214 cb.had_pruneorskip = FALSE;
10215 cb.open_caps = NULL;
10216 
10217 /* If any named groups were found, create the name/number table from the list
10218 created in the pre-pass. */
10219 
10220 if (cb.names_found > 0)
10221   {
10222   named_group *ng = cb.named_groups;
10223   for (i = 0; i < cb.names_found; i++, ng++)
10224     add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10225   }
10226 
10227 /* Set up a starting, non-extracting bracket, then compile the expression. On
10228 error, errorcode will be set non-zero, so we don't need to look at the result
10229 of the function here. */
10230 
10231 pptr = cb.parsed_pattern;
10232 code = (PCRE2_UCHAR *)codestart;
10233 *code = OP_BRA;
10234 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
10235   &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
10236 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10237 re->top_bracket = cb.bracount;
10238 re->top_backref = cb.top_backref;
10239 re->max_lookbehind = cb.max_lookbehind;
10240 
10241 if (cb.had_accept)
10242   {
10243   reqcu = 0;                     /* Must disable after (*ACCEPT) */
10244   reqcuflags = REQ_NONE;
10245   re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10246   }
10247 
10248 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10249 but the estimated length exceeds the really used length, adjust the value of
10250 re->blocksize, and if valgrind support is configured, mark the extra allocated
10251 memory as unaddressable, so that any out-of-bound reads can be detected. */
10252 
10253 *code++ = OP_END;
10254 usedlength = code - codestart;
10255 if (usedlength > length) errorcode = ERR23; else
10256   {
10257   re->blocksize -= CU2BYTES(length - usedlength);
10258 #ifdef SUPPORT_VALGRIND
10259   VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10260 #endif
10261   }
10262 
10263 /* Scan the pattern for recursion/subroutine calls and convert the group
10264 numbers into offsets. Maintain a small cache so that repeated groups containing
10265 recursions are efficiently handled. */
10266 
10267 #define RSCAN_CACHE_SIZE 8
10268 
10269 if (errorcode == 0 && cb.had_recurse)
10270   {
10271   PCRE2_UCHAR *rcode;
10272   PCRE2_SPTR rgroup;
10273   unsigned int ccount = 0;
10274   int start = RSCAN_CACHE_SIZE;
10275   recurse_cache rc[RSCAN_CACHE_SIZE];
10276 
10277   for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10278        rcode != NULL;
10279        rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10280     {
10281     int p, groupnumber;
10282 
10283     groupnumber = (int)GET(rcode, 1);
10284     if (groupnumber == 0) rgroup = codestart; else
10285       {
10286       PCRE2_SPTR search_from = codestart;
10287       rgroup = NULL;
10288       for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10289         {
10290         if (groupnumber == rc[p].groupnumber)
10291           {
10292           rgroup = rc[p].group;
10293           break;
10294           }
10295 
10296         /* Group n+1 must always start to the right of group n, so we can save
10297         search time below when the new group number is greater than any of the
10298         previously found groups. */
10299 
10300         if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10301         }
10302 
10303       if (rgroup == NULL)
10304         {
10305         rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10306         if (rgroup == NULL)
10307           {
10308           errorcode = ERR53;
10309           break;
10310           }
10311         if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10312         rc[start].groupnumber = groupnumber;
10313         rc[start].group = rgroup;
10314         if (ccount < RSCAN_CACHE_SIZE) ccount++;
10315         }
10316       }
10317 
10318     PUT(rcode, 1, rgroup - codestart);
10319     }
10320   }
10321 
10322 /* In rare debugging situations we sometimes need to look at the compiled code
10323 at this stage. */
10324 
10325 #ifdef DEBUG_CALL_PRINTINT
10326 pcre2_printint(re, stderr, TRUE);
10327 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10328 #endif
10329 
10330 /* Unless disabled, check whether any single character iterators can be
10331 auto-possessified. The function overwrites the appropriate opcode values, so
10332 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10333 used in this code because at least one compiler gives a warning about loss of
10334 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10335 function call. */
10336 
10337 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10338   {
10339   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10340   if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10341   }
10342 
10343 /* Failed to compile, or error while post-processing. */
10344 
10345 if (errorcode != 0) goto HAD_CB_ERROR;
10346 
10347 /* Successful compile. If the anchored option was not passed, set it if
10348 we can determine that the pattern is anchored by virtue of ^ characters or \A
10349 or anything else, such as starting with non-atomic .* when DOTALL is set and
10350 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10351 disable this case). */
10352 
10353 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10354      is_anchored(codestart, 0, &cb, 0, FALSE))
10355   re->overall_options |= PCRE2_ANCHORED;
10356 
10357 /* Set up the first code unit or startline flag, the required code unit, and
10358 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10359 is set, as the data it would create will not be used. Note that a first code
10360 unit (but not the startline flag) is useful for anchored patterns because it
10361 can still give a quick "no match" and also avoid searching for a last code
10362 unit. */
10363 
10364 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10365   {
10366   int minminlength = 0;  /* For minimal minlength from first/required CU */
10367 
10368   /* If we do not have a first code unit, see if there is one that is asserted
10369   (these are not saved during the compile because they can cause conflicts with
10370   actual literals that follow). */
10371 
10372   if (firstcuflags < 0)
10373     firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10374 
10375   /* Save the data for a first code unit. The existence of one means the
10376   minimum length must be at least 1. */
10377 
10378   if (firstcuflags >= 0)
10379     {
10380     re->first_codeunit = firstcu;
10381     re->flags |= PCRE2_FIRSTSET;
10382     minminlength++;
10383 
10384     /* Handle caseless first code units. */
10385 
10386     if ((firstcuflags & REQ_CASELESS) != 0)
10387       {
10388       if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10389         {
10390         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10391         }
10392 
10393       /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10394       In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10395       points and cannot have another case, but if UCP is set they may do. */
10396 
10397 #ifdef SUPPORT_UNICODE
10398 #if PCRE2_CODE_UNIT_WIDTH == 8
10399       else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10400         re->flags |= PCRE2_FIRSTCASELESS;
10401 #else
10402       else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10403                UCD_OTHERCASE(firstcu) != firstcu)
10404         re->flags |= PCRE2_FIRSTCASELESS;
10405 #endif
10406 #endif  /* SUPPORT_UNICODE */
10407       }
10408     }
10409 
10410   /* When there is no first code unit, for non-anchored patterns, see if we can
10411   set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10412   branches start with ^ and also when all branches start with non-atomic .* for
10413   non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10414   that disables this case.) */
10415 
10416   else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10417            is_startline(codestart, 0, &cb, 0, FALSE))
10418     re->flags |= PCRE2_STARTLINE;
10419 
10420   /* Handle the "required code unit", if one is set. In the UTF case we can
10421   increment the minimum minimum length only if we are sure this really is a
10422   different character and not a non-starting code unit of the first character,
10423   because the minimum length count is in characters, not code units. */
10424 
10425   if (reqcuflags >= 0)
10426     {
10427 #if PCRE2_CODE_UNIT_WIDTH == 16
10428     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10429         firstcuflags < 0 ||                         /* First not set */
10430         (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
10431         (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
10432 #elif PCRE2_CODE_UNIT_WIDTH == 8
10433     if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
10434         firstcuflags < 0 ||                         /* First not set */
10435         (firstcu & 0x80) == 0 ||                    /* First is ASCII */
10436         (reqcu & 0x80) == 0)                        /* Req is ASCII */
10437 #endif
10438       {
10439       minminlength++;
10440       }
10441 
10442     /* In the case of an anchored pattern, set up the value only if it follows
10443     a variable length item in the pattern. */
10444 
10445     if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10446         (reqcuflags & REQ_VARY) != 0)
10447       {
10448       re->last_codeunit = reqcu;
10449       re->flags |= PCRE2_LASTSET;
10450 
10451       /* Handle caseless required code units as for first code units (above). */
10452 
10453       if ((reqcuflags & REQ_CASELESS) != 0)
10454         {
10455         if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10456           {
10457           if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10458           }
10459 #ifdef SUPPORT_UNICODE
10460 #if PCRE2_CODE_UNIT_WIDTH == 8
10461       else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10462         re->flags |= PCRE2_LASTCASELESS;
10463 #else
10464       else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10465                UCD_OTHERCASE(reqcu) != reqcu)
10466         re->flags |= PCRE2_LASTCASELESS;
10467 #endif
10468 #endif  /* SUPPORT_UNICODE */
10469         }
10470       }
10471     }
10472 
10473   /* Study the compiled pattern to set up information such as a bitmap of
10474   starting code units and a minimum matching length. */
10475 
10476   if (PRIV(study)(re) != 0)
10477     {
10478     errorcode = ERR31;
10479     goto HAD_CB_ERROR;
10480     }
10481 
10482   /* If study() set a bitmap of starting code units, it implies a minimum
10483   length of at least one. */
10484 
10485   if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10486     minminlength = 1;
10487 
10488   /* If the minimum length set (or not set) by study() is less than the minimum
10489   implied by required code units, override it. */
10490 
10491   if (re->minlength < minminlength) re->minlength = minminlength;
10492   }   /* End of start-of-match optimizations. */
10493 
10494 /* Control ends up here in all cases. When running under valgrind, make a
10495 pattern's terminating zero defined again. If memory was obtained for the parsed
10496 version of the pattern, free it before returning. Also free the list of named
10497 groups if a larger one had to be obtained, and likewise the group information
10498 vector. */
10499 
10500 EXIT:
10501 #ifdef SUPPORT_VALGRIND
10502 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10503 #endif
10504 if (cb.parsed_pattern != stack_parsed_pattern)
10505   ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10506 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10507   ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10508 if (cb.groupinfo != stack_groupinfo)
10509   ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10510 return re;    /* Will be NULL after an error */
10511 
10512 /* Errors discovered in parse_regex() set the offset value in the compile
10513 block. Errors discovered before it is called must compute it from the ptr
10514 value. After parse_regex() is called, the offset in the compile block is set to
10515 the end of the pattern, but certain errors in compile_regex() may reset it if
10516 an offset is available in the parsed pattern. */
10517 
10518 HAD_CB_ERROR:
10519 ptr = pattern + cb.erroroffset;
10520 
10521 HAD_EARLY_ERROR:
10522 *erroroffset = ptr - pattern;
10523 
10524 HAD_ERROR:
10525 *errorptr = errorcode;
10526 pcre2_code_free(re);
10527 re = NULL;
10528 goto EXIT;
10529 }
10530 
10531 /* End of pcre2_compile.c */
10532