1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2024 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #define NLBLOCK cb /* Block containing newline information */
47 #define PSSTART start_pattern /* Field containing processed string start */
48 #define PSEND end_pattern /* Field containing processed string end */
49
50 #include "pcre2_internal.h"
51
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63
64 /* Other debugging code can be enabled by these defines. */
65
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c) xdigitab[c]
75
76 #else /* Either 16-bit or 32-bit */
77 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82 #else /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110
111 /* Macros for manipulating elements of the parsed pattern vector. */
112
113 #define META_CODE(x) (x & 0xffff0000u)
114 #define META_DATA(x) (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116
117 /* Function definitions to allow mutual recursion */
118
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t,
122 compile_block *, const uint32_t *, unsigned int);
123 #endif
124
125 static int
126 compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
127 uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128 open_capitem *, compile_block *, PCRE2_SIZE *);
129
130 static int
131 get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
132 compile_block *);
133
134 static BOOL
135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136 compile_block *);
137
138 static int
139 check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140 compile_block *, int *);
141
142
143 /*************************************************
144 * Code parameters and static tables *
145 *************************************************/
146
147 #define MAX_GROUP_NUMBER 65535u
148 #define MAX_REPEAT_COUNT 65535u
149 #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
150
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163
164 In the real compile phase, this workspace is not currently used. */
165
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
167
168 #define C16_WORK_SIZE \
169 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174
175 #define GROUPINFO_DEFAULT_SIZE 256
176
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186
187 #define NAMED_GROUP_LIST_SIZE 20
188
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199
200 #define OFLOW_MAX (INT_MAX - 20)
201
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210
211 #define META_END 0x80000000u /* End of pattern */
212
213 #define META_ALT 0x80010000u /* alternation */
214 #define META_ATOMIC 0x80020000u /* atomic group */
215 #define META_BACKREF 0x80030000u /* Back ref */
216 #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
217 #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
219 #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
220 #define META_CAPTURE 0x80080000u /* Capturing parenthesis */
221 #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
222 #define META_CLASS 0x800a0000u /* start non-empty class */
223 #define META_CLASS_EMPTY 0x800b0000u /* empty class */
224 #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
225 #define META_CLASS_END 0x800d0000u /* end of non-empty class */
226 #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
227 #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
228 #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
229 #define META_COND_NAME 0x80110000u /* (?(<name>)... */
230 #define META_COND_NUMBER 0x80120000u /* (?(digits)... */
231 #define META_COND_RNAME 0x80130000u /* (?(R&name)... */
232 #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
233 #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR 0x80160000u /* $ metacharacter */
235 #define META_DOT 0x80170000u /* . metacharacter */
236 #define META_ESCAPE 0x80180000u /* \d and friends */
237 #define META_KET 0x80190000u /* closing parenthesis */
238 #define META_NOCAPTURE 0x801a0000u /* no capture parens */
239 #define META_OPTIONS 0x801b0000u /* (?i) and friends */
240 #define META_POSIX 0x801c0000u /* POSIX class item */
241 #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
242 #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
243 #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
244 #define META_RECURSE 0x80200000u /* Recursion */
245 #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
246 #define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
247
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250
251 #define META_LOOKAHEAD 0x80230000u /* (?= */
252 #define META_LOOKAHEADNOT 0x80240000u /* (?! */
253 #define META_LOOKBEHIND 0x80250000u /* (?<= */
254 #define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
255
256 /* These cannot be conditions */
257
258 #define META_LOOKAHEAD_NA 0x80270000u /* (*napla: */
259 #define META_LOOKBEHIND_NA 0x80280000u /* (*naplb: */
260
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264
265 #define META_MARK 0x80290000u /* (*MARK) */
266 #define META_ACCEPT 0x802a0000u /* (*ACCEPT) */
267 #define META_FAIL 0x802b0000u /* (*FAIL) */
268 #define META_COMMIT 0x802c0000u /* These */
269 #define META_COMMIT_ARG 0x802d0000u /* pairs */
270 #define META_PRUNE 0x802e0000u /* must */
271 #define META_PRUNE_ARG 0x802f0000u /* be */
272 #define META_SKIP 0x80300000u /* kept */
273 #define META_SKIP_ARG 0x80310000u /* in */
274 #define META_THEN 0x80320000u /* this */
275 #define META_THEN_ARG 0x80330000u /* order */
276
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278
279 #define META_ASTERISK 0x80340000u /* * */
280 #define META_ASTERISK_PLUS 0x80350000u /* *+ */
281 #define META_ASTERISK_QUERY 0x80360000u /* *? */
282 #define META_PLUS 0x80370000u /* + */
283 #define META_PLUS_PLUS 0x80380000u /* ++ */
284 #define META_PLUS_QUERY 0x80390000u /* +? */
285 #define META_QUERY 0x803a0000u /* ? */
286 #define META_QUERY_PLUS 0x803b0000u /* ?+ */
287 #define META_QUERY_QUERY 0x803c0000u /* ?? */
288 #define META_MINMAX 0x803d0000u /* {n,m} repeat */
289 #define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
291
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER META_MINMAX_QUERY
294
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305
306 static unsigned char meta_extra_lengths[] = {
307 0, /* META_END */
308 0, /* META_ALT */
309 0, /* META_ATOMIC */
310 0, /* META_BACKREF - more if group is >= 10 */
311 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
312 1, /* META_BIGVALUE */
313 3, /* META_CALLOUT_NUMBER */
314 3+SIZEOFFSET, /* META_CALLOUT_STRING */
315 0, /* META_CAPTURE */
316 0, /* META_CIRCUMFLEX */
317 0, /* META_CLASS */
318 0, /* META_CLASS_EMPTY */
319 0, /* META_CLASS_EMPTY_NOT */
320 0, /* META_CLASS_END */
321 0, /* META_CLASS_NOT */
322 0, /* META_COND_ASSERT */
323 SIZEOFFSET, /* META_COND_DEFINE */
324 1+SIZEOFFSET, /* META_COND_NAME */
325 1+SIZEOFFSET, /* META_COND_NUMBER */
326 1+SIZEOFFSET, /* META_COND_RNAME */
327 1+SIZEOFFSET, /* META_COND_RNUMBER */
328 3, /* META_COND_VERSION */
329 0, /* META_DOLLAR */
330 0, /* META_DOT */
331 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332 0, /* META_KET */
333 0, /* META_NOCAPTURE */
334 1, /* META_OPTIONS */
335 1, /* META_POSIX */
336 1, /* META_POSIX_NEG */
337 0, /* META_RANGE_ESCAPED */
338 0, /* META_RANGE_LITERAL */
339 SIZEOFFSET, /* META_RECURSE */
340 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
341 0, /* META_SCRIPT_RUN */
342 0, /* META_LOOKAHEAD */
343 0, /* META_LOOKAHEADNOT */
344 SIZEOFFSET, /* META_LOOKBEHIND */
345 SIZEOFFSET, /* META_LOOKBEHINDNOT */
346 0, /* META_LOOKAHEAD_NA */
347 SIZEOFFSET, /* META_LOOKBEHIND_NA */
348 1, /* META_MARK - plus the string length */
349 0, /* META_ACCEPT */
350 0, /* META_FAIL */
351 0, /* META_COMMIT */
352 1, /* META_COMMIT_ARG - plus the string length */
353 0, /* META_PRUNE */
354 1, /* META_PRUNE_ARG - plus the string length */
355 0, /* META_SKIP */
356 1, /* META_SKIP_ARG - plus the string length */
357 0, /* META_THEN */
358 1, /* META_THEN_ARG - plus the string length */
359 0, /* META_ASTERISK */
360 0, /* META_ASTERISK_PLUS */
361 0, /* META_ASTERISK_QUERY */
362 0, /* META_PLUS */
363 0, /* META_PLUS_PLUS */
364 0, /* META_PLUS_QUERY */
365 0, /* META_QUERY */
366 0, /* META_QUERY_PLUS */
367 0, /* META_QUERY_QUERY */
368 2, /* META_MINMAX */
369 2, /* META_MINMAX_PLUS */
370 2 /* META_MINMAX_QUERY */
371 };
372
373 /* Types for skipping parts of a parsed pattern. */
374
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380
381 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387
388 /* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389 variables, which are concerned with first and required code units. A value
390 greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391 matching xxcu variable is set, and the low valued bits are relevant. */
392
393 #define REQ_UNSET 0xffffffffu /* Not yet found anything */
394 #define REQ_NONE 0xfffffffeu /* Found not fixed character */
395 #define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
396 #define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
397
398 /* These flags are used in the groupinfo vector. */
399
400 #define GI_SET_FIXED_LENGTH 0x80000000u
401 #define GI_NOT_FIXED_LENGTH 0x40000000u
402 #define GI_FIXED_LENGTH_MASK 0x0000ffffu
403
404 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405 and is fast (a good compiler can turn it into a subtraction and unsigned
406 comparison). */
407
408 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409
410 /* Table to identify hex digits. The tables in chartables are dependent on the
411 locale, and may mark arbitrary characters as digits. We want to recognize only
412 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413 costs 256 bytes, but it is a lot faster than doing character value tests (at
414 least in some simple cases I timed), and in some applications one wants PCRE2
415 to compile efficiently as well as match efficiently. The value in the table is
416 the binary hex digit value, or 0xff for non-hex digits. */
417
418 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419 UTF-8 mode. */
420
421 #ifndef EBCDIC
422 static const uint8_t xdigitab[] =
423 {
424 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
428 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
429 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
430 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
431 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
432 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
434 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
435 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
436 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
437 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
438 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
439 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
440 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456
457 #else
458
459 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460
461 static const uint8_t xdigitab[] =
462 {
463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
466 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
467 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
472 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
473 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
474 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
475 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
476 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
477 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
479 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
480 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
481 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
482 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
483 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
484 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
485 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
486 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
487 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
488 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
489 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
490 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
491 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
492 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
493 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
494 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
495 #endif /* EBCDIC */
496
497
498 /* Table for handling alphanumeric escaped characters. Positive returns are
499 simple data values; negative values are for special things like \d and so on.
500 Zero means further processing is needed (for things like \x), or the escape is
501 invalid. */
502
503 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
504 in UTF-8 mode. It runs from '0' to 'z'. */
505
506 #ifndef EBCDIC
507 #define ESCAPES_FIRST CHAR_0
508 #define ESCAPES_LAST CHAR_z
509 #define UPPER_CASE(c) (c-32)
510
511 static const short int escapes[] = {
512 0, 0,
513 0, 0,
514 0, 0,
515 0, 0,
516 0, 0,
517 CHAR_COLON, CHAR_SEMICOLON,
518 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
519 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
520 CHAR_COMMERCIAL_AT, -ESC_A,
521 -ESC_B, -ESC_C,
522 -ESC_D, -ESC_E,
523 0, -ESC_G,
524 -ESC_H, 0,
525 0, -ESC_K,
526 0, 0,
527 -ESC_N, 0,
528 -ESC_P, -ESC_Q,
529 -ESC_R, -ESC_S,
530 0, 0,
531 -ESC_V, -ESC_W,
532 -ESC_X, 0,
533 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
534 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
535 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
536 CHAR_GRAVE_ACCENT, CHAR_BEL,
537 -ESC_b, 0,
538 -ESC_d, CHAR_ESC,
539 CHAR_FF, 0,
540 -ESC_h, 0,
541 0, -ESC_k,
542 0, 0,
543 CHAR_LF, 0,
544 -ESC_p, 0,
545 CHAR_CR, -ESC_s,
546 CHAR_HT, 0,
547 -ESC_v, -ESC_w,
548 0, 0,
549 -ESC_z
550 };
551
552 #else
553
554 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557 because it is defined as 'a', which of course picks up the ASCII value. */
558
559 #if 'a' == 0x81 /* Check for a real EBCDIC environment */
560 #define ESCAPES_FIRST CHAR_a
561 #define ESCAPES_LAST CHAR_9
562 #define UPPER_CASE(c) (c+64)
563 #else /* Testing in an ASCII environment */
564 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
565 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
566 #define UPPER_CASE(c) (c-32)
567 #endif
568
569 static const short int escapes[] = {
570 /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
571 /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
572 /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
573 /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
574 /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
575 /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
576 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
577 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
578 /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
579 /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
580 /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
581 /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
582 /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
583 /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
584 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
585 /* F8 */ 0, 0
586 };
587
588 /* We also need a table of characters that may follow \c in an EBCDIC
589 environment for characters 0-31. */
590
591 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592
593 #endif /* EBCDIC */
594
595
596 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597 searched linearly. Put all the names into a single string, in order to reduce
598 the number of relocations when a shared library is dynamically linked. The
599 string is built from string macros so that it works in UTF-8 mode on EBCDIC
600 platforms. */
601
602 typedef struct verbitem {
603 unsigned int len; /* Length of verb name */
604 uint32_t meta; /* Base META_ code */
605 int has_arg; /* Argument requirement */
606 } verbitem;
607
608 static const char verbnames[] =
609 "\0" /* Empty name is a shorthand for MARK */
610 STRING_MARK0
611 STRING_ACCEPT0
612 STRING_F0
613 STRING_FAIL0
614 STRING_COMMIT0
615 STRING_PRUNE0
616 STRING_SKIP0
617 STRING_THEN;
618
619 static const verbitem verbs[] = {
620 { 0, META_MARK, +1 }, /* > 0 => must have an argument */
621 { 4, META_MARK, +1 },
622 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
623 { 1, META_FAIL, -1 },
624 { 4, META_FAIL, -1 },
625 { 6, META_COMMIT, 0 },
626 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
627 { 4, META_SKIP, 0 },
628 { 4, META_THEN, 0 }
629 };
630
631 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632
633 /* Verb opcodes, indexed by their META code offset from META_MARK. */
634
635 static const uint32_t verbops[] = {
636 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638
639 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640
641 typedef struct alasitem {
642 unsigned int len; /* Length of name */
643 uint32_t meta; /* Base META_ code */
644 } alasitem;
645
646 static const char alasnames[] =
647 STRING_pla0
648 STRING_plb0
649 STRING_napla0
650 STRING_naplb0
651 STRING_nla0
652 STRING_nlb0
653 STRING_positive_lookahead0
654 STRING_positive_lookbehind0
655 STRING_non_atomic_positive_lookahead0
656 STRING_non_atomic_positive_lookbehind0
657 STRING_negative_lookahead0
658 STRING_negative_lookbehind0
659 STRING_atomic0
660 STRING_sr0
661 STRING_asr0
662 STRING_script_run0
663 STRING_atomic_script_run;
664
665 static const alasitem alasmeta[] = {
666 { 3, META_LOOKAHEAD },
667 { 3, META_LOOKBEHIND },
668 { 5, META_LOOKAHEAD_NA },
669 { 5, META_LOOKBEHIND_NA },
670 { 3, META_LOOKAHEADNOT },
671 { 3, META_LOOKBEHINDNOT },
672 { 18, META_LOOKAHEAD },
673 { 19, META_LOOKBEHIND },
674 { 29, META_LOOKAHEAD_NA },
675 { 30, META_LOOKBEHIND_NA },
676 { 18, META_LOOKAHEADNOT },
677 { 19, META_LOOKBEHINDNOT },
678 { 6, META_ATOMIC },
679 { 2, META_SCRIPT_RUN }, /* sr = script run */
680 { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681 { 10, META_SCRIPT_RUN }, /* script run */
682 { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
683 };
684
685 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686
687 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688
689 static uint32_t chartypeoffset[] = {
690 OP_STAR - OP_STAR, OP_STARI - OP_STAR,
691 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692
693 /* Tables of names of POSIX character classes and their lengths. The names are
694 now all in a single string, to reduce the number of relocations when a shared
695 library is dynamically loaded. The list of lengths is terminated by a zero
696 length entry. The first three must be alpha, lower, upper, as this is assumed
697 for handling case independence. The indices for several classes are needed, so
698 identify them. */
699
700 static const char posix_names[] =
701 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704 STRING_word0 STRING_xdigit;
705
706 static const uint8_t posix_name_lengths[] = {
707 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708
709 #define PC_DIGIT 7
710 #define PC_GRAPH 8
711 #define PC_PRINT 9
712 #define PC_PUNCT 10
713 #define PC_XDIGIT 13
714
715 /* Table of class bit maps for each POSIX class. Each class is formed from a
716 base map, with an optional addition or removal of another map. Then, for some
717 classes, there is some additional tweaking: for [:blank:] the vertical space
718 characters are removed, and for [:alpha:] and [:alnum:] the underscore
719 character is removed. The triples in the table consist of the base map offset,
720 second map offset or -1 if no second map, and a non-negative value for map
721 addition or a negative value for map subtraction (if there are two maps). The
722 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
723 remove vertical space characters, 2 => remove underscore. */
724
725 static const int posix_class_maps[] = {
726 cbit_word, cbit_digit, -2, /* alpha */
727 cbit_lower, -1, 0, /* lower */
728 cbit_upper, -1, 0, /* upper */
729 cbit_word, -1, 2, /* alnum - word without underscore */
730 cbit_print, cbit_cntrl, 0, /* ascii */
731 cbit_space, -1, 1, /* blank - a GNU extension */
732 cbit_cntrl, -1, 0, /* cntrl */
733 cbit_digit, -1, 0, /* digit */
734 cbit_graph, -1, 0, /* graph */
735 cbit_print, -1, 0, /* print */
736 cbit_punct, -1, 0, /* punct */
737 cbit_space, -1, 0, /* space */
738 cbit_word, -1, 0, /* word - a Perl extension */
739 cbit_xdigit, -1, 0 /* xdigit */
740 };
741
742 #ifdef SUPPORT_UNICODE
743
744 /* The POSIX class Unicode property substitutes that are used in UCP mode must
745 be in the order of the POSIX class names, defined above. */
746
747 static int posix_substitutes[] = {
748 PT_GC, ucp_L, /* alpha */
749 PT_PC, ucp_Ll, /* lower */
750 PT_PC, ucp_Lu, /* upper */
751 PT_ALNUM, 0, /* alnum */
752 -1, 0, /* ascii, treat as non-UCP */
753 -1, 1, /* blank, treat as \h */
754 PT_PC, ucp_Cc, /* cntrl */
755 PT_PC, ucp_Nd, /* digit */
756 PT_PXGRAPH, 0, /* graph */
757 PT_PXPRINT, 0, /* print */
758 PT_PXPUNCT, 0, /* punct */
759 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
760 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
761 PT_PXXDIGIT, 0 /* xdigit */ /* Perl has additional hex digits */
762 };
763 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
764 #endif /* SUPPORT_UNICODE */
765
766 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
767 are allowed. */
768
769 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
770 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
771 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
772 PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
773
774 #define PUBLIC_COMPILE_OPTIONS \
775 (PUBLIC_LITERAL_COMPILE_OPTIONS| \
776 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
777 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
778 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
779 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
780 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
781 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
782
783 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
784 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT)
785
786 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
787 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
788 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
789 PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
790 PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
791 PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
792 PCRE2_EXTRA_ASCII_DIGIT)
793
794 /* Compile time error code numbers. They are given names so that they can more
795 easily be tracked. When a new number is added, the tables called eint1 and
796 eint2 in pcre2posix.c may need to be updated, and a new error text must be
797 added to compile_error_texts in pcre2_error.c. Also, the error codes in
798 pcre2.h.in must be updated - their values are exactly 100 greater than these
799 values. */
800
801 enum { ERR0 = COMPILE_ERROR_BASE,
802 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
803 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
804 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
805 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
806 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
807 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
808 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
809 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
810 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
811 ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99, ERR100,
812 ERR101 };
813
814 /* This is a table of start-of-pattern options such as (*UTF) and settings such
815 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
816 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
817 generic and always supported. */
818
819 enum { PSO_OPT, /* Value is an option bit */
820 PSO_FLG, /* Value is a flag bit */
821 PSO_NL, /* Value is a newline type */
822 PSO_BSR, /* Value is a \R type */
823 PSO_LIMH, /* Read integer value for heap limit */
824 PSO_LIMM, /* Read integer value for match limit */
825 PSO_LIMD /* Read integer value for depth limit */
826 };
827
828 typedef struct pso {
829 const uint8_t *name;
830 uint16_t length;
831 uint16_t type;
832 uint32_t value;
833 } pso;
834
835 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
836
837 static const pso pso_list[] = {
838 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
839 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
840 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
841 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
842 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
843 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
844 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
845 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
846 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
847 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
848 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
849 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
850 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
851 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
852 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
853 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
854 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
855 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
856 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
857 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
858 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
859 };
860
861 /* This table is used when converting repeating opcodes into possessified
862 versions as a result of an explicit possessive quantifier such as ++. A zero
863 value means there is no possessified version - in those cases the item in
864 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
865 because all relevant opcodes are less than that. */
866
867 static const uint8_t opcode_possessify[] = {
868 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
869 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
870
871 0, /* NOTI */
872 OP_POSSTAR, 0, /* STAR, MINSTAR */
873 OP_POSPLUS, 0, /* PLUS, MINPLUS */
874 OP_POSQUERY, 0, /* QUERY, MINQUERY */
875 OP_POSUPTO, 0, /* UPTO, MINUPTO */
876 0, /* EXACT */
877 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
878
879 OP_POSSTARI, 0, /* STARI, MINSTARI */
880 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
881 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
882 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
883 0, /* EXACTI */
884 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
885
886 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
887 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
888 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
889 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
890 0, /* NOTEXACT */
891 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
892
893 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
894 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
895 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
896 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
897 0, /* NOTEXACTI */
898 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
899
900 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
901 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
902 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
903 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
904 0, /* TYPEEXACT */
905 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
906
907 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
908 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
909 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
910 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
911 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
912
913 0, 0, 0, /* CLASS, NCLASS, XCLASS */
914 0, 0, /* REF, REFI */
915 0, 0, /* DNREF, DNREFI */
916 0, 0 /* RECURSE, CALLOUT */
917 };
918
919
920 #ifdef DEBUG_SHOW_PARSED
921 /*************************************************
922 * Show the parsed pattern for debugging *
923 *************************************************/
924
925 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
926 can be enabled. */
927
show_parsed(compile_block * cb)928 static void show_parsed(compile_block *cb)
929 {
930 uint32_t *pptr = cb->parsed_pattern;
931
932 for (;;)
933 {
934 int max, min;
935 PCRE2_SIZE offset;
936 uint32_t i;
937 uint32_t length;
938 uint32_t meta_arg = META_DATA(*pptr);
939
940 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
941
942 if (*pptr < META_END)
943 {
944 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
945 pptr++;
946 }
947
948 else switch (META_CODE(*pptr++))
949 {
950 default:
951 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
952 return;
953
954 case META_END:
955 fprintf(stderr, "META_END\n");
956 return;
957
958 case META_CAPTURE:
959 fprintf(stderr, "META_CAPTURE %d", meta_arg);
960 break;
961
962 case META_RECURSE:
963 GETOFFSET(offset, pptr);
964 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
965 break;
966
967 case META_BACKREF:
968 if (meta_arg < 10)
969 offset = cb->small_ref_offset[meta_arg];
970 else
971 GETOFFSET(offset, pptr);
972 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
973 break;
974
975 case META_ESCAPE:
976 if (meta_arg == ESC_P || meta_arg == ESC_p)
977 {
978 uint32_t ptype = *pptr >> 16;
979 uint32_t pvalue = *pptr++ & 0xffff;
980 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
981 ptype, pvalue);
982 }
983 else
984 {
985 uint32_t cc;
986 /* There's just one escape we might have here that isn't negated in the
987 escapes table. */
988 if (meta_arg == ESC_g) cc = CHAR_g;
989 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
990 {
991 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
992 }
993 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
994 fprintf(stderr, "META \\%c", cc);
995 }
996 break;
997
998 case META_MINMAX:
999 min = *pptr++;
1000 max = *pptr++;
1001 if (max != REPEAT_UNLIMITED)
1002 fprintf(stderr, "META {%d,%d}", min, max);
1003 else
1004 fprintf(stderr, "META {%d,}", min);
1005 break;
1006
1007 case META_MINMAX_QUERY:
1008 min = *pptr++;
1009 max = *pptr++;
1010 if (max != REPEAT_UNLIMITED)
1011 fprintf(stderr, "META {%d,%d}?", min, max);
1012 else
1013 fprintf(stderr, "META {%d,}?", min);
1014 break;
1015
1016 case META_MINMAX_PLUS:
1017 min = *pptr++;
1018 max = *pptr++;
1019 if (max != REPEAT_UNLIMITED)
1020 fprintf(stderr, "META {%d,%d}+", min, max);
1021 else
1022 fprintf(stderr, "META {%d,}+", min);
1023 break;
1024
1025 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1026 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1027 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1028 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1029 case META_DOT: fprintf(stderr, "META_DOT"); break;
1030 case META_ASTERISK: fprintf(stderr, "META *"); break;
1031 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1032 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1033 case META_PLUS: fprintf(stderr, "META +"); break;
1034 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1035 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1036 case META_QUERY: fprintf(stderr, "META ?"); break;
1037 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1038 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1039
1040 case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1041 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1042 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1043 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1044 case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1045 case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1046 case META_KET: fprintf(stderr, "META )"); break;
1047 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1048
1049 case META_CLASS: fprintf(stderr, "META ["); break;
1050 case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1051 case META_CLASS_END: fprintf(stderr, "META ]"); break;
1052 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1053 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1054
1055 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1056 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1057
1058 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1059 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1060
1061 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1062 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1063 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1064 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1065 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1066 case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1067
1068 case META_OPTIONS:
1069 fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
1070 pptr += 2;
1071 break;
1072
1073 case META_LOOKBEHIND:
1074 fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
1075 pptr += 2;
1076 break;
1077
1078 case META_LOOKBEHIND_NA:
1079 fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
1080 pptr += 2;
1081 break;
1082
1083 case META_LOOKBEHINDNOT:
1084 fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
1085 pptr += 2;
1086 break;
1087
1088 case META_CALLOUT_NUMBER:
1089 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1090 pptr[1]);
1091 pptr += 3;
1092 break;
1093
1094 case META_CALLOUT_STRING:
1095 {
1096 uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1097 uint32_t patlength = *pptr++; /* Length of next pattern item */
1098 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1099 GETOFFSET(offset, pptr);
1100 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1101 }
1102 break;
1103
1104 case META_RECURSE_BYNAME:
1105 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1106 GETOFFSET(offset, pptr);
1107 fprintf(stderr, "%zd", offset);
1108 break;
1109
1110 case META_BACKREF_BYNAME:
1111 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1112 GETOFFSET(offset, pptr);
1113 fprintf(stderr, "%zd", offset);
1114 break;
1115
1116 case META_COND_NUMBER:
1117 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1118 GETOFFSET(offset, pptr);
1119 fprintf(stderr, "%zd", offset);
1120 pptr++;
1121 break;
1122
1123 case META_COND_DEFINE:
1124 fprintf(stderr, "META (?(DEFINE) offset=");
1125 GETOFFSET(offset, pptr);
1126 fprintf(stderr, "%zd", offset);
1127 break;
1128
1129 case META_COND_VERSION:
1130 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1131 fprintf(stderr, "%d.", *pptr++);
1132 fprintf(stderr, "%d)", *pptr++);
1133 break;
1134
1135 case META_COND_NAME:
1136 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1137 GETOFFSET(offset, pptr);
1138 fprintf(stderr, "%zd", offset);
1139 break;
1140
1141 case META_COND_RNAME:
1142 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1143 GETOFFSET(offset, pptr);
1144 fprintf(stderr, "%zd", offset);
1145 break;
1146
1147 /* This is kept as a name, because it might be. */
1148
1149 case META_COND_RNUMBER:
1150 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1151 GETOFFSET(offset, pptr);
1152 fprintf(stderr, "%zd", offset);
1153 break;
1154
1155 case META_MARK:
1156 fprintf(stderr, "META (*MARK:");
1157 goto SHOWARG;
1158
1159 case META_COMMIT_ARG:
1160 fprintf(stderr, "META (*COMMIT:");
1161 goto SHOWARG;
1162
1163 case META_PRUNE_ARG:
1164 fprintf(stderr, "META (*PRUNE:");
1165 goto SHOWARG;
1166
1167 case META_SKIP_ARG:
1168 fprintf(stderr, "META (*SKIP:");
1169 goto SHOWARG;
1170
1171 case META_THEN_ARG:
1172 fprintf(stderr, "META (*THEN:");
1173 SHOWARG:
1174 length = *pptr++;
1175 for (i = 0; i < length; i++)
1176 {
1177 uint32_t cc = *pptr++;
1178 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1179 else fprintf(stderr, "\\x{%x}", cc);
1180 }
1181 fprintf(stderr, ") length=%u", length);
1182 break;
1183 }
1184 fprintf(stderr, "\n");
1185 }
1186 return;
1187 }
1188 #endif /* DEBUG_SHOW_PARSED */
1189
1190
1191
1192 /*************************************************
1193 * Copy compiled code *
1194 *************************************************/
1195
1196 /* Compiled JIT code cannot be copied, so the new compiled block has no
1197 associated JIT data. */
1198
1199 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1200 pcre2_code_copy(const pcre2_code *code)
1201 {
1202 PCRE2_SIZE* ref_count;
1203 pcre2_code *newcode;
1204
1205 if (code == NULL) return NULL;
1206 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1207 if (newcode == NULL) return NULL;
1208 memcpy(newcode, code, code->blocksize);
1209 newcode->executable_jit = NULL;
1210
1211 /* If the code is one that has been deserialized, increment the reference count
1212 in the decoded tables. */
1213
1214 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1215 {
1216 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1217 (*ref_count)++;
1218 }
1219
1220 return newcode;
1221 }
1222
1223
1224
1225 /*************************************************
1226 * Copy compiled code and character tables *
1227 *************************************************/
1228
1229 /* Compiled JIT code cannot be copied, so the new compiled block has no
1230 associated JIT data. This version of code_copy also makes a separate copy of
1231 the character tables. */
1232
1233 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1234 pcre2_code_copy_with_tables(const pcre2_code *code)
1235 {
1236 PCRE2_SIZE* ref_count;
1237 pcre2_code *newcode;
1238 uint8_t *newtables;
1239
1240 if (code == NULL) return NULL;
1241 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1242 if (newcode == NULL) return NULL;
1243 memcpy(newcode, code, code->blocksize);
1244 newcode->executable_jit = NULL;
1245
1246 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1247 code->memctl.memory_data);
1248 if (newtables == NULL)
1249 {
1250 code->memctl.free((void *)newcode, code->memctl.memory_data);
1251 return NULL;
1252 }
1253 memcpy(newtables, code->tables, TABLES_LENGTH);
1254 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1255 *ref_count = 1;
1256
1257 newcode->tables = newtables;
1258 newcode->flags |= PCRE2_DEREF_TABLES;
1259 return newcode;
1260 }
1261
1262
1263
1264 /*************************************************
1265 * Free compiled code *
1266 *************************************************/
1267
1268 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1269 pcre2_code_free(pcre2_code *code)
1270 {
1271 PCRE2_SIZE* ref_count;
1272
1273 if (code != NULL)
1274 {
1275 #ifdef SUPPORT_JIT
1276 if (code->executable_jit != NULL)
1277 PRIV(jit_free)(code->executable_jit, &code->memctl);
1278 #endif
1279
1280 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1281 {
1282 /* Decoded tables belong to the codes after deserialization, and they must
1283 be freed when there are no more references to them. The *ref_count should
1284 always be > 0. */
1285
1286 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1287 if (*ref_count > 0)
1288 {
1289 (*ref_count)--;
1290 if (*ref_count == 0)
1291 code->memctl.free((void *)code->tables, code->memctl.memory_data);
1292 }
1293 }
1294
1295 code->memctl.free(code, code->memctl.memory_data);
1296 }
1297 }
1298
1299
1300
1301 /*************************************************
1302 * Read a number, possibly signed *
1303 *************************************************/
1304
1305 /* This function is used to read numbers in the pattern. The initial pointer
1306 must be at the sign or first digit of the number. When relative values
1307 (introduced by + or -) are allowed, they are relative group numbers, and the
1308 result must be greater than zero.
1309
1310 Arguments:
1311 ptrptr points to the character pointer variable
1312 ptrend points to the end of the input string
1313 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1314 max_value the largest number allowed
1315 max_error the error to give for an over-large number
1316 intptr where to put the result
1317 errcodeptr where to put an error code
1318
1319 Returns: TRUE - a number was read
1320 FALSE - errorcode == 0 => no number was found
1321 errorcode != 0 => an error occurred
1322 */
1323
1324 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1325 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1326 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1327 {
1328 int sign = 0;
1329 uint32_t n = 0;
1330 PCRE2_SPTR ptr = *ptrptr;
1331 BOOL yield = FALSE;
1332
1333 *errorcodeptr = 0;
1334
1335 if (allow_sign >= 0 && ptr < ptrend)
1336 {
1337 if (*ptr == CHAR_PLUS)
1338 {
1339 sign = +1;
1340 max_value -= allow_sign;
1341 ptr++;
1342 }
1343 else if (*ptr == CHAR_MINUS)
1344 {
1345 sign = -1;
1346 ptr++;
1347 }
1348 }
1349
1350 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1351 while (ptr < ptrend && IS_DIGIT(*ptr))
1352 {
1353 n = n * 10 + *ptr++ - CHAR_0;
1354 if (n > max_value)
1355 {
1356 *errorcodeptr = max_error;
1357 goto EXIT;
1358 }
1359 }
1360
1361 if (allow_sign >= 0 && sign != 0)
1362 {
1363 if (n == 0)
1364 {
1365 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1366 goto EXIT;
1367 }
1368
1369 if (sign > 0) n += allow_sign;
1370 else if ((int)n > allow_sign)
1371 {
1372 *errorcodeptr = ERR15; /* Non-existent subpattern */
1373 goto EXIT;
1374 }
1375 else n = allow_sign + 1 - n;
1376 }
1377
1378 yield = TRUE;
1379
1380 EXIT:
1381 *intptr = n;
1382 *ptrptr = ptr;
1383 return yield;
1384 }
1385
1386
1387
1388 /*************************************************
1389 * Read repeat counts *
1390 *************************************************/
1391
1392 /* Read an item of the form {n,m} and return the values when non-NULL pointers
1393 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1394 larger value is used for "unlimited". We have to use signed arguments for
1395 read_number() because it is capable of returning a signed value. As of Perl
1396 5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1397 tabs after { and before } and between the numbers and the comma, so we do too.
1398
1399 Arguments:
1400 ptrptr points to pointer to character after '{'
1401 ptrend pointer to end of input
1402 minp if not NULL, pointer to int for min
1403 maxp if not NULL, pointer to int for max
1404 errorcodeptr points to error code variable
1405
1406 Returns: FALSE if not a repeat quantifier, errorcode set zero
1407 FALSE on error, with errorcode set non-zero
1408 TRUE on success, with pointer updated to point after '}'
1409 */
1410
1411 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1412 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1413 uint32_t *maxp, int *errorcodeptr)
1414 {
1415 PCRE2_SPTR p = *ptrptr;
1416 PCRE2_SPTR pp;
1417 BOOL yield = FALSE;
1418 BOOL had_minimum = FALSE;
1419 int32_t min = 0;
1420 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1421
1422 *errorcodeptr = 0;
1423 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1424
1425 /* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1426 such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1427 error. */
1428
1429 pp = p;
1430 if (pp < ptrend && IS_DIGIT(*pp))
1431 {
1432 had_minimum = TRUE;
1433 while (++pp < ptrend && IS_DIGIT(*pp)) {}
1434 }
1435
1436 while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1437 if (pp >= ptrend) return FALSE;
1438
1439 if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1440 {
1441 if (!had_minimum) return FALSE;
1442 }
1443 else
1444 {
1445 if (*pp++ != CHAR_COMMA) return FALSE;
1446 while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1447 if (pp >= ptrend) return FALSE;
1448 if (IS_DIGIT(*pp))
1449 {
1450 while (++pp < ptrend && IS_DIGIT(*pp)) {}
1451 }
1452 else if (!had_minimum) return FALSE;
1453 while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1454 if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1455 }
1456
1457 /* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
1458 or {n,m}. The only error that read_number() can return is for a number that is
1459 too big. If *errorcodeptr is returned as zero it means no number was found. */
1460
1461 /* Deal with {,m} or n too big. If we successfully read m there is no need to
1462 check m >= n because n defaults to zero. */
1463
1464 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1465 {
1466 if (*errorcodeptr != 0) goto EXIT; /* n too big */
1467 p++; /* Skip comma and subsequent spaces */
1468 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1469 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1470 {
1471 if (*errorcodeptr != 0) goto EXIT; /* m too big */
1472 }
1473 }
1474
1475 /* Have read one number. Deal with {n} or {n,} or {n,m} */
1476
1477 else
1478 {
1479 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1480 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1481 {
1482 max = min;
1483 }
1484 else /* Handle {n,} or {n,m} */
1485 {
1486 p++; /* Skip comma and subsequent spaces */
1487 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1488 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1489 {
1490 if (*errorcodeptr != 0) goto EXIT; /* m too big */
1491 }
1492
1493 if (max < min)
1494 {
1495 *errorcodeptr = ERR4;
1496 goto EXIT;
1497 }
1498 }
1499 }
1500
1501 /* Valid quantifier exists */
1502
1503 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1504 p++;
1505 yield = TRUE;
1506 if (minp != NULL) *minp = (uint32_t)min;
1507 if (maxp != NULL) *maxp = (uint32_t)max;
1508
1509 /* Update the pattern pointer */
1510
1511 EXIT:
1512 *ptrptr = p;
1513 return yield;
1514 }
1515
1516
1517
1518 /*************************************************
1519 * Handle escapes *
1520 *************************************************/
1521
1522 /* This function is called when a \ has been encountered. It either returns a
1523 positive value for a simple escape such as \d, or 0 for a data character, which
1524 is placed in chptr. A backreference to group n is returned as negative n. On
1525 entry, ptr is pointing at the character after \. On exit, it points after the
1526 final code unit of the escape sequence.
1527
1528 This function is also called from pcre2_substitute() to handle escape sequences
1529 in replacement strings. In this case, the cb argument is NULL, and in the case
1530 of escapes that have further processing, only sequences that define a data
1531 character are recognised. The isclass argument is not relevant; the options
1532 argument is the final value of the compiled pattern's options.
1533
1534 Arguments:
1535 ptrptr points to the input position pointer
1536 ptrend points to the end of the input
1537 chptr points to a returned data character
1538 errorcodeptr points to the errorcode variable (containing zero)
1539 options the current options bits
1540 xoptions the current extra options bits
1541 isclass TRUE if inside a character class
1542 cb compile data block or NULL when called from pcre2_substitute()
1543
1544 Returns: zero => a data character
1545 positive => a special escape sequence
1546 negative => a numerical back reference
1547 on error, errorcodeptr is set non-zero
1548 */
1549
1550 int
PRIV(check_escape)1551 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1552 int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
1553 compile_block *cb)
1554 {
1555 BOOL utf = (options & PCRE2_UTF) != 0;
1556 BOOL alt_bsux =
1557 ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1558 PCRE2_SPTR ptr = *ptrptr;
1559 uint32_t c, cc;
1560 int escape = 0;
1561 int i;
1562
1563 /* If backslash is at the end of the string, it's an error. */
1564
1565 if (ptr >= ptrend)
1566 {
1567 *errorcodeptr = ERR1;
1568 return 0;
1569 }
1570
1571 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1572 *errorcodeptr = 0; /* Be optimistic */
1573
1574 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1575 value test saves a memory lookup for code points outside the alphanumeric
1576 range. */
1577
1578 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1579
1580 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1581 positive value is a literal value for something like \n. A negative value is
1582 the negation of one of the ESC_ macros that is passed back for handling by the
1583 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1584 is supported. If the value is zero, further processing is handled below. */
1585
1586 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1587 {
1588 if (i > 0)
1589 {
1590 c = (uint32_t)i;
1591 if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1592 c = CHAR_LF;
1593 }
1594 else /* Negative table entry */
1595 {
1596 escape = -i; /* Else return a special escape */
1597 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1598 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1599
1600 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1601 Unicode code points, as well as plain \N for "not newline". PCRE does not
1602 support \N{name}. However, it does support quantification such as \N{2,3},
1603 so if \N{ is not followed by U+dddd we check for a quantifier. */
1604
1605 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1606 {
1607 PCRE2_SPTR p = ptr + 1;
1608
1609 /* Perl ignores spaces and tabs after { */
1610
1611 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1612
1613 /* \N{U+ can be handled by the \x{ code. However, this construction is
1614 not valid in EBCDIC environments because it specifies a Unicode
1615 character, not a codepoint in the local code. For example \N{U+0041}
1616 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1617 casing semantics for the entire pattern, so allow it only in UTF (i.e.
1618 Unicode) mode. */
1619
1620 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1621 {
1622 #ifdef EBCDIC
1623 *errorcodeptr = ERR93;
1624 #else
1625 if (utf)
1626 {
1627 ptr = p + 2;
1628 escape = 0; /* Not a fancy escape after all */
1629 goto COME_FROM_NU;
1630 }
1631 else *errorcodeptr = ERR93;
1632 #endif
1633 }
1634
1635 /* Give an error if what follows is not a quantifier, but don't override
1636 an error set by the quantifier reader (e.g. number overflow). */
1637
1638 else
1639 {
1640 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1641 *errorcodeptr == 0)
1642 *errorcodeptr = ERR37;
1643 }
1644 }
1645 }
1646 }
1647
1648 /* Escapes that need further processing, including those that are unknown, have
1649 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1650 \o, and \x are recognized (\u and \U can never appear as they are used for case
1651 forcing). */
1652
1653 else
1654 {
1655 int s;
1656 PCRE2_SPTR oldptr;
1657 BOOL overflow;
1658
1659 /* Filter calls from pcre2_substitute(). */
1660
1661 if (cb == NULL)
1662 {
1663 if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1664 {
1665 *errorcodeptr = ERR3;
1666 return 0;
1667 }
1668 alt_bsux = FALSE; /* Do not modify \x handling */
1669 }
1670
1671 switch (c)
1672 {
1673 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1674 error. */
1675
1676 case CHAR_F:
1677 case CHAR_l:
1678 case CHAR_L:
1679 *errorcodeptr = ERR37;
1680 break;
1681
1682 /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1683 is set. Otherwise, \u must be followed by exactly four hex digits or, if
1684 PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1685 Otherwise it is a lowercase u letter. This gives some compatibility with
1686 ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1687 allowed. When \u{ is not followed by hex digits, a special return is given
1688 because otherwise \u{ 12} (for example) would be treated as u{12}. */
1689
1690 case CHAR_u:
1691 if (!alt_bsux) *errorcodeptr = ERR37; else
1692 {
1693 uint32_t xc;
1694
1695 if (ptr >= ptrend) break;
1696 if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1697 (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1698 {
1699 PCRE2_SPTR hptr = ptr + 1;
1700
1701 cc = 0;
1702 while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1703 {
1704 if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1705 {
1706 *errorcodeptr = ERR77;
1707 ptr = hptr; /* Show where */
1708 break; /* *hptr != } will cause another break below */
1709 }
1710 cc = (cc << 4) | xc;
1711 hptr++;
1712 }
1713
1714 if (hptr == ptr + 1 || /* No hex digits */
1715 hptr >= ptrend || /* Hit end of input */
1716 *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1717 {
1718 escape = ESC_ub; /* Special return */
1719 ptr++; /* Skip { */
1720 break; /* Hex escape not recognized */
1721 }
1722
1723 c = cc; /* Accept the code point */
1724 ptr = hptr + 1;
1725 }
1726
1727 else /* Must be exactly 4 hex digits */
1728 {
1729 if (ptrend - ptr < 4) break; /* Less than 4 chars */
1730 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1731 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1732 cc = (cc << 4) | xc;
1733 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1734 cc = (cc << 4) | xc;
1735 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1736 c = (cc << 4) | xc;
1737 ptr += 4;
1738 }
1739
1740 if (utf)
1741 {
1742 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1743 else
1744 if (c >= 0xd800 && c <= 0xdfff &&
1745 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1746 *errorcodeptr = ERR73;
1747 }
1748 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1749 }
1750 break;
1751
1752 /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1753 in which case it is an upper case letter. */
1754
1755 case CHAR_U:
1756 if (!alt_bsux) *errorcodeptr = ERR37;
1757 break;
1758
1759 /* In a character class, \g is just a literal "g". Outside a character
1760 class, \g must be followed by one of a number of specific things:
1761
1762 (1) A number, either plain or braced. If positive, it is an absolute
1763 backreference. If negative, it is a relative backreference. This is a Perl
1764 5.10 feature.
1765
1766 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1767 is part of Perl's movement towards a unified syntax for back references. As
1768 this is synonymous with \k{name}, we fudge it up by pretending it really
1769 was \k{name}.
1770
1771 (3) For Oniguruma compatibility we also support \g followed by a name or a
1772 number either in angle brackets or in single quotes. However, these are
1773 (possibly recursive) subroutine calls, _not_ backreferences. We return
1774 the ESC_g code.
1775
1776 Summary: Return a negative number for a numerical back reference, ESC_k for
1777 a named back reference, and ESC_g for a named or numbered subroutine call.
1778 */
1779
1780 case CHAR_g:
1781 if (isclass) break;
1782
1783 if (ptr >= ptrend)
1784 {
1785 *errorcodeptr = ERR57;
1786 break;
1787 }
1788
1789 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790 {
1791 escape = ESC_g;
1792 break;
1793 }
1794
1795 /* If there is a brace delimiter, try to read a numerical reference. If
1796 there isn't one, assume we have a name and treat it as \k. */
1797
1798 if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799 {
1800 PCRE2_SPTR p = ptr + 1;
1801
1802 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804 errorcodeptr))
1805 {
1806 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1807 break;
1808 }
1809 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810
1811 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812 {
1813 *errorcodeptr = ERR57;
1814 break;
1815 }
1816 ptr = p + 1;
1817 }
1818
1819 /* Read an undelimited number */
1820
1821 else
1822 {
1823 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1824 errorcodeptr))
1825 {
1826 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1827 break;
1828 }
1829 }
1830
1831 if (s <= 0)
1832 {
1833 *errorcodeptr = ERR15;
1834 break;
1835 }
1836
1837 escape = -s;
1838 break;
1839
1840 /* The handling of escape sequences consisting of a string of digits
1841 starting with one that is not zero is not straightforward. Perl has changed
1842 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1843 recommended to avoid the ambiguities in the old syntax.
1844
1845 Outside a character class, the digits are read as a decimal number. If the
1846 number is less than 10, or if there are that many previous extracting left
1847 brackets, it is a back reference. Otherwise, up to three octal digits are
1848 read to form an escaped character code. Thus \123 is likely to be octal 123
1849 (cf \0123, which is octal 012 followed by the literal 3).
1850
1851 Inside a character class, \ followed by a digit is always either a literal
1852 8 or 9 or an octal number. */
1853
1854 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1855 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1856
1857 if (!isclass)
1858 {
1859 oldptr = ptr;
1860 ptr--; /* Back to the digit */
1861
1862 /* As we know we are at a digit, the only possible error from
1863 read_number() is a number that is too large to be a group number. In this
1864 case we fall through handle this as not a group reference. If we have
1865 read a small enough number, check for a back reference.
1866
1867 \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1868 are octal escapes if there are not that many previous captures. */
1869
1870 if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1871 (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1872 {
1873 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1874 else escape = -s; /* Indicates a back reference */
1875 break;
1876 }
1877
1878 ptr = oldptr; /* Put the pointer back and fall through */
1879 }
1880
1881 /* Handle a digit following \ when the number is not a back reference, or
1882 we are within a character class. If the first digit is 8 or 9, Perl used to
1883 generate a binary zero and then treat the digit as a following literal. At
1884 least by Perl 5.18 this changed so as not to insert the binary zero. */
1885
1886 if (c >= CHAR_8) break;
1887
1888 /* Fall through */
1889
1890 /* \0 always starts an octal number, but we may drop through to here with a
1891 larger first octal digit. The original code used just to take the least
1892 significant 8 bits of octal numbers (I think this is what early Perls used
1893 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1894 but no more than 3 octal digits. */
1895
1896 case CHAR_0:
1897 c -= CHAR_0;
1898 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1899 c = c * 8 + *ptr++ - CHAR_0;
1900 #if PCRE2_CODE_UNIT_WIDTH == 8
1901 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1902 #endif
1903 break;
1904
1905 /* \o is a relatively new Perl feature, supporting a more general way of
1906 specifying character codes in octal. The only supported form is \o{ddd},
1907 with optional spaces or tabs after { and before }. */
1908
1909 case CHAR_o:
1910 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1911 {
1912 ptr--;
1913 *errorcodeptr = ERR55;
1914 break;
1915 }
1916
1917 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1918 if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1919 {
1920 *errorcodeptr = ERR78;
1921 break;
1922 }
1923
1924 c = 0;
1925 overflow = FALSE;
1926 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1927 {
1928 cc = *ptr++;
1929 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1930 #if PCRE2_CODE_UNIT_WIDTH == 32
1931 if (c >= 0x20000000l) { overflow = TRUE; break; }
1932 #endif
1933 c = (c << 3) + (cc - CHAR_0);
1934 #if PCRE2_CODE_UNIT_WIDTH == 8
1935 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1936 #elif PCRE2_CODE_UNIT_WIDTH == 16
1937 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1938 #elif PCRE2_CODE_UNIT_WIDTH == 32
1939 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1940 #endif
1941 }
1942
1943 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1944
1945 if (overflow)
1946 {
1947 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1948 *errorcodeptr = ERR34;
1949 }
1950 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1951 {
1952 if (utf && c >= 0xd800 && c <= 0xdfff &&
1953 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1954 {
1955 ptr--;
1956 *errorcodeptr = ERR73;
1957 }
1958 }
1959 else
1960 {
1961 ptr--;
1962 *errorcodeptr = ERR64;
1963 }
1964 break;
1965
1966 /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1967 by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1968
1969 case CHAR_x:
1970 if (alt_bsux)
1971 {
1972 uint32_t xc;
1973 if (ptrend - ptr < 2) break; /* Less than 2 characters */
1974 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1975 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1976 c = (cc << 4) | xc;
1977 ptr += 2;
1978 }
1979
1980 /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1981 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1982 digits. If not, { used to be treated as a data character. However, Perl
1983 seems to read hex digits up to the first non-such, and ignore the rest, so
1984 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1985 now gives an error. */
1986
1987 else
1988 {
1989 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1990 {
1991 ptr++;
1992 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1993
1994 #ifndef EBCDIC
1995 COME_FROM_NU:
1996 #endif
1997 if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1998 {
1999 *errorcodeptr = ERR78;
2000 break;
2001 }
2002 c = 0;
2003 overflow = FALSE;
2004
2005 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2006 {
2007 ptr++;
2008 if (c == 0 && cc == 0) continue; /* Leading zeroes */
2009 #if PCRE2_CODE_UNIT_WIDTH == 32
2010 if (c >= 0x10000000l) { overflow = TRUE; break; }
2011 #endif
2012 c = (c << 4) | cc;
2013 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2014 {
2015 overflow = TRUE;
2016 break;
2017 }
2018 }
2019
2020 /* Perl ignores spaces and tabs before } */
2021
2022 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2023
2024 /* On overflow, skip remaining hex digits */
2025
2026 if (overflow)
2027 {
2028 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2029 *errorcodeptr = ERR34;
2030 }
2031 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2032 {
2033 if (utf && c >= 0xd800 && c <= 0xdfff &&
2034 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2035 {
2036 ptr--;
2037 *errorcodeptr = ERR73;
2038 }
2039 }
2040
2041 /* If the sequence of hex digits (followed by optional space) does not
2042 end with '}', give an error. We used just to recognize this construct
2043 and fall through to the normal \x handling, but nowadays Perl gives an
2044 error, which seems much more sensible, so we do too. */
2045
2046 else
2047 {
2048 ptr--;
2049 *errorcodeptr = ERR67;
2050 }
2051 } /* End of \x{} processing */
2052
2053 /* Read a up to two hex digits after \x */
2054
2055 else
2056 {
2057 c = 0;
2058 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
2059 ptr++;
2060 c = cc;
2061 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
2062 ptr++;
2063 c = (c << 4) | cc;
2064 } /* End of \xdd handling */
2065 } /* End of Perl-style \x handling */
2066 break;
2067
2068 /* The handling of \c is different in ASCII and EBCDIC environments. In an
2069 ASCII (or Unicode) environment, an error is given if the character
2070 following \c is not a printable ASCII character. Otherwise, the following
2071 character is upper-cased if it is a letter, and after that the 0x40 bit is
2072 flipped. The result is the value of the escape.
2073
2074 In an EBCDIC environment the handling of \c is compatible with the
2075 specification in the perlebcdic document. The following character must be
2076 a letter or one of small number of special characters. These provide a
2077 means of defining the character values 0-31.
2078
2079 For testing the EBCDIC handling of \c in an ASCII environment, recognize
2080 the EBCDIC value of 'c' explicitly. */
2081
2082 #if defined EBCDIC && 'a' != 0x81
2083 case 0x83:
2084 #else
2085 case CHAR_c:
2086 #endif
2087 if (ptr >= ptrend)
2088 {
2089 *errorcodeptr = ERR2;
2090 break;
2091 }
2092 c = *ptr;
2093 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2094
2095 /* Handle \c in an ASCII/Unicode environment. */
2096
2097 #ifndef EBCDIC /* ASCII/UTF-8 coding */
2098 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
2099 {
2100 *errorcodeptr = ERR68;
2101 break;
2102 }
2103 c ^= 0x40;
2104
2105 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2106 255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2107 POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2108 The other valid sequences correspond to a list of specific characters. */
2109
2110 #else
2111 if (c == CHAR_QUESTION_MARK)
2112 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2113 else
2114 {
2115 for (i = 0; i < 32; i++)
2116 {
2117 if (c == ebcdic_escape_c[i]) break;
2118 }
2119 if (i < 32) c = i; else *errorcodeptr = ERR68;
2120 }
2121 #endif /* EBCDIC */
2122
2123 ptr++;
2124 break;
2125
2126 /* Any other alphanumeric following \ is an error. Perl gives an error only
2127 if in warning mode, but PCRE doesn't have a warning mode. */
2128
2129 default:
2130 *errorcodeptr = ERR3;
2131 *ptrptr = ptr - 1; /* Point to the character at fault */
2132 return 0;
2133 }
2134 }
2135
2136 /* Set the pointer to the next character before returning. */
2137
2138 *ptrptr = ptr;
2139 *chptr = c;
2140 return escape;
2141 }
2142
2143
2144
2145 #ifdef SUPPORT_UNICODE
2146 /*************************************************
2147 * Handle \P and \p *
2148 *************************************************/
2149
2150 /* This function is called after \P or \p has been encountered, provided that
2151 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2152 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2153 after the final code unit of the escape sequence.
2154
2155 Arguments:
2156 ptrptr the pattern position pointer
2157 negptr a boolean that is set TRUE for negation else FALSE
2158 ptypeptr an unsigned int that is set to the type value
2159 pdataptr an unsigned int that is set to the detailed property value
2160 errorcodeptr the error code variable
2161 cb the compile data
2162
2163 Returns: TRUE if the type value was found, or FALSE for an invalid type
2164 */
2165
2166 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2167 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2168 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2169 {
2170 PCRE2_UCHAR c;
2171 PCRE2_SIZE i, bot, top;
2172 PCRE2_SPTR ptr = *ptrptr;
2173 PCRE2_UCHAR name[50];
2174 PCRE2_UCHAR *vptr = NULL;
2175 uint16_t ptscript = PT_NOTSCRIPT;
2176
2177 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2178 c = *ptr++;
2179 *negptr = FALSE;
2180
2181 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2182 negation. */
2183
2184 if (c == CHAR_LEFT_CURLY_BRACKET)
2185 {
2186 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2187
2188 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2189 {
2190 *negptr = TRUE;
2191 ptr++;
2192 }
2193
2194 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2195 {
2196 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2197 c = *ptr++;
2198 #if PCRE2_CODE_UNIT_WIDTH != 8
2199 while (c == '_' || c == '-' || (c <= 0xff && isspace(c)))
2200 #else
2201 while (c == '_' || c == '-' || isspace(c))
2202 #endif
2203 {
2204 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2205 c = *ptr++;
2206 }
2207 if (c == CHAR_NUL) goto ERROR_RETURN;
2208 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2209 name[i] = tolower(c);
2210 if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2211 }
2212
2213 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2214 name[i] = 0;
2215 }
2216
2217 /* If { doesn't follow \p or \P there is just one following character, which
2218 must be an ASCII letter. */
2219
2220 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2221 {
2222 name[0] = tolower(c);
2223 name[1] = 0;
2224 }
2225 else goto ERROR_RETURN;
2226
2227 *ptrptr = ptr;
2228
2229 /* If the property contains ':' or '=' we have class name and value separately
2230 specified. The following are supported:
2231
2232 . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2233 . Script (synonym sc) for which the property name is the script name
2234 . Script_Extensions (synonym scx), ditto
2235
2236 As this is a small number, we currently just check the names directly. If this
2237 grows, a sorted table and a switch will be neater.
2238
2239 For both the script properties, set a PT_xxx value so that (1) they can be
2240 distinguished and (2) invalid script names that happen to be the name of
2241 another property can be diagnosed. */
2242
2243 if (vptr != NULL)
2244 {
2245 int offset = 0;
2246 PCRE2_UCHAR sname[8];
2247
2248 *vptr = 0; /* Terminate property name */
2249 if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2250 PRIV(strcmp_c8)(name, STRING_bc) == 0)
2251 {
2252 offset = 4;
2253 sname[0] = CHAR_b;
2254 sname[1] = CHAR_i; /* There is no strcpy_c8 function */
2255 sname[2] = CHAR_d;
2256 sname[3] = CHAR_i;
2257 }
2258
2259 else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2260 PRIV(strcmp_c8)(name, STRING_sc) == 0)
2261 ptscript = PT_SC;
2262
2263 else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2264 PRIV(strcmp_c8)(name, STRING_scx) == 0)
2265 ptscript = PT_SCX;
2266
2267 else
2268 {
2269 *errorcodeptr = ERR47;
2270 return FALSE;
2271 }
2272
2273 /* Adjust the string in name[] as needed */
2274
2275 memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2276 if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2277 }
2278
2279 /* Search for a recognized property using binary chop. */
2280
2281 bot = 0;
2282 top = PRIV(utt_size);
2283
2284 while (bot < top)
2285 {
2286 int r;
2287 i = (bot + top) >> 1;
2288 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2289
2290 /* When a matching property is found, some extra checking is needed when the
2291 \p{xx:yy} syntax is used and xx is either sc or scx. */
2292
2293 if (r == 0)
2294 {
2295 *pdataptr = PRIV(utt)[i].value;
2296 if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2297 {
2298 *ptypeptr = PRIV(utt)[i].type;
2299 return TRUE;
2300 }
2301
2302 switch (PRIV(utt)[i].type)
2303 {
2304 case PT_SC:
2305 *ptypeptr = PT_SC;
2306 return TRUE;
2307
2308 case PT_SCX:
2309 *ptypeptr = ptscript;
2310 return TRUE;
2311 }
2312
2313 break; /* Non-script found */
2314 }
2315
2316 if (r > 0) bot = i + 1; else top = i;
2317 }
2318
2319 *errorcodeptr = ERR47; /* Unrecognized property */
2320 return FALSE;
2321
2322 ERROR_RETURN: /* Malformed \P or \p */
2323 *errorcodeptr = ERR46;
2324 *ptrptr = ptr;
2325 return FALSE;
2326 }
2327 #endif
2328
2329
2330
2331 /*************************************************
2332 * Check for POSIX class syntax *
2333 *************************************************/
2334
2335 /* This function is called when the sequence "[:" or "[." or "[=" is
2336 encountered in a character class. It checks whether this is followed by a
2337 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2338 reach an unescaped ']' without the special preceding character, return FALSE.
2339
2340 Originally, this function only recognized a sequence of letters between the
2341 terminators, but it seems that Perl recognizes any sequence of characters,
2342 though of course unknown POSIX names are subsequently rejected. Perl gives an
2343 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2344 didn't consider this to be a POSIX class. Likewise for [:1234:].
2345
2346 The problem in trying to be exactly like Perl is in the handling of escapes. We
2347 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2348 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2349 below handles the special cases \\ and \], but does not try to do any other
2350 escape processing. This makes it different from Perl for cases such as
2351 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2352 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2353 when Perl does, I think.
2354
2355 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2356 It seems that the appearance of a nested POSIX class supersedes an apparent
2357 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2358 a digit. This is handled by returning FALSE if the start of a new group with
2359 the same terminator is encountered, since the next closing sequence must close
2360 the nested group, not the outer one.
2361
2362 In Perl, unescaped square brackets may also appear as part of class names. For
2363 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2364 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2365 seem right at all. PCRE does not allow closing square brackets in POSIX class
2366 names.
2367
2368 Arguments:
2369 ptr pointer to the character after the initial [ (colon, dot, equals)
2370 ptrend pointer to the end of the pattern
2371 endptr where to return a pointer to the terminating ':', '.', or '='
2372
2373 Returns: TRUE or FALSE
2374 */
2375
2376 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2377 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2378 {
2379 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2380 terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2381
2382 for (; ptrend - ptr >= 2; ptr++)
2383 {
2384 if (*ptr == CHAR_BACKSLASH &&
2385 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2386 ptr++;
2387
2388 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2389 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2390
2391 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2392 {
2393 *endptr = ptr;
2394 return TRUE;
2395 }
2396 }
2397
2398 return FALSE;
2399 }
2400
2401
2402
2403 /*************************************************
2404 * Check POSIX class name *
2405 *************************************************/
2406
2407 /* This function is called to check the name given in a POSIX-style class entry
2408 such as [:alnum:].
2409
2410 Arguments:
2411 ptr points to the first letter
2412 len the length of the name
2413
2414 Returns: a value representing the name, or -1 if unknown
2415 */
2416
2417 static int
check_posix_name(PCRE2_SPTR ptr,int len)2418 check_posix_name(PCRE2_SPTR ptr, int len)
2419 {
2420 const char *pn = posix_names;
2421 int yield = 0;
2422 while (posix_name_lengths[yield] != 0)
2423 {
2424 if (len == posix_name_lengths[yield] &&
2425 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2426 pn += posix_name_lengths[yield] + 1;
2427 yield++;
2428 }
2429 return -1;
2430 }
2431
2432
2433
2434 /*************************************************
2435 * Read a subpattern or VERB name *
2436 *************************************************/
2437
2438 /* This function is called from parse_regex() below whenever it needs to read
2439 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2440 pointer must be to the preceding character. If that character is '*' we are
2441 reading a verb or alpha assertion name. The pointer is updated to point after
2442 the name, for a VERB or alpha assertion name, or after tha name's terminator
2443 for a subpattern name. Returning both the offset and the name pointer is
2444 redundant information, but some callers use one and some the other, so it is
2445 simplest just to return both. When the name is in braces, spaces and tabs are
2446 allowed (and ignored) at either end.
2447
2448 Arguments:
2449 ptrptr points to the character pointer variable
2450 ptrend points to the end of the input string
2451 utf true if the input is UTF-encoded
2452 terminator the terminator of a subpattern name must be this
2453 offsetptr where to put the offset from the start of the pattern
2454 nameptr where to put a pointer to the name in the input
2455 namelenptr where to put the length of the name
2456 errcodeptr where to put an error code
2457 cb pointer to the compile data block
2458
2459 Returns: TRUE if a name was read
2460 FALSE otherwise, with error code set
2461 */
2462
2463 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2464 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2465 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2466 int *errorcodeptr, compile_block *cb)
2467 {
2468 PCRE2_SPTR ptr = *ptrptr;
2469 BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2470 BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2471
2472 if (is_braced)
2473 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2474
2475 if (ptr >= ptrend) /* No characters in name */
2476 {
2477 *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2478 ERR60; /* Verb not recognized or malformed */
2479 goto FAILED;
2480 }
2481
2482 *nameptr = ptr;
2483 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2484
2485 /* In UTF mode, a group name may contain letters and decimal digits as defined
2486 by Unicode properties, and underscores, but must not start with a digit. */
2487
2488 #ifdef SUPPORT_UNICODE
2489 if (utf && is_group)
2490 {
2491 uint32_t c, type;
2492
2493 GETCHAR(c, ptr);
2494 type = UCD_CHARTYPE(c);
2495
2496 if (type == ucp_Nd)
2497 {
2498 *errorcodeptr = ERR44;
2499 goto FAILED;
2500 }
2501
2502 for(;;)
2503 {
2504 if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2505 c != CHAR_UNDERSCORE) break;
2506 ptr++;
2507 FORWARDCHARTEST(ptr, ptrend);
2508 if (ptr >= ptrend) break;
2509 GETCHAR(c, ptr);
2510 type = UCD_CHARTYPE(c);
2511 }
2512 }
2513 else
2514 #else
2515 (void)utf; /* Avoid compiler warning */
2516 #endif /* SUPPORT_UNICODE */
2517
2518 /* Handle non-group names and group names in non-UTF modes. A group name must
2519 not start with a digit. If either of the others start with a digit it just
2520 won't be recognized. */
2521
2522 {
2523 if (is_group && IS_DIGIT(*ptr))
2524 {
2525 *errorcodeptr = ERR44;
2526 goto FAILED;
2527 }
2528
2529 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2530 {
2531 ptr++;
2532 }
2533 }
2534
2535 /* Check name length */
2536
2537 if (ptr > *nameptr + MAX_NAME_SIZE)
2538 {
2539 *errorcodeptr = ERR48;
2540 goto FAILED;
2541 }
2542 *namelenptr = (uint32_t)(ptr - *nameptr);
2543
2544 /* Subpattern names must not be empty, and their terminator is checked here.
2545 (What follows a verb or alpha assertion name is checked separately.) */
2546
2547 if (is_group)
2548 {
2549 if (ptr == *nameptr)
2550 {
2551 *errorcodeptr = ERR62; /* Subpattern name expected */
2552 goto FAILED;
2553 }
2554 if (is_braced)
2555 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2556 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2557 {
2558 *errorcodeptr = ERR42;
2559 goto FAILED;
2560 }
2561 ptr++;
2562 }
2563
2564 *ptrptr = ptr;
2565 return TRUE;
2566
2567 FAILED:
2568 *ptrptr = ptr;
2569 return FALSE;
2570 }
2571
2572
2573
2574 /*************************************************
2575 * Manage callouts at start of cycle *
2576 *************************************************/
2577
2578 /* At the start of a new item in parse_regex() we are able to record the
2579 details of the previous item in a prior callout, and also to set up an
2580 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2581 which would otherwise happen for items such as \Q that contribute nothing to
2582 the parsed pattern.
2583
2584 Arguments:
2585 ptr current pattern pointer
2586 pcalloutptr points to a pointer to previous callout, or NULL
2587 auto_callout TRUE if auto_callouts are enabled
2588 parsed_pattern the parsed pattern pointer
2589 cb compile block
2590
2591 Returns: possibly updated parsed_pattern pointer.
2592 */
2593
2594 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2595 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2596 uint32_t *parsed_pattern, compile_block *cb)
2597 {
2598 uint32_t *previous_callout = *pcalloutptr;
2599
2600 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2601 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2602
2603 if (!auto_callout) previous_callout = NULL; else
2604 {
2605 if (previous_callout == NULL ||
2606 previous_callout != parsed_pattern - 4 ||
2607 previous_callout[3] != 255)
2608 {
2609 previous_callout = parsed_pattern; /* Set up new automatic callout */
2610 parsed_pattern += 4;
2611 previous_callout[0] = META_CALLOUT_NUMBER;
2612 previous_callout[2] = 0;
2613 previous_callout[3] = 255;
2614 }
2615 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2616 }
2617
2618 *pcalloutptr = previous_callout;
2619 return parsed_pattern;
2620 }
2621
2622
2623
2624 /*************************************************
2625 * Handle \d, \D, \s, \S, \w, \W *
2626 *************************************************/
2627
2628 /* This function is called from parse_regex() below, both for freestanding
2629 escapes, and those within classes, to handle those escapes that may change when
2630 Unicode property support is requested. Note that PCRE2_UCP will never be set
2631 without Unicode support because that is checked when pcre2_compile() is called.
2632
2633 Arguments:
2634 escape the ESC_... value
2635 parsed_pattern where to add the code
2636 options options bits
2637 xoptions extra options bits
2638
2639 Returns: updated value of parsed_pattern
2640 */
2641 static uint32_t *
handle_escdsw(int escape,uint32_t * parsed_pattern,uint32_t options,uint32_t xoptions)2642 handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2643 uint32_t xoptions)
2644 {
2645 uint32_t ascii_option = 0;
2646 uint32_t prop = ESC_p;
2647
2648 switch(escape)
2649 {
2650 case ESC_D:
2651 prop = ESC_P;
2652 /* Fall through */
2653 case ESC_d:
2654 ascii_option = PCRE2_EXTRA_ASCII_BSD;
2655 break;
2656
2657 case ESC_S:
2658 prop = ESC_P;
2659 /* Fall through */
2660 case ESC_s:
2661 ascii_option = PCRE2_EXTRA_ASCII_BSS;
2662 break;
2663
2664 case ESC_W:
2665 prop = ESC_P;
2666 /* Fall through */
2667 case ESC_w:
2668 ascii_option = PCRE2_EXTRA_ASCII_BSW;
2669 break;
2670 }
2671
2672 if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2673 {
2674 *parsed_pattern++ = META_ESCAPE + escape;
2675 }
2676 else
2677 {
2678 *parsed_pattern++ = META_ESCAPE + prop;
2679 switch(escape)
2680 {
2681 case ESC_d:
2682 case ESC_D:
2683 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2684 break;
2685
2686 case ESC_s:
2687 case ESC_S:
2688 *parsed_pattern++ = PT_SPACE << 16;
2689 break;
2690
2691 case ESC_w:
2692 case ESC_W:
2693 *parsed_pattern++ = PT_WORD << 16;
2694 break;
2695 }
2696 }
2697
2698 return parsed_pattern;
2699 }
2700
2701
2702
2703 /*************************************************
2704 * Parse regex and identify named groups *
2705 *************************************************/
2706
2707 /* This function is called first of all. It scans the pattern and does two
2708 things: (1) It identifies capturing groups and makes a table of named capturing
2709 groups so that information about them is fully available to both the compiling
2710 scans. (2) It writes a parsed version of the pattern with comments omitted and
2711 escapes processed into the parsed_pattern vector.
2712
2713 Arguments:
2714 ptr points to the start of the pattern
2715 options compiling dynamic options (may change during the scan)
2716 has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2717 cb pointer to the compile data block
2718
2719 Returns: zero on success or a non-zero error code, with the
2720 error offset placed in the cb field
2721 */
2722
2723 /* A structure and some flags for dealing with nested groups. */
2724
2725 typedef struct nest_save {
2726 uint16_t nest_depth;
2727 uint16_t reset_group;
2728 uint16_t max_group;
2729 uint16_t flags;
2730 uint32_t options;
2731 uint32_t xoptions;
2732 } nest_save;
2733
2734 #define NSF_RESET 0x0001u
2735 #define NSF_CONDASSERT 0x0002u
2736 #define NSF_ATOMICSR 0x0004u
2737
2738 /* Options that are changeable within the pattern must be tracked during
2739 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2740 but all must be tracked so that META_OPTIONS items set the correct values for
2741 the main compiling phase. */
2742
2743 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2744 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2745 PCRE2_UNGREEDY)
2746
2747 #define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2748 PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2749 PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2750
2751 /* States used for analyzing ranges in character classes. The two OK values
2752 must be last. */
2753
2754 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2755
2756 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2757 the storing of literal values in the main parsed pattern, where they can always
2758 be quantified. */
2759
2760 #if PCRE2_CODE_UNIT_WIDTH == 32
2761 #define PARSED_LITERAL(c, p) \
2762 { \
2763 if (c >= META_END) *p++ = META_BIGVALUE; \
2764 *p++ = c; \
2765 okquantifier = TRUE; \
2766 }
2767 #else
2768 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2769 #endif
2770
2771 /* Here's the actual function. */
2772
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2773 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2774 compile_block *cb)
2775 {
2776 uint32_t c;
2777 uint32_t delimiter;
2778 uint32_t namelen;
2779 uint32_t class_range_state;
2780 uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2781 uint32_t *verbstartptr = NULL;
2782 uint32_t *previous_callout = NULL;
2783 uint32_t *parsed_pattern = cb->parsed_pattern;
2784 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2785 uint32_t *this_parsed_item = NULL;
2786 uint32_t *prev_parsed_item = NULL;
2787 uint32_t meta_quantifier = 0;
2788 uint32_t add_after_mark = 0;
2789 uint32_t xoptions = cb->cx->extra_options;
2790 uint16_t nest_depth = 0;
2791 int after_manual_callout = 0;
2792 int expect_cond_assert = 0;
2793 int errorcode = 0;
2794 int escape;
2795 int i;
2796 BOOL inescq = FALSE;
2797 BOOL inverbname = FALSE;
2798 BOOL utf = (options & PCRE2_UTF) != 0;
2799 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2800 BOOL isdupname;
2801 BOOL negate_class;
2802 BOOL okquantifier = FALSE;
2803 PCRE2_SPTR thisptr;
2804 PCRE2_SPTR name;
2805 PCRE2_SPTR ptrend = cb->end_pattern;
2806 PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2807 named_group *ng;
2808 nest_save *top_nest, *end_nests;
2809
2810 /* Insert leading items for word and line matching (features provided for the
2811 benefit of pcre2grep). */
2812
2813 if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2814 {
2815 *parsed_pattern++ = META_CIRCUMFLEX;
2816 *parsed_pattern++ = META_NOCAPTURE;
2817 }
2818 else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2819 {
2820 *parsed_pattern++ = META_ESCAPE + ESC_b;
2821 *parsed_pattern++ = META_NOCAPTURE;
2822 }
2823
2824 /* If the pattern is actually a literal string, process it separately to avoid
2825 cluttering up the main loop. */
2826
2827 if ((options & PCRE2_LITERAL) != 0)
2828 {
2829 while (ptr < ptrend)
2830 {
2831 if (parsed_pattern >= parsed_pattern_end)
2832 {
2833 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2834 goto FAILED;
2835 }
2836 thisptr = ptr;
2837 GETCHARINCTEST(c, ptr);
2838 if (auto_callout)
2839 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2840 auto_callout, parsed_pattern, cb);
2841 PARSED_LITERAL(c, parsed_pattern);
2842 }
2843 goto PARSED_END;
2844 }
2845
2846 /* Process a real regex which may contain meta-characters. */
2847
2848 top_nest = NULL;
2849 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2850
2851 /* The size of the nest_save structure might not be a factor of the size of the
2852 workspace. Therefore we must round down end_nests so as to correctly avoid
2853 creating a nest_save that spans the end of the workspace. */
2854
2855 end_nests = (nest_save *)((char *)end_nests -
2856 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2857
2858 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2859
2860 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2861
2862 /* Now scan the pattern */
2863
2864 while (ptr < ptrend)
2865 {
2866 int prev_expect_cond_assert;
2867 uint32_t min_repeat = 0, max_repeat = 0;
2868 uint32_t set, unset, *optset;
2869 uint32_t xset, xunset, *xoptset;
2870 uint32_t terminator;
2871 uint32_t prev_meta_quantifier;
2872 BOOL prev_okquantifier;
2873 PCRE2_SPTR tempptr;
2874 PCRE2_SIZE offset;
2875
2876 if (parsed_pattern >= parsed_pattern_end)
2877 {
2878 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2879 goto FAILED;
2880 }
2881
2882 if (nest_depth > cb->cx->parens_nest_limit)
2883 {
2884 errorcode = ERR19;
2885 goto FAILED; /* Parentheses too deeply nested */
2886 }
2887
2888 /* If the last time round this loop something was added, parsed_pattern will
2889 no longer be equal to this_parsed_item. Remember where the previous item
2890 started and reset for the next item. Note that sometimes round the loop,
2891 nothing gets added (e.g. for ignored white space). */
2892
2893 if (this_parsed_item != parsed_pattern)
2894 {
2895 prev_parsed_item = this_parsed_item;
2896 this_parsed_item = parsed_pattern;
2897 }
2898
2899 /* Get next input character, save its position for callout handling. */
2900
2901 thisptr = ptr;
2902 GETCHARINCTEST(c, ptr);
2903
2904 /* Copy quoted literals until \E, allowing for the possibility of automatic
2905 callouts, except when processing a (*VERB) "name". */
2906
2907 if (inescq)
2908 {
2909 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2910 {
2911 inescq = FALSE;
2912 ptr++; /* Skip E */
2913 }
2914 else
2915 {
2916 if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2917 { /* expecting a conditional assertion, */
2918 ptr--; /* but an empty \Q\E sequence is OK. */
2919 errorcode = ERR28;
2920 goto FAILED;
2921 }
2922 if (inverbname)
2923 { /* Don't use PARSED_LITERAL() because it */
2924 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2925 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2926 #endif
2927 *parsed_pattern++ = c;
2928 }
2929 else
2930 {
2931 if (after_manual_callout-- <= 0)
2932 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2933 auto_callout, parsed_pattern, cb);
2934 PARSED_LITERAL(c, parsed_pattern);
2935 }
2936 meta_quantifier = 0;
2937 }
2938 continue; /* Next character */
2939 }
2940
2941 /* If we are processing the "name" part of a (*VERB:NAME) item, all
2942 characters up to the closing parenthesis are literals except when
2943 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2944 and \E and escaped characters are allowed (no character types such as \d). If
2945 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2946 this by not entering the special (*VERB:NAME) processing - they are then
2947 picked up below. Note that c is a character, not a code unit, so we must not
2948 use MAX_255 to test its size because MAX_255 tests code units and is assumed
2949 TRUE in 8-bit mode. */
2950
2951 if (inverbname &&
2952 (
2953 /* EITHER: not both options set */
2954 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2955 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2956 #ifdef SUPPORT_UNICODE
2957 /* OR: character > 255 AND not Unicode Pattern White Space */
2958 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2959 #endif
2960 /* OR: not a # comment or isspace() white space */
2961 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2962 #ifdef SUPPORT_UNICODE
2963 /* and not CHAR_NEL when Unicode is supported */
2964 && c != CHAR_NEL
2965 #endif
2966 )))
2967 {
2968 PCRE2_SIZE verbnamelength;
2969
2970 switch(c)
2971 {
2972 default: /* Don't use PARSED_LITERAL() because it */
2973 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2974 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2975 #endif
2976 *parsed_pattern++ = c;
2977 break;
2978
2979 case CHAR_RIGHT_PARENTHESIS:
2980 inverbname = FALSE;
2981 /* This is the length in characters */
2982 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2983 /* But the limit on the length is in code units */
2984 if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2985 {
2986 ptr--;
2987 errorcode = ERR76;
2988 goto FAILED;
2989 }
2990 *verblengthptr = (uint32_t)verbnamelength;
2991
2992 /* If this name was on a verb such as (*ACCEPT) which does not continue,
2993 a (*MARK) was generated for the name. We now add the original verb as the
2994 next item. */
2995
2996 if (add_after_mark != 0)
2997 {
2998 *parsed_pattern++ = add_after_mark;
2999 add_after_mark = 0;
3000 }
3001 break;
3002
3003 case CHAR_BACKSLASH:
3004 if ((options & PCRE2_ALT_VERBNAMES) != 0)
3005 {
3006 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3007 xoptions, FALSE, cb);
3008 if (errorcode != 0) goto FAILED;
3009 }
3010 else escape = 0; /* Treat all as literal */
3011
3012 switch(escape)
3013 {
3014 case 0: /* Don't use PARSED_LITERAL() because it */
3015 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
3016 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3017 #endif
3018 *parsed_pattern++ = c;
3019 break;
3020
3021 case ESC_ub:
3022 *parsed_pattern++ = CHAR_u;
3023 PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3024 break;
3025
3026 case ESC_Q:
3027 inescq = TRUE;
3028 break;
3029
3030 case ESC_E: /* Ignore */
3031 break;
3032
3033 default:
3034 errorcode = ERR40; /* Invalid in verb name */
3035 goto FAILED;
3036 }
3037 }
3038 continue; /* Next character in pattern */
3039 }
3040
3041 /* Not a verb name character. At this point we must process everything that
3042 must not change the quantification state. This is mainly comments, but we
3043 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3044 A+, as in Perl. An isolated \E is ignored. */
3045
3046 if (c == CHAR_BACKSLASH && ptr < ptrend)
3047 {
3048 if (*ptr == CHAR_Q || *ptr == CHAR_E)
3049 {
3050 inescq = *ptr == CHAR_Q;
3051 ptr++;
3052 continue;
3053 }
3054 }
3055
3056 /* Skip over whitespace and # comments in extended mode. Note that c is a
3057 character, not a code unit, so we must not use MAX_255 to test its size
3058 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3059 whitespace characters are those designated as "Pattern White Space" by
3060 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3061 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3062 subset of space characters that match \h and \v. */
3063
3064 if ((options & PCRE2_EXTENDED) != 0)
3065 {
3066 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3067 #ifdef SUPPORT_UNICODE
3068 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3069 #endif
3070 if (c == CHAR_NUMBER_SIGN)
3071 {
3072 while (ptr < ptrend)
3073 {
3074 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
3075 { /* IS_NEWLINE sets cb->nllen. */
3076 ptr += cb->nllen;
3077 break;
3078 }
3079 ptr++;
3080 #ifdef SUPPORT_UNICODE
3081 if (utf) FORWARDCHARTEST(ptr, ptrend);
3082 #endif
3083 }
3084 continue; /* Next character in pattern */
3085 }
3086 }
3087
3088 /* Skip over bracketed comments */
3089
3090 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3091 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3092 {
3093 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3094 if (ptr >= ptrend)
3095 {
3096 errorcode = ERR18; /* A special error for missing ) in a comment */
3097 goto FAILED; /* to make it easier to debug. */
3098 }
3099 ptr++;
3100 continue; /* Next character in pattern */
3101 }
3102
3103 /* If the next item is not a quantifier, fill in length of any previous
3104 callout and create an auto callout if required. */
3105
3106 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3107 (c != CHAR_LEFT_CURLY_BRACKET ||
3108 (tempptr = ptr,
3109 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3110 {
3111 if (after_manual_callout-- <= 0)
3112 {
3113 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3114 parsed_pattern, cb);
3115 this_parsed_item = parsed_pattern; /* New start for current item */
3116 }
3117 }
3118
3119 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3120 assertion, possibly preceded by a callout. If the value is 1, we have just
3121 had the callout and expect an assertion. There must be at least 3 more
3122 characters in all cases. When expect_cond_assert is 2, we know that the
3123 current character is an opening parenthesis, as otherwise we wouldn't be
3124 here. However, when it is 1, we need to check, and it's easiest just to check
3125 always. Note that expect_cond_assert may be negative, since all callouts just
3126 decrement it. */
3127
3128 if (expect_cond_assert > 0)
3129 {
3130 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3131 (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3132 if (ok)
3133 {
3134 if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
3135 {
3136 ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3137 }
3138 else switch(ptr[1]) /* Traditional symbolic format */
3139 {
3140 case CHAR_C:
3141 ok = expect_cond_assert == 2;
3142 break;
3143
3144 case CHAR_EQUALS_SIGN:
3145 case CHAR_EXCLAMATION_MARK:
3146 break;
3147
3148 case CHAR_LESS_THAN_SIGN:
3149 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3150 break;
3151
3152 default:
3153 ok = FALSE;
3154 }
3155 }
3156
3157 if (!ok)
3158 {
3159 ptr--; /* Adjust error offset */
3160 errorcode = ERR28;
3161 goto FAILED;
3162 }
3163 }
3164
3165 /* Remember whether we are expecting a conditional assertion, and set the
3166 default for this item. */
3167
3168 prev_expect_cond_assert = expect_cond_assert;
3169 expect_cond_assert = 0;
3170
3171 /* Remember quantification status for the previous significant item, then set
3172 default for this item. */
3173
3174 prev_okquantifier = okquantifier;
3175 prev_meta_quantifier = meta_quantifier;
3176 okquantifier = FALSE;
3177 meta_quantifier = 0;
3178
3179 /* If the previous significant item was a quantifier, adjust the parsed code
3180 if there is a following modifier. The base meta value is always followed by
3181 the PLUS and QUERY values, in that order. We do this here rather than after
3182 reading a quantifier so that intervening comments and /x whitespace can be
3183 ignored without having to replicate code. */
3184
3185 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3186 {
3187 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3188 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3189 0x00020000u : 0x00010000u);
3190 continue; /* Next character in pattern */
3191 }
3192
3193 /* Process the next item in the main part of a pattern. */
3194
3195 switch(c)
3196 {
3197 default: /* Non-special character */
3198 PARSED_LITERAL(c, parsed_pattern);
3199 break;
3200
3201
3202 /* ---- Escape sequence ---- */
3203
3204 case CHAR_BACKSLASH:
3205 tempptr = ptr;
3206 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3207 xoptions, FALSE, cb);
3208 if (errorcode != 0)
3209 {
3210 ESCAPE_FAILED:
3211 if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3212 goto FAILED;
3213 ptr = tempptr;
3214 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3215 {
3216 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3217 }
3218 escape = 0; /* Treat as literal character */
3219 }
3220
3221 /* The escape was a data escape or literal character. */
3222
3223 if (escape == 0)
3224 {
3225 PARSED_LITERAL(c, parsed_pattern);
3226 }
3227
3228 /* The escape was a back (or forward) reference. We keep the offset in
3229 order to give a more useful diagnostic for a bad forward reference. For
3230 references to groups numbered less than 10 we can't use more than two items
3231 in parsed_pattern because they may be just two characters in the input (and
3232 in a 64-bit world an offset may need two elements). So for them, the offset
3233 of the first occurrent is held in a special vector. */
3234
3235 else if (escape < 0)
3236 {
3237 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3238 escape = -escape;
3239 *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3240 if (escape < 10)
3241 {
3242 if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3243 cb->small_ref_offset[escape] = offset;
3244 }
3245 else
3246 {
3247 PUTOFFSET(offset, parsed_pattern);
3248 }
3249 okquantifier = TRUE;
3250 }
3251
3252 /* The escape was a character class such as \d etc. or other special
3253 escape indicator such as \A or \X. Most of them generate just a single
3254 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3255 value. They are supported only when Unicode is available. The type and
3256 value are packed into a single 32-bit value so that the whole sequences
3257 uses only two elements in the parsed_vector. This is because the same
3258 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3259 set.
3260
3261 There are also some cases where the escape sequence is followed by a name:
3262 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3263 and \g'name' are subroutine calls by name; \g{name} is a synonym for
3264 \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3265 and returned as a negative value (handled above). A name is coded as an
3266 offset into the pattern and a length. */
3267
3268 else switch (escape)
3269 {
3270 case ESC_C:
3271 #ifdef NEVER_BACKSLASH_C
3272 errorcode = ERR85;
3273 goto ESCAPE_FAILED;
3274 #else
3275 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3276 {
3277 errorcode = ERR83;
3278 goto ESCAPE_FAILED;
3279 }
3280 #endif
3281 okquantifier = TRUE;
3282 *parsed_pattern++ = META_ESCAPE + escape;
3283 break;
3284
3285 /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3286 when \u{ is not followed by hex digits and }. It requests two literal
3287 characters, u and { and we need this, as otherwise \u{ 12} (for example)
3288 would be treated as u{12} now that spaces are allowed in quantifiers. */
3289
3290 case ESC_ub:
3291 *parsed_pattern++ = CHAR_u;
3292 PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3293 break;
3294
3295 case ESC_X:
3296 #ifndef SUPPORT_UNICODE
3297 errorcode = ERR45; /* Supported only with Unicode support */
3298 goto ESCAPE_FAILED;
3299 #endif
3300 case ESC_H:
3301 case ESC_h:
3302 case ESC_N:
3303 case ESC_R:
3304 case ESC_V:
3305 case ESC_v:
3306 okquantifier = TRUE;
3307 *parsed_pattern++ = META_ESCAPE + escape;
3308 break;
3309
3310 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3311 *parsed_pattern++ = META_ESCAPE + escape;
3312 break;
3313
3314 /* Escapes that may change in UCP mode. */
3315
3316 case ESC_d:
3317 case ESC_D:
3318 case ESC_s:
3319 case ESC_S:
3320 case ESC_w:
3321 case ESC_W:
3322 okquantifier = TRUE;
3323 parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3324 xoptions);
3325 break;
3326
3327 /* Unicode property matching */
3328
3329 case ESC_P:
3330 case ESC_p:
3331 #ifdef SUPPORT_UNICODE
3332 {
3333 BOOL negated;
3334 uint16_t ptype = 0, pdata = 0;
3335 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3336 goto ESCAPE_FAILED;
3337 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3338 *parsed_pattern++ = META_ESCAPE + escape;
3339 *parsed_pattern++ = (ptype << 16) | pdata;
3340 okquantifier = TRUE;
3341 }
3342 #else
3343 errorcode = ERR45;
3344 goto ESCAPE_FAILED;
3345 #endif
3346 break; /* End \P and \p */
3347
3348 /* When \g is used with quotes or angle brackets as delimiters, it is a
3349 numerical or named subroutine call, and control comes here. When used
3350 with brace delimiters it is a numberical back reference and does not come
3351 here because check_escape() returns it directly as a reference. \k is
3352 always a named back reference. */
3353
3354 case ESC_g:
3355 case ESC_k:
3356 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3357 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3358 {
3359 errorcode = (escape == ESC_g)? ERR57 : ERR69;
3360 goto ESCAPE_FAILED;
3361 }
3362 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3363 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3364 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3365
3366 /* For a non-braced \g, check for a numerical recursion. */
3367
3368 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3369 {
3370 PCRE2_SPTR p = ptr + 1;
3371
3372 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3373 &errorcode))
3374 {
3375 if (p >= ptrend || *p != terminator)
3376 {
3377 errorcode = ERR57;
3378 goto ESCAPE_FAILED;
3379 }
3380 ptr = p;
3381 goto SET_RECURSION;
3382 }
3383 if (errorcode != 0) goto ESCAPE_FAILED;
3384 }
3385
3386 /* Not a numerical recursion. Perl allows spaces and tabs after { and
3387 before } but not for other delimiters. */
3388
3389 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3390 &errorcode, cb)) goto ESCAPE_FAILED;
3391
3392 /* \k and \g when used with braces are back references, whereas \g used
3393 with quotes or angle brackets is a recursion */
3394
3395 *parsed_pattern++ =
3396 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3397 META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3398 *parsed_pattern++ = namelen;
3399
3400 PUTOFFSET(offset, parsed_pattern);
3401 okquantifier = TRUE;
3402 break; /* End special escape processing */
3403 }
3404 break; /* End escape sequence processing */
3405
3406
3407 /* ---- Single-character special items ---- */
3408
3409 case CHAR_CIRCUMFLEX_ACCENT:
3410 *parsed_pattern++ = META_CIRCUMFLEX;
3411 break;
3412
3413 case CHAR_DOLLAR_SIGN:
3414 *parsed_pattern++ = META_DOLLAR;
3415 break;
3416
3417 case CHAR_DOT:
3418 *parsed_pattern++ = META_DOT;
3419 okquantifier = TRUE;
3420 break;
3421
3422
3423 /* ---- Single-character quantifiers ---- */
3424
3425 case CHAR_ASTERISK:
3426 meta_quantifier = META_ASTERISK;
3427 goto CHECK_QUANTIFIER;
3428
3429 case CHAR_PLUS:
3430 meta_quantifier = META_PLUS;
3431 goto CHECK_QUANTIFIER;
3432
3433 case CHAR_QUESTION_MARK:
3434 meta_quantifier = META_QUERY;
3435 goto CHECK_QUANTIFIER;
3436
3437
3438 /* ---- Potential {n,m} quantifier ---- */
3439
3440 case CHAR_LEFT_CURLY_BRACKET:
3441 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3442 &errorcode))
3443 {
3444 if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3445 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3446 break; /* No more quantifier processing */
3447 }
3448 meta_quantifier = META_MINMAX;
3449 /* Fall through */
3450
3451
3452 /* ---- Quantifier post-processing ---- */
3453
3454 /* Check that a quantifier is allowed after the previous item. This
3455 guarantees that there is a previous item. */
3456
3457 CHECK_QUANTIFIER:
3458 if (!prev_okquantifier)
3459 {
3460 errorcode = ERR9;
3461 goto FAILED_BACK;
3462 }
3463
3464 /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3465 quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3466 sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3467 wrapping it in non-capturing brackets, but we have to allow for a preceding
3468 (*MARK) for when (*ACCEPT) has an argument. */
3469
3470 if (*prev_parsed_item == META_ACCEPT)
3471 {
3472 uint32_t *p;
3473 for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3474 *verbstartptr = META_NOCAPTURE;
3475 parsed_pattern[1] = META_KET;
3476 parsed_pattern += 2;
3477 }
3478
3479 /* Now we can put the quantifier into the parsed pattern vector. At this
3480 stage, we have only the basic quantifier. The check for a following + or ?
3481 modifier happens at the top of the loop, after any intervening comments
3482 have been removed. */
3483
3484 *parsed_pattern++ = meta_quantifier;
3485 if (c == CHAR_LEFT_CURLY_BRACKET)
3486 {
3487 *parsed_pattern++ = min_repeat;
3488 *parsed_pattern++ = max_repeat;
3489 }
3490 break;
3491
3492
3493 /* ---- Character class ---- */
3494
3495 case CHAR_LEFT_SQUARE_BRACKET:
3496 okquantifier = TRUE;
3497
3498 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3499 used for "start of word" and "end of word". As these are otherwise illegal
3500 sequences, we don't break anything by recognizing them. They are replaced
3501 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3502 erroneous and are handled by the normal code below. */
3503
3504 if (ptrend - ptr >= 6 &&
3505 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3506 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3507 {
3508 *parsed_pattern++ = META_ESCAPE + ESC_b;
3509
3510 if (ptr[2] == CHAR_LESS_THAN_SIGN)
3511 {
3512 *parsed_pattern++ = META_LOOKAHEAD;
3513 }
3514 else
3515 {
3516 *parsed_pattern++ = META_LOOKBEHIND;
3517 *has_lookbehind = TRUE;
3518
3519 /* The offset is used only for the "non-fixed length" error; this won't
3520 occur here, so just store zero. */
3521
3522 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3523 }
3524
3525 if ((options & PCRE2_UCP) == 0)
3526 *parsed_pattern++ = META_ESCAPE + ESC_w;
3527 else
3528 {
3529 *parsed_pattern++ = META_ESCAPE + ESC_p;
3530 *parsed_pattern++ = PT_WORD << 16;
3531 }
3532 *parsed_pattern++ = META_KET;
3533 ptr += 6;
3534 break;
3535 }
3536
3537 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3538 they are encountered at the top level, so we'll do that too. */
3539
3540 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3541 *ptr == CHAR_EQUALS_SIGN) &&
3542 check_posix_syntax(ptr, ptrend, &tempptr))
3543 {
3544 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3545 goto FAILED;
3546 }
3547
3548 /* Process a regular character class. If the first character is '^', set
3549 the negation flag. If the first few characters (either before or after ^)
3550 are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3551 This makes for compatibility with Perl. */
3552
3553 negate_class = FALSE;
3554 while (ptr < ptrend)
3555 {
3556 GETCHARINCTEST(c, ptr);
3557 if (c == CHAR_BACKSLASH)
3558 {
3559 if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3560 else if (ptrend - ptr >= 3 &&
3561 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3562 ptr += 3;
3563 else
3564 break;
3565 }
3566 else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3567 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3568 continue;
3569 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3570 negate_class = TRUE;
3571 else break;
3572 }
3573
3574 /* Now the real contents of the class; c has the first "real" character.
3575 Empty classes are permitted only if the option is set. */
3576
3577 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3578 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3579 {
3580 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3581 break; /* End of class processing */
3582 }
3583
3584 /* Process a non-empty class. */
3585
3586 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3587 class_range_state = RANGE_NO;
3588
3589 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3590 because there are holes in the encoding, and simply using the range A-Z
3591 (for example) would include the characters in the holes. This applies only
3592 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3593 in this respect. In order to accommodate this, we keep track of whether
3594 character values are literal or not, and a state variable for handling
3595 ranges. */
3596
3597 /* Loop for the contents of the class */
3598
3599 for (;;)
3600 {
3601 BOOL char_is_literal = TRUE;
3602
3603 /* Inside \Q...\E everything is literal except \E */
3604
3605 if (inescq)
3606 {
3607 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3608 {
3609 inescq = FALSE; /* Reset literal state */
3610 ptr++; /* Skip the 'E' */
3611 goto CLASS_CONTINUE;
3612 }
3613 goto CLASS_LITERAL;
3614 }
3615
3616 /* Skip over space and tab (only) in extended-more mode. */
3617
3618 if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3619 (c == CHAR_SPACE || c == CHAR_HT))
3620 goto CLASS_CONTINUE;
3621
3622 /* Handle POSIX class names. Perl allows a negation extension of the
3623 form [:^name:]. A square bracket that doesn't match the syntax is
3624 treated as a literal. We also recognize the POSIX constructions
3625 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3626 5.6 and 5.8 do. */
3627
3628 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3629 ptrend - ptr >= 3 &&
3630 (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3631 *ptr == CHAR_EQUALS_SIGN) &&
3632 check_posix_syntax(ptr, ptrend, &tempptr))
3633 {
3634 BOOL posix_negate = FALSE;
3635 int posix_class;
3636
3637 /* Perl treats a hyphen before a POSIX class as a literal, not the
3638 start of a range. However, it gives a warning in its warning mode. PCRE
3639 does not have a warning mode, so we give an error, because this is
3640 likely an error on the user's part. */
3641
3642 if (class_range_state == RANGE_STARTED)
3643 {
3644 errorcode = ERR50;
3645 goto FAILED;
3646 }
3647
3648 if (*ptr != CHAR_COLON)
3649 {
3650 errorcode = ERR13;
3651 goto FAILED_BACK;
3652 }
3653
3654 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3655 {
3656 posix_negate = TRUE;
3657 ptr++;
3658 }
3659
3660 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3661 if (posix_class < 0)
3662 {
3663 errorcode = ERR30;
3664 goto FAILED;
3665 }
3666 ptr = tempptr + 2;
3667
3668 /* Perl treats a hyphen after a POSIX class as a literal, not the
3669 start of a range. However, it gives a warning in its warning mode
3670 unless the hyphen is the last character in the class. PCRE does not
3671 have a warning mode, so we give an error, because this is likely an
3672 error on the user's part. */
3673
3674 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3675 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3676 {
3677 errorcode = ERR50;
3678 goto FAILED;
3679 }
3680
3681 /* Set "a hyphen is not the start of a range" for the -] case, and also
3682 in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3683 fuzzers do that kind of thing) and *then* a hyphen. This causes that
3684 hyphen to be treated as a literal. I don't think it's worth setting up
3685 special apparatus to do otherwise. */
3686
3687 class_range_state = RANGE_NO;
3688
3689 /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3690 of the POSIX classes are converted to use Unicode properties \p or \P
3691 or, in one case, \h or \H. The substitutes table has two values per
3692 class, containing the type and value of a \p or \P item. The special
3693 cases are specified with a negative type: a non-zero value causes \h or
3694 \H to be used, and a zero value falls through to behave like a non-UCP
3695 POSIX class. There are now also some extra options that force ASCII for
3696 some classes. */
3697
3698 #ifdef SUPPORT_UNICODE
3699 if ((options & PCRE2_UCP) != 0 &&
3700 (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3701 !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3702 (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3703 {
3704 int ptype = posix_substitutes[2*posix_class];
3705 int pvalue = posix_substitutes[2*posix_class + 1];
3706
3707 if (ptype >= 0)
3708 {
3709 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3710 *parsed_pattern++ = (ptype << 16) | pvalue;
3711 goto CLASS_CONTINUE;
3712 }
3713
3714 if (pvalue != 0)
3715 {
3716 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3717 goto CLASS_CONTINUE;
3718 }
3719
3720 /* Fall through */
3721 }
3722 #endif /* SUPPORT_UNICODE */
3723
3724 /* Non-UCP POSIX class */
3725
3726 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3727 *parsed_pattern++ = posix_class;
3728 }
3729
3730 /* Handle potential start of range */
3731
3732 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3733 {
3734 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3735 META_RANGE_LITERAL : META_RANGE_ESCAPED;
3736 class_range_state = RANGE_STARTED;
3737 }
3738
3739 /* Handle a literal character */
3740
3741 else if (c != CHAR_BACKSLASH)
3742 {
3743 CLASS_LITERAL:
3744 if (class_range_state == RANGE_STARTED)
3745 {
3746 if (c == parsed_pattern[-2]) /* Optimize one-char range */
3747 parsed_pattern--;
3748 else if (parsed_pattern[-2] > c) /* Check range is in order */
3749 {
3750 errorcode = ERR8;
3751 goto FAILED_BACK;
3752 }
3753 else
3754 {
3755 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3756 parsed_pattern[-1] = META_RANGE_ESCAPED;
3757 PARSED_LITERAL(c, parsed_pattern);
3758 }
3759 class_range_state = RANGE_NO;
3760 }
3761 else /* Potential start of range */
3762 {
3763 class_range_state = char_is_literal?
3764 RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3765 PARSED_LITERAL(c, parsed_pattern);
3766 }
3767 }
3768
3769 /* Handle escapes in a class */
3770
3771 else
3772 {
3773 tempptr = ptr;
3774 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3775 xoptions, TRUE, cb);
3776
3777 if (errorcode != 0)
3778 {
3779 if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3780 goto FAILED;
3781 ptr = tempptr;
3782 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3783 {
3784 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3785 }
3786 escape = 0; /* Treat as literal character */
3787 }
3788
3789 switch(escape)
3790 {
3791 case 0: /* Escaped character code point is in c */
3792 char_is_literal = FALSE;
3793 goto CLASS_LITERAL; /* (a few lines above) */
3794
3795 case ESC_b:
3796 c = CHAR_BS; /* \b is backspace in a class */
3797 char_is_literal = FALSE;
3798 goto CLASS_LITERAL;
3799
3800 case ESC_Q:
3801 inescq = TRUE; /* Enter literal mode */
3802 goto CLASS_CONTINUE;
3803
3804 case ESC_E: /* Ignore orphan \E */
3805 goto CLASS_CONTINUE;
3806
3807 case ESC_B: /* Always an error in a class */
3808 case ESC_R:
3809 case ESC_X:
3810 errorcode = ERR7;
3811 ptr--;
3812 goto FAILED;
3813 }
3814
3815 /* The second part of a range can be a single-character escape
3816 sequence (detected above), but not any of the other escapes. Perl
3817 treats a hyphen as a literal in such circumstances. However, in Perl's
3818 warning mode, a warning is given, so PCRE now faults it, as it is
3819 almost certainly a mistake on the user's part. */
3820
3821 if (class_range_state == RANGE_STARTED)
3822 {
3823 errorcode = ERR50;
3824 goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
3825 }
3826
3827 /* Of the remaining escapes, only those that define characters are
3828 allowed in a class. None may start a range. */
3829
3830 class_range_state = RANGE_NO;
3831 switch(escape)
3832 {
3833 case ESC_N:
3834 errorcode = ERR71;
3835 goto FAILED;
3836
3837 case ESC_H:
3838 case ESC_h:
3839 case ESC_V:
3840 case ESC_v:
3841 *parsed_pattern++ = META_ESCAPE + escape;
3842 break;
3843
3844 /* These escapes may be converted to Unicode property tests when
3845 PCRE2_UCP is set. */
3846
3847 case ESC_d:
3848 case ESC_D:
3849 case ESC_s:
3850 case ESC_S:
3851 case ESC_w:
3852 case ESC_W:
3853 parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3854 xoptions);
3855 break;
3856
3857 /* Explicit Unicode property matching */
3858
3859 case ESC_P:
3860 case ESC_p:
3861 #ifdef SUPPORT_UNICODE
3862 {
3863 BOOL negated;
3864 uint16_t ptype = 0, pdata = 0;
3865 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3866 goto FAILED;
3867 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3868 *parsed_pattern++ = META_ESCAPE + escape;
3869 *parsed_pattern++ = (ptype << 16) | pdata;
3870 }
3871 #else
3872 errorcode = ERR45;
3873 goto FAILED;
3874 #endif
3875 break; /* End \P and \p */
3876
3877 default: /* All others are not allowed in a class */
3878 errorcode = ERR7;
3879 ptr--;
3880 goto FAILED;
3881 }
3882
3883 /* Perl gives a warning unless a following hyphen is the last character
3884 in the class. PCRE throws an error. */
3885
3886 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3887 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3888 {
3889 errorcode = ERR50;
3890 goto FAILED;
3891 }
3892 }
3893
3894 /* Proceed to next thing in the class. */
3895
3896 CLASS_CONTINUE:
3897 if (ptr >= ptrend)
3898 {
3899 errorcode = ERR6; /* Missing terminating ']' */
3900 goto FAILED;
3901 }
3902 GETCHARINCTEST(c, ptr);
3903 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3904 } /* End of class-processing loop */
3905
3906 /* -] at the end of a class is a literal '-' */
3907
3908 if (class_range_state == RANGE_STARTED)
3909 {
3910 parsed_pattern[-1] = CHAR_MINUS;
3911 class_range_state = RANGE_NO;
3912 }
3913
3914 *parsed_pattern++ = META_CLASS_END;
3915 break; /* End of character class */
3916
3917
3918 /* ---- Opening parenthesis ---- */
3919
3920 case CHAR_LEFT_PARENTHESIS:
3921 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3922
3923 /* If ( is not followed by ? it is either a capture or a special verb or an
3924 alpha assertion or a positive non-atomic lookahead. */
3925
3926 if (*ptr != CHAR_QUESTION_MARK)
3927 {
3928 const char *vn;
3929
3930 /* Handle capturing brackets (or non-capturing if auto-capture is turned
3931 off). */
3932
3933 if (*ptr != CHAR_ASTERISK)
3934 {
3935 nest_depth++;
3936 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3937 {
3938 if (cb->bracount >= MAX_GROUP_NUMBER)
3939 {
3940 errorcode = ERR97;
3941 goto FAILED;
3942 }
3943 cb->bracount++;
3944 *parsed_pattern++ = META_CAPTURE | cb->bracount;
3945 }
3946 else *parsed_pattern++ = META_NOCAPTURE;
3947 }
3948
3949 /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3950 quantifier" error rather than "(*MARK) must have an argument". */
3951
3952 else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3953 break;
3954
3955 /* Handle "alpha assertions" such as (*pla:...). Most of these are
3956 synonyms for the historical symbolic assertions, but the script run and
3957 non-atomic lookaround ones are new. They are distinguished by starting
3958 with a lower case letter. Checking both ends of the alphabet makes this
3959 work in all character codes. */
3960
3961 else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3962 {
3963 uint32_t meta;
3964
3965 vn = alasnames;
3966 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3967 &errorcode, cb)) goto FAILED;
3968 if (ptr >= ptrend || *ptr != CHAR_COLON)
3969 {
3970 errorcode = ERR95; /* Malformed */
3971 goto FAILED;
3972 }
3973
3974 /* Scan the table of alpha assertion names */
3975
3976 for (i = 0; i < alascount; i++)
3977 {
3978 if (namelen == alasmeta[i].len &&
3979 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3980 break;
3981 vn += alasmeta[i].len + 1;
3982 }
3983
3984 if (i >= alascount)
3985 {
3986 errorcode = ERR95; /* Alpha assertion not recognized */
3987 goto FAILED;
3988 }
3989
3990 /* Check for expecting an assertion condition. If so, only atomic
3991 lookaround assertions are valid. */
3992
3993 meta = alasmeta[i].meta;
3994 if (prev_expect_cond_assert > 0 &&
3995 (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3996 {
3997 errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3998 ERR98 : ERR28; /* (Atomic) assertion expected */
3999 goto FAILED;
4000 }
4001
4002 /* The lookaround alphabetic synonyms can mostly be handled by jumping
4003 to the code that handles the traditional symbolic forms. */
4004
4005 switch(meta)
4006 {
4007 default:
4008 errorcode = ERR89; /* Unknown code; should never occur because */
4009 goto FAILED; /* the meta values come from a table above. */
4010
4011 case META_ATOMIC:
4012 goto ATOMIC_GROUP;
4013
4014 case META_LOOKAHEAD:
4015 goto POSITIVE_LOOK_AHEAD;
4016
4017 case META_LOOKAHEAD_NA:
4018 goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4019
4020 case META_LOOKAHEADNOT:
4021 goto NEGATIVE_LOOK_AHEAD;
4022
4023 case META_LOOKBEHIND:
4024 case META_LOOKBEHINDNOT:
4025 case META_LOOKBEHIND_NA:
4026 *parsed_pattern++ = meta;
4027 ptr--;
4028 goto POST_LOOKBEHIND;
4029
4030 /* The script run facilities are handled here. Unicode support is
4031 required (give an error if not, as this is a security issue). Always
4032 record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4033 META_ATOMIC and remember that we need two META_KETs at the end. */
4034
4035 case META_SCRIPT_RUN:
4036 case META_ATOMIC_SCRIPT_RUN:
4037 #ifdef SUPPORT_UNICODE
4038 *parsed_pattern++ = META_SCRIPT_RUN;
4039 nest_depth++;
4040 ptr++;
4041 if (meta == META_ATOMIC_SCRIPT_RUN)
4042 {
4043 *parsed_pattern++ = META_ATOMIC;
4044 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4045 else if (++top_nest >= end_nests)
4046 {
4047 errorcode = ERR84;
4048 goto FAILED;
4049 }
4050 top_nest->nest_depth = nest_depth;
4051 top_nest->flags = NSF_ATOMICSR;
4052 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4053 top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4054 }
4055 break;
4056 #else /* SUPPORT_UNICODE */
4057 errorcode = ERR96;
4058 goto FAILED;
4059 #endif
4060 }
4061 }
4062
4063
4064 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4065
4066 else
4067 {
4068 vn = verbnames;
4069 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4070 &errorcode, cb)) goto FAILED;
4071 if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4072 *ptr != CHAR_RIGHT_PARENTHESIS))
4073 {
4074 errorcode = ERR60; /* Malformed */
4075 goto FAILED;
4076 }
4077
4078 /* Scan the table of verb names */
4079
4080 for (i = 0; i < verbcount; i++)
4081 {
4082 if (namelen == verbs[i].len &&
4083 PRIV(strncmp_c8)(name, vn, namelen) == 0)
4084 break;
4085 vn += verbs[i].len + 1;
4086 }
4087
4088 if (i >= verbcount)
4089 {
4090 errorcode = ERR60; /* Verb not recognized */
4091 goto FAILED;
4092 }
4093
4094 /* An empty argument is treated as no argument. */
4095
4096 if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4097 ptr[1] == CHAR_RIGHT_PARENTHESIS)
4098 ptr++; /* Advance to the closing parens */
4099
4100 /* Check for mandatory non-empty argument; this is (*MARK) */
4101
4102 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4103 {
4104 errorcode = ERR66;
4105 goto FAILED;
4106 }
4107
4108 /* Remember where this verb, possibly with a preceding (*MARK), starts,
4109 for handling quantified (*ACCEPT). */
4110
4111 verbstartptr = parsed_pattern;
4112 okquantifier = (verbs[i].meta == META_ACCEPT);
4113
4114 /* It appears that Perl allows any characters whatsoever, other than a
4115 closing parenthesis, to appear in arguments ("names"), so we no longer
4116 insist on letters, digits, and underscores. Perl does not, however, do
4117 any interpretation within arguments, and has no means of including a
4118 closing parenthesis. PCRE supports escape processing but only when it
4119 is requested by an option. We set inverbname TRUE here, and let the
4120 main loop take care of this so that escape and \x processing is done by
4121 the main code above. */
4122
4123 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
4124 {
4125 /* Some optional arguments can be treated as a preceding (*MARK) */
4126
4127 if (verbs[i].has_arg < 0)
4128 {
4129 add_after_mark = verbs[i].meta;
4130 *parsed_pattern++ = META_MARK;
4131 }
4132
4133 /* The remaining verbs with arguments (except *MARK) need a different
4134 opcode. */
4135
4136 else
4137 {
4138 *parsed_pattern++ = verbs[i].meta +
4139 ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4140 }
4141
4142 /* Set up for reading the name in the main loop. */
4143
4144 verblengthptr = parsed_pattern++;
4145 verbnamestart = ptr;
4146 inverbname = TRUE;
4147 }
4148 else /* No verb "name" argument */
4149 {
4150 *parsed_pattern++ = verbs[i].meta;
4151 }
4152 } /* End of (*VERB) handling */
4153 break; /* Done with this parenthesis */
4154 } /* End of groups that don't start with (? */
4155
4156
4157 /* ---- Items starting (? ---- */
4158
4159 /* The type of item is determined by what follows (?. Handle (?| and option
4160 changes under "default" because both need a new block on the nest stack.
4161 Comments starting with (?# are handled above. Note that there is some
4162 ambiguity about the sequence (?- because if a digit follows it's a relative
4163 recursion or subroutine call whereas otherwise it's an option unsetting. */
4164
4165 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4166
4167 switch(*ptr)
4168 {
4169 default:
4170 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4171 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
4172
4173 /* We now have either (?| or a (possibly empty) option setting,
4174 optionally followed by a non-capturing group. */
4175
4176 nest_depth++;
4177 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4178 else if (++top_nest >= end_nests)
4179 {
4180 errorcode = ERR84;
4181 goto FAILED;
4182 }
4183 top_nest->nest_depth = nest_depth;
4184 top_nest->flags = 0;
4185 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4186 top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4187
4188 /* Start of non-capturing group that resets the capture count for each
4189 branch. */
4190
4191 if (*ptr == CHAR_VERTICAL_LINE)
4192 {
4193 top_nest->reset_group = (uint16_t)cb->bracount;
4194 top_nest->max_group = (uint16_t)cb->bracount;
4195 top_nest->flags |= NSF_RESET;
4196 cb->external_flags |= PCRE2_DUPCAPUSED;
4197 *parsed_pattern++ = META_NOCAPTURE;
4198 ptr++;
4199 }
4200
4201 /* Scan for options imnrsxJU to be set or unset. */
4202
4203 else
4204 {
4205 BOOL hyphenok = TRUE;
4206 uint32_t oldoptions = options;
4207 uint32_t oldxoptions = xoptions;
4208
4209 top_nest->reset_group = 0;
4210 top_nest->max_group = 0;
4211 set = unset = 0;
4212 optset = &set;
4213 xset = xunset = 0;
4214 xoptset = &xset;
4215
4216 /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4217
4218 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4219 {
4220 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4221 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4222 xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4223 hyphenok = FALSE;
4224 ptr++;
4225 }
4226
4227 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4228 *ptr != CHAR_COLON)
4229 {
4230 switch (*ptr++)
4231 {
4232 case CHAR_MINUS:
4233 if (!hyphenok)
4234 {
4235 errorcode = ERR94;
4236 ptr--; /* Correct the offset */
4237 goto FAILED;
4238 }
4239 optset = &unset;
4240 xoptset = &xunset;
4241 hyphenok = FALSE;
4242 break;
4243
4244 /* There are some two-character sequences that start with 'a'. */
4245
4246 case CHAR_a:
4247 if (ptr < ptrend)
4248 {
4249 if (*ptr == CHAR_D)
4250 {
4251 *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4252 ptr++;
4253 break;
4254 }
4255 if (*ptr == CHAR_P)
4256 {
4257 *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4258 ptr++;
4259 break;
4260 }
4261 if (*ptr == CHAR_S)
4262 {
4263 *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4264 ptr++;
4265 break;
4266 }
4267 if (*ptr == CHAR_T)
4268 {
4269 *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4270 ptr++;
4271 break;
4272 }
4273 if (*ptr == CHAR_W)
4274 {
4275 *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4276 ptr++;
4277 break;
4278 }
4279 }
4280 *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4281 PCRE2_EXTRA_ASCII_BSW|
4282 PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4283 break;
4284
4285 case CHAR_J: /* Record that it changed in the external options */
4286 *optset |= PCRE2_DUPNAMES;
4287 cb->external_flags |= PCRE2_JCHANGED;
4288 break;
4289
4290 case CHAR_i: *optset |= PCRE2_CASELESS; break;
4291 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4292 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4293 case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4294 case CHAR_s: *optset |= PCRE2_DOTALL; break;
4295 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4296
4297 /* If x appears twice it sets the extended extended option. */
4298
4299 case CHAR_x:
4300 *optset |= PCRE2_EXTENDED;
4301 if (ptr < ptrend && *ptr == CHAR_x)
4302 {
4303 *optset |= PCRE2_EXTENDED_MORE;
4304 ptr++;
4305 }
4306 break;
4307
4308 default:
4309 errorcode = ERR11;
4310 ptr--; /* Correct the offset */
4311 goto FAILED;
4312 }
4313 }
4314
4315 /* If we are setting extended without extended-more, ensure that any
4316 existing extended-more gets unset. Also, unsetting extended must also
4317 unset extended-more. */
4318
4319 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4320 (unset & PCRE2_EXTENDED) != 0)
4321 unset |= PCRE2_EXTENDED_MORE;
4322
4323 options = (options | set) & (~unset);
4324 xoptions = (xoptions | xset) & (~xunset);
4325
4326 /* If the options ended with ')' this is not the start of a nested
4327 group with option changes, so the options change at this level.
4328 In this case, if the previous level set up a nest block, discard the
4329 one we have just created. Otherwise adjust it for the previous level.
4330 If the options ended with ':' we are starting a non-capturing group,
4331 possibly with an options setting. */
4332
4333 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4334 if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4335 {
4336 nest_depth--; /* This is not a nested group after all. */
4337 if (top_nest > (nest_save *)(cb->start_workspace) &&
4338 (top_nest-1)->nest_depth == nest_depth) top_nest--;
4339 else top_nest->nest_depth = nest_depth;
4340 }
4341 else *parsed_pattern++ = META_NOCAPTURE;
4342
4343 /* If nothing changed, no need to record. */
4344
4345 if (options != oldoptions || xoptions != oldxoptions)
4346 {
4347 *parsed_pattern++ = META_OPTIONS;
4348 *parsed_pattern++ = options;
4349 *parsed_pattern++ = xoptions;
4350 }
4351 } /* End options processing */
4352 break; /* End default case after (? */
4353
4354
4355 /* ---- Python syntax support ---- */
4356
4357 case CHAR_P:
4358 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4359
4360 /* (?P<name> is the same as (?<name>, which defines a named group. */
4361
4362 if (*ptr == CHAR_LESS_THAN_SIGN)
4363 {
4364 terminator = CHAR_GREATER_THAN_SIGN;
4365 goto DEFINE_NAME;
4366 }
4367
4368 /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4369 call. */
4370
4371 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4372
4373 /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4374 else after (?P is an error. */
4375
4376 if (*ptr != CHAR_EQUALS_SIGN)
4377 {
4378 errorcode = ERR41;
4379 goto FAILED;
4380 }
4381 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4382 &namelen, &errorcode, cb)) goto FAILED;
4383 *parsed_pattern++ = META_BACKREF_BYNAME;
4384 *parsed_pattern++ = namelen;
4385 PUTOFFSET(offset, parsed_pattern);
4386 okquantifier = TRUE;
4387 break; /* End of (?P processing */
4388
4389
4390 /* ---- Recursion/subroutine calls by number ---- */
4391
4392 case CHAR_R:
4393 i = 0; /* (?R) == (?R0) */
4394 ptr++;
4395 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4396 {
4397 errorcode = ERR58;
4398 goto FAILED;
4399 }
4400 goto SET_RECURSION;
4401
4402 /* An item starting (?- followed by a digit comes here via the "default"
4403 case because (?- followed by a non-digit is an options setting. */
4404
4405 case CHAR_PLUS:
4406 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4407 {
4408 errorcode = ERR29; /* Missing number */
4409 goto FAILED;
4410 }
4411 /* Fall through */
4412
4413 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4414 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4415 RECURSION_BYNUMBER:
4416 if (!read_number(&ptr, ptrend,
4417 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4418 MAX_GROUP_NUMBER, ERR61,
4419 &i, &errorcode)) goto FAILED;
4420 if (i < 0) /* NB (?0) is permitted */
4421 {
4422 errorcode = ERR15; /* Unknown group */
4423 goto FAILED_BACK;
4424 }
4425 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4426 goto UNCLOSED_PARENTHESIS;
4427
4428 SET_RECURSION:
4429 *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4430 offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4431 ptr++;
4432 PUTOFFSET(offset, parsed_pattern);
4433 okquantifier = TRUE;
4434 break; /* End of recursive call by number handling */
4435
4436
4437 /* ---- Recursion/subroutine calls by name ---- */
4438
4439 case CHAR_AMPERSAND:
4440 RECURSE_BY_NAME:
4441 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4442 &namelen, &errorcode, cb)) goto FAILED;
4443 *parsed_pattern++ = META_RECURSE_BYNAME;
4444 *parsed_pattern++ = namelen;
4445 PUTOFFSET(offset, parsed_pattern);
4446 okquantifier = TRUE;
4447 break;
4448
4449 /* ---- Callout with numerical or string argument ---- */
4450
4451 case CHAR_C:
4452 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4453
4454 /* If the previous item was a condition starting (?(? an assertion,
4455 optionally preceded by a callout, is expected. This is checked later on,
4456 during actual compilation. However we need to identify this kind of
4457 assertion in this pass because it must not be qualified. The value of
4458 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4459 for a callout - still leaving a positive value that identifies the
4460 assertion. Multiple callouts or any other items will make it zero or
4461 less, which doesn't matter because they will cause an error later. */
4462
4463 expect_cond_assert = prev_expect_cond_assert - 1;
4464
4465 /* If previous_callout is not NULL, it means this follows a previous
4466 callout. If it was a manual callout, do nothing; this means its "length
4467 of next pattern item" field will remain zero. If it was an automatic
4468 callout, abolish it. */
4469
4470 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4471 previous_callout == parsed_pattern - 4 &&
4472 parsed_pattern[-1] == 255)
4473 parsed_pattern = previous_callout;
4474
4475 /* Save for updating next pattern item length, and skip one item before
4476 completing. */
4477
4478 previous_callout = parsed_pattern;
4479 after_manual_callout = 1;
4480
4481 /* Handle a string argument; specific delimiter is required. */
4482
4483 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4484 {
4485 PCRE2_SIZE calloutlength;
4486 PCRE2_SPTR startptr = ptr;
4487
4488 delimiter = 0;
4489 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4490 {
4491 if (*ptr == PRIV(callout_start_delims)[i])
4492 {
4493 delimiter = PRIV(callout_end_delims)[i];
4494 break;
4495 }
4496 }
4497 if (delimiter == 0)
4498 {
4499 errorcode = ERR82;
4500 goto FAILED;
4501 }
4502
4503 *parsed_pattern = META_CALLOUT_STRING;
4504 parsed_pattern += 3; /* Skip pattern info */
4505
4506 for (;;)
4507 {
4508 if (++ptr >= ptrend)
4509 {
4510 errorcode = ERR81;
4511 ptr = startptr; /* To give a more useful message */
4512 goto FAILED;
4513 }
4514 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4515 break;
4516 }
4517
4518 calloutlength = (PCRE2_SIZE)(ptr - startptr);
4519 if (calloutlength > UINT32_MAX)
4520 {
4521 errorcode = ERR72;
4522 goto FAILED;
4523 }
4524 *parsed_pattern++ = (uint32_t)calloutlength;
4525 offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4526 PUTOFFSET(offset, parsed_pattern);
4527 }
4528
4529 /* Handle a callout with an optional numerical argument, which must be
4530 less than or equal to 255. A missing argument gives 0. */
4531
4532 else
4533 {
4534 int n = 0;
4535 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
4536 parsed_pattern += 3; /* Skip pattern info */
4537 while (ptr < ptrend && IS_DIGIT(*ptr))
4538 {
4539 n = n * 10 + *ptr++ - CHAR_0;
4540 if (n > 255)
4541 {
4542 errorcode = ERR38;
4543 goto FAILED;
4544 }
4545 }
4546 *parsed_pattern++ = n;
4547 }
4548
4549 /* Both formats must have a closing parenthesis */
4550
4551 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4552 {
4553 errorcode = ERR39;
4554 goto FAILED;
4555 }
4556 ptr++;
4557
4558 /* Remember the offset to the next item in the pattern, and set a default
4559 length. This should get updated after the next item is read. */
4560
4561 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4562 previous_callout[2] = 0;
4563 break; /* End callout */
4564
4565
4566 /* ---- Conditional group ---- */
4567
4568 /* A condition can be an assertion, a number (referring to a numbered
4569 group's having been set), a name (referring to a named group), or 'R',
4570 referring to overall recursion. R<digits> and R&name are also permitted
4571 for recursion state tests. Numbers may be preceded by + or - to specify a
4572 relative group number.
4573
4574 There are several syntaxes for testing a named group: (?(name)) is used
4575 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4576
4577 There are two unfortunate ambiguities. 'R' can be the recursive thing or
4578 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4579 the Perl DEFINE feature or the Python named test. We look for a name
4580 first; if not found, we try the other case.
4581
4582 For compatibility with auto-callouts, we allow a callout to be specified
4583 before a condition that is an assertion. */
4584
4585 case CHAR_LEFT_PARENTHESIS:
4586 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4587 nest_depth++;
4588
4589 /* If the next character is ? or * there must be an assertion next
4590 (optionally preceded by a callout). We do not check this here, but
4591 instead we set expect_cond_assert to 2. If this is still greater than
4592 zero (callouts decrement it) when the next assertion is read, it will be
4593 marked as a condition that must not be repeated. A value greater than
4594 zero also causes checking that an assertion (possibly with callout)
4595 follows. */
4596
4597 if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4598 {
4599 *parsed_pattern++ = META_COND_ASSERT;
4600 ptr--; /* Pull pointer back to the opening parenthesis. */
4601 expect_cond_assert = 2;
4602 break; /* End of conditional */
4603 }
4604
4605 /* Handle (?([+-]number)... */
4606
4607 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4608 &errorcode))
4609 {
4610 if (i <= 0)
4611 {
4612 errorcode = ERR15;
4613 goto FAILED;
4614 }
4615 *parsed_pattern++ = META_COND_NUMBER;
4616 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4617 PUTOFFSET(offset, parsed_pattern);
4618 *parsed_pattern++ = i;
4619 }
4620 else if (errorcode != 0) goto FAILED; /* Number too big */
4621
4622 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4623
4624 else if (ptrend - ptr >= 10 &&
4625 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4626 ptr[7] != CHAR_RIGHT_PARENTHESIS)
4627 {
4628 uint32_t ge = 0;
4629 int major = 0;
4630 int minor = 0;
4631
4632 ptr += 7;
4633 if (*ptr == CHAR_GREATER_THAN_SIGN)
4634 {
4635 ge = 1;
4636 ptr++;
4637 }
4638
4639 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4640 references its argument twice. */
4641
4642 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4643 goto BAD_VERSION_CONDITION;
4644
4645 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4646 goto FAILED;
4647
4648 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4649 if (*ptr == CHAR_DOT)
4650 {
4651 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4652 minor = (*ptr++ - CHAR_0) * 10;
4653 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4654 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4655 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4656 goto BAD_VERSION_CONDITION;
4657 }
4658
4659 *parsed_pattern++ = META_COND_VERSION;
4660 *parsed_pattern++ = ge;
4661 *parsed_pattern++ = major;
4662 *parsed_pattern++ = minor;
4663 }
4664
4665 /* All the remaining cases now require us to read a name. We cannot at
4666 this stage distinguish ambiguous cases such as (?(R12) which might be a
4667 recursion test by number or a name, because the named groups have not yet
4668 all been identified. Those cases are treated as names, but given a
4669 different META code. */
4670
4671 else
4672 {
4673 BOOL was_r_ampersand = FALSE;
4674
4675 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4676 {
4677 terminator = CHAR_RIGHT_PARENTHESIS;
4678 was_r_ampersand = TRUE;
4679 ptr++;
4680 }
4681 else if (*ptr == CHAR_LESS_THAN_SIGN)
4682 terminator = CHAR_GREATER_THAN_SIGN;
4683 else if (*ptr == CHAR_APOSTROPHE)
4684 terminator = CHAR_APOSTROPHE;
4685 else
4686 {
4687 terminator = CHAR_RIGHT_PARENTHESIS;
4688 ptr--; /* Point to char before name */
4689 }
4690 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4691 &errorcode, cb)) goto FAILED;
4692
4693 /* Handle (?(R&name) */
4694
4695 if (was_r_ampersand)
4696 {
4697 *parsed_pattern = META_COND_RNAME;
4698 ptr--; /* Back to closing parens */
4699 }
4700
4701 /* Handle (?(name). If the name is "DEFINE" we identify it with a
4702 special code. Likewise if the name consists of R followed only by
4703 digits. Otherwise, handle it like a quoted name. */
4704
4705 else if (terminator == CHAR_RIGHT_PARENTHESIS)
4706 {
4707 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4708 *parsed_pattern = META_COND_DEFINE;
4709 else
4710 {
4711 for (i = 1; i < (int)namelen; i++)
4712 if (!IS_DIGIT(name[i])) break;
4713 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4714 META_COND_RNUMBER : META_COND_NAME;
4715 }
4716 ptr--; /* Back to closing parens */
4717 }
4718
4719 /* Handle (?('name') or (?(<name>) */
4720
4721 else *parsed_pattern = META_COND_NAME;
4722
4723 /* All these cases except DEFINE end with the name length and offset;
4724 DEFINE just has an offset (for the "too many branches" error). */
4725
4726 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4727 PUTOFFSET(offset, parsed_pattern);
4728 } /* End cases that read a name */
4729
4730 /* Check the closing parenthesis of the condition */
4731
4732 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4733 {
4734 errorcode = ERR24;
4735 goto FAILED;
4736 }
4737 ptr++;
4738 break; /* End of condition processing */
4739
4740
4741 /* ---- Atomic group ---- */
4742
4743 case CHAR_GREATER_THAN_SIGN:
4744 ATOMIC_GROUP: /* Come from (*atomic: */
4745 *parsed_pattern++ = META_ATOMIC;
4746 nest_depth++;
4747 ptr++;
4748 break;
4749
4750
4751 /* ---- Lookahead assertions ---- */
4752
4753 case CHAR_EQUALS_SIGN:
4754 POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
4755 *parsed_pattern++ = META_LOOKAHEAD;
4756 ptr++;
4757 goto POST_ASSERTION;
4758
4759 case CHAR_ASTERISK:
4760 POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
4761 *parsed_pattern++ = META_LOOKAHEAD_NA;
4762 ptr++;
4763 goto POST_ASSERTION;
4764
4765 case CHAR_EXCLAMATION_MARK:
4766 NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
4767 *parsed_pattern++ = META_LOOKAHEADNOT;
4768 ptr++;
4769 goto POST_ASSERTION;
4770
4771
4772 /* ---- Lookbehind assertions ---- */
4773
4774 /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4775 is the start of the name of a capturing group. */
4776
4777 case CHAR_LESS_THAN_SIGN:
4778 if (ptrend - ptr <= 1 ||
4779 (ptr[1] != CHAR_EQUALS_SIGN &&
4780 ptr[1] != CHAR_EXCLAMATION_MARK &&
4781 ptr[1] != CHAR_ASTERISK))
4782 {
4783 terminator = CHAR_GREATER_THAN_SIGN;
4784 goto DEFINE_NAME;
4785 }
4786 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4787 META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4788 META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4789
4790 POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
4791 *has_lookbehind = TRUE;
4792 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4793 PUTOFFSET(offset, parsed_pattern);
4794 ptr += 2;
4795 /* Fall through */
4796
4797 /* If the previous item was a condition starting (?(? an assertion,
4798 optionally preceded by a callout, is expected. This is checked later on,
4799 during actual compilation. However we need to identify this kind of
4800 assertion in this pass because it must not be qualified. The value of
4801 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4802 for a callout - still leaving a positive value that identifies the
4803 assertion. Multiple callouts or any other items will make it zero or
4804 less, which doesn't matter because they will cause an error later. */
4805
4806 POST_ASSERTION:
4807 nest_depth++;
4808 if (prev_expect_cond_assert > 0)
4809 {
4810 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4811 else if (++top_nest >= end_nests)
4812 {
4813 errorcode = ERR84;
4814 goto FAILED;
4815 }
4816 top_nest->nest_depth = nest_depth;
4817 top_nest->flags = NSF_CONDASSERT;
4818 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4819 top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4820 }
4821 break;
4822
4823
4824 /* ---- Define a named group ---- */
4825
4826 /* A named group may be defined as (?'name') or (?<name>). In the latter
4827 case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4828 terminator set to '>'. */
4829
4830 case CHAR_APOSTROPHE:
4831 terminator = CHAR_APOSTROPHE; /* Terminator */
4832
4833 DEFINE_NAME:
4834 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4835 &errorcode, cb)) goto FAILED;
4836
4837 /* We have a name for this capturing group. It is also assigned a number,
4838 which is its primary means of identification. */
4839
4840 if (cb->bracount >= MAX_GROUP_NUMBER)
4841 {
4842 errorcode = ERR97;
4843 goto FAILED;
4844 }
4845 cb->bracount++;
4846 *parsed_pattern++ = META_CAPTURE | cb->bracount;
4847 nest_depth++;
4848
4849 /* Check not too many names */
4850
4851 if (cb->names_found >= MAX_NAME_COUNT)
4852 {
4853 errorcode = ERR49;
4854 goto FAILED;
4855 }
4856
4857 /* Adjust the entry size to accommodate the longest name found. */
4858
4859 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4860 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4861
4862 /* Scan the list to check for duplicates. For duplicate names, if the
4863 number is the same, break the loop, which causes the name to be
4864 discarded; otherwise, if DUPNAMES is not set, give an error.
4865 If it is set, allow the name with a different number, but continue
4866 scanning in case this is a duplicate with the same number. For
4867 non-duplicate names, give an error if the number is duplicated. */
4868
4869 isdupname = FALSE;
4870 ng = cb->named_groups;
4871 for (i = 0; i < cb->names_found; i++, ng++)
4872 {
4873 if (namelen == ng->length &&
4874 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4875 {
4876 if (ng->number == cb->bracount) break;
4877 if ((options & PCRE2_DUPNAMES) == 0)
4878 {
4879 errorcode = ERR43;
4880 goto FAILED;
4881 }
4882 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4883 cb->dupnames = TRUE; /* Duplicate names exist */
4884 }
4885 else if (ng->number == cb->bracount)
4886 {
4887 errorcode = ERR65;
4888 goto FAILED;
4889 }
4890 }
4891
4892 if (i < cb->names_found) break; /* Ignore duplicate with same number */
4893
4894 /* Increase the list size if necessary */
4895
4896 if (cb->names_found >= cb->named_group_list_size)
4897 {
4898 uint32_t newsize = cb->named_group_list_size * 2;
4899 named_group *newspace =
4900 cb->cx->memctl.malloc(newsize * sizeof(named_group),
4901 cb->cx->memctl.memory_data);
4902 if (newspace == NULL)
4903 {
4904 errorcode = ERR21;
4905 goto FAILED;
4906 }
4907
4908 memcpy(newspace, cb->named_groups,
4909 cb->named_group_list_size * sizeof(named_group));
4910 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4911 cb->cx->memctl.free((void *)cb->named_groups,
4912 cb->cx->memctl.memory_data);
4913 cb->named_groups = newspace;
4914 cb->named_group_list_size = newsize;
4915 }
4916
4917 /* Add this name to the list */
4918
4919 cb->named_groups[cb->names_found].name = name;
4920 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4921 cb->named_groups[cb->names_found].number = cb->bracount;
4922 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4923 cb->names_found++;
4924 break;
4925 } /* End of (? switch */
4926 break; /* End of ( handling */
4927
4928
4929 /* ---- Branch terminators ---- */
4930
4931 /* Alternation: reset the capture count if we are in a (?| group. */
4932
4933 case CHAR_VERTICAL_LINE:
4934 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4935 (top_nest->flags & NSF_RESET) != 0)
4936 {
4937 if (cb->bracount > top_nest->max_group)
4938 top_nest->max_group = (uint16_t)cb->bracount;
4939 cb->bracount = top_nest->reset_group;
4940 }
4941 *parsed_pattern++ = META_ALT;
4942 break;
4943
4944 /* End of group; reset the capture count to the maximum if we are in a (?|
4945 group and/or reset the options that are tracked during parsing. Disallow
4946 quantifier for a condition that is an assertion. */
4947
4948 case CHAR_RIGHT_PARENTHESIS:
4949 okquantifier = TRUE;
4950 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4951 {
4952 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4953 xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
4954 if ((top_nest->flags & NSF_RESET) != 0 &&
4955 top_nest->max_group > cb->bracount)
4956 cb->bracount = top_nest->max_group;
4957 if ((top_nest->flags & NSF_CONDASSERT) != 0)
4958 okquantifier = FALSE;
4959
4960 if ((top_nest->flags & NSF_ATOMICSR) != 0)
4961 {
4962 *parsed_pattern++ = META_KET;
4963 }
4964
4965 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4966 else top_nest--;
4967 }
4968 if (nest_depth == 0) /* Unmatched closing parenthesis */
4969 {
4970 errorcode = ERR22;
4971 goto FAILED_BACK;
4972 }
4973 nest_depth--;
4974 *parsed_pattern++ = META_KET;
4975 break;
4976 } /* End of switch on pattern character */
4977 } /* End of main character scan loop */
4978
4979 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4980
4981 if (inverbname && ptr >= ptrend)
4982 {
4983 errorcode = ERR60;
4984 goto FAILED;
4985 }
4986
4987 /* Manage callout for the final item */
4988
4989 PARSED_END:
4990 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4991 parsed_pattern, cb);
4992
4993 /* Insert trailing items for word and line matching (features provided for the
4994 benefit of pcre2grep). */
4995
4996 if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
4997 {
4998 *parsed_pattern++ = META_KET;
4999 *parsed_pattern++ = META_DOLLAR;
5000 }
5001 else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5002 {
5003 *parsed_pattern++ = META_KET;
5004 *parsed_pattern++ = META_ESCAPE + ESC_b;
5005 }
5006
5007 /* Terminate the parsed pattern, then return success if all groups are closed.
5008 Otherwise we have unclosed parentheses. */
5009
5010 if (parsed_pattern >= parsed_pattern_end)
5011 {
5012 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
5013 goto FAILED;
5014 }
5015
5016 *parsed_pattern = META_END;
5017 if (nest_depth == 0) return 0;
5018
5019 UNCLOSED_PARENTHESIS:
5020 errorcode = ERR14;
5021
5022 /* Come here for all failures. */
5023
5024 FAILED:
5025 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5026 return errorcode;
5027
5028 /* Some errors need to indicate the previous character. */
5029
5030 FAILED_BACK:
5031 ptr--;
5032 goto FAILED;
5033
5034 /* This failure happens several times. */
5035
5036 BAD_VERSION_CONDITION:
5037 errorcode = ERR79;
5038 goto FAILED;
5039 }
5040
5041
5042
5043 /*************************************************
5044 * Find first significant opcode *
5045 *************************************************/
5046
5047 /* This is called by several functions that scan a compiled expression looking
5048 for a fixed first character, or an anchoring opcode etc. It skips over things
5049 that do not influence this. For some calls, it makes sense to skip negative
5050 forward and all backward assertions, and also the \b assertion; for others it
5051 does not.
5052
5053 Arguments:
5054 code pointer to the start of the group
5055 skipassert TRUE if certain assertions are to be skipped
5056
5057 Returns: pointer to the first significant opcode
5058 */
5059
5060 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)5061 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5062 {
5063 for (;;)
5064 {
5065 switch ((int)*code)
5066 {
5067 case OP_ASSERT_NOT:
5068 case OP_ASSERTBACK:
5069 case OP_ASSERTBACK_NOT:
5070 case OP_ASSERTBACK_NA:
5071 if (!skipassert) return code;
5072 do code += GET(code, 1); while (*code == OP_ALT);
5073 code += PRIV(OP_lengths)[*code];
5074 break;
5075
5076 case OP_WORD_BOUNDARY:
5077 case OP_NOT_WORD_BOUNDARY:
5078 case OP_UCP_WORD_BOUNDARY:
5079 case OP_NOT_UCP_WORD_BOUNDARY:
5080 if (!skipassert) return code;
5081 /* Fall through */
5082
5083 case OP_CALLOUT:
5084 case OP_CREF:
5085 case OP_DNCREF:
5086 case OP_RREF:
5087 case OP_DNRREF:
5088 case OP_FALSE:
5089 case OP_TRUE:
5090 code += PRIV(OP_lengths)[*code];
5091 break;
5092
5093 case OP_CALLOUT_STR:
5094 code += GET(code, 1 + 2*LINK_SIZE);
5095 break;
5096
5097 case OP_SKIPZERO:
5098 code += 2 + GET(code, 2) + LINK_SIZE;
5099 break;
5100
5101 case OP_COND:
5102 case OP_SCOND:
5103 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
5104 code[GET(code, 1)] != OP_KET) /* More than one branch */
5105 return code;
5106 code += GET(code, 1) + 1 + LINK_SIZE;
5107 break;
5108
5109 case OP_MARK:
5110 case OP_COMMIT_ARG:
5111 case OP_PRUNE_ARG:
5112 case OP_SKIP_ARG:
5113 case OP_THEN_ARG:
5114 code += code[1] + PRIV(OP_lengths)[*code];
5115 break;
5116
5117 default:
5118 return code;
5119 }
5120 }
5121 /* Control never reaches here */
5122 }
5123
5124
5125
5126 #ifdef SUPPORT_UNICODE
5127 /*************************************************
5128 * Get othercase range *
5129 *************************************************/
5130
5131 /* This function is passed the start and end of a class range in UCP mode. For
5132 single characters the range may be just one character long. The function
5133 searches up the characters, looking for ranges of characters in the "other"
5134 case. Each call returns the next one, updating the start address. A character
5135 with multiple other cases is returned on its own with a special return value.
5136
5137 Arguments:
5138 cptr points to starting character value; updated
5139 d end value
5140 ocptr where to put start of othercase range
5141 odptr where to put end of othercase range
5142 restricted TRUE if caseless restriction applies
5143
5144 Yield: -1 when no more
5145 0 when a range is returned
5146 >0 the CASESET offset for char with multiple other cases;
5147 for this return, *ocptr contains the original
5148 */
5149
5150 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr,BOOL restricted)5151 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
5152 uint32_t *odptr, BOOL restricted)
5153 {
5154 uint32_t c, othercase, next;
5155 unsigned int co;
5156
5157 /* Find the first character that has an other case. If it has multiple other
5158 cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the
5159 multi-case entries that begin with ASCII values. In 32-bit mode, a value
5160 greater than the Unicode maximum ends the range. */
5161
5162 for (c = *cptr; c <= d; c++)
5163 {
5164 #if PCRE2_CODE_UNIT_WIDTH == 32
5165 if (c > MAX_UTF_CODE_POINT) return -1;
5166 #endif
5167 if ((co = UCD_CASESET(c)) != 0 &&
5168 (!restricted || PRIV(ucd_caseless_sets)[co] > 127))
5169 {
5170 *ocptr = c++; /* Character that has the set */
5171 *cptr = c; /* Rest of input range */
5172 return (int)co;
5173 }
5174
5175 /* This is not a valid multiple-case character. Check that the single other
5176 case is different to the original. We don't need to check "restricted" here
5177 because the non-ASCII characters with multiple cases that include an ASCII
5178 character don't have a different "othercase". */
5179
5180 if ((othercase = UCD_OTHERCASE(c)) != c) break;
5181 }
5182
5183 if (c > d) return -1; /* Reached end of range */
5184
5185 /* Found a character that has a single other case. Search for the end of the
5186 range, which is either the end of the input range, or a character that has zero
5187 or more than one other cases. */
5188
5189 *ocptr = othercase;
5190 next = othercase + 1;
5191
5192 for (++c; c <= d; c++)
5193 {
5194 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
5195 next++;
5196 }
5197
5198 *odptr = next - 1; /* End of othercase range */
5199 *cptr = c; /* Rest of input range */
5200 return 0;
5201 }
5202 #endif /* SUPPORT_UNICODE */
5203
5204
5205
5206 /*************************************************
5207 * Add a character or range to a class (internal) *
5208 *************************************************/
5209
5210 /* This function packages up the logic of adding a character or range of
5211 characters to a class. The character values in the arguments will be within the
5212 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
5213 called only from within the "add to class" group of functions, some of which
5214 are recursive and mutually recursive. The external entry point is
5215 add_to_class().
5216
5217 Arguments:
5218 classbits the bit map for characters < 256
5219 uchardptr points to the pointer for extra data
5220 options the options bits
5221 xoptions the extra options bits
5222 cb compile data
5223 start start of range character
5224 end end of range character
5225
5226 Returns: the number of < 256 characters added
5227 the pointer to extra data is updated
5228 */
5229
5230 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5231 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5232 uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start,
5233 uint32_t end)
5234 {
5235 uint32_t c;
5236 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5237 unsigned int n8 = 0;
5238
5239 /* If caseless matching is required, scan the range and process alternate
5240 cases. In Unicode, there are 8-bit characters that have alternate cases that
5241 are greater than 255 and vice-versa (though these may be ignored if caseless
5242 restriction is in force). Sometimes we can just extend the original range. */
5243
5244 if ((options & PCRE2_CASELESS) != 0)
5245 {
5246 #ifdef SUPPORT_UNICODE
5247 if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5248 {
5249 int rc;
5250 uint32_t oc, od;
5251
5252 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
5253 c = start;
5254
5255 while ((rc = get_othercase_range(&c, end, &oc, &od,
5256 (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0)
5257 {
5258 /* Handle a single character that has more than one other case. */
5259
5260 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr,
5261 options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc);
5262
5263 /* Do nothing if the other case range is within the original range. */
5264
5265 else if (oc >= cb->class_range_start && od <= cb->class_range_end)
5266 continue;
5267
5268 /* Extend the original range if there is overlap, noting that if oc < c,
5269 we can't have od > end because a subrange is always shorter than the
5270 basic range. Otherwise, use a recursive call to add the additional range.
5271 */
5272
5273 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5274 else if (od > end && oc <= end + 1)
5275 {
5276 end = od; /* Extend upwards */
5277 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5278 }
5279 else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions,
5280 cb, oc, od);
5281 }
5282 }
5283 else
5284 #else
5285 (void)xoptions; /* Avoid compiler warning */
5286 #endif /* SUPPORT_UNICODE */
5287
5288 /* Not UTF mode */
5289
5290 for (c = start; c <= classbits_end; c++)
5291 {
5292 SETBIT(classbits, cb->fcc[c]);
5293 n8++;
5294 }
5295 }
5296
5297 /* Now handle the originally supplied range. Adjust the final value according
5298 to the bit length - this means that the same lists of (e.g.) horizontal spaces
5299 can be used in all cases. */
5300
5301 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5302 end = MAX_NON_UTF_CHAR;
5303
5304 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5305
5306 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
5307
5308 for (c = start; c <= classbits_end; c++)
5309 {
5310 /* Regardless of start, c will always be <= 255. */
5311 SETBIT(classbits, c);
5312 n8++;
5313 }
5314
5315 #ifdef SUPPORT_WIDE_CHARS
5316 if (start <= 0xff) start = 0xff + 1;
5317
5318 if (end >= start)
5319 {
5320 PCRE2_UCHAR *uchardata = *uchardptr;
5321
5322 #ifdef SUPPORT_UNICODE
5323 if ((options & PCRE2_UTF) != 0)
5324 {
5325 if (start < end)
5326 {
5327 *uchardata++ = XCL_RANGE;
5328 uchardata += PRIV(ord2utf)(start, uchardata);
5329 uchardata += PRIV(ord2utf)(end, uchardata);
5330 }
5331 else if (start == end)
5332 {
5333 *uchardata++ = XCL_SINGLE;
5334 uchardata += PRIV(ord2utf)(start, uchardata);
5335 }
5336 }
5337 else
5338 #endif /* SUPPORT_UNICODE */
5339
5340 /* Without UTF support, character values are constrained by the bit length,
5341 and can only be > 256 for 16-bit and 32-bit libraries. */
5342
5343 #if PCRE2_CODE_UNIT_WIDTH == 8
5344 {}
5345 #else
5346 if (start < end)
5347 {
5348 *uchardata++ = XCL_RANGE;
5349 *uchardata++ = start;
5350 *uchardata++ = end;
5351 }
5352 else if (start == end)
5353 {
5354 *uchardata++ = XCL_SINGLE;
5355 *uchardata++ = start;
5356 }
5357 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
5358 *uchardptr = uchardata; /* Updata extra data pointer */
5359 }
5360 #else /* SUPPORT_WIDE_CHARS */
5361 (void)uchardptr; /* Avoid compiler warning */
5362 #endif /* SUPPORT_WIDE_CHARS */
5363
5364 return n8; /* Number of 8-bit characters */
5365 }
5366
5367
5368
5369 #ifdef SUPPORT_UNICODE
5370 /*************************************************
5371 * Add a list of characters to a class (internal) *
5372 *************************************************/
5373
5374 /* This function is used for adding a list of case-equivalent characters to a
5375 class when in UTF mode. This function is called only from within
5376 add_to_class_internal(), with which it is mutually recursive.
5377
5378 Arguments:
5379 classbits the bit map for characters < 256
5380 uchardptr points to the pointer for extra data
5381 options the options bits
5382 xoptions the extra options bits
5383 cb contains pointers to tables etc.
5384 p points to row of 32-bit values, terminated by NOTACHAR
5385 except character to omit; this is used when adding lists of
5386 case-equivalent characters to avoid including the one we
5387 already know about
5388
5389 Returns: the number of < 256 characters added
5390 the pointer to extra data is updated
5391 */
5392
5393 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5394 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5395 uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p,
5396 unsigned int except)
5397 {
5398 unsigned int n8 = 0;
5399 while (p[0] < NOTACHAR)
5400 {
5401 unsigned int n = 0;
5402 if (p[0] != except)
5403 {
5404 while(p[n+1] == p[0] + n + 1) n++;
5405 n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5406 p[0], p[n]);
5407 }
5408 p += n + 1;
5409 }
5410 return n8;
5411 }
5412 #endif
5413
5414
5415
5416 /*************************************************
5417 * External entry point for add range to class *
5418 *************************************************/
5419
5420 /* This function sets the overall range so that the internal functions can try
5421 to avoid duplication when handling case-independence.
5422
5423 Arguments:
5424 classbits the bit map for characters < 256
5425 uchardptr points to the pointer for extra data
5426 options the options bits
5427 xoptions the extra options bits
5428 cb compile data
5429 start start of range character
5430 end end of range character
5431
5432 Returns: the number of < 256 characters added
5433 the pointer to extra data is updated
5434 */
5435
5436 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5437 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5438 uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end)
5439 {
5440 cb->class_range_start = start;
5441 cb->class_range_end = end;
5442 return add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5443 start, end);
5444 }
5445
5446
5447 /*************************************************
5448 * External entry point for add list to class *
5449 *************************************************/
5450
5451 /* This function is used for adding a list of horizontal or vertical whitespace
5452 characters to a class. The list must be in order so that ranges of characters
5453 can be detected and handled appropriately. This function sets the overall range
5454 so that the internal functions can try to avoid duplication when handling
5455 case-independence.
5456
5457 Arguments:
5458 classbits the bit map for characters < 256
5459 uchardptr points to the pointer for extra data
5460 options the options bits
5461 xoptions the extra options bits
5462 cb contains pointers to tables etc.
5463 p points to row of 32-bit values, terminated by NOTACHAR
5464 except character to omit; this is used when adding lists of
5465 case-equivalent characters to avoid including the one we
5466 already know about
5467
5468 Returns: the number of < 256 characters added
5469 the pointer to extra data is updated
5470 */
5471
5472 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5473 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5474 uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except)
5475 {
5476 unsigned int n8 = 0;
5477 while (p[0] < NOTACHAR)
5478 {
5479 unsigned int n = 0;
5480 if (p[0] != except)
5481 {
5482 while(p[n+1] == p[0] + n + 1) n++;
5483 cb->class_range_start = p[0];
5484 cb->class_range_end = p[n];
5485 n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5486 p[0], p[n]);
5487 }
5488 p += n + 1;
5489 }
5490 return n8;
5491 }
5492
5493
5494
5495 /*************************************************
5496 * Add characters not in a list to a class *
5497 *************************************************/
5498
5499 /* This function is used for adding the complement of a list of horizontal or
5500 vertical whitespace to a class. The list must be in order.
5501
5502 Arguments:
5503 classbits the bit map for characters < 256
5504 uchardptr points to the pointer for extra data
5505 options the options bits
5506 xoptions the extra options bits
5507 cb contains pointers to tables etc.
5508 p points to row of 32-bit values, terminated by NOTACHAR
5509
5510 Returns: the number of < 256 characters added
5511 the pointer to extra data is updated
5512 */
5513
5514 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p)5515 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5516 uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p)
5517 {
5518 BOOL utf = (options & PCRE2_UTF) != 0;
5519 unsigned int n8 = 0;
5520 if (p[0] > 0)
5521 n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1);
5522 while (p[0] < NOTACHAR)
5523 {
5524 while (p[1] == p[0] + 1) p++;
5525 n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1,
5526 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5527 p++;
5528 }
5529 return n8;
5530 }
5531
5532
5533
5534 /*************************************************
5535 * Find details of duplicate group names *
5536 *************************************************/
5537
5538 /* This is called from compile_branch() when it needs to know the index and
5539 count of duplicates in the names table when processing named backreferences,
5540 either directly, or as conditions.
5541
5542 Arguments:
5543 name points to the name
5544 length the length of the name
5545 indexptr where to put the index
5546 countptr where to put the count of duplicates
5547 errorcodeptr where to put an error code
5548 cb the compile block
5549
5550 Returns: TRUE if OK, FALSE if not, error code set
5551 */
5552
5553 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5554 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5555 int *countptr, int *errorcodeptr, compile_block *cb)
5556 {
5557 uint32_t i, groupnumber;
5558 int count;
5559 PCRE2_UCHAR *slot = cb->name_table;
5560
5561 /* Find the first entry in the table */
5562
5563 for (i = 0; i < cb->names_found; i++)
5564 {
5565 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5566 slot[IMM2_SIZE+length] == 0) break;
5567 slot += cb->name_entry_size;
5568 }
5569
5570 /* This should not occur, because this function is called only when we know we
5571 have duplicate names. Give an internal error. */
5572
5573 if (i >= cb->names_found)
5574 {
5575 *errorcodeptr = ERR53;
5576 cb->erroroffset = name - cb->start_pattern;
5577 return FALSE;
5578 }
5579
5580 /* Record the index and then see how many duplicates there are, updating the
5581 backref map and maximum back reference as we do. */
5582
5583 *indexptr = i;
5584 count = 0;
5585
5586 for (;;)
5587 {
5588 count++;
5589 groupnumber = GET2(slot,0);
5590 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5591 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5592 if (++i >= cb->names_found) break;
5593 slot += cb->name_entry_size;
5594 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5595 (slot+IMM2_SIZE)[length] != 0) break;
5596 }
5597
5598 *countptr = count;
5599 return TRUE;
5600 }
5601
5602
5603
5604 /*************************************************
5605 * Compile one branch *
5606 *************************************************/
5607
5608 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5609 the options are changed during the branch, the pointer is used to change the
5610 external options bits. This function is used during the pre-compile phase when
5611 we are trying to find out the amount of memory needed, as well as during the
5612 real compile phase. The value of lengthptr distinguishes the two phases.
5613
5614 Arguments:
5615 optionsptr pointer to the option bits
5616 xoptionsptr pointer to the extra option bits
5617 codeptr points to the pointer to the current code point
5618 pptrptr points to the current parsed pattern pointer
5619 errorcodeptr points to error code variable
5620 firstcuptr place to put the first required code unit
5621 firstcuflagsptr place to put the first code unit flags
5622 reqcuptr place to put the last required code unit
5623 reqcuflagsptr place to put the last required code unit flags
5624 bcptr points to current branch chain
5625 open_caps points to current capitem
5626 cb contains pointers to tables etc.
5627 lengthptr NULL during the real compile phase
5628 points to length accumulator during pre-compile phase
5629
5630 Returns: 0 There's been an error, *errorcodeptr is non-zero
5631 +1 Success, this branch must match at least one character
5632 -1 Success, this branch may match an empty string
5633 */
5634
5635 static int
compile_branch(uint32_t * optionsptr,uint32_t * xoptionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)5636 compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5637 PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5638 uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5639 uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5640 compile_block *cb, PCRE2_SIZE *lengthptr)
5641 {
5642 int bravalue = 0;
5643 int okreturn = -1;
5644 int group_return = 0;
5645 uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
5646 uint32_t greedy_default, greedy_non_default;
5647 uint32_t repeat_type, op_type;
5648 uint32_t options = *optionsptr; /* May change dynamically */
5649 uint32_t xoptions = *xoptionsptr; /* May change dynamically */
5650 uint32_t firstcu, reqcu;
5651 uint32_t zeroreqcu, zerofirstcu;
5652 uint32_t escape;
5653 uint32_t *pptr = *pptrptr;
5654 uint32_t meta, meta_arg;
5655 uint32_t firstcuflags, reqcuflags;
5656 uint32_t zeroreqcuflags, zerofirstcuflags;
5657 uint32_t req_caseopt, reqvary, tempreqvary;
5658 PCRE2_SIZE offset = 0;
5659 PCRE2_SIZE length_prevgroup = 0;
5660 PCRE2_UCHAR *code = *codeptr;
5661 PCRE2_UCHAR *last_code = code;
5662 PCRE2_UCHAR *orig_code = code;
5663 PCRE2_UCHAR *tempcode;
5664 PCRE2_UCHAR *previous = NULL;
5665 PCRE2_UCHAR op_previous;
5666 BOOL groupsetfirstcu = FALSE;
5667 BOOL had_accept = FALSE;
5668 BOOL matched_char = FALSE;
5669 BOOL previous_matched_char = FALSE;
5670 BOOL reset_caseful = FALSE;
5671 const uint8_t *cbits = cb->cbits;
5672 uint8_t classbits[32];
5673
5674 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5675 not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5676 as we process the pattern. */
5677
5678 #ifdef SUPPORT_UNICODE
5679 BOOL utf = (options & PCRE2_UTF) != 0;
5680 BOOL ucp = (options & PCRE2_UCP) != 0;
5681 #else /* No Unicode support */
5682 BOOL utf = FALSE;
5683 #endif
5684
5685 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5686 class_uchardata always so that it can be passed to add_to_class() always,
5687 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5688 alternative calls for the different cases. */
5689
5690 PCRE2_UCHAR *class_uchardata;
5691 #ifdef SUPPORT_WIDE_CHARS
5692 BOOL xclass;
5693 PCRE2_UCHAR *class_uchardata_base;
5694 #endif
5695
5696 /* Set up the default and non-default settings for greediness */
5697
5698 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5699 greedy_non_default = greedy_default ^ 1;
5700
5701 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5702 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5703 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5704
5705 When we hit a repeat whose minimum is zero, we may have to adjust these values
5706 to take the zero repeat into account. This is implemented by setting them to
5707 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5708 item types that can be repeated set these backoff variables appropriately. */
5709
5710 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5711 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5712
5713 /* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5714 according to the current setting of the caseless flag. The REQ_CASELESS value
5715 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5716 to record the case status of the value. This is used only for ASCII characters.
5717 */
5718
5719 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5720
5721 /* Switch on next META item until the end of the branch */
5722
5723 for (;; pptr++)
5724 {
5725 #ifdef SUPPORT_WIDE_CHARS
5726 BOOL xclass_has_prop;
5727 #endif
5728 BOOL negate_class;
5729 BOOL should_flip_negation;
5730 BOOL match_all_or_no_wide_chars;
5731 BOOL possessive_quantifier;
5732 BOOL note_group_empty;
5733 int class_has_8bitchar;
5734 uint32_t mclength;
5735 uint32_t skipunits;
5736 uint32_t subreqcu, subfirstcu;
5737 uint32_t groupnumber;
5738 uint32_t verbarglen, verbculen;
5739 uint32_t subreqcuflags, subfirstcuflags;
5740 open_capitem *oc;
5741 PCRE2_UCHAR mcbuffer[8];
5742
5743 /* Get next META item in the pattern and its potential argument. */
5744
5745 meta = META_CODE(*pptr);
5746 meta_arg = META_DATA(*pptr);
5747
5748 /* If we are in the pre-compile phase, accumulate the length used for the
5749 previous cycle of this loop, unless the next item is a quantifier. */
5750
5751 if (lengthptr != NULL)
5752 {
5753 if (code > cb->start_workspace + cb->workspace_size -
5754 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5755 {
5756 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5757 ERR52 : ERR86;
5758 return 0;
5759 }
5760
5761 /* There is at least one situation where code goes backwards: this is the
5762 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5763 is processed, the whole class is eliminated. However, it is created first,
5764 so we have to allow memory for it. Therefore, don't ever reduce the length
5765 at this point. */
5766
5767 if (code < last_code) code = last_code;
5768
5769 /* If the next thing is not a quantifier, we add the length of the previous
5770 item into the total, and reset the code pointer to the start of the
5771 workspace. Otherwise leave the previous item available to be quantified. */
5772
5773 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5774 {
5775 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5776 {
5777 *errorcodeptr = ERR20; /* Integer overflow */
5778 return 0;
5779 }
5780 *lengthptr += (PCRE2_SIZE)(code - orig_code);
5781 if (*lengthptr > MAX_PATTERN_SIZE)
5782 {
5783 *errorcodeptr = ERR20; /* Pattern is too large */
5784 return 0;
5785 }
5786 code = orig_code;
5787 }
5788
5789 /* Remember where this code item starts so we can catch the "backwards"
5790 case above next time round. */
5791
5792 last_code = code;
5793 }
5794
5795 /* Process the next parsed pattern item. If it is not a quantifier, remember
5796 where it starts so that it can be quantified when a quantifier follows.
5797 Checking for the legality of quantifiers happens in parse_regex(), except for
5798 a quantifier after an assertion that is a condition. */
5799
5800 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5801 {
5802 previous = code;
5803 if (matched_char && !had_accept) okreturn = 1;
5804 }
5805
5806 previous_matched_char = matched_char;
5807 matched_char = FALSE;
5808 note_group_empty = FALSE;
5809 skipunits = 0; /* Default value for most subgroups */
5810
5811 switch(meta)
5812 {
5813 /* ===================================================================*/
5814 /* The branch terminates at pattern end or | or ) */
5815
5816 case META_END:
5817 case META_ALT:
5818 case META_KET:
5819 *firstcuptr = firstcu;
5820 *firstcuflagsptr = firstcuflags;
5821 *reqcuptr = reqcu;
5822 *reqcuflagsptr = reqcuflags;
5823 *codeptr = code;
5824 *pptrptr = pptr;
5825 return okreturn;
5826
5827
5828 /* ===================================================================*/
5829 /* Handle single-character metacharacters. In multiline mode, ^ disables
5830 the setting of any following char as a first character. */
5831
5832 case META_CIRCUMFLEX:
5833 if ((options & PCRE2_MULTILINE) != 0)
5834 {
5835 if (firstcuflags == REQ_UNSET)
5836 zerofirstcuflags = firstcuflags = REQ_NONE;
5837 *code++ = OP_CIRCM;
5838 }
5839 else *code++ = OP_CIRC;
5840 break;
5841
5842 case META_DOLLAR:
5843 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5844 break;
5845
5846 /* There can never be a first char if '.' is first, whatever happens about
5847 repeats. The value of reqcu doesn't change either. */
5848
5849 case META_DOT:
5850 matched_char = TRUE;
5851 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5852 zerofirstcu = firstcu;
5853 zerofirstcuflags = firstcuflags;
5854 zeroreqcu = reqcu;
5855 zeroreqcuflags = reqcuflags;
5856 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5857 break;
5858
5859
5860 /* ===================================================================*/
5861 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5862 Otherwise, an initial ']' is taken as a data character. When empty classes
5863 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5864 match any character, so generate OP_ALLANY. */
5865
5866 case META_CLASS_EMPTY:
5867 case META_CLASS_EMPTY_NOT:
5868 matched_char = TRUE;
5869 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5870 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5871 zerofirstcu = firstcu;
5872 zerofirstcuflags = firstcuflags;
5873 break;
5874
5875
5876 /* ===================================================================*/
5877 /* Non-empty character class. If the included characters are all < 256, we
5878 build a 32-byte bitmap of the permitted characters, except in the special
5879 case where there is only one such character. For negated classes, we build
5880 the map as usual, then invert it at the end. However, we use a different
5881 opcode so that data characters > 255 can be handled correctly.
5882
5883 If the class contains characters outside the 0-255 range, a different
5884 opcode is compiled. It may optionally have a bit map for characters < 256,
5885 but those above are explicitly listed afterwards. A flag code unit tells
5886 whether the bitmap is present, and whether this is a negated class or
5887 not. */
5888
5889 case META_CLASS_NOT:
5890 case META_CLASS:
5891 matched_char = TRUE;
5892 negate_class = meta == META_CLASS_NOT;
5893
5894 /* We can optimize the case of a single character in a class by generating
5895 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5896 negative. In the negative case there can be no first char if this item is
5897 first, whatever repeat count may follow. In the case of reqcu, save the
5898 previous value for reinstating. */
5899
5900 /* NOTE: at present this optimization is not effective if the only
5901 character in a class in 32-bit, non-UCP mode has its top bit set. */
5902
5903 if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5904 {
5905 #ifdef SUPPORT_UNICODE
5906 uint32_t d;
5907 #endif
5908 uint32_t c = pptr[1];
5909
5910 pptr += 2; /* Move on to class end */
5911 if (meta == META_CLASS) /* A positive one-char class can be */
5912 { /* handled as a normal literal character. */
5913 meta = c; /* Set up the character */
5914 goto NORMAL_CHAR_SET;
5915 }
5916
5917 /* Handle a negative one-character class */
5918
5919 zeroreqcu = reqcu;
5920 zeroreqcuflags = reqcuflags;
5921 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5922 zerofirstcu = firstcu;
5923 zerofirstcuflags = firstcuflags;
5924
5925 /* For caseless UTF or UCP mode, check whether this character has more
5926 than one other case. If so, generate a special OP_NOTPROP item instead of
5927 OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
5928 caseless set that starts with an ASCII character. */
5929
5930 #ifdef SUPPORT_UNICODE
5931 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5932 (d = UCD_CASESET(c)) != 0 &&
5933 ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
5934 PRIV(ucd_caseless_sets)[d] > 127))
5935 {
5936 *code++ = OP_NOTPROP;
5937 *code++ = PT_CLIST;
5938 *code++ = d;
5939 break; /* We are finished with this class */
5940 }
5941 #endif
5942 /* Char has only one other (usable) case, or UCP not available */
5943
5944 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5945 code += PUTCHAR(c, code);
5946 break; /* We are finished with this class */
5947 } /* End of 1-char optimization */
5948
5949 /* Handle character classes that contain more than just one literal
5950 character. If there are exactly two characters in a positive class, see if
5951 they are case partners. This can be optimized to generate a caseless single
5952 character match (which also sets first/required code units if relevant).
5953 When casing restrictions apply, ignore a caseless set if both characters
5954 are ASCII. */
5955
5956 if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5957 pptr[3] == META_CLASS_END)
5958 {
5959 uint32_t c = pptr[1];
5960
5961 #ifdef SUPPORT_UNICODE
5962 if (UCD_CASESET(c) == 0 ||
5963 ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
5964 c < 128 && pptr[2] < 128))
5965 #endif
5966 {
5967 uint32_t d;
5968
5969 #ifdef SUPPORT_UNICODE
5970 if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5971 #endif
5972 {
5973 #if PCRE2_CODE_UNIT_WIDTH != 8
5974 if (c > 255) d = c; else
5975 #endif
5976 d = TABLE_GET(c, cb->fcc, c);
5977 }
5978
5979 if (c != d && pptr[2] == d)
5980 {
5981 pptr += 3; /* Move on to class end */
5982 meta = c;
5983 if ((options & PCRE2_CASELESS) == 0)
5984 {
5985 reset_caseful = TRUE;
5986 options |= PCRE2_CASELESS;
5987 req_caseopt = REQ_CASELESS;
5988 }
5989 goto CLASS_CASELESS_CHAR;
5990 }
5991 }
5992 }
5993
5994 /* If a non-extended class contains a negative special such as \S, we need
5995 to flip the negation flag at the end, so that support for characters > 255
5996 works correctly (they are all included in the class). An extended class may
5997 need to insert specific matching or non-matching code for wide characters.
5998 */
5999
6000 should_flip_negation = match_all_or_no_wide_chars = FALSE;
6001
6002 /* Extended class (xclass) will be used when characters > 255
6003 might match. */
6004
6005 #ifdef SUPPORT_WIDE_CHARS
6006 xclass = FALSE;
6007 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
6008 class_uchardata_base = class_uchardata; /* Save the start */
6009 #endif
6010
6011 /* For optimization purposes, we track some properties of the class:
6012 class_has_8bitchar will be non-zero if the class contains at least one
6013 character with a code point less than 256; xclass_has_prop will be TRUE if
6014 Unicode property checks are present in the class. */
6015
6016 class_has_8bitchar = 0;
6017 #ifdef SUPPORT_WIDE_CHARS
6018 xclass_has_prop = FALSE;
6019 #endif
6020
6021 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
6022 in a temporary bit of memory, in case the class contains fewer than two
6023 8-bit characters because in that case the compiled code doesn't use the bit
6024 map. */
6025
6026 memset(classbits, 0, 32 * sizeof(uint8_t));
6027
6028 /* Process items until META_CLASS_END is reached. */
6029
6030 while ((meta = *(++pptr)) != META_CLASS_END)
6031 {
6032 /* Handle POSIX classes such as [:alpha:] etc. */
6033
6034 if (meta == META_POSIX || meta == META_POSIX_NEG)
6035 {
6036 BOOL local_negate = (meta == META_POSIX_NEG);
6037 int posix_class = *(++pptr);
6038 int taboffset, tabopt;
6039 uint8_t pbits[32];
6040
6041 should_flip_negation = local_negate; /* Note negative special */
6042
6043 /* If matching is caseless, upper and lower are converted to alpha.
6044 This relies on the fact that the class table starts with alpha,
6045 lower, upper as the first 3 entries. */
6046
6047 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
6048 posix_class = 0;
6049
6050 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
6051 different escape sequences that use Unicode properties \p or \P.
6052 Others that are not available via \p or \P have to generate
6053 XCL_PROP/XCL_NOTPROP directly, which is done here. */
6054
6055 #ifdef SUPPORT_UNICODE
6056 if ((options & PCRE2_UCP) != 0 &&
6057 (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
6058 {
6059 switch(posix_class)
6060 {
6061 case PC_GRAPH:
6062 case PC_PRINT:
6063 case PC_PUNCT:
6064 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
6065 *class_uchardata++ = (PCRE2_UCHAR)
6066 ((posix_class == PC_GRAPH)? PT_PXGRAPH :
6067 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
6068 *class_uchardata++ = 0;
6069 xclass_has_prop = TRUE;
6070 goto CONTINUE_CLASS;
6071
6072 /* For the other POSIX classes (ex: ascii) we are going to
6073 fall through to the non-UCP case and build a bit map for
6074 characters with code points less than 256. However, if we are in
6075 a negated POSIX class, characters with code points greater than
6076 255 must either all match or all not match, depending on whether
6077 the whole class is not or is negated. For example, for
6078 [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
6079 they must not.
6080
6081 In the special case where there are no xclass items, this is
6082 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
6083 explicit range is needed for OP_XCLASS. Setting a flag here
6084 causes the range to be generated later when it is known that
6085 OP_XCLASS is required. In the 8-bit library this is relevant only in
6086 utf mode, since no wide characters can exist otherwise. */
6087
6088 default:
6089 #if PCRE2_CODE_UNIT_WIDTH == 8
6090 if (utf)
6091 #endif
6092 match_all_or_no_wide_chars |= local_negate;
6093 break;
6094 }
6095 }
6096 #endif /* SUPPORT_UNICODE */
6097
6098 /* In the non-UCP case, or when UCP makes no difference, we build the
6099 bit map for the POSIX class in a chunk of local store because we may
6100 be adding and subtracting from it, and we don't want to subtract bits
6101 that may be in the main map already. At the end we or the result into
6102 the bit map that is being built. */
6103
6104 posix_class *= 3;
6105
6106 /* Copy in the first table (always present) */
6107
6108 memcpy(pbits, cbits + posix_class_maps[posix_class],
6109 32 * sizeof(uint8_t));
6110
6111 /* If there is a second table, add or remove it as required. */
6112
6113 taboffset = posix_class_maps[posix_class + 1];
6114 tabopt = posix_class_maps[posix_class + 2];
6115
6116 if (taboffset >= 0)
6117 {
6118 if (tabopt >= 0)
6119 for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
6120 else
6121 for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
6122 }
6123
6124 /* Now see if we need to remove any special characters. An option
6125 value of 1 removes vertical space and 2 removes underscore. */
6126
6127 if (tabopt < 0) tabopt = -tabopt;
6128 if (tabopt == 1) pbits[1] &= ~0x3c;
6129 else if (tabopt == 2) pbits[11] &= 0x7f;
6130
6131 /* Add the POSIX table or its complement into the main table that is
6132 being built and we are done. */
6133
6134 if (local_negate)
6135 for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
6136 else
6137 for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
6138
6139 /* Every class contains at least one < 256 character. */
6140
6141 class_has_8bitchar = 1;
6142 goto CONTINUE_CLASS; /* End of POSIX handling */
6143 }
6144
6145 /* Other than POSIX classes, the only items we should encounter are
6146 \d-type escapes and literal characters (possibly as ranges). */
6147
6148 if (meta == META_BIGVALUE)
6149 {
6150 meta = *(++pptr);
6151 goto CLASS_LITERAL;
6152 }
6153
6154 /* Any other non-literal must be an escape */
6155
6156 if (meta >= META_END)
6157 {
6158 if (META_CODE(meta) != META_ESCAPE)
6159 {
6160 #ifdef DEBUG_SHOW_PARSED
6161 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
6162 "in character class\n", meta);
6163 #endif
6164 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
6165 return 0;
6166 }
6167 escape = META_DATA(meta);
6168
6169 /* Every class contains at least one < 256 character. */
6170
6171 class_has_8bitchar++;
6172
6173 switch(escape)
6174 {
6175 case ESC_d:
6176 for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
6177 break;
6178
6179 case ESC_D:
6180 should_flip_negation = TRUE;
6181 for (int i = 0; i < 32; i++)
6182 classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
6183 break;
6184
6185 case ESC_w:
6186 for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
6187 break;
6188
6189 case ESC_W:
6190 should_flip_negation = TRUE;
6191 for (int i = 0; i < 32; i++)
6192 classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
6193 break;
6194
6195 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
6196 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
6197 previously set by something earlier in the character class.
6198 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
6199 we could just adjust the appropriate bit. From PCRE 8.34 we no
6200 longer treat \s and \S specially. */
6201
6202 case ESC_s:
6203 for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
6204 break;
6205
6206 case ESC_S:
6207 should_flip_negation = TRUE;
6208 for (int i = 0; i < 32; i++)
6209 classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
6210 break;
6211
6212 /* When adding the horizontal or vertical space lists to a class, or
6213 their complements, disable PCRE2_CASELESS, because it justs wastes
6214 time, and in the "not-x" UTF cases can create unwanted duplicates in
6215 the XCLASS list (provoked by characters that have more than one other
6216 case and by both cases being in the same "not-x" sublist). */
6217
6218 case ESC_h:
6219 (void)add_list_to_class(classbits, &class_uchardata,
6220 options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list),
6221 NOTACHAR);
6222 break;
6223
6224 case ESC_H:
6225 (void)add_not_list_to_class(classbits, &class_uchardata,
6226 options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list));
6227 break;
6228
6229 case ESC_v:
6230 (void)add_list_to_class(classbits, &class_uchardata,
6231 options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list),
6232 NOTACHAR);
6233 break;
6234
6235 case ESC_V:
6236 (void)add_not_list_to_class(classbits, &class_uchardata,
6237 options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list));
6238 break;
6239
6240 /* If Unicode is not supported, \P and \p are not allowed and are
6241 faulted at parse time, so will never appear here. */
6242
6243 #ifdef SUPPORT_UNICODE
6244 case ESC_p:
6245 case ESC_P:
6246 {
6247 uint32_t ptype = *(++pptr) >> 16;
6248 uint32_t pdata = *pptr & 0xffff;
6249 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
6250 *class_uchardata++ = ptype;
6251 *class_uchardata++ = pdata;
6252 xclass_has_prop = TRUE;
6253 class_has_8bitchar--; /* Undo! */
6254 }
6255 break;
6256 #endif
6257 }
6258
6259 goto CONTINUE_CLASS;
6260 } /* End handling \d-type escapes */
6261
6262 /* A literal character may be followed by a range meta. At parse time
6263 there are checks for out-of-order characters, for ranges where the two
6264 characters are equal, and for hyphens that cannot indicate a range. At
6265 this point, therefore, no checking is needed. */
6266
6267 else
6268 {
6269 uint32_t c, d;
6270
6271 CLASS_LITERAL:
6272 c = d = meta;
6273
6274 /* Remember if \r or \n were explicitly used */
6275
6276 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6277
6278 /* Process a character range */
6279
6280 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6281 {
6282 #ifdef EBCDIC
6283 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6284 #endif
6285 pptr += 2;
6286 d = *pptr;
6287 if (d == META_BIGVALUE) d = *(++pptr);
6288
6289 /* Remember an explicit \r or \n, and add the range to the class. */
6290
6291 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6292
6293 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6294 because there are holes in the encoding, and simply using the range
6295 A-Z (for example) would include the characters in the holes. This
6296 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6297
6298 #ifdef EBCDIC
6299 if (range_is_literal &&
6300 (cb->ctypes[c] & ctype_letter) != 0 &&
6301 (cb->ctypes[d] & ctype_letter) != 0 &&
6302 (c <= CHAR_z) == (d <= CHAR_z))
6303 {
6304 uint32_t uc = (d <= CHAR_z)? 0 : 64;
6305 uint32_t C = c - uc;
6306 uint32_t D = d - uc;
6307
6308 if (C <= CHAR_i)
6309 {
6310 class_has_8bitchar +=
6311 add_to_class(classbits, &class_uchardata, options, xoptions,
6312 cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc);
6313 C = CHAR_j;
6314 }
6315
6316 if (C <= D && C <= CHAR_r)
6317 {
6318 class_has_8bitchar +=
6319 add_to_class(classbits, &class_uchardata, options, xoptions,
6320 cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc);
6321 C = CHAR_s;
6322 }
6323
6324 if (C <= D)
6325 {
6326 class_has_8bitchar +=
6327 add_to_class(classbits, &class_uchardata, options, xoptions,
6328 cb, C + uc, D + uc);
6329 }
6330 }
6331 else
6332 #endif
6333 /* Not an EBCDIC special range */
6334
6335 class_has_8bitchar += add_to_class(classbits, &class_uchardata,
6336 options, xoptions, cb, c, d);
6337 goto CONTINUE_CLASS; /* Go get the next char in the class */
6338 } /* End of range handling */
6339
6340
6341 /* Handle a single character. */
6342
6343 class_has_8bitchar +=
6344 add_to_class(classbits, &class_uchardata, options, xoptions, cb,
6345 meta, meta);
6346 }
6347
6348 /* Continue to the next item in the class. */
6349
6350 CONTINUE_CLASS:
6351
6352 #ifdef SUPPORT_WIDE_CHARS
6353 /* If any wide characters or Unicode properties have been encountered,
6354 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6355 of the extra data and reset the pointer. This is so that very large
6356 classes that contain a zillion wide characters or Unicode property tests
6357 do not overwrite the workspace (which is on the stack). */
6358
6359 if (class_uchardata > class_uchardata_base)
6360 {
6361 xclass = TRUE;
6362 if (lengthptr != NULL)
6363 {
6364 *lengthptr += class_uchardata - class_uchardata_base;
6365 class_uchardata = class_uchardata_base;
6366 }
6367 }
6368 #endif
6369
6370 continue; /* Needed to avoid error when not supporting wide chars */
6371 } /* End of main class-processing loop */
6372
6373 /* If this class is the first thing in the branch, there can be no first
6374 char setting, whatever the repeat count. Any reqcu setting must remain
6375 unchanged after any kind of repeat. */
6376
6377 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6378 zerofirstcu = firstcu;
6379 zerofirstcuflags = firstcuflags;
6380 zeroreqcu = reqcu;
6381 zeroreqcuflags = reqcuflags;
6382
6383 /* If there are characters with values > 255, or Unicode property settings
6384 (\p or \P), we have to compile an extended class, with its own opcode,
6385 unless there were no property settings and there was a negated special such
6386 as \S in the class, and PCRE2_UCP is not set, because in that case all
6387 characters > 255 are in or not in the class, so any that were explicitly
6388 given as well can be ignored.
6389
6390 In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
6391 were present in a class, we either have to match or not match all wide
6392 characters (depending on whether the whole class is or is not negated).
6393 This requirement is indicated by match_all_or_no_wide_chars being true.
6394 We do this by including an explicit range, which works in both cases.
6395 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6396 cannot be any wide characters in 8-bit non-UTF mode.
6397
6398 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6399 class where \S etc is present without PCRE2_UCP, causing an extended class
6400 to be compiled, we make sure that all characters > 255 are included by
6401 forcing match_all_or_no_wide_chars to be true.
6402
6403 If, when generating an xclass, there are no characters < 256, we can omit
6404 the bitmap in the actual compiled code. */
6405
6406 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
6407 if (xclass && (
6408 #ifdef SUPPORT_UNICODE
6409 (options & PCRE2_UCP) != 0 ||
6410 #endif
6411 xclass_has_prop || !should_flip_negation))
6412 {
6413 if (match_all_or_no_wide_chars || (
6414 #if PCRE2_CODE_UNIT_WIDTH == 8
6415 utf &&
6416 #endif
6417 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6418 {
6419 *class_uchardata++ = XCL_RANGE;
6420 if (utf) /* Will always be utf in the 8-bit library */
6421 {
6422 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6423 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6424 }
6425 else /* Can only happen for the 16-bit & 32-bit libraries */
6426 {
6427 #if PCRE2_CODE_UNIT_WIDTH == 16
6428 *class_uchardata++ = 0x100;
6429 *class_uchardata++ = 0xffffu;
6430 #elif PCRE2_CODE_UNIT_WIDTH == 32
6431 *class_uchardata++ = 0x100;
6432 *class_uchardata++ = 0xffffffffu;
6433 #endif
6434 }
6435 }
6436 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
6437 *code++ = OP_XCLASS;
6438 code += LINK_SIZE;
6439 *code = negate_class? XCL_NOT:0;
6440 if (xclass_has_prop) *code |= XCL_HASPROP;
6441
6442 /* If the map is required, move up the extra data to make room for it;
6443 otherwise just move the code pointer to the end of the extra data. */
6444
6445 if (class_has_8bitchar > 0)
6446 {
6447 *code++ |= XCL_MAP;
6448 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6449 CU2BYTES(class_uchardata - code));
6450 if (negate_class && !xclass_has_prop)
6451 {
6452 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6453 for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6454 }
6455 memcpy(code, classbits, 32);
6456 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6457 }
6458 else code = class_uchardata;
6459
6460 /* Now fill in the complete length of the item */
6461
6462 PUT(previous, 1, (int)(code - previous));
6463 break; /* End of class handling */
6464 }
6465 #endif /* SUPPORT_WIDE_CHARS */
6466
6467 /* If there are no characters > 255, or they are all to be included or
6468 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6469 whole class was negated and whether there were negative specials such as \S
6470 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6471 negating it if necessary. */
6472
6473 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6474 if (lengthptr == NULL) /* Save time in the pre-compile phase */
6475 {
6476 if (negate_class)
6477 {
6478 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6479 for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6480 }
6481 memcpy(code, classbits, 32);
6482 }
6483 code += 32 / sizeof(PCRE2_UCHAR);
6484 break; /* End of class processing */
6485
6486
6487 /* ===================================================================*/
6488 /* Deal with (*VERB)s. */
6489
6490 /* Check for open captures before ACCEPT and close those that are within
6491 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6492 assertion. In the first pass, just accumulate the length required;
6493 otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6494 workspace overflow. Do not set firstcu after *ACCEPT. */
6495
6496 case META_ACCEPT:
6497 cb->had_accept = had_accept = TRUE;
6498 for (oc = open_caps;
6499 oc != NULL && oc->assert_depth >= cb->assert_depth;
6500 oc = oc->next)
6501 {
6502 if (lengthptr != NULL)
6503 {
6504 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6505 }
6506 else
6507 {
6508 *code++ = OP_CLOSE;
6509 PUT2INC(code, 0, oc->number);
6510 }
6511 }
6512 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6513 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6514 break;
6515
6516 case META_PRUNE:
6517 case META_SKIP:
6518 cb->had_pruneorskip = TRUE;
6519 /* Fall through */
6520 case META_COMMIT:
6521 case META_FAIL:
6522 *code++ = verbops[(meta - META_MARK) >> 16];
6523 break;
6524
6525 case META_THEN:
6526 cb->external_flags |= PCRE2_HASTHEN;
6527 *code++ = OP_THEN;
6528 break;
6529
6530 /* Handle verbs with arguments. Arguments can be very long, especially in
6531 16- and 32-bit modes, and can overflow the workspace in the first pass.
6532 However, the argument length is constrained to be small enough to fit in
6533 one code unit. This check happens in parse_regex(). In the first pass,
6534 instead of putting the argument into memory, we just update the length
6535 counter and set up an empty argument. */
6536
6537 case META_THEN_ARG:
6538 cb->external_flags |= PCRE2_HASTHEN;
6539 goto VERB_ARG;
6540
6541 case META_PRUNE_ARG:
6542 case META_SKIP_ARG:
6543 cb->had_pruneorskip = TRUE;
6544 /* Fall through */
6545 case META_MARK:
6546 case META_COMMIT_ARG:
6547 VERB_ARG:
6548 *code++ = verbops[(meta - META_MARK) >> 16];
6549 /* The length is in characters. */
6550 verbarglen = *(++pptr);
6551 verbculen = 0;
6552 tempcode = code++;
6553 for (int i = 0; i < (int)verbarglen; i++)
6554 {
6555 meta = *(++pptr);
6556 #ifdef SUPPORT_UNICODE
6557 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6558 #endif
6559 {
6560 mclength = 1;
6561 mcbuffer[0] = meta;
6562 }
6563 if (lengthptr != NULL) *lengthptr += mclength; else
6564 {
6565 memcpy(code, mcbuffer, CU2BYTES(mclength));
6566 code += mclength;
6567 verbculen += mclength;
6568 }
6569 }
6570
6571 *tempcode = verbculen; /* Fill in the code unit length */
6572 *code++ = 0; /* Terminating zero */
6573 break;
6574
6575
6576 /* ===================================================================*/
6577 /* Handle options change. The new setting must be passed back for use in
6578 subsequent branches. Reset the greedy defaults and the case value for
6579 firstcu and reqcu. */
6580
6581 case META_OPTIONS:
6582 *optionsptr = options = *(++pptr);
6583 *xoptionsptr = xoptions = *(++pptr);
6584 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6585 greedy_non_default = greedy_default ^ 1;
6586 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6587 break;
6588
6589
6590 /* ===================================================================*/
6591 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6592 because it could be a numerical check on recursion, or a name check on a
6593 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6594 we can handle it either way. We first try for a name; if not found, process
6595 the number. */
6596
6597 case META_COND_RNUMBER: /* (?(Rdigits) */
6598 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6599 case META_COND_RNAME: /* (?(R&name) - test for recursion */
6600 bravalue = OP_COND;
6601 {
6602 int count, index;
6603 unsigned int i;
6604 PCRE2_SPTR name;
6605 named_group *ng = cb->named_groups;
6606 uint32_t length = *(++pptr);
6607
6608 GETPLUSOFFSET(offset, pptr);
6609 name = cb->start_pattern + offset;
6610
6611 /* In the first pass, the names generated in the pre-pass are available,
6612 but the main name table has not yet been created. Scan the list of names
6613 generated in the pre-pass in order to get a number and whether or not
6614 this name is duplicated. If it is not duplicated, we can handle it as a
6615 numerical group. */
6616
6617 for (i = 0; i < cb->names_found; i++, ng++)
6618 {
6619 if (length == ng->length &&
6620 PRIV(strncmp)(name, ng->name, length) == 0)
6621 {
6622 if (!ng->isdup)
6623 {
6624 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6625 PUT2(code, 2+LINK_SIZE, ng->number);
6626 if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6627 skipunits = 1+IMM2_SIZE;
6628 goto GROUP_PROCESS_NOTE_EMPTY;
6629 }
6630 break; /* Found a duplicated name */
6631 }
6632 }
6633
6634 /* If the name was not found we have a bad reference, unless we are
6635 dealing with R<digits>, which is treated as a recursion test by number.
6636 */
6637
6638 if (i >= cb->names_found)
6639 {
6640 groupnumber = 0;
6641 if (meta == META_COND_RNUMBER)
6642 {
6643 for (i = 1; i < length; i++)
6644 {
6645 groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6646 if (groupnumber > MAX_GROUP_NUMBER)
6647 {
6648 *errorcodeptr = ERR61;
6649 cb->erroroffset = offset + i;
6650 return 0;
6651 }
6652 }
6653 }
6654
6655 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6656 {
6657 *errorcodeptr = ERR15;
6658 cb->erroroffset = offset;
6659 return 0;
6660 }
6661
6662 /* (?Rdigits) treated as a recursion reference by number. A value of
6663 zero (which is the result of both (?R) and (?R0)) means "any", and is
6664 translated into RREF_ANY (which is 0xffff). */
6665
6666 if (groupnumber == 0) groupnumber = RREF_ANY;
6667 code[1+LINK_SIZE] = OP_RREF;
6668 PUT2(code, 2+LINK_SIZE, groupnumber);
6669 skipunits = 1+IMM2_SIZE;
6670 goto GROUP_PROCESS_NOTE_EMPTY;
6671 }
6672
6673 /* A duplicated name was found. Note that if an R<digits> name is found
6674 (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6675
6676 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6677
6678 /* We have a duplicated name. In the compile pass we have to search the
6679 main table in order to get the index and count values. */
6680
6681 count = 0; /* Values for first pass (avoids compiler warning) */
6682 index = 0;
6683 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6684 &count, errorcodeptr, cb)) return 0;
6685
6686 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6687 insert appropriate data values. */
6688
6689 code[1+LINK_SIZE]++;
6690 skipunits = 1+2*IMM2_SIZE;
6691 PUT2(code, 2+LINK_SIZE, index);
6692 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6693 }
6694 goto GROUP_PROCESS_NOTE_EMPTY;
6695
6696 /* The DEFINE condition is always false. Its internal groups may never
6697 be called, so matched_char must remain false, hence the jump to
6698 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6699
6700 case META_COND_DEFINE:
6701 bravalue = OP_COND;
6702 GETPLUSOFFSET(offset, pptr);
6703 code[1+LINK_SIZE] = OP_DEFINE;
6704 skipunits = 1;
6705 goto GROUP_PROCESS;
6706
6707 /* Conditional test of a group's being set. */
6708
6709 case META_COND_NUMBER:
6710 bravalue = OP_COND;
6711 GETPLUSOFFSET(offset, pptr);
6712 groupnumber = *(++pptr);
6713 if (groupnumber > cb->bracount)
6714 {
6715 *errorcodeptr = ERR15;
6716 cb->erroroffset = offset;
6717 return 0;
6718 }
6719 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6720 offset -= 2; /* Point at initial ( for too many branches error */
6721 code[1+LINK_SIZE] = OP_CREF;
6722 skipunits = 1+IMM2_SIZE;
6723 PUT2(code, 2+LINK_SIZE, groupnumber);
6724 goto GROUP_PROCESS_NOTE_EMPTY;
6725
6726 /* Test for the PCRE2 version. */
6727
6728 case META_COND_VERSION:
6729 bravalue = OP_COND;
6730 if (pptr[1] > 0)
6731 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6732 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6733 OP_TRUE : OP_FALSE;
6734 else
6735 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6736 OP_TRUE : OP_FALSE;
6737 skipunits = 1;
6738 pptr += 3;
6739 goto GROUP_PROCESS_NOTE_EMPTY;
6740
6741 /* The condition is an assertion, possibly preceded by a callout. */
6742
6743 case META_COND_ASSERT:
6744 bravalue = OP_COND;
6745 goto GROUP_PROCESS_NOTE_EMPTY;
6746
6747
6748 /* ===================================================================*/
6749 /* Handle all kinds of nested bracketed groups. The non-capturing,
6750 non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6751
6752 case META_LOOKAHEAD:
6753 bravalue = OP_ASSERT;
6754 cb->assert_depth += 1;
6755 goto GROUP_PROCESS;
6756
6757 case META_LOOKAHEAD_NA:
6758 bravalue = OP_ASSERT_NA;
6759 cb->assert_depth += 1;
6760 goto GROUP_PROCESS;
6761
6762 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6763 thing to do, but Perl allows all assertions to be quantified, and when
6764 they contain capturing parentheses there may be a potential use for
6765 this feature. Not that that applies to a quantified (?!) but we allow
6766 it for uniformity. */
6767
6768 case META_LOOKAHEADNOT:
6769 if (pptr[1] == META_KET &&
6770 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6771 {
6772 *code++ = OP_FAIL;
6773 pptr++;
6774 }
6775 else
6776 {
6777 bravalue = OP_ASSERT_NOT;
6778 cb->assert_depth += 1;
6779 goto GROUP_PROCESS;
6780 }
6781 break;
6782
6783 case META_LOOKBEHIND:
6784 bravalue = OP_ASSERTBACK;
6785 cb->assert_depth += 1;
6786 goto GROUP_PROCESS;
6787
6788 case META_LOOKBEHINDNOT:
6789 bravalue = OP_ASSERTBACK_NOT;
6790 cb->assert_depth += 1;
6791 goto GROUP_PROCESS;
6792
6793 case META_LOOKBEHIND_NA:
6794 bravalue = OP_ASSERTBACK_NA;
6795 cb->assert_depth += 1;
6796 goto GROUP_PROCESS;
6797
6798 case META_ATOMIC:
6799 bravalue = OP_ONCE;
6800 goto GROUP_PROCESS_NOTE_EMPTY;
6801
6802 case META_SCRIPT_RUN:
6803 bravalue = OP_SCRIPT_RUN;
6804 goto GROUP_PROCESS_NOTE_EMPTY;
6805
6806 case META_NOCAPTURE:
6807 bravalue = OP_BRA;
6808 /* Fall through */
6809
6810 /* Process nested bracketed regex. The nesting depth is maintained for the
6811 benefit of the stackguard function. The test for too deep nesting is now
6812 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6813 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6814 note of whether or not they may match an empty string. */
6815
6816 GROUP_PROCESS_NOTE_EMPTY:
6817 note_group_empty = TRUE;
6818
6819 GROUP_PROCESS:
6820 cb->parens_depth += 1;
6821 *code = bravalue;
6822 pptr++;
6823 tempcode = code;
6824 tempreqvary = cb->req_varyopt; /* Save value before group */
6825 length_prevgroup = 0; /* Initialize for pre-compile phase */
6826
6827 if ((group_return =
6828 compile_regex(
6829 options, /* The options state */
6830 xoptions, /* The extra options state */
6831 &tempcode, /* Where to put code (updated) */
6832 &pptr, /* Input pointer (updated) */
6833 errorcodeptr, /* Where to put an error message */
6834 skipunits, /* Skip over bracket number */
6835 &subfirstcu, /* For possible first char */
6836 &subfirstcuflags,
6837 &subreqcu, /* For possible last char */
6838 &subreqcuflags,
6839 bcptr, /* Current branch chain */
6840 open_caps, /* Pointer to capture stack */
6841 cb, /* Compile data block */
6842 (lengthptr == NULL)? NULL : /* Actual compile phase */
6843 &length_prevgroup /* Pre-compile phase */
6844 )) == 0)
6845 return 0; /* Error */
6846
6847 cb->parens_depth -= 1;
6848
6849 /* If that was a non-conditional significant group (not an assertion, not a
6850 DEFINE) that matches at least one character, then the current item matches
6851 a character. Conditionals are handled below. */
6852
6853 if (note_group_empty && bravalue != OP_COND && group_return > 0)
6854 matched_char = TRUE;
6855
6856 /* If we've just compiled an assertion, pop the assert depth. */
6857
6858 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6859 cb->assert_depth -= 1;
6860
6861 /* At the end of compiling, code is still pointing to the start of the
6862 group, while tempcode has been updated to point past the end of the group.
6863 The parsed pattern pointer (pptr) is on the closing META_KET.
6864
6865 If this is a conditional bracket, check that there are no more than
6866 two branches in the group, or just one if it's a DEFINE group. We do this
6867 in the real compile phase, not in the pre-pass, where the whole group may
6868 not be available. */
6869
6870 if (bravalue == OP_COND && lengthptr == NULL)
6871 {
6872 PCRE2_UCHAR *tc = code;
6873 int condcount = 0;
6874
6875 do {
6876 condcount++;
6877 tc += GET(tc,1);
6878 }
6879 while (*tc != OP_KET);
6880
6881 /* A DEFINE group is never obeyed inline (the "condition" is always
6882 false). It must have only one branch. Having checked this, change the
6883 opcode to OP_FALSE. */
6884
6885 if (code[LINK_SIZE+1] == OP_DEFINE)
6886 {
6887 if (condcount > 1)
6888 {
6889 cb->erroroffset = offset;
6890 *errorcodeptr = ERR54;
6891 return 0;
6892 }
6893 code[LINK_SIZE+1] = OP_FALSE;
6894 bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6895 }
6896
6897 /* A "normal" conditional group. If there is just one branch, we must not
6898 make use of its firstcu or reqcu, because this is equivalent to an
6899 empty second branch. Also, it may match an empty string. If there are two
6900 branches, this item must match a character if the group must. */
6901
6902 else
6903 {
6904 if (condcount > 2)
6905 {
6906 cb->erroroffset = offset;
6907 *errorcodeptr = ERR27;
6908 return 0;
6909 }
6910 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6911 else if (group_return > 0) matched_char = TRUE;
6912 }
6913 }
6914
6915 /* In the pre-compile phase, update the length by the length of the group,
6916 less the brackets at either end. Then reduce the compiled code to just a
6917 set of non-capturing brackets so that it doesn't use much memory if it is
6918 duplicated by a quantifier.*/
6919
6920 if (lengthptr != NULL)
6921 {
6922 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6923 {
6924 *errorcodeptr = ERR20;
6925 return 0;
6926 }
6927 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6928 code++; /* This already contains bravalue */
6929 PUTINC(code, 0, 1 + LINK_SIZE);
6930 *code++ = OP_KET;
6931 PUTINC(code, 0, 1 + LINK_SIZE);
6932 break; /* No need to waste time with special character handling */
6933 }
6934
6935 /* Otherwise update the main code pointer to the end of the group. */
6936
6937 code = tempcode;
6938
6939 /* For a DEFINE group, required and first character settings are not
6940 relevant. */
6941
6942 if (bravalue == OP_DEFINE) break;
6943
6944 /* Handle updating of the required and first code units for other types of
6945 group. Update for normal brackets of all kinds, and conditions with two
6946 branches (see code above). If the bracket is followed by a quantifier with
6947 zero repeat, we have to back off. Hence the definition of zeroreqcu and
6948 zerofirstcu outside the main loop so that they can be accessed for the back
6949 off. */
6950
6951 zeroreqcu = reqcu;
6952 zeroreqcuflags = reqcuflags;
6953 zerofirstcu = firstcu;
6954 zerofirstcuflags = firstcuflags;
6955 groupsetfirstcu = FALSE;
6956
6957 if (bravalue >= OP_ONCE) /* Not an assertion */
6958 {
6959 /* If we have not yet set a firstcu in this branch, take it from the
6960 subpattern, remembering that it was set here so that a repeat of more
6961 than one can replicate it as reqcu if necessary. If the subpattern has
6962 no firstcu, set "none" for the whole branch. In both cases, a zero
6963 repeat forces firstcu to "none". */
6964
6965 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6966 {
6967 if (subfirstcuflags < REQ_NONE)
6968 {
6969 firstcu = subfirstcu;
6970 firstcuflags = subfirstcuflags;
6971 groupsetfirstcu = TRUE;
6972 }
6973 else firstcuflags = REQ_NONE;
6974 zerofirstcuflags = REQ_NONE;
6975 }
6976
6977 /* If firstcu was previously set, convert the subpattern's firstcu
6978 into reqcu if there wasn't one, using the vary flag that was in
6979 existence beforehand. */
6980
6981 else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6982 {
6983 subreqcu = subfirstcu;
6984 subreqcuflags = subfirstcuflags | tempreqvary;
6985 }
6986
6987 /* If the subpattern set a required code unit (or set a first code unit
6988 that isn't really the first code unit - see above), set it. */
6989
6990 if (subreqcuflags < REQ_NONE)
6991 {
6992 reqcu = subreqcu;
6993 reqcuflags = subreqcuflags;
6994 }
6995 }
6996
6997 /* For a forward assertion, we take the reqcu, if set, provided that the
6998 group has also set a firstcu. This can be helpful if the pattern that
6999 follows the assertion doesn't set a different char. For example, it's
7000 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7001 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7002 the "real" "a" would then become a reqcu instead of a firstcu. This is
7003 overcome by a scan at the end if there's no firstcu, looking for an
7004 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7005 we must only take the reqcu when the group also set a firstcu. Otherwise,
7006 in that example, 'X' ends up set for both. */
7007
7008 else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7009 subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7010 {
7011 reqcu = subreqcu;
7012 reqcuflags = subreqcuflags;
7013 }
7014
7015 break; /* End of nested group handling */
7016
7017
7018 /* ===================================================================*/
7019 /* Handle named backreferences and recursions. */
7020
7021 case META_BACKREF_BYNAME:
7022 case META_RECURSE_BYNAME:
7023 {
7024 int count, index;
7025 PCRE2_SPTR name;
7026 BOOL is_dupname = FALSE;
7027 named_group *ng = cb->named_groups;
7028 uint32_t length = *(++pptr);
7029
7030 GETPLUSOFFSET(offset, pptr);
7031 name = cb->start_pattern + offset;
7032
7033 /* In the first pass, the names generated in the pre-pass are available,
7034 but the main name table has not yet been created. Scan the list of names
7035 generated in the pre-pass in order to get a number and whether or not
7036 this name is duplicated. */
7037
7038 groupnumber = 0;
7039 for (unsigned int i = 0; i < cb->names_found; i++, ng++)
7040 {
7041 if (length == ng->length &&
7042 PRIV(strncmp)(name, ng->name, length) == 0)
7043 {
7044 is_dupname = ng->isdup;
7045 groupnumber = ng->number;
7046
7047 /* For a recursion, that's all that is needed. We can now go to
7048 the code that handles numerical recursion, applying it to the first
7049 group with the given name. */
7050
7051 if (meta == META_RECURSE_BYNAME)
7052 {
7053 meta_arg = groupnumber;
7054 goto HANDLE_NUMERICAL_RECURSION;
7055 }
7056
7057 /* For a back reference, update the back reference map and the
7058 maximum back reference. */
7059
7060 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7061 if (groupnumber > cb->top_backref)
7062 cb->top_backref = groupnumber;
7063 }
7064 }
7065
7066 /* If the name was not found we have a bad reference. */
7067
7068 if (groupnumber == 0)
7069 {
7070 *errorcodeptr = ERR15;
7071 cb->erroroffset = offset;
7072 return 0;
7073 }
7074
7075 /* If a back reference name is not duplicated, we can handle it as
7076 a numerical reference. */
7077
7078 if (!is_dupname)
7079 {
7080 meta_arg = groupnumber;
7081 goto HANDLE_SINGLE_REFERENCE;
7082 }
7083
7084 /* If a back reference name is duplicated, we generate a different
7085 opcode to a numerical back reference. In the second pass we must
7086 search for the index and count in the final name table. */
7087
7088 count = 0; /* Values for first pass (avoids compiler warning) */
7089 index = 0;
7090 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
7091 &count, errorcodeptr, cb)) return 0;
7092
7093 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7094 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7095 PUT2INC(code, 0, index);
7096 PUT2INC(code, 0, count);
7097 }
7098 break;
7099
7100
7101 /* ===================================================================*/
7102 /* Handle a numerical callout. */
7103
7104 case META_CALLOUT_NUMBER:
7105 code[0] = OP_CALLOUT;
7106 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7107 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7108 code[1 + 2*LINK_SIZE] = pptr[3];
7109 pptr += 3;
7110 code += PRIV(OP_lengths)[OP_CALLOUT];
7111 break;
7112
7113
7114 /* ===================================================================*/
7115 /* Handle a callout with a string argument. In the pre-pass we just compute
7116 the length without generating anything. The length in pptr[3] includes both
7117 delimiters; in the actual compile only the first one is copied, but a
7118 terminating zero is added. Any doubled delimiters within the string make
7119 this an overestimate, but it is not worth bothering about. */
7120
7121 case META_CALLOUT_STRING:
7122 if (lengthptr != NULL)
7123 {
7124 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7125 pptr += 3;
7126 SKIPOFFSET(pptr);
7127 }
7128
7129 /* In the real compile we can copy the string. The starting delimiter is
7130 included so that the client can discover it if they want. We also pass the
7131 start offset to help a script language give better error messages. */
7132
7133 else
7134 {
7135 PCRE2_SPTR pp;
7136 uint32_t delimiter;
7137 uint32_t length = pptr[3];
7138 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7139
7140 code[0] = OP_CALLOUT_STR;
7141 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7142 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7143
7144 pptr += 3;
7145 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
7146 pp = cb->start_pattern + offset;
7147 delimiter = *callout_string++ = *pp++;
7148 if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7149 delimiter = CHAR_RIGHT_CURLY_BRACKET;
7150 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
7151
7152 /* The syntax of the pattern was checked in the parsing scan. The length
7153 includes both delimiters, but we have passed the opening one just above,
7154 so we reduce length before testing it. The test is for > 1 because we do
7155 not want to copy the final delimiter. This also ensures that pp[1] is
7156 accessible. */
7157
7158 while (--length > 1)
7159 {
7160 if (*pp == delimiter && pp[1] == delimiter)
7161 {
7162 *callout_string++ = delimiter;
7163 pp += 2;
7164 length--;
7165 }
7166 else *callout_string++ = *pp++;
7167 }
7168 *callout_string++ = CHAR_NUL;
7169
7170 /* Set the length of the entire item, the advance to its end. */
7171
7172 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7173 code = callout_string;
7174 }
7175 break;
7176
7177
7178 /* ===================================================================*/
7179 /* Handle repetition. The different types are all sorted out in the parsing
7180 pass. */
7181
7182 case META_MINMAX_PLUS:
7183 case META_MINMAX_QUERY:
7184 case META_MINMAX:
7185 repeat_min = *(++pptr);
7186 repeat_max = *(++pptr);
7187 goto REPEAT;
7188
7189 case META_ASTERISK:
7190 case META_ASTERISK_PLUS:
7191 case META_ASTERISK_QUERY:
7192 repeat_min = 0;
7193 repeat_max = REPEAT_UNLIMITED;
7194 goto REPEAT;
7195
7196 case META_PLUS:
7197 case META_PLUS_PLUS:
7198 case META_PLUS_QUERY:
7199 repeat_min = 1;
7200 repeat_max = REPEAT_UNLIMITED;
7201 goto REPEAT;
7202
7203 case META_QUERY:
7204 case META_QUERY_PLUS:
7205 case META_QUERY_QUERY:
7206 repeat_min = 0;
7207 repeat_max = 1;
7208
7209 REPEAT:
7210 if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7211
7212 /* Remember whether this is a variable length repeat, and default to
7213 single-char opcodes. */
7214
7215 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7216 op_type = 0;
7217
7218 /* Adjust first and required code units for a zero repeat. */
7219
7220 if (repeat_min == 0)
7221 {
7222 firstcu = zerofirstcu;
7223 firstcuflags = zerofirstcuflags;
7224 reqcu = zeroreqcu;
7225 reqcuflags = zeroreqcuflags;
7226 }
7227
7228 /* Note the greediness and possessiveness. */
7229
7230 switch (meta)
7231 {
7232 case META_MINMAX_PLUS:
7233 case META_ASTERISK_PLUS:
7234 case META_PLUS_PLUS:
7235 case META_QUERY_PLUS:
7236 repeat_type = 0; /* Force greedy */
7237 possessive_quantifier = TRUE;
7238 break;
7239
7240 case META_MINMAX_QUERY:
7241 case META_ASTERISK_QUERY:
7242 case META_PLUS_QUERY:
7243 case META_QUERY_QUERY:
7244 repeat_type = greedy_non_default;
7245 possessive_quantifier = FALSE;
7246 break;
7247
7248 default:
7249 repeat_type = greedy_default;
7250 possessive_quantifier = FALSE;
7251 break;
7252 }
7253
7254 /* Save start of previous item, in case we have to move it up in order to
7255 insert something before it, and remember what it was. */
7256
7257 tempcode = previous;
7258 op_previous = *previous;
7259
7260 /* Now handle repetition for the different types of item. If the repeat
7261 minimum and the repeat maximum are both 1, we can ignore the quantifier for
7262 non-parenthesized items, as they have only one alternative. For anything in
7263 parentheses, we must not ignore if {1} is possessive. */
7264
7265 switch (op_previous)
7266 {
7267 /* If previous was a character or negated character match, abolish the
7268 item and generate a repeat item instead. If a char item has a minimum of
7269 more than one, ensure that it is set in reqcu - it might not be if a
7270 sequence such as x{3} is the first thing in a branch because the x will
7271 have gone into firstcu instead. */
7272
7273 case OP_CHAR:
7274 case OP_CHARI:
7275 case OP_NOT:
7276 case OP_NOTI:
7277 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7278 op_type = chartypeoffset[op_previous - OP_CHAR];
7279
7280 /* Deal with UTF characters that take up more than one code unit. */
7281
7282 #ifdef MAYBE_UTF_MULTI
7283 if (utf && NOT_FIRSTCU(code[-1]))
7284 {
7285 PCRE2_UCHAR *lastchar = code - 1;
7286 BACKCHAR(lastchar);
7287 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
7288 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
7289 }
7290 else
7291 #endif /* MAYBE_UTF_MULTI */
7292
7293 /* Handle the case of a single code unit - either with no UTF support, or
7294 with UTF disabled, or for a single-code-unit UTF character. In the latter
7295 case, for a repeated positive match, get the caseless flag for the
7296 required code unit from the previous character, because a class like [Aa]
7297 sets a caseless A but by now the req_caseopt flag has been reset. */
7298
7299 {
7300 mcbuffer[0] = code[-1];
7301 mclength = 1;
7302 if (op_previous <= OP_CHARI && repeat_min > 1)
7303 {
7304 reqcu = mcbuffer[0];
7305 reqcuflags = cb->req_varyopt;
7306 if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7307 }
7308 }
7309 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
7310
7311 /* If previous was a character class or a back reference, we put the
7312 repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7313
7314 #ifdef SUPPORT_WIDE_CHARS
7315 case OP_XCLASS:
7316 #endif
7317 case OP_CLASS:
7318 case OP_NCLASS:
7319 case OP_REF:
7320 case OP_REFI:
7321 case OP_DNREF:
7322 case OP_DNREFI:
7323
7324 if (repeat_max == 0)
7325 {
7326 code = previous;
7327 goto END_REPEAT;
7328 }
7329 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7330
7331 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7332 *code++ = OP_CRSTAR + repeat_type;
7333 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7334 *code++ = OP_CRPLUS + repeat_type;
7335 else if (repeat_min == 0 && repeat_max == 1)
7336 *code++ = OP_CRQUERY + repeat_type;
7337 else
7338 {
7339 *code++ = OP_CRRANGE + repeat_type;
7340 PUT2INC(code, 0, repeat_min);
7341 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
7342 PUT2INC(code, 0, repeat_max);
7343 }
7344 break;
7345
7346 /* If previous is OP_FAIL, it was generated by an empty class []
7347 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7348 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7349 time. We can just ignore this repeat. */
7350
7351 case OP_FAIL:
7352 goto END_REPEAT;
7353
7354 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7355 because pcre2_match() could not handle backtracking into recursively
7356 called groups. Now that this backtracking is available, we no longer need
7357 to do this. However, we still need to replicate recursions as we do for
7358 groups so as to have independent backtracking points. We can replicate
7359 for the minimum number of repeats directly. For optional repeats we now
7360 wrap the recursion in OP_BRA brackets and make use of the bracket
7361 repetition. */
7362
7363 case OP_RECURSE:
7364 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7365 goto END_REPEAT;
7366
7367 /* Generate unwrapped repeats for a non-zero minimum, except when the
7368 minimum is 1 and the maximum unlimited, because that can be handled with
7369 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7370 minimum, we just need to generate the appropriate additional copies.
7371 Otherwise we need to generate one more, to simulate the situation when
7372 the minimum is zero. */
7373
7374 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7375 {
7376 int replicate = repeat_min;
7377 if (repeat_min == repeat_max) replicate--;
7378
7379 /* In the pre-compile phase, we don't actually do the replication. We
7380 just adjust the length as if we had. Do some paranoid checks for
7381 potential integer overflow. */
7382
7383 if (lengthptr != NULL)
7384 {
7385 PCRE2_SIZE delta;
7386 if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7387 OFLOW_MAX - *lengthptr < delta)
7388 {
7389 *errorcodeptr = ERR20;
7390 return 0;
7391 }
7392 *lengthptr += delta;
7393 }
7394
7395 else for (int i = 0; i < replicate; i++)
7396 {
7397 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7398 previous = code;
7399 code += 1 + LINK_SIZE;
7400 }
7401
7402 /* If the number of repeats is fixed, we are done. Otherwise, adjust
7403 the counts and fall through. */
7404
7405 if (repeat_min == repeat_max) break;
7406 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7407 repeat_min = 0;
7408 }
7409
7410 /* Wrap the recursion call in OP_BRA brackets. */
7411
7412 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7413 op_previous = *previous = OP_BRA;
7414 PUT(previous, 1, 2 + 2*LINK_SIZE);
7415 previous[2 + 2*LINK_SIZE] = OP_KET;
7416 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7417 code += 2 + 2 * LINK_SIZE;
7418 length_prevgroup = 3 + 3*LINK_SIZE;
7419 group_return = -1; /* Set "may match empty string" */
7420
7421 /* Now treat as a repeated OP_BRA. */
7422 /* Fall through */
7423
7424 /* If previous was a bracket group, we may have to replicate it in
7425 certain cases. Note that at this point we can encounter only the "basic"
7426 bracket opcodes such as BRA and CBRA, as this is the place where they get
7427 converted into the more special varieties such as BRAPOS and SBRA.
7428 Originally, PCRE did not allow repetition of assertions, but now it does,
7429 for Perl compatibility. */
7430
7431 case OP_ASSERT:
7432 case OP_ASSERT_NOT:
7433 case OP_ASSERT_NA:
7434 case OP_ASSERTBACK:
7435 case OP_ASSERTBACK_NOT:
7436 case OP_ASSERTBACK_NA:
7437 case OP_ONCE:
7438 case OP_SCRIPT_RUN:
7439 case OP_BRA:
7440 case OP_CBRA:
7441 case OP_COND:
7442 {
7443 int len = (int)(code - previous);
7444 PCRE2_UCHAR *bralink = NULL;
7445 PCRE2_UCHAR *brazeroptr = NULL;
7446
7447 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7448 goto END_REPEAT;
7449
7450 /* Repeating a DEFINE group (or any group where the condition is always
7451 FALSE and there is only one branch) is pointless, but Perl allows the
7452 syntax, so we just ignore the repeat. */
7453
7454 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7455 previous[GET(previous, 1)] != OP_ALT)
7456 goto END_REPEAT;
7457
7458 /* Perl allows all assertions to be quantified, and when they contain
7459 capturing parentheses and/or are optional there are potential uses for
7460 this feature. PCRE2 used to force the maximum quantifier to 1 on the
7461 invalid grounds that further repetition was never useful. This was
7462 always a bit pointless, since an assertion could be wrapped with a
7463 repeated group to achieve the effect. General repetition is now
7464 permitted, but if the maximum is unlimited it is set to one more than
7465 the minimum. */
7466
7467 if (op_previous < OP_ONCE) /* Assertion */
7468 {
7469 if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7470 }
7471
7472 /* The case of a zero minimum is special because of the need to stick
7473 OP_BRAZERO in front of it, and because the group appears once in the
7474 data, whereas in other cases it appears the minimum number of times. For
7475 this reason, it is simplest to treat this case separately, as otherwise
7476 the code gets far too messy. There are several special subcases when the
7477 minimum is zero. */
7478
7479 if (repeat_min == 0)
7480 {
7481 /* If the maximum is also zero, we used to just omit the group from
7482 the output altogether, like this:
7483
7484 ** if (repeat_max == 0)
7485 ** {
7486 ** code = previous;
7487 ** goto END_REPEAT;
7488 ** }
7489
7490 However, that fails when a group or a subgroup within it is
7491 referenced as a subroutine from elsewhere in the pattern, so now we
7492 stick in OP_SKIPZERO in front of it so that it is skipped on
7493 execution. As we don't have a list of which groups are referenced, we
7494 cannot do this selectively.
7495
7496 If the maximum is 1 or unlimited, we just have to stick in the
7497 BRAZERO and do no more at this point. */
7498
7499 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7500 {
7501 (void)memmove(previous + 1, previous, CU2BYTES(len));
7502 code++;
7503 if (repeat_max == 0)
7504 {
7505 *previous++ = OP_SKIPZERO;
7506 goto END_REPEAT;
7507 }
7508 brazeroptr = previous; /* Save for possessive optimizing */
7509 *previous++ = OP_BRAZERO + repeat_type;
7510 }
7511
7512 /* If the maximum is greater than 1 and limited, we have to replicate
7513 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7514 The first one has to be handled carefully because it's the original
7515 copy, which has to be moved up. The remainder can be handled by code
7516 that is common with the non-zero minimum case below. We have to
7517 adjust the value or repeat_max, since one less copy is required. */
7518
7519 else
7520 {
7521 int linkoffset;
7522 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7523 code += 2 + LINK_SIZE;
7524 *previous++ = OP_BRAZERO + repeat_type;
7525 *previous++ = OP_BRA;
7526
7527 /* We chain together the bracket link offset fields that have to be
7528 filled in later when the ends of the brackets are reached. */
7529
7530 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7531 bralink = previous;
7532 PUTINC(previous, 0, linkoffset);
7533 }
7534
7535 if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7536 }
7537
7538 /* If the minimum is greater than zero, replicate the group as many
7539 times as necessary, and adjust the maximum to the number of subsequent
7540 copies that we need. */
7541
7542 else
7543 {
7544 if (repeat_min > 1)
7545 {
7546 /* In the pre-compile phase, we don't actually do the replication.
7547 We just adjust the length as if we had. Do some paranoid checks for
7548 potential integer overflow. */
7549
7550 if (lengthptr != NULL)
7551 {
7552 PCRE2_SIZE delta;
7553 if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7554 (int)length_prevgroup) ||
7555 OFLOW_MAX - *lengthptr < delta)
7556 {
7557 *errorcodeptr = ERR20;
7558 return 0;
7559 }
7560 *lengthptr += delta;
7561 }
7562
7563 /* This is compiling for real. If there is a set first code unit
7564 for the group, and we have not yet set a "required code unit", set
7565 it. */
7566
7567 else
7568 {
7569 if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7570 {
7571 reqcu = firstcu;
7572 reqcuflags = firstcuflags;
7573 }
7574 for (uint32_t i = 1; i < repeat_min; i++)
7575 {
7576 memcpy(code, previous, CU2BYTES(len));
7577 code += len;
7578 }
7579 }
7580 }
7581
7582 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7583 }
7584
7585 /* This code is common to both the zero and non-zero minimum cases. If
7586 the maximum is limited, it replicates the group in a nested fashion,
7587 remembering the bracket starts on a stack. In the case of a zero
7588 minimum, the first one was set up above. In all cases the repeat_max
7589 now specifies the number of additional copies needed. Again, we must
7590 remember to replicate entries on the forward reference list. */
7591
7592 if (repeat_max != REPEAT_UNLIMITED)
7593 {
7594 /* In the pre-compile phase, we don't actually do the replication. We
7595 just adjust the length as if we had. For each repetition we must add
7596 1 to the length for BRAZERO and for all but the last repetition we
7597 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7598 paranoid checks to avoid integer overflow. */
7599
7600 if (lengthptr != NULL && repeat_max > 0)
7601 {
7602 PCRE2_SIZE delta;
7603 if (PRIV(ckd_smul)(&delta, repeat_max,
7604 (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7605 OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7606 {
7607 *errorcodeptr = ERR20;
7608 return 0;
7609 }
7610 delta -= (2 + 2*LINK_SIZE); /* Last one doesn't nest */
7611 *lengthptr += delta;
7612 }
7613
7614 /* This is compiling for real */
7615
7616 else for (uint32_t i = repeat_max; i >= 1; i--)
7617 {
7618 *code++ = OP_BRAZERO + repeat_type;
7619
7620 /* All but the final copy start a new nesting, maintaining the
7621 chain of brackets outstanding. */
7622
7623 if (i != 1)
7624 {
7625 int linkoffset;
7626 *code++ = OP_BRA;
7627 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7628 bralink = code;
7629 PUTINC(code, 0, linkoffset);
7630 }
7631
7632 memcpy(code, previous, CU2BYTES(len));
7633 code += len;
7634 }
7635
7636 /* Now chain through the pending brackets, and fill in their length
7637 fields (which are holding the chain links pro tem). */
7638
7639 while (bralink != NULL)
7640 {
7641 int oldlinkoffset;
7642 int linkoffset = (int)(code - bralink + 1);
7643 PCRE2_UCHAR *bra = code - linkoffset;
7644 oldlinkoffset = GET(bra, 1);
7645 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7646 *code++ = OP_KET;
7647 PUTINC(code, 0, linkoffset);
7648 PUT(bra, 1, linkoffset);
7649 }
7650 }
7651
7652 /* If the maximum is unlimited, set a repeater in the final copy. For
7653 SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7654 possessively repeated ONCE brackets can be converted into non-capturing
7655 brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7656 saves having to deal with possessive ONCEs specially.
7657
7658 Otherwise, when we are doing the actual compile phase, check to see
7659 whether this group is one that could match an empty string. If so,
7660 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7661 that runtime checking can be done. [This check is also applied to ONCE
7662 and SCRIPT_RUN groups at runtime, but in a different way.]
7663
7664 Then, if the quantifier was possessive and the bracket is not a
7665 conditional, we convert the BRA code to the POS form, and the KET code
7666 to KETRPOS. (It turns out to be convenient at runtime to detect this
7667 kind of subpattern at both the start and at the end.) The use of
7668 special opcodes makes it possible to reduce greatly the stack usage in
7669 pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7670 OP_BRAPOSZERO.
7671
7672 Then, if the minimum number of matches is 1 or 0, cancel the possessive
7673 flag so that the default action below, of wrapping everything inside
7674 atomic brackets, does not happen. When the minimum is greater than 1,
7675 there will be earlier copies of the group, and so we still have to wrap
7676 the whole thing. */
7677
7678 else
7679 {
7680 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7681 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7682
7683 /* Convert possessive ONCE brackets to non-capturing */
7684
7685 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7686
7687 /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7688 to do is to set the KET. */
7689
7690 if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7691 *ketcode = OP_KETRMAX + repeat_type;
7692
7693 /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7694 (which have been converted to non-capturing above). */
7695
7696 else
7697 {
7698 /* In the compile phase, adjust the opcode if the group can match
7699 an empty string. For a conditional group with only one branch, the
7700 value of group_return will not show "could be empty", so we must
7701 check that separately. */
7702
7703 if (lengthptr == NULL)
7704 {
7705 if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7706 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7707 *bracode = OP_SCOND;
7708 }
7709
7710 /* Handle possessive quantifiers. */
7711
7712 if (possessive_quantifier)
7713 {
7714 /* For COND brackets, we wrap the whole thing in a possessively
7715 repeated non-capturing bracket, because we have not invented POS
7716 versions of the COND opcodes. */
7717
7718 if (*bracode == OP_COND || *bracode == OP_SCOND)
7719 {
7720 int nlen = (int)(code - bracode);
7721 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7722 code += 1 + LINK_SIZE;
7723 nlen += 1 + LINK_SIZE;
7724 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7725 *code++ = OP_KETRPOS;
7726 PUTINC(code, 0, nlen);
7727 PUT(bracode, 1, nlen);
7728 }
7729
7730 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7731
7732 else
7733 {
7734 *bracode += 1; /* Switch to xxxPOS opcodes */
7735 *ketcode = OP_KETRPOS;
7736 }
7737
7738 /* If the minimum is zero, mark it as possessive, then unset the
7739 possessive flag when the minimum is 0 or 1. */
7740
7741 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7742 if (repeat_min < 2) possessive_quantifier = FALSE;
7743 }
7744
7745 /* Non-possessive quantifier */
7746
7747 else *ketcode = OP_KETRMAX + repeat_type;
7748 }
7749 }
7750 }
7751 break;
7752
7753 /* If previous was a character type match (\d or similar), abolish it and
7754 create a suitable repeat item. The code is shared with single-character
7755 repeats by setting op_type to add a suitable offset into repeat_type.
7756 Note the the Unicode property types will be present only when
7757 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7758 here because it just makes it horribly messy. */
7759
7760 default:
7761 if (op_previous >= OP_EODN) /* Not a character type - internal error */
7762 {
7763 *errorcodeptr = ERR10;
7764 return 0;
7765 }
7766 else
7767 {
7768 int prop_type, prop_value;
7769 PCRE2_UCHAR *oldcode;
7770
7771 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7772
7773 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7774 mclength = 0; /* Not a character */
7775
7776 if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7777 {
7778 prop_type = previous[1];
7779 prop_value = previous[2];
7780 }
7781 else
7782 {
7783 /* Come here from just above with a character in mcbuffer/mclength. */
7784 OUTPUT_SINGLE_REPEAT:
7785 prop_type = prop_value = -1;
7786 }
7787
7788 /* At this point, if prop_type == prop_value == -1 we either have a
7789 character in mcbuffer when mclength is greater than zero, or we have
7790 mclength zero, in which case there is a non-property character type in
7791 op_previous. If prop_type/value are not negative, we have a property
7792 character type in op_previous. */
7793
7794 oldcode = code; /* Save where we were */
7795 code = previous; /* Usually overwrite previous item */
7796
7797 /* If the maximum is zero then the minimum must also be zero; Perl allows
7798 this case, so we do too - by simply omitting the item altogether. */
7799
7800 if (repeat_max == 0) goto END_REPEAT;
7801
7802 /* Combine the op_type with the repeat_type */
7803
7804 repeat_type += op_type;
7805
7806 /* A minimum of zero is handled either as the special case * or ?, or as
7807 an UPTO, with the maximum given. */
7808
7809 if (repeat_min == 0)
7810 {
7811 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7812 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7813 else
7814 {
7815 *code++ = OP_UPTO + repeat_type;
7816 PUT2INC(code, 0, repeat_max);
7817 }
7818 }
7819
7820 /* A repeat minimum of 1 is optimized into some special cases. If the
7821 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7822 left in place and, if the maximum is greater than 1, we use OP_UPTO with
7823 one less than the maximum. */
7824
7825 else if (repeat_min == 1)
7826 {
7827 if (repeat_max == REPEAT_UNLIMITED)
7828 *code++ = OP_PLUS + repeat_type;
7829 else
7830 {
7831 code = oldcode; /* Leave previous item in place */
7832 if (repeat_max == 1) goto END_REPEAT;
7833 *code++ = OP_UPTO + repeat_type;
7834 PUT2INC(code, 0, repeat_max - 1);
7835 }
7836 }
7837
7838 /* The case {n,n} is just an EXACT, while the general case {n,m} is
7839 handled as an EXACT followed by an UPTO or STAR or QUERY. */
7840
7841 else
7842 {
7843 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7844 PUT2INC(code, 0, repeat_min);
7845
7846 /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7847 and then generate the second opcode. For a repeated Unicode property
7848 match, there are two extra values that define the required property,
7849 and mclength is set zero to indicate this. */
7850
7851 if (repeat_max != repeat_min)
7852 {
7853 if (mclength > 0)
7854 {
7855 memcpy(code, mcbuffer, CU2BYTES(mclength));
7856 code += mclength;
7857 }
7858 else
7859 {
7860 *code++ = op_previous;
7861 if (prop_type >= 0)
7862 {
7863 *code++ = prop_type;
7864 *code++ = prop_value;
7865 }
7866 }
7867
7868 /* Now set up the following opcode */
7869
7870 if (repeat_max == REPEAT_UNLIMITED)
7871 *code++ = OP_STAR + repeat_type;
7872 else
7873 {
7874 repeat_max -= repeat_min;
7875 if (repeat_max == 1)
7876 {
7877 *code++ = OP_QUERY + repeat_type;
7878 }
7879 else
7880 {
7881 *code++ = OP_UPTO + repeat_type;
7882 PUT2INC(code, 0, repeat_max);
7883 }
7884 }
7885 }
7886 }
7887
7888 /* Fill in the character or character type for the final opcode. */
7889
7890 if (mclength > 0)
7891 {
7892 memcpy(code, mcbuffer, CU2BYTES(mclength));
7893 code += mclength;
7894 }
7895 else
7896 {
7897 *code++ = op_previous;
7898 if (prop_type >= 0)
7899 {
7900 *code++ = prop_type;
7901 *code++ = prop_value;
7902 }
7903 }
7904 }
7905 break;
7906 } /* End of switch on different op_previous values */
7907
7908
7909 /* If the character following a repeat is '+', possessive_quantifier is
7910 TRUE. For some opcodes, there are special alternative opcodes for this
7911 case. For anything else, we wrap the entire repeated item inside OP_ONCE
7912 brackets. Logically, the '+' notation is just syntactic sugar, taken from
7913 Sun's Java package, but the special opcodes can optimize it.
7914
7915 Some (but not all) possessively repeated subpatterns have already been
7916 completely handled in the code just above. For them, possessive_quantifier
7917 is always FALSE at this stage. Note that the repeated item starts at
7918 tempcode, not at previous, which might be the first part of a string whose
7919 (former) last char we repeated. */
7920
7921 if (possessive_quantifier)
7922 {
7923 int len;
7924
7925 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7926 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7927 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7928 remains is greater than zero, there's a further opcode that can be
7929 handled. If not, do nothing, leaving the EXACT alone. */
7930
7931 switch(*tempcode)
7932 {
7933 case OP_TYPEEXACT:
7934 tempcode += PRIV(OP_lengths)[*tempcode] +
7935 ((tempcode[1 + IMM2_SIZE] == OP_PROP
7936 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7937 break;
7938
7939 /* CHAR opcodes are used for exacts whose count is 1. */
7940
7941 case OP_CHAR:
7942 case OP_CHARI:
7943 case OP_NOT:
7944 case OP_NOTI:
7945 case OP_EXACT:
7946 case OP_EXACTI:
7947 case OP_NOTEXACT:
7948 case OP_NOTEXACTI:
7949 tempcode += PRIV(OP_lengths)[*tempcode];
7950 #ifdef SUPPORT_UNICODE
7951 if (utf && HAS_EXTRALEN(tempcode[-1]))
7952 tempcode += GET_EXTRALEN(tempcode[-1]);
7953 #endif
7954 break;
7955
7956 /* For the class opcodes, the repeat operator appears at the end;
7957 adjust tempcode to point to it. */
7958
7959 case OP_CLASS:
7960 case OP_NCLASS:
7961 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7962 break;
7963
7964 #ifdef SUPPORT_WIDE_CHARS
7965 case OP_XCLASS:
7966 tempcode += GET(tempcode, 1);
7967 break;
7968 #endif
7969 }
7970
7971 /* If tempcode is equal to code (which points to the end of the repeated
7972 item), it means we have skipped an EXACT item but there is no following
7973 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7974 all other cases, tempcode will be pointing to the repeat opcode, and will
7975 be less than code, so the value of len will be greater than 0. */
7976
7977 len = (int)(code - tempcode);
7978 if (len > 0)
7979 {
7980 unsigned int repcode = *tempcode;
7981
7982 /* There is a table for possessifying opcodes, all of which are less
7983 than OP_CALLOUT. A zero entry means there is no possessified version.
7984 */
7985
7986 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7987 *tempcode = opcode_possessify[repcode];
7988
7989 /* For opcode without a special possessified version, wrap the item in
7990 ONCE brackets. */
7991
7992 else
7993 {
7994 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7995 code += 1 + LINK_SIZE;
7996 len += 1 + LINK_SIZE;
7997 tempcode[0] = OP_ONCE;
7998 *code++ = OP_KET;
7999 PUTINC(code, 0, len);
8000 PUT(tempcode, 1, len);
8001 }
8002 }
8003 }
8004
8005 /* We set the "follows varying string" flag for subsequently encountered
8006 reqcus if it isn't already set and we have just passed a varying length
8007 item. */
8008
8009 END_REPEAT:
8010 cb->req_varyopt |= reqvary;
8011 break;
8012
8013
8014 /* ===================================================================*/
8015 /* Handle a 32-bit data character with a value greater than META_END. */
8016
8017 case META_BIGVALUE:
8018 pptr++;
8019 goto NORMAL_CHAR;
8020
8021
8022 /* ===============================================================*/
8023 /* Handle a back reference by number, which is the meta argument. The
8024 pattern offsets for back references to group numbers less than 10 are held
8025 in a special vector, to avoid using more than two parsed pattern elements
8026 in 64-bit environments. We only need the offset to the first occurrence,
8027 because if that doesn't fail, subsequent ones will also be OK. */
8028
8029 case META_BACKREF:
8030 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8031 else GETPLUSOFFSET(offset, pptr);
8032
8033 if (meta_arg > cb->bracount)
8034 {
8035 cb->erroroffset = offset;
8036 *errorcodeptr = ERR15; /* Non-existent subpattern */
8037 return 0;
8038 }
8039
8040 /* Come here from named backref handling when the reference is to a
8041 single group (that is, not to a duplicated name). The back reference
8042 data will have already been updated. We must disable firstcu if not
8043 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8044 later. */
8045
8046 HANDLE_SINGLE_REFERENCE:
8047 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8048 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8049 PUT2INC(code, 0, meta_arg);
8050
8051 /* Update the map of back references, and keep the highest one. We
8052 could do this in parse_regex() for numerical back references, but not
8053 for named back references, because we don't know the numbers to which
8054 named back references refer. So we do it all in this function. */
8055
8056 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8057 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8058 break;
8059
8060
8061 /* ===============================================================*/
8062 /* Handle recursion by inserting the number of the called group (which is
8063 the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8064 scanned and these numbers are replaced by offsets within the pattern. It is
8065 done like this to avoid problems with forward references and adjusting
8066 offsets when groups are duplicated and moved (as discovered in previous
8067 implementations). Note that a recursion does not have a set first
8068 character. */
8069
8070 case META_RECURSE:
8071 GETPLUSOFFSET(offset, pptr);
8072 if (meta_arg > cb->bracount)
8073 {
8074 cb->erroroffset = offset;
8075 *errorcodeptr = ERR15; /* Non-existent subpattern */
8076 return 0;
8077 }
8078 HANDLE_NUMERICAL_RECURSION:
8079 *code = OP_RECURSE;
8080 PUT(code, 1, meta_arg);
8081 code += 1 + LINK_SIZE;
8082 groupsetfirstcu = FALSE;
8083 cb->had_recurse = TRUE;
8084 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8085 zerofirstcu = firstcu;
8086 zerofirstcuflags = firstcuflags;
8087 break;
8088
8089
8090 /* ===============================================================*/
8091 /* Handle capturing parentheses; the number is the meta argument. */
8092
8093 case META_CAPTURE:
8094 bravalue = OP_CBRA;
8095 skipunits = IMM2_SIZE;
8096 PUT2(code, 1+LINK_SIZE, meta_arg);
8097 cb->lastcapture = meta_arg;
8098 goto GROUP_PROCESS_NOTE_EMPTY;
8099
8100
8101 /* ===============================================================*/
8102 /* Handle escape sequence items. For ones like \d, the ESC_values are
8103 arranged to be the same as the corresponding OP_values in the default case
8104 when PCRE2_UCP is not set (which is the only case in which they will appear
8105 here).
8106
8107 Note: \Q and \E are never seen here, as they were dealt with in
8108 parse_pattern(). Neither are numerical back references or recursions, which
8109 were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8110 \g, when followed by names, are turned into META_BACKREF_BYNAME or
8111 META_RECURSE_BYNAME. */
8112
8113 case META_ESCAPE:
8114
8115 /* We can test for escape sequences that consume a character because their
8116 values lie between ESC_b and ESC_Z; this may have to change if any new ones
8117 are ever created. For these sequences, we disable the setting of a first
8118 character if it hasn't already been set. */
8119
8120 if (meta_arg > ESC_b && meta_arg < ESC_Z)
8121 {
8122 matched_char = TRUE;
8123 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8124 }
8125
8126 /* Set values to reset to if this is followed by a zero repeat. */
8127
8128 zerofirstcu = firstcu;
8129 zerofirstcuflags = firstcuflags;
8130 zeroreqcu = reqcu;
8131 zeroreqcuflags = reqcuflags;
8132
8133 /* If Unicode is not supported, \P and \p are not allowed and are
8134 faulted at parse time, so will never appear here. */
8135
8136 #ifdef SUPPORT_UNICODE
8137 if (meta_arg == ESC_P || meta_arg == ESC_p)
8138 {
8139 uint32_t ptype = *(++pptr) >> 16;
8140 uint32_t pdata = *pptr & 0xffff;
8141
8142 /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
8143 from the auto-anchoring code. */
8144
8145 if (meta_arg == ESC_p && ptype == PT_ANY)
8146 {
8147 *code++ = OP_ALLANY;
8148 }
8149 else
8150 {
8151 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8152 *code++ = ptype;
8153 *code++ = pdata;
8154 }
8155 break; /* End META_ESCAPE */
8156 }
8157 #endif
8158
8159 /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8160 done. However, there's an option, in case anyone was relying on it. */
8161
8162 if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8163 (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8164 {
8165 *errorcodeptr = ERR99;
8166 return 0;
8167 }
8168
8169 /* For the rest (including \X when Unicode is supported - if not it's
8170 faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8171 not set; if it is set, most of them do not show up here because they are
8172 converted into Unicode property tests in parse_regex().
8173
8174 In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8175 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8176 There are special UCP codes for \B and \b which are used in UCP mode unless
8177 "word" matching is being forced to ASCII.
8178
8179 Note that \b and \B do a one-character lookbehind, and \A also behaves as
8180 if it does. */
8181
8182 switch(meta_arg)
8183 {
8184 case ESC_C:
8185 cb->external_flags |= PCRE2_HASBKC; /* Record */
8186 #if PCRE2_CODE_UNIT_WIDTH == 32
8187 meta_arg = OP_ALLANY;
8188 #else
8189 if (!utf) meta_arg = OP_ALLANY;
8190 #endif
8191 break;
8192
8193 case ESC_B:
8194 case ESC_b:
8195 if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8196 meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8197 OP_UCP_WORD_BOUNDARY;
8198 /* Fall through */
8199
8200 case ESC_A:
8201 if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8202 break;
8203 }
8204
8205 *code++ = meta_arg;
8206 break; /* End META_ESCAPE */
8207
8208
8209 /* ===================================================================*/
8210 /* Handle an unrecognized meta value. A parsed pattern value less than
8211 META_END is a literal. Otherwise we have a problem. */
8212
8213 default:
8214 if (meta >= META_END)
8215 {
8216 #ifdef DEBUG_SHOW_PARSED
8217 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
8218 #endif
8219 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
8220 return 0;
8221 }
8222
8223 /* Handle a literal character. We come here by goto in the case of a
8224 32-bit, non-UTF character whose value is greater than META_END. */
8225
8226 NORMAL_CHAR:
8227 meta = *pptr; /* Get the full 32 bits */
8228 NORMAL_CHAR_SET: /* Character is already in meta */
8229 matched_char = TRUE;
8230
8231 /* For caseless UTF or UCP mode, check whether this character has more than
8232 one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8233 When casing restrictions apply, ignore caseless sets that start with an
8234 ASCII character. */
8235
8236 #ifdef SUPPORT_UNICODE
8237 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8238 {
8239 uint32_t caseset = UCD_CASESET(meta);
8240 if (caseset != 0 &&
8241 ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
8242 PRIV(ucd_caseless_sets)[caseset] > 127))
8243 {
8244 *code++ = OP_PROP;
8245 *code++ = PT_CLIST;
8246 *code++ = caseset;
8247 if (firstcuflags == REQ_UNSET)
8248 firstcuflags = zerofirstcuflags = REQ_NONE;
8249 break; /* End handling this meta item */
8250 }
8251 }
8252 #endif
8253
8254 /* Caseful matches, or caseless and not one of the multicase characters. We
8255 come here by goto in the case of a positive class that contains only
8256 case-partners of a character with just two cases; matched_char has already
8257 been set TRUE and options fudged if necessary. */
8258
8259 CLASS_CASELESS_CHAR:
8260
8261 /* Get the character's code units into mcbuffer, with the length in
8262 mclength. When not in UTF mode, the length is always 1. */
8263
8264 #ifdef SUPPORT_UNICODE
8265 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8266 #endif
8267 {
8268 mclength = 1;
8269 mcbuffer[0] = meta;
8270 }
8271
8272 /* Generate the appropriate code */
8273
8274 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8275 memcpy(code, mcbuffer, CU2BYTES(mclength));
8276 code += mclength;
8277
8278 /* Remember if \r or \n were seen */
8279
8280 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8281 cb->external_flags |= PCRE2_HASCRORLF;
8282
8283 /* Set the first and required code units appropriately. If no previous
8284 first code unit, set it from this character, but revert to none on a zero
8285 repeat. Otherwise, leave the firstcu value alone, and don't change it on
8286 a zero repeat. */
8287
8288 if (firstcuflags == REQ_UNSET)
8289 {
8290 zerofirstcuflags = REQ_NONE;
8291 zeroreqcu = reqcu;
8292 zeroreqcuflags = reqcuflags;
8293
8294 /* If the character is more than one code unit long, we can set a single
8295 firstcu only if it is not to be matched caselessly. Multiple possible
8296 starting code units may be picked up later in the studying code. */
8297
8298 if (mclength == 1 || req_caseopt == 0)
8299 {
8300 firstcu = mcbuffer[0];
8301 firstcuflags = req_caseopt;
8302 if (mclength != 1)
8303 {
8304 reqcu = code[-1];
8305 reqcuflags = cb->req_varyopt;
8306 }
8307 }
8308 else firstcuflags = reqcuflags = REQ_NONE;
8309 }
8310
8311 /* firstcu was previously set; we can set reqcu only if the length is
8312 1 or the matching is caseful. */
8313
8314 else
8315 {
8316 zerofirstcu = firstcu;
8317 zerofirstcuflags = firstcuflags;
8318 zeroreqcu = reqcu;
8319 zeroreqcuflags = reqcuflags;
8320 if (mclength == 1 || req_caseopt == 0)
8321 {
8322 reqcu = code[-1];
8323 reqcuflags = req_caseopt | cb->req_varyopt;
8324 }
8325 }
8326
8327 /* If caselessness was temporarily instated, reset it. */
8328
8329 if (reset_caseful)
8330 {
8331 options &= ~PCRE2_CASELESS;
8332 req_caseopt = 0;
8333 reset_caseful = FALSE;
8334 }
8335
8336 break; /* End literal character handling */
8337 } /* End of big switch */
8338 } /* End of big loop */
8339
8340 /* Control never reaches here. */
8341 }
8342
8343
8344
8345 /*************************************************
8346 * Compile regex: a sequence of alternatives *
8347 *************************************************/
8348
8349 /* On entry, pptr is pointing past the bracket meta, but on return it points to
8350 the closing bracket or META_END. The code variable is pointing at the code unit
8351 into which the BRA operator has been stored. This function is used during the
8352 pre-compile phase when we are trying to find out the amount of memory needed,
8353 as well as during the real compile phase. The value of lengthptr distinguishes
8354 the two phases.
8355
8356 Arguments:
8357 options option bits, including any changes for this subpattern
8358 xoptions extra option bits, ditto
8359 codeptr -> the address of the current code pointer
8360 pptrptr -> the address of the current parsed pattern pointer
8361 errorcodeptr -> pointer to error code variable
8362 skipunits skip this many code units at start (for brackets and OP_COND)
8363 firstcuptr place to put the first required code unit
8364 firstcuflagsptr place to put the first code unit flags
8365 reqcuptr place to put the last required code unit
8366 reqcuflagsptr place to put the last required code unit flags
8367 bcptr pointer to the chain of currently open branches
8368 cb points to the data block with tables pointers etc.
8369 lengthptr NULL during the real compile phase
8370 points to length accumulator during pre-compile phase
8371
8372 Returns: 0 There has been an error
8373 +1 Success, this group must match at least one character
8374 -1 Success, this group may match an empty string
8375 */
8376
8377 static int
compile_regex(uint32_t options,uint32_t xoptions,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)8378 compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8379 uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8380 uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8381 uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8382 compile_block *cb, PCRE2_SIZE *lengthptr)
8383 {
8384 PCRE2_UCHAR *code = *codeptr;
8385 PCRE2_UCHAR *last_branch = code;
8386 PCRE2_UCHAR *start_bracket = code;
8387 BOOL lookbehind;
8388 open_capitem capitem;
8389 int capnumber = 0;
8390 int okreturn = 1;
8391 uint32_t *pptr = *pptrptr;
8392 uint32_t firstcu, reqcu;
8393 uint32_t lookbehindlength;
8394 uint32_t lookbehindminlength;
8395 uint32_t firstcuflags, reqcuflags;
8396 uint32_t branchfirstcu, branchreqcu;
8397 uint32_t branchfirstcuflags, branchreqcuflags;
8398 PCRE2_SIZE length;
8399 branch_chain bc;
8400
8401 /* If set, call the external function that checks for stack availability. */
8402
8403 if (cb->cx->stack_guard != NULL &&
8404 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8405 {
8406 *errorcodeptr= ERR33;
8407 return 0;
8408 }
8409
8410 /* Miscellaneous initialization */
8411
8412 bc.outer = bcptr;
8413 bc.current_branch = code;
8414
8415 firstcu = reqcu = 0;
8416 firstcuflags = reqcuflags = REQ_UNSET;
8417
8418 /* Accumulate the length for use in the pre-compile phase. Start with the
8419 length of the BRA and KET and any extra code units that are required at the
8420 beginning. We accumulate in a local variable to save frequent testing of
8421 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8422 start and end of each alternative, because compiled items are discarded during
8423 the pre-compile phase so that the workspace is not exceeded. */
8424
8425 length = 2 + 2*LINK_SIZE + skipunits;
8426
8427 /* Remember if this is a lookbehind assertion, and if it is, save its length
8428 and skip over the pattern offset. */
8429
8430 lookbehind = *code == OP_ASSERTBACK ||
8431 *code == OP_ASSERTBACK_NOT ||
8432 *code == OP_ASSERTBACK_NA;
8433
8434 if (lookbehind)
8435 {
8436 lookbehindlength = META_DATA(pptr[-1]);
8437 lookbehindminlength = *pptr;
8438 pptr += SIZEOFFSET;
8439 }
8440 else lookbehindlength = lookbehindminlength = 0;
8441
8442 /* If this is a capturing subpattern, add to the chain of open capturing items
8443 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8444 need be tested here; changing this opcode to one of its variants, e.g.
8445 OP_SCBRAPOS, happens later, after the group has been compiled. */
8446
8447 if (*code == OP_CBRA)
8448 {
8449 capnumber = GET2(code, 1 + LINK_SIZE);
8450 capitem.number = capnumber;
8451 capitem.next = open_caps;
8452 capitem.assert_depth = cb->assert_depth;
8453 open_caps = &capitem;
8454 }
8455
8456 /* Offset is set zero to mark that this bracket is still open */
8457
8458 PUT(code, 1, 0);
8459 code += 1 + LINK_SIZE + skipunits;
8460
8461 /* Loop for each alternative branch */
8462
8463 for (;;)
8464 {
8465 int branch_return;
8466
8467 /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8468 is only a single mimimum length for the whole assertion. When the mimimum
8469 length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8470 though not necessarily the same length. In this case, the original OP_REVERSE
8471 can be used. It can also be used if a branch in a variable length lookbehind
8472 has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8473 maximum and minimum values. */
8474
8475 if (lookbehind && lookbehindlength > 0)
8476 {
8477 if (lookbehindminlength == LOOKBEHIND_MAX ||
8478 lookbehindminlength == lookbehindlength)
8479 {
8480 *code++ = OP_REVERSE;
8481 PUT2INC(code, 0, lookbehindlength);
8482 length += 1 + IMM2_SIZE;
8483 }
8484 else
8485 {
8486 *code++ = OP_VREVERSE;
8487 PUT2INC(code, 0, lookbehindminlength);
8488 PUT2INC(code, 0, lookbehindlength);
8489 length += 1 + 2*IMM2_SIZE;
8490 }
8491 }
8492
8493 /* Now compile the branch; in the pre-compile phase its length gets added
8494 into the length. */
8495
8496 if ((branch_return =
8497 compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8498 &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8499 &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8500 return 0;
8501
8502 /* If a branch can match an empty string, so can the whole group. */
8503
8504 if (branch_return < 0) okreturn = -1;
8505
8506 /* In the real compile phase, there is some post-processing to be done. */
8507
8508 if (lengthptr == NULL)
8509 {
8510 /* If this is the first branch, the firstcu and reqcu values for the
8511 branch become the values for the regex. */
8512
8513 if (*last_branch != OP_ALT)
8514 {
8515 firstcu = branchfirstcu;
8516 firstcuflags = branchfirstcuflags;
8517 reqcu = branchreqcu;
8518 reqcuflags = branchreqcuflags;
8519 }
8520
8521 /* If this is not the first branch, the first char and reqcu have to
8522 match the values from all the previous branches, except that if the
8523 previous value for reqcu didn't have REQ_VARY set, it can still match,
8524 and we set REQ_VARY for the group from this branch's value. */
8525
8526 else
8527 {
8528 /* If we previously had a firstcu, but it doesn't match the new branch,
8529 we have to abandon the firstcu for the regex, but if there was
8530 previously no reqcu, it takes on the value of the old firstcu. */
8531
8532 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8533 {
8534 if (firstcuflags < REQ_NONE)
8535 {
8536 if (reqcuflags >= REQ_NONE)
8537 {
8538 reqcu = firstcu;
8539 reqcuflags = firstcuflags;
8540 }
8541 }
8542 firstcuflags = REQ_NONE;
8543 }
8544
8545 /* If we (now or from before) have no firstcu, a firstcu from the
8546 branch becomes a reqcu if there isn't a branch reqcu. */
8547
8548 if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8549 branchreqcuflags >= REQ_NONE)
8550 {
8551 branchreqcu = branchfirstcu;
8552 branchreqcuflags = branchfirstcuflags;
8553 }
8554
8555 /* Now ensure that the reqcus match */
8556
8557 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8558 reqcu != branchreqcu)
8559 reqcuflags = REQ_NONE;
8560 else
8561 {
8562 reqcu = branchreqcu;
8563 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8564 }
8565 }
8566 }
8567
8568 /* Handle reaching the end of the expression, either ')' or end of pattern.
8569 In the real compile phase, go back through the alternative branches and
8570 reverse the chain of offsets, with the field in the BRA item now becoming an
8571 offset to the first alternative. If there are no alternatives, it points to
8572 the end of the group. The length in the terminating ket is always the length
8573 of the whole bracketed item. Return leaving the pointer at the terminating
8574 char. */
8575
8576 if (META_CODE(*pptr) != META_ALT)
8577 {
8578 if (lengthptr == NULL)
8579 {
8580 PCRE2_SIZE branch_length = code - last_branch;
8581 do
8582 {
8583 PCRE2_SIZE prev_length = GET(last_branch, 1);
8584 PUT(last_branch, 1, branch_length);
8585 branch_length = prev_length;
8586 last_branch -= branch_length;
8587 }
8588 while (branch_length > 0);
8589 }
8590
8591 /* Fill in the ket */
8592
8593 *code = OP_KET;
8594 PUT(code, 1, (int)(code - start_bracket));
8595 code += 1 + LINK_SIZE;
8596
8597 /* Set values to pass back */
8598
8599 *codeptr = code;
8600 *pptrptr = pptr;
8601 *firstcuptr = firstcu;
8602 *firstcuflagsptr = firstcuflags;
8603 *reqcuptr = reqcu;
8604 *reqcuflagsptr = reqcuflags;
8605 if (lengthptr != NULL)
8606 {
8607 if (OFLOW_MAX - *lengthptr < length)
8608 {
8609 *errorcodeptr = ERR20;
8610 return 0;
8611 }
8612 *lengthptr += length;
8613 }
8614 return okreturn;
8615 }
8616
8617 /* Another branch follows. In the pre-compile phase, we can move the code
8618 pointer back to where it was for the start of the first branch. (That is,
8619 pretend that each branch is the only one.)
8620
8621 In the real compile phase, insert an ALT node. Its length field points back
8622 to the previous branch while the bracket remains open. At the end the chain
8623 is reversed. It's done like this so that the start of the bracket has a
8624 zero offset until it is closed, making it possible to detect recursion. */
8625
8626 if (lengthptr != NULL)
8627 {
8628 code = *codeptr + 1 + LINK_SIZE + skipunits;
8629 length += 1 + LINK_SIZE;
8630 }
8631 else
8632 {
8633 *code = OP_ALT;
8634 PUT(code, 1, (int)(code - last_branch));
8635 bc.current_branch = last_branch = code;
8636 code += 1 + LINK_SIZE;
8637 }
8638
8639 /* Set the maximum lookbehind length for the next branch (if not in a
8640 lookbehind the value will be zero) and then advance past the vertical bar. */
8641
8642 lookbehindlength = META_DATA(*pptr);
8643 pptr++;
8644 }
8645 /* Control never reaches here */
8646 }
8647
8648
8649
8650 /*************************************************
8651 * Check for anchored pattern *
8652 *************************************************/
8653
8654 /* Try to find out if this is an anchored regular expression. Consider each
8655 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8656 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8657 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8658 be found, because ^ generates OP_CIRCM in that mode.
8659
8660 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8661 This is the code for \G, which means "match at start of match position, taking
8662 into account the match offset".
8663
8664 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8665 because that will try the rest of the pattern at all possible matching points,
8666 so there is no point trying again.... er ....
8667
8668 .... except when the .* appears inside capturing parentheses, and there is a
8669 subsequent back reference to those parentheses. We haven't enough information
8670 to catch that case precisely.
8671
8672 At first, the best we could do was to detect when .* was in capturing brackets
8673 and the highest back reference was greater than or equal to that level.
8674 However, by keeping a bitmap of the first 31 back references, we can catch some
8675 of the more common cases more precisely.
8676
8677 ... A second exception is when the .* appears inside an atomic group, because
8678 this prevents the number of characters it matches from being adjusted.
8679
8680 Arguments:
8681 code points to start of the compiled pattern
8682 bracket_map a bitmap of which brackets we are inside while testing; this
8683 handles up to substring 31; after that we just have to take
8684 the less precise approach
8685 cb points to the compile data block
8686 atomcount atomic group level
8687 inassert TRUE if in an assertion
8688
8689 Returns: TRUE or FALSE
8690 */
8691
8692 static BOOL
is_anchored(PCRE2_SPTR code,uint32_t bracket_map,compile_block * cb,int atomcount,BOOL inassert)8693 is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8694 int atomcount, BOOL inassert)
8695 {
8696 do {
8697 PCRE2_SPTR scode = first_significant_code(
8698 code + PRIV(OP_lengths)[*code], FALSE);
8699 int op = *scode;
8700
8701 /* Non-capturing brackets */
8702
8703 if (op == OP_BRA || op == OP_BRAPOS ||
8704 op == OP_SBRA || op == OP_SBRAPOS)
8705 {
8706 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8707 return FALSE;
8708 }
8709
8710 /* Capturing brackets */
8711
8712 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8713 op == OP_SCBRA || op == OP_SCBRAPOS)
8714 {
8715 int n = GET2(scode, 1+LINK_SIZE);
8716 uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8717 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8718 }
8719
8720 /* Positive forward assertion */
8721
8722 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8723 {
8724 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8725 }
8726
8727 /* Condition. If there is no second branch, it can't be anchored. */
8728
8729 else if (op == OP_COND || op == OP_SCOND)
8730 {
8731 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8732 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8733 return FALSE;
8734 }
8735
8736 /* Atomic groups */
8737
8738 else if (op == OP_ONCE)
8739 {
8740 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8741 return FALSE;
8742 }
8743
8744 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8745 it isn't in brackets that are or may be referenced or inside an atomic
8746 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8747 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8748 with the subject "aab", which matches "b", i.e. not at the start of a line.
8749 There is also an option that disables auto-anchoring. */
8750
8751 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8752 op == OP_TYPEPOSSTAR))
8753 {
8754 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8755 atomcount > 0 || cb->had_pruneorskip || inassert ||
8756 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8757 return FALSE;
8758 }
8759
8760 /* Check for explicit anchoring */
8761
8762 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8763
8764 code += GET(code, 1);
8765 }
8766 while (*code == OP_ALT); /* Loop for each alternative */
8767 return TRUE;
8768 }
8769
8770
8771
8772 /*************************************************
8773 * Check for starting with ^ or .* *
8774 *************************************************/
8775
8776 /* This is called to find out if every branch starts with ^ or .* so that
8777 "first char" processing can be done to speed things up in multiline
8778 matching and for non-DOTALL patterns that start with .* (which must start at
8779 the beginning or after \n). As in the case of is_anchored() (see above), we
8780 have to take account of back references to capturing brackets that contain .*
8781 because in that case we can't make the assumption. Also, the appearance of .*
8782 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8783 or *SKIP does not count, because once again the assumption no longer holds.
8784
8785 Arguments:
8786 code points to start of the compiled pattern or a group
8787 bracket_map a bitmap of which brackets we are inside while testing; this
8788 handles up to substring 31; after that we just have to take
8789 the less precise approach
8790 cb points to the compile data
8791 atomcount atomic group level
8792 inassert TRUE if in an assertion
8793
8794 Returns: TRUE or FALSE
8795 */
8796
8797 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8798 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8799 int atomcount, BOOL inassert)
8800 {
8801 do {
8802 PCRE2_SPTR scode = first_significant_code(
8803 code + PRIV(OP_lengths)[*code], FALSE);
8804 int op = *scode;
8805
8806 /* If we are at the start of a conditional assertion group, *both* the
8807 conditional assertion *and* what follows the condition must satisfy the test
8808 for start of line. Other kinds of condition fail. Note that there may be an
8809 auto-callout at the start of a condition. */
8810
8811 if (op == OP_COND)
8812 {
8813 scode += 1 + LINK_SIZE;
8814
8815 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8816 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8817
8818 switch (*scode)
8819 {
8820 case OP_CREF:
8821 case OP_DNCREF:
8822 case OP_RREF:
8823 case OP_DNRREF:
8824 case OP_FAIL:
8825 case OP_FALSE:
8826 case OP_TRUE:
8827 return FALSE;
8828
8829 default: /* Assertion */
8830 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8831 do scode += GET(scode, 1); while (*scode == OP_ALT);
8832 scode += 1 + LINK_SIZE;
8833 break;
8834 }
8835 scode = first_significant_code(scode, FALSE);
8836 op = *scode;
8837 }
8838
8839 /* Non-capturing brackets */
8840
8841 if (op == OP_BRA || op == OP_BRAPOS ||
8842 op == OP_SBRA || op == OP_SBRAPOS)
8843 {
8844 if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8845 return FALSE;
8846 }
8847
8848 /* Capturing brackets */
8849
8850 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8851 op == OP_SCBRA || op == OP_SCBRAPOS)
8852 {
8853 int n = GET2(scode, 1+LINK_SIZE);
8854 unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8855 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8856 }
8857
8858 /* Positive forward assertions */
8859
8860 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8861 {
8862 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8863 return FALSE;
8864 }
8865
8866 /* Atomic brackets */
8867
8868 else if (op == OP_ONCE)
8869 {
8870 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8871 return FALSE;
8872 }
8873
8874 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8875 brackets that may be referenced or an assertion, and as long as the pattern
8876 does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8877 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8878 i.e. not at the start of a line. There is also an option that disables this
8879 optimization. */
8880
8881 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8882 {
8883 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8884 atomcount > 0 || cb->had_pruneorskip || inassert ||
8885 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8886 return FALSE;
8887 }
8888
8889 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8890 in particular that this includes atomic brackets OP_ONCE because the number
8891 of characters matched by .* cannot be adjusted inside them. */
8892
8893 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8894
8895 /* Move on to the next alternative */
8896
8897 code += GET(code, 1);
8898 }
8899 while (*code == OP_ALT); /* Loop for each alternative */
8900 return TRUE;
8901 }
8902
8903
8904
8905 /*************************************************
8906 * Scan compiled regex for recursion reference *
8907 *************************************************/
8908
8909 /* This function scans through a compiled pattern until it finds an instance of
8910 OP_RECURSE.
8911
8912 Arguments:
8913 code points to start of expression
8914 utf TRUE in UTF mode
8915
8916 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8917 */
8918
8919 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8920 find_recurse(PCRE2_SPTR code, BOOL utf)
8921 {
8922 for (;;)
8923 {
8924 PCRE2_UCHAR c = *code;
8925 if (c == OP_END) return NULL;
8926 if (c == OP_RECURSE) return code;
8927
8928 /* XCLASS is used for classes that cannot be represented just by a bit map.
8929 This includes negated single high-valued characters. CALLOUT_STR is used for
8930 callouts with string arguments. In both cases the length in the table is
8931 zero; the actual length is stored in the compiled code. */
8932
8933 if (c == OP_XCLASS) code += GET(code, 1);
8934 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8935
8936 /* Otherwise, we can get the item's length from the table, except that for
8937 repeated character types, we have to test for \p and \P, which have an extra
8938 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8939 we must add in its length. */
8940
8941 else
8942 {
8943 switch(c)
8944 {
8945 case OP_TYPESTAR:
8946 case OP_TYPEMINSTAR:
8947 case OP_TYPEPLUS:
8948 case OP_TYPEMINPLUS:
8949 case OP_TYPEQUERY:
8950 case OP_TYPEMINQUERY:
8951 case OP_TYPEPOSSTAR:
8952 case OP_TYPEPOSPLUS:
8953 case OP_TYPEPOSQUERY:
8954 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8955 break;
8956
8957 case OP_TYPEPOSUPTO:
8958 case OP_TYPEUPTO:
8959 case OP_TYPEMINUPTO:
8960 case OP_TYPEEXACT:
8961 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8962 code += 2;
8963 break;
8964
8965 case OP_MARK:
8966 case OP_COMMIT_ARG:
8967 case OP_PRUNE_ARG:
8968 case OP_SKIP_ARG:
8969 case OP_THEN_ARG:
8970 code += code[1];
8971 break;
8972 }
8973
8974 /* Add in the fixed length from the table */
8975
8976 code += PRIV(OP_lengths)[c];
8977
8978 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8979 be followed by a multi-unit character. The length in the table is a
8980 minimum, so we have to arrange to skip the extra units. */
8981
8982 #ifdef MAYBE_UTF_MULTI
8983 if (utf) switch(c)
8984 {
8985 case OP_CHAR:
8986 case OP_CHARI:
8987 case OP_NOT:
8988 case OP_NOTI:
8989 case OP_EXACT:
8990 case OP_EXACTI:
8991 case OP_NOTEXACT:
8992 case OP_NOTEXACTI:
8993 case OP_UPTO:
8994 case OP_UPTOI:
8995 case OP_NOTUPTO:
8996 case OP_NOTUPTOI:
8997 case OP_MINUPTO:
8998 case OP_MINUPTOI:
8999 case OP_NOTMINUPTO:
9000 case OP_NOTMINUPTOI:
9001 case OP_POSUPTO:
9002 case OP_POSUPTOI:
9003 case OP_NOTPOSUPTO:
9004 case OP_NOTPOSUPTOI:
9005 case OP_STAR:
9006 case OP_STARI:
9007 case OP_NOTSTAR:
9008 case OP_NOTSTARI:
9009 case OP_MINSTAR:
9010 case OP_MINSTARI:
9011 case OP_NOTMINSTAR:
9012 case OP_NOTMINSTARI:
9013 case OP_POSSTAR:
9014 case OP_POSSTARI:
9015 case OP_NOTPOSSTAR:
9016 case OP_NOTPOSSTARI:
9017 case OP_PLUS:
9018 case OP_PLUSI:
9019 case OP_NOTPLUS:
9020 case OP_NOTPLUSI:
9021 case OP_MINPLUS:
9022 case OP_MINPLUSI:
9023 case OP_NOTMINPLUS:
9024 case OP_NOTMINPLUSI:
9025 case OP_POSPLUS:
9026 case OP_POSPLUSI:
9027 case OP_NOTPOSPLUS:
9028 case OP_NOTPOSPLUSI:
9029 case OP_QUERY:
9030 case OP_QUERYI:
9031 case OP_NOTQUERY:
9032 case OP_NOTQUERYI:
9033 case OP_MINQUERY:
9034 case OP_MINQUERYI:
9035 case OP_NOTMINQUERY:
9036 case OP_NOTMINQUERYI:
9037 case OP_POSQUERY:
9038 case OP_POSQUERYI:
9039 case OP_NOTPOSQUERY:
9040 case OP_NOTPOSQUERYI:
9041 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9042 break;
9043 }
9044 #else
9045 (void)(utf); /* Keep compiler happy by referencing function argument */
9046 #endif /* MAYBE_UTF_MULTI */
9047 }
9048 }
9049 }
9050
9051
9052
9053 /*************************************************
9054 * Check for asserted fixed first code unit *
9055 *************************************************/
9056
9057 /* During compilation, the "first code unit" settings from forward assertions
9058 are discarded, because they can cause conflicts with actual literals that
9059 follow. However, if we end up without a first code unit setting for an
9060 unanchored pattern, it is worth scanning the regex to see if there is an
9061 initial asserted first code unit. If all branches start with the same asserted
9062 code unit, or with a non-conditional bracket all of whose alternatives start
9063 with the same asserted code unit (recurse ad lib), then we return that code
9064 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9065 REQ_NONE in the flags.
9066
9067 Arguments:
9068 code points to start of compiled pattern
9069 flags points to the first code unit flags
9070 inassert non-zero if in an assertion
9071
9072 Returns: the fixed first code unit, or 0 with REQ_NONE in flags
9073 */
9074
9075 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,uint32_t * flags,uint32_t inassert)9076 find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9077 {
9078 uint32_t c = 0;
9079 uint32_t cflags = REQ_NONE;
9080
9081 *flags = REQ_NONE;
9082 do {
9083 uint32_t d;
9084 uint32_t dflags;
9085 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9086 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9087 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9088 PCRE2_UCHAR op = *scode;
9089
9090 switch(op)
9091 {
9092 default:
9093 return 0;
9094
9095 case OP_BRA:
9096 case OP_BRAPOS:
9097 case OP_CBRA:
9098 case OP_SCBRA:
9099 case OP_CBRAPOS:
9100 case OP_SCBRAPOS:
9101 case OP_ASSERT:
9102 case OP_ASSERT_NA:
9103 case OP_ONCE:
9104 case OP_SCRIPT_RUN:
9105 d = find_firstassertedcu(scode, &dflags, inassert +
9106 ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9107 if (dflags >= REQ_NONE) return 0;
9108 if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9109 else if (c != d || cflags != dflags) return 0;
9110 break;
9111
9112 case OP_EXACT:
9113 scode += IMM2_SIZE;
9114 /* Fall through */
9115
9116 case OP_CHAR:
9117 case OP_PLUS:
9118 case OP_MINPLUS:
9119 case OP_POSPLUS:
9120 if (inassert == 0) return 0;
9121 if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9122 else if (c != scode[1]) return 0;
9123 break;
9124
9125 case OP_EXACTI:
9126 scode += IMM2_SIZE;
9127 /* Fall through */
9128
9129 case OP_CHARI:
9130 case OP_PLUSI:
9131 case OP_MINPLUSI:
9132 case OP_POSPLUSI:
9133 if (inassert == 0) return 0;
9134
9135 /* If the character is more than one code unit long, we cannot set its
9136 first code unit when matching caselessly. Later scanning may pick up
9137 multiple code units. */
9138
9139 #ifdef SUPPORT_UNICODE
9140 #if PCRE2_CODE_UNIT_WIDTH == 8
9141 if (scode[1] >= 0x80) return 0;
9142 #elif PCRE2_CODE_UNIT_WIDTH == 16
9143 if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9144 #endif
9145 #endif
9146
9147 if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9148 else if (c != scode[1]) return 0;
9149 break;
9150 }
9151
9152 code += GET(code, 1);
9153 }
9154 while (*code == OP_ALT);
9155
9156 *flags = cflags;
9157 return c;
9158 }
9159
9160
9161
9162 /*************************************************
9163 * Add an entry to the name/number table *
9164 *************************************************/
9165
9166 /* This function is called between compiling passes to add an entry to the
9167 name/number table, maintaining alphabetical order. Checking for permitted
9168 and forbidden duplicates has already been done.
9169
9170 Arguments:
9171 cb the compile data block
9172 name the name to add
9173 length the length of the name
9174 groupno the group number
9175 tablecount the count of names in the table so far
9176
9177 Returns: nothing
9178 */
9179
9180 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)9181 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9182 unsigned int groupno, uint32_t tablecount)
9183 {
9184 uint32_t i;
9185 PCRE2_UCHAR *slot = cb->name_table;
9186
9187 for (i = 0; i < tablecount; i++)
9188 {
9189 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9190 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9191 crc = -1; /* Current name is a substring */
9192
9193 /* Make space in the table and break the loop for an earlier name. For a
9194 duplicate or later name, carry on. We do this for duplicates so that in the
9195 simple case (when ?(| is not used) they are in order of their numbers. In all
9196 cases they are in the order in which they appear in the pattern. */
9197
9198 if (crc < 0)
9199 {
9200 (void)memmove(slot + cb->name_entry_size, slot,
9201 CU2BYTES((tablecount - i) * cb->name_entry_size));
9202 break;
9203 }
9204
9205 /* Continue the loop for a later or duplicate name */
9206
9207 slot += cb->name_entry_size;
9208 }
9209
9210 PUT2(slot, 0, groupno);
9211 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9212
9213 /* Add a terminating zero and fill the rest of the slot with zeroes so that
9214 the memory is all initialized. Otherwise valgrind moans about uninitialized
9215 memory when saving serialized compiled patterns. */
9216
9217 memset(slot + IMM2_SIZE + length, 0,
9218 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9219 }
9220
9221
9222
9223 /*************************************************
9224 * Skip in parsed pattern *
9225 *************************************************/
9226
9227 /* This function is called to skip parts of the parsed pattern when finding the
9228 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9229 the end of the branch, it is called to skip over an internal lookaround or
9230 (DEFINE) group, and it is also called to skip to the end of a class, during
9231 which it will never encounter nested groups (but there's no need to have
9232 special code for that).
9233
9234 When called to find the end of a branch or group, pptr must point to the first
9235 meta code inside the branch, not the branch-starting code. In other cases it
9236 can point to the item that causes the function to be called.
9237
9238 Arguments:
9239 pptr current pointer to skip from
9240 skiptype PSKIP_CLASS when skipping to end of class
9241 PSKIP_ALT when META_ALT ends the skip
9242 PSKIP_KET when only META_KET ends the skip
9243
9244 Returns: new value of pptr
9245 NULL if META_END is reached - should never occur
9246 or for an unknown meta value - likewise
9247 */
9248
9249 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)9250 parsed_skip(uint32_t *pptr, uint32_t skiptype)
9251 {
9252 uint32_t nestlevel = 0;
9253
9254 for (;; pptr++)
9255 {
9256 uint32_t meta = META_CODE(*pptr);
9257
9258 switch(meta)
9259 {
9260 default: /* Just skip over most items */
9261 if (meta < META_END) continue; /* Literal */
9262 break;
9263
9264 /* This should never occur. */
9265
9266 case META_END:
9267 return NULL;
9268
9269 /* The data for these items is variable in length. */
9270
9271 case META_BACKREF: /* Offset is present only if group >= 10 */
9272 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9273 break;
9274
9275 case META_ESCAPE: /* A few escapes are followed by data items. */
9276 switch (META_DATA(*pptr))
9277 {
9278 case ESC_P:
9279 case ESC_p:
9280 pptr += 1;
9281 break;
9282
9283 case ESC_g:
9284 case ESC_k:
9285 pptr += 1 + SIZEOFFSET;
9286 break;
9287 }
9288 break;
9289
9290 case META_MARK: /* Add the length of the name. */
9291 case META_COMMIT_ARG:
9292 case META_PRUNE_ARG:
9293 case META_SKIP_ARG:
9294 case META_THEN_ARG:
9295 pptr += pptr[1];
9296 break;
9297
9298 /* These are the "active" items in this loop. */
9299
9300 case META_CLASS_END:
9301 if (skiptype == PSKIP_CLASS) return pptr;
9302 break;
9303
9304 case META_ATOMIC:
9305 case META_CAPTURE:
9306 case META_COND_ASSERT:
9307 case META_COND_DEFINE:
9308 case META_COND_NAME:
9309 case META_COND_NUMBER:
9310 case META_COND_RNAME:
9311 case META_COND_RNUMBER:
9312 case META_COND_VERSION:
9313 case META_LOOKAHEAD:
9314 case META_LOOKAHEADNOT:
9315 case META_LOOKAHEAD_NA:
9316 case META_LOOKBEHIND:
9317 case META_LOOKBEHINDNOT:
9318 case META_LOOKBEHIND_NA:
9319 case META_NOCAPTURE:
9320 case META_SCRIPT_RUN:
9321 nestlevel++;
9322 break;
9323
9324 case META_ALT:
9325 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9326 break;
9327
9328 case META_KET:
9329 if (nestlevel == 0) return pptr;
9330 nestlevel--;
9331 break;
9332 }
9333
9334 /* The extra data item length for each meta is in a table. */
9335
9336 meta = (meta >> 16) & 0x7fff;
9337 if (meta >= sizeof(meta_extra_lengths)) return NULL;
9338 pptr += meta_extra_lengths[meta];
9339 }
9340 /* Control never reaches here */
9341 return pptr;
9342 }
9343
9344
9345
9346 /*************************************************
9347 * Find length of a parsed group *
9348 *************************************************/
9349
9350 /* This is called for nested groups within a branch of a lookbehind whose
9351 length is being computed. On entry, the pointer must be at the first element
9352 after the group initializing code. On exit it points to OP_KET. Caching is used
9353 to improve processing speed when the same capturing group occurs many times.
9354
9355 Arguments:
9356 pptrptr pointer to pointer in the parsed pattern
9357 minptr where to return the minimum length
9358 isinline FALSE if a reference or recursion; TRUE for inline group
9359 errcodeptr pointer to the errorcode
9360 lcptr pointer to the loop counter
9361 group number of captured group or -1 for a non-capturing group
9362 recurses chain of recurse_check to catch mutual recursion
9363 cb pointer to the compile data
9364
9365 Returns: the maximum group length or a negative number
9366 */
9367
9368 static int
get_grouplength(uint32_t ** pptrptr,int * minptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)9369 get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9370 int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9371 {
9372 uint32_t *gi = cb->groupinfo + 2 * group;
9373 int branchlength, branchminlength;
9374 int grouplength = -1;
9375 int groupminlength = INT_MAX;
9376
9377 /* The cache can be used only if there is no possibility of there being two
9378 groups with the same number. We do not need to set the end pointer for a group
9379 that is being processed as a back reference or recursion, but we must do so for
9380 an inline group. */
9381
9382 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9383 {
9384 uint32_t groupinfo = gi[0];
9385 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9386 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9387 {
9388 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9389 *minptr = gi[1];
9390 return groupinfo & GI_FIXED_LENGTH_MASK;
9391 }
9392 }
9393
9394 /* Scan the group. In this case we find the end pointer of necessity. */
9395
9396 for(;;)
9397 {
9398 branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9399 recurses, cb);
9400 if (branchlength < 0) goto ISNOTFIXED;
9401 if (branchlength > grouplength) grouplength = branchlength;
9402 if (branchminlength < groupminlength) groupminlength = branchminlength;
9403 if (**pptrptr == META_KET) break;
9404 *pptrptr += 1; /* Skip META_ALT */
9405 }
9406
9407 if (group > 0)
9408 {
9409 gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9410 gi[1] = groupminlength;
9411 }
9412
9413 *minptr = groupminlength;
9414 return grouplength;
9415
9416 ISNOTFIXED:
9417 if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9418 return -1;
9419 }
9420
9421
9422
9423 /*************************************************
9424 * Find length of a parsed branch *
9425 *************************************************/
9426
9427 /* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9428 giving an error if the length is not limited. On entry, *pptrptr points to the
9429 first element inside the branch. On exit it is set to point to the ALT or KET.
9430
9431 Arguments:
9432 pptrptr pointer to pointer in the parsed pattern
9433 minptr where to return the minimum length
9434 errcodeptr pointer to error code
9435 lcptr pointer to loop counter
9436 recurses chain of recurse_check to catch mutual recursion
9437 cb pointer to compile block
9438
9439 Returns: the maximum length, or a negative value on error
9440 */
9441
9442 static int
get_branchlength(uint32_t ** pptrptr,int * minptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9443 get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9444 parsed_recurse_check *recurses, compile_block *cb)
9445 {
9446 int branchlength = 0;
9447 int branchminlength = 0;
9448 int grouplength, groupminlength;
9449 uint32_t lastitemlength = 0;
9450 uint32_t lastitemminlength = 0;
9451 uint32_t *pptr = *pptrptr;
9452 PCRE2_SIZE offset;
9453 parsed_recurse_check this_recurse;
9454
9455 /* A large and/or complex regex can take too long to process. This can happen
9456 more often when (?| groups are present in the pattern because their length
9457 cannot be cached. */
9458
9459 if ((*lcptr)++ > 2000)
9460 {
9461 *errcodeptr = ERR35; /* Lookbehind is too complicated */
9462 return -1;
9463 }
9464
9465 /* Scan the branch, accumulating the length. */
9466
9467 for (;; pptr++)
9468 {
9469 parsed_recurse_check *r;
9470 uint32_t *gptr, *gptrend;
9471 uint32_t escape;
9472 uint32_t group = 0;
9473 uint32_t itemlength = 0;
9474 uint32_t itemminlength = 0;
9475 uint32_t min, max;
9476
9477 if (*pptr < META_END)
9478 {
9479 itemlength = itemminlength = 1;
9480 }
9481
9482 else switch (META_CODE(*pptr))
9483 {
9484 case META_KET:
9485 case META_ALT:
9486 goto EXIT;
9487
9488 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9489 actual termination. */
9490
9491 case META_ACCEPT:
9492 case META_FAIL:
9493 pptr = parsed_skip(pptr, PSKIP_ALT);
9494 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9495 goto EXIT;
9496
9497 case META_MARK:
9498 case META_COMMIT_ARG:
9499 case META_PRUNE_ARG:
9500 case META_SKIP_ARG:
9501 case META_THEN_ARG:
9502 pptr += pptr[1] + 1;
9503 break;
9504
9505 case META_CIRCUMFLEX:
9506 case META_COMMIT:
9507 case META_DOLLAR:
9508 case META_PRUNE:
9509 case META_SKIP:
9510 case META_THEN:
9511 break;
9512
9513 case META_OPTIONS:
9514 pptr += 2;
9515 break;
9516
9517 case META_BIGVALUE:
9518 itemlength = itemminlength = 1;
9519 pptr += 1;
9520 break;
9521
9522 case META_CLASS:
9523 case META_CLASS_NOT:
9524 itemlength = itemminlength = 1;
9525 pptr = parsed_skip(pptr, PSKIP_CLASS);
9526 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9527 break;
9528
9529 case META_CLASS_EMPTY_NOT:
9530 case META_DOT:
9531 itemlength = itemminlength = 1;
9532 break;
9533
9534 case META_CALLOUT_NUMBER:
9535 pptr += 3;
9536 break;
9537
9538 case META_CALLOUT_STRING:
9539 pptr += 3 + SIZEOFFSET;
9540 break;
9541
9542 /* Only some escapes consume a character. Of those, \R can match one or two
9543 characters, but \X is never allowed because it matches an unknown number of
9544 characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9545
9546 case META_ESCAPE:
9547 escape = META_DATA(*pptr);
9548 if (escape == ESC_X) return -1;
9549 if (escape == ESC_R)
9550 {
9551 itemminlength = 1;
9552 itemlength = 2;
9553 }
9554 else if (escape > ESC_b && escape < ESC_Z)
9555 {
9556 #if PCRE2_CODE_UNIT_WIDTH != 32
9557 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9558 {
9559 *errcodeptr = ERR36;
9560 return -1;
9561 }
9562 #endif
9563 itemlength = itemminlength = 1;
9564 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
9565 }
9566 break;
9567
9568 /* Lookaheads do not contribute to the length of this branch, but they may
9569 contain lookbehinds within them whose lengths need to be set. */
9570
9571 case META_LOOKAHEAD:
9572 case META_LOOKAHEADNOT:
9573 case META_LOOKAHEAD_NA:
9574 *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9575 if (*errcodeptr != 0) return -1;
9576
9577 /* Ignore any qualifiers that follow a lookahead assertion. */
9578
9579 switch (pptr[1])
9580 {
9581 case META_ASTERISK:
9582 case META_ASTERISK_PLUS:
9583 case META_ASTERISK_QUERY:
9584 case META_PLUS:
9585 case META_PLUS_PLUS:
9586 case META_PLUS_QUERY:
9587 case META_QUERY:
9588 case META_QUERY_PLUS:
9589 case META_QUERY_QUERY:
9590 pptr++;
9591 break;
9592
9593 case META_MINMAX:
9594 case META_MINMAX_PLUS:
9595 case META_MINMAX_QUERY:
9596 pptr += 3;
9597 break;
9598
9599 default:
9600 break;
9601 }
9602 break;
9603
9604 /* A nested lookbehind does not contribute any length to this lookbehind,
9605 but must itself be checked and have its lengths set. */
9606
9607 case META_LOOKBEHIND:
9608 case META_LOOKBEHINDNOT:
9609 case META_LOOKBEHIND_NA:
9610 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9611 return -1;
9612 break;
9613
9614 /* Back references and recursions are handled by very similar code. At this
9615 stage, the names generated in the parsing pass are available, but the main
9616 name table has not yet been created. So for the named varieties, scan the
9617 list of names in order to get the number of the first one in the pattern,
9618 and whether or not this name is duplicated. */
9619
9620 case META_BACKREF_BYNAME:
9621 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9622 goto ISNOTFIXED;
9623 /* Fall through */
9624
9625 case META_RECURSE_BYNAME:
9626 {
9627 int i;
9628 PCRE2_SPTR name;
9629 BOOL is_dupname = FALSE;
9630 named_group *ng = cb->named_groups;
9631 uint32_t meta_code = META_CODE(*pptr);
9632 uint32_t length = *(++pptr);
9633
9634 GETPLUSOFFSET(offset, pptr);
9635 name = cb->start_pattern + offset;
9636 for (i = 0; i < cb->names_found; i++, ng++)
9637 {
9638 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9639 {
9640 group = ng->number;
9641 is_dupname = ng->isdup;
9642 break;
9643 }
9644 }
9645
9646 if (group == 0)
9647 {
9648 *errcodeptr = ERR15; /* Non-existent subpattern */
9649 cb->erroroffset = offset;
9650 return -1;
9651 }
9652
9653 /* A numerical back reference can be fixed length if duplicate capturing
9654 groups are not being used. A non-duplicate named back reference can also
9655 be handled. */
9656
9657 if (meta_code == META_RECURSE_BYNAME ||
9658 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9659 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9660 }
9661 goto ISNOTFIXED; /* Duplicate name or number */
9662
9663 /* The offset values for back references < 10 are in a separate vector
9664 because otherwise they would use more than two parsed pattern elements on
9665 64-bit systems. */
9666
9667 case META_BACKREF:
9668 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9669 (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9670 goto ISNOTFIXED;
9671 group = META_DATA(*pptr);
9672 if (group < 10)
9673 {
9674 offset = cb->small_ref_offset[group];
9675 goto RECURSE_OR_BACKREF_LENGTH;
9676 }
9677
9678 /* Fall through */
9679 /* For groups >= 10 - picking up group twice does no harm. */
9680
9681 /* A true recursion implies not fixed length, but a subroutine call may
9682 be OK. Back reference "recursions" are also failed. */
9683
9684 case META_RECURSE:
9685 group = META_DATA(*pptr);
9686 GETPLUSOFFSET(offset, pptr);
9687
9688 RECURSE_OR_BACKREF_LENGTH:
9689 if (group > cb->bracount)
9690 {
9691 cb->erroroffset = offset;
9692 *errcodeptr = ERR15; /* Non-existent subpattern */
9693 return -1;
9694 }
9695 if (group == 0) goto ISNOTFIXED; /* Local recursion */
9696 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9697 {
9698 if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9699 else if (*gptr == (META_CAPTURE | group)) break;
9700 }
9701
9702 /* We must start the search for the end of the group at the first meta code
9703 inside the group. Otherwise it will be treated as an enclosed group. */
9704
9705 gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9706 if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9707 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9708 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9709 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9710 this_recurse.prev = recurses;
9711 this_recurse.groupptr = gptr;
9712
9713 /* We do not need to know the position of the end of the group, that is,
9714 gptr is not used after the call to get_grouplength(). Setting the second
9715 argument FALSE stops it scanning for the end when the length can be found
9716 in the cache. */
9717
9718 gptr++;
9719 grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9720 lcptr, group, &this_recurse, cb);
9721 if (grouplength < 0)
9722 {
9723 if (*errcodeptr == 0) goto ISNOTFIXED;
9724 return -1; /* Error already set */
9725 }
9726 itemlength = grouplength;
9727 itemminlength = groupminlength;
9728 break;
9729
9730 /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9731 the length of this branch. Skip from the following item to the next
9732 unpaired ket. */
9733
9734 case META_COND_DEFINE:
9735 pptr = parsed_skip(pptr + 1, PSKIP_KET);
9736 break;
9737
9738 /* Check other nested groups - advance past the initial data for each type
9739 and then seek a fixed length with get_grouplength(). */
9740
9741 case META_COND_NAME:
9742 case META_COND_NUMBER:
9743 case META_COND_RNAME:
9744 case META_COND_RNUMBER:
9745 pptr += 2 + SIZEOFFSET;
9746 goto CHECK_GROUP;
9747
9748 case META_COND_ASSERT:
9749 pptr += 1;
9750 goto CHECK_GROUP;
9751
9752 case META_COND_VERSION:
9753 pptr += 4;
9754 goto CHECK_GROUP;
9755
9756 case META_CAPTURE:
9757 group = META_DATA(*pptr);
9758 /* Fall through */
9759
9760 case META_ATOMIC:
9761 case META_NOCAPTURE:
9762 case META_SCRIPT_RUN:
9763 pptr++;
9764 CHECK_GROUP:
9765 grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9766 lcptr, group, recurses, cb);
9767 if (grouplength < 0) return -1;
9768 itemlength = grouplength;
9769 itemminlength = groupminlength;
9770 break;
9771
9772 case META_QUERY:
9773 case META_QUERY_PLUS:
9774 case META_QUERY_QUERY:
9775 min = 0;
9776 max = 1;
9777 goto REPETITION;
9778
9779 /* Exact repetition is OK; variable repetition is not. A repetition of zero
9780 must subtract the length that has already been added. */
9781
9782 case META_MINMAX:
9783 case META_MINMAX_PLUS:
9784 case META_MINMAX_QUERY:
9785 min = pptr[1];
9786 max = pptr[2];
9787 pptr += 2;
9788
9789 REPETITION:
9790 if (max != REPEAT_UNLIMITED)
9791 {
9792 if (lastitemlength != 0 && /* Should not occur, but just in case */
9793 max != 0 &&
9794 (INT_MAX - branchlength)/lastitemlength < max - 1)
9795 {
9796 *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */
9797 return -1;
9798 }
9799 if (min == 0) branchminlength -= lastitemminlength;
9800 else itemminlength = (min - 1) * lastitemminlength;
9801 if (max == 0) branchlength -= lastitemlength;
9802 else itemlength = (max - 1) * lastitemlength;
9803 break;
9804 }
9805 /* Fall through */
9806
9807 /* Any other item means this branch does not have a fixed length. */
9808
9809 default:
9810 ISNOTFIXED:
9811 *errcodeptr = ERR25; /* Not fixed length */
9812 return -1;
9813 }
9814
9815 /* Add the item length to the branchlength, checking for integer overflow and
9816 for the branch length exceeding the overall limit. Later, if there is at
9817 least one variable-length branch in the group, there is a test for the
9818 (smaller) variable-length branch length limit. */
9819
9820 if (INT_MAX - branchlength < (int)itemlength ||
9821 (branchlength += itemlength) > LOOKBEHIND_MAX)
9822 {
9823 *errcodeptr = ERR87;
9824 return -1;
9825 }
9826
9827 branchminlength += itemminlength;
9828
9829 /* Save this item length for use if the next item is a quantifier. */
9830
9831 lastitemlength = itemlength;
9832 lastitemminlength = itemminlength;
9833 }
9834
9835 EXIT:
9836 *pptrptr = pptr;
9837 *minptr = branchminlength;
9838 return branchlength;
9839
9840 PARSED_SKIP_FAILED:
9841 *errcodeptr = ERR90;
9842 return -1;
9843 }
9844
9845
9846
9847 /*************************************************
9848 * Set lengths in a lookbehind *
9849 *************************************************/
9850
9851 /* This function is called for each lookbehind, to set the lengths in its
9852 branches. An error occurs if any branch does not have a limited maximum length
9853 that is less than the limit (65535). On exit, the pointer must be left on the
9854 final ket.
9855
9856 The function also maintains the max_lookbehind value. Any lookbehind branch
9857 that contains a nested lookbehind may actually look further back than the
9858 length of the branch. The additional amount is passed back from
9859 get_branchlength() as an "extra" value.
9860
9861 Arguments:
9862 pptrptr pointer to pointer in the parsed pattern
9863 errcodeptr pointer to error code
9864 lcptr pointer to loop counter
9865 recurses chain of recurse_check to catch mutual recursion
9866 cb pointer to compile block
9867
9868 Returns: TRUE if all is well
9869 FALSE otherwise, with error code and offset set
9870 */
9871
9872 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9873 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9874 parsed_recurse_check *recurses, compile_block *cb)
9875 {
9876 PCRE2_SIZE offset;
9877 uint32_t *bptr = *pptrptr;
9878 uint32_t *gbptr = bptr;
9879 int maxlength = 0;
9880 int minlength = INT_MAX;
9881 BOOL variable = FALSE;
9882
9883 READPLUSOFFSET(offset, bptr); /* Offset for error messages */
9884 *pptrptr += SIZEOFFSET;
9885
9886 /* Each branch can have a different maximum length, but we can keep only a
9887 single minimum for the whole group, because there's nowhere to save individual
9888 values in the META_ALT item. */
9889
9890 do
9891 {
9892 int branchlength, branchminlength;
9893
9894 *pptrptr += 1;
9895 branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9896 recurses, cb);
9897
9898 if (branchlength < 0)
9899 {
9900 /* The errorcode and offset may already be set from a nested lookbehind. */
9901 if (*errcodeptr == 0) *errcodeptr = ERR25;
9902 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9903 return FALSE;
9904 }
9905
9906 if (branchlength != branchminlength) variable = TRUE;
9907 if (branchminlength < minlength) minlength = branchminlength;
9908 if (branchlength > maxlength) maxlength = branchlength;
9909 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9910 *bptr |= branchlength; /* branchlength never more than 65535 */
9911 bptr = *pptrptr;
9912 }
9913 while (META_CODE(*bptr) == META_ALT);
9914
9915 /* If any branch is of variable length, the whole lookbehind is of variable
9916 length. If the maximum length of any branch exceeds the maximum for variable
9917 lookbehinds, give an error. Otherwise, the minimum length is set in the word
9918 that follows the original group META value. For a fixed-length lookbehind, this
9919 is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9920 possibly different) length. */
9921
9922 if (variable)
9923 {
9924 gbptr[1] = minlength;
9925 if ((uint32_t)maxlength > cb->max_varlookbehind)
9926 {
9927 *errcodeptr = ERR100;
9928 cb->erroroffset = offset;
9929 return FALSE;
9930 }
9931 }
9932 else gbptr[1] = LOOKBEHIND_MAX;
9933
9934
9935 gbptr[1] = variable? minlength : LOOKBEHIND_MAX;
9936 return TRUE;
9937 }
9938
9939
9940
9941 /*************************************************
9942 * Check parsed pattern lookbehinds *
9943 *************************************************/
9944
9945 /* This function is called at the end of parsing a pattern if any lookbehinds
9946 were encountered. It scans the parsed pattern for them, calling
9947 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9948 the error offset is marked unset. The enables the functions above not to
9949 override settings from deeper nestings.
9950
9951 This function is called recursively from get_branchlength() for lookaheads in
9952 order to process any lookbehinds that they may contain. It stops when it hits a
9953 non-nested closing parenthesis in this case, returning a pointer to it.
9954
9955 Arguments
9956 pptr points to where to start (start of pattern or start of lookahead)
9957 retptr if not NULL, return the ket pointer here
9958 recurses chain of recurse_check to catch mutual recursion
9959 cb points to the compile block
9960 lcptr points to loop counter
9961
9962 Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9963 */
9964
9965 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9966 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9967 parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9968 {
9969 int errorcode = 0;
9970 int nestlevel = 0;
9971
9972 cb->erroroffset = PCRE2_UNSET;
9973
9974 for (; *pptr != META_END; pptr++)
9975 {
9976 if (*pptr < META_END) continue; /* Literal */
9977
9978 switch (META_CODE(*pptr))
9979 {
9980 default:
9981 return ERR70; /* Unrecognized meta code */
9982
9983 case META_ESCAPE:
9984 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9985 pptr += 1;
9986 break;
9987
9988 case META_KET:
9989 if (--nestlevel < 0)
9990 {
9991 if (retptr != NULL) *retptr = pptr;
9992 return 0;
9993 }
9994 break;
9995
9996 case META_ATOMIC:
9997 case META_CAPTURE:
9998 case META_COND_ASSERT:
9999 case META_LOOKAHEAD:
10000 case META_LOOKAHEADNOT:
10001 case META_LOOKAHEAD_NA:
10002 case META_NOCAPTURE:
10003 case META_SCRIPT_RUN:
10004 nestlevel++;
10005 break;
10006
10007 case META_ACCEPT:
10008 case META_ALT:
10009 case META_ASTERISK:
10010 case META_ASTERISK_PLUS:
10011 case META_ASTERISK_QUERY:
10012 case META_BACKREF:
10013 case META_CIRCUMFLEX:
10014 case META_CLASS:
10015 case META_CLASS_EMPTY:
10016 case META_CLASS_EMPTY_NOT:
10017 case META_CLASS_END:
10018 case META_CLASS_NOT:
10019 case META_COMMIT:
10020 case META_DOLLAR:
10021 case META_DOT:
10022 case META_FAIL:
10023 case META_PLUS:
10024 case META_PLUS_PLUS:
10025 case META_PLUS_QUERY:
10026 case META_PRUNE:
10027 case META_QUERY:
10028 case META_QUERY_PLUS:
10029 case META_QUERY_QUERY:
10030 case META_RANGE_ESCAPED:
10031 case META_RANGE_LITERAL:
10032 case META_SKIP:
10033 case META_THEN:
10034 break;
10035
10036 case META_RECURSE:
10037 pptr += SIZEOFFSET;
10038 break;
10039
10040 case META_BACKREF_BYNAME:
10041 case META_RECURSE_BYNAME:
10042 pptr += 1 + SIZEOFFSET;
10043 break;
10044
10045 case META_COND_DEFINE:
10046 pptr += SIZEOFFSET;
10047 nestlevel++;
10048 break;
10049
10050 case META_COND_NAME:
10051 case META_COND_NUMBER:
10052 case META_COND_RNAME:
10053 case META_COND_RNUMBER:
10054 pptr += 1 + SIZEOFFSET;
10055 nestlevel++;
10056 break;
10057
10058 case META_COND_VERSION:
10059 pptr += 3;
10060 nestlevel++;
10061 break;
10062
10063 case META_CALLOUT_STRING:
10064 pptr += 3 + SIZEOFFSET;
10065 break;
10066
10067 case META_BIGVALUE:
10068 case META_POSIX:
10069 case META_POSIX_NEG:
10070 pptr += 1;
10071 break;
10072
10073 case META_MINMAX:
10074 case META_MINMAX_QUERY:
10075 case META_MINMAX_PLUS:
10076 case META_OPTIONS:
10077 pptr += 2;
10078 break;
10079
10080 case META_CALLOUT_NUMBER:
10081 pptr += 3;
10082 break;
10083
10084 case META_MARK:
10085 case META_COMMIT_ARG:
10086 case META_PRUNE_ARG:
10087 case META_SKIP_ARG:
10088 case META_THEN_ARG:
10089 pptr += 1 + pptr[1];
10090 break;
10091
10092 case META_LOOKBEHIND:
10093 case META_LOOKBEHINDNOT:
10094 case META_LOOKBEHIND_NA:
10095 if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10096 return errorcode;
10097 break;
10098 }
10099 }
10100
10101 return 0;
10102 }
10103
10104
10105
10106 /*************************************************
10107 * External function to compile a pattern *
10108 *************************************************/
10109
10110 /* This function reads a regular expression in the form of a string and returns
10111 a pointer to a block of store holding a compiled version of the expression.
10112
10113 Arguments:
10114 pattern the regular expression
10115 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
10116 options option bits
10117 errorptr pointer to errorcode
10118 erroroffset pointer to error offset
10119 ccontext points to a compile context or is NULL
10120
10121 Returns: pointer to compiled data block, or NULL on error,
10122 with errorcode and erroroffset set
10123 */
10124
10125 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)10126 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10127 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10128 {
10129 BOOL utf; /* Set TRUE for UTF mode */
10130 BOOL ucp; /* Set TRUE for UCP mode */
10131 BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
10132 BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
10133 pcre2_real_code *re = NULL; /* What we will return */
10134 compile_block cb; /* "Static" compile-time data */
10135 const uint8_t *tables; /* Char tables base pointer */
10136
10137 PCRE2_UCHAR *code; /* Current pointer in compiled code */
10138 PCRE2_SPTR codestart; /* Start of compiled code */
10139 PCRE2_SPTR ptr; /* Current pointer in pattern */
10140 uint32_t *pptr; /* Current pointer in parsed pattern */
10141
10142 PCRE2_SIZE length = 1; /* Allow for final END opcode */
10143 PCRE2_SIZE usedlength; /* Actual length used */
10144 PCRE2_SIZE re_blocksize; /* Size of memory block */
10145 PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
10146 PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
10147
10148 uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
10149 uint32_t firstcu, reqcu; /* Value of first/req code unit */
10150 uint32_t setflags = 0; /* NL and BSR set flags */
10151
10152 uint32_t skipatstart; /* When checking (*UTF) etc */
10153 uint32_t limit_heap = UINT32_MAX;
10154 uint32_t limit_match = UINT32_MAX; /* Unset match limits */
10155 uint32_t limit_depth = UINT32_MAX;
10156
10157 int newline = 0; /* Unset; can be set by the pattern */
10158 int bsr = 0; /* Unset; can be set by the pattern */
10159 int errorcode = 0; /* Initialize to avoid compiler warn */
10160 int regexrc; /* Return from compile */
10161
10162 uint32_t i; /* Local loop counter */
10163
10164 /* Comments at the head of this file explain about these variables. */
10165
10166 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10167 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10168 named_group named_groups[NAMED_GROUP_LIST_SIZE];
10169
10170 /* The workspace is used in different ways in the different compiling phases.
10171 It needs to be 16-bit aligned for the preliminary parsing scan. */
10172
10173 uint32_t c16workspace[C16_WORK_SIZE];
10174 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10175
10176
10177 /* -------------- Check arguments and set up the pattern ----------------- */
10178
10179 /* There must be error code and offset pointers. */
10180
10181 if (errorptr == NULL || erroroffset == NULL) return NULL;
10182 *errorptr = ERR0;
10183 *erroroffset = 0;
10184
10185 /* There must be a pattern, but NULL is allowed with zero length. */
10186
10187 if (pattern == NULL)
10188 {
10189 if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10190 {
10191 *errorptr = ERR16;
10192 return NULL;
10193 }
10194 }
10195
10196 /* A NULL compile context means "use a default context" */
10197
10198 if (ccontext == NULL)
10199 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10200
10201 /* PCRE2_MATCH_INVALID_UTF implies UTF */
10202
10203 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10204
10205 /* Check that all undefined public option bits are zero. */
10206
10207 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10208 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10209 {
10210 *errorptr = ERR17;
10211 return NULL;
10212 }
10213
10214 if ((options & PCRE2_LITERAL) != 0 &&
10215 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10216 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10217 {
10218 *errorptr = ERR92;
10219 return NULL;
10220 }
10221
10222 /* A zero-terminated pattern is indicated by the special length value
10223 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10224
10225 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10226 patlen = PRIV(strlen)(pattern);
10227
10228 if (patlen > ccontext->max_pattern_length)
10229 {
10230 *errorptr = ERR88;
10231 return NULL;
10232 }
10233
10234 /* From here on, all returns from this function should end up going via the
10235 EXIT label. */
10236
10237
10238 /* ------------ Initialize the "static" compile data -------------- */
10239
10240 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10241
10242 cb.lcc = tables + lcc_offset; /* Individual */
10243 cb.fcc = tables + fcc_offset; /* character */
10244 cb.cbits = tables + cbits_offset; /* tables */
10245 cb.ctypes = tables + ctypes_offset;
10246
10247 cb.assert_depth = 0;
10248 cb.bracount = 0;
10249 cb.cx = ccontext;
10250 cb.dupnames = FALSE;
10251 cb.end_pattern = pattern + patlen;
10252 cb.erroroffset = 0;
10253 cb.external_flags = 0;
10254 cb.external_options = options;
10255 cb.groupinfo = stack_groupinfo;
10256 cb.had_recurse = FALSE;
10257 cb.lastcapture = 0;
10258 cb.max_lookbehind = 0; /* Max encountered */
10259 cb.max_varlookbehind = ccontext->max_varlookbehind; /* Limit */
10260 cb.name_entry_size = 0;
10261 cb.name_table = NULL;
10262 cb.named_groups = named_groups;
10263 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10264 cb.names_found = 0;
10265 cb.parens_depth = 0;
10266 cb.parsed_pattern = stack_parsed_pattern;
10267 cb.req_varyopt = 0;
10268 cb.start_code = cworkspace;
10269 cb.start_pattern = pattern;
10270 cb.start_workspace = cworkspace;
10271 cb.workspace_size = COMPILE_WORK_SIZE;
10272
10273 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10274 references to help in deciding whether (.*) can be treated as anchored or not.
10275 */
10276
10277 cb.top_backref = 0;
10278 cb.backref_map = 0;
10279
10280 /* Escape sequences \1 to \9 are always back references, but as they are only
10281 two characters long, only two elements can be used in the parsed_pattern
10282 vector. The first contains the reference, and we'd like to use the second to
10283 record the offset in the pattern, so that forward references to non-existent
10284 groups can be diagnosed later with an offset. However, on 64-bit systems,
10285 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10286 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10287 references have enough space for the offset to be put into the parsed pattern.
10288 */
10289
10290 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10291
10292
10293 /* --------------- Start looking at the pattern --------------- */
10294
10295 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10296 the start of the pattern, and remember the offset to the actual regex. With
10297 valgrind support, make the terminator of a zero-terminated pattern
10298 inaccessible. This catches bugs that would otherwise only show up for
10299 non-zero-terminated patterns. */
10300
10301 #ifdef SUPPORT_VALGRIND
10302 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10303 #endif
10304
10305 ptr = pattern;
10306 skipatstart = 0;
10307
10308 if ((options & PCRE2_LITERAL) == 0)
10309 {
10310 while (patlen - skipatstart >= 2 &&
10311 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10312 ptr[skipatstart+1] == CHAR_ASTERISK)
10313 {
10314 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10315 {
10316 uint32_t c, pp;
10317 const pso *p = pso_list + i;
10318
10319 if (patlen - skipatstart - 2 >= p->length &&
10320 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
10321 p->length) == 0)
10322 {
10323 skipatstart += p->length + 2;
10324 switch(p->type)
10325 {
10326 case PSO_OPT:
10327 cb.external_options |= p->value;
10328 break;
10329
10330 case PSO_FLG:
10331 setflags |= p->value;
10332 break;
10333
10334 case PSO_NL:
10335 newline = p->value;
10336 setflags |= PCRE2_NL_SET;
10337 break;
10338
10339 case PSO_BSR:
10340 bsr = p->value;
10341 setflags |= PCRE2_BSR_SET;
10342 break;
10343
10344 case PSO_LIMM:
10345 case PSO_LIMD:
10346 case PSO_LIMH:
10347 c = 0;
10348 pp = skipatstart;
10349 if (!IS_DIGIT(ptr[pp]))
10350 {
10351 errorcode = ERR60;
10352 ptr += pp;
10353 goto HAD_EARLY_ERROR;
10354 }
10355 while (IS_DIGIT(ptr[pp]))
10356 {
10357 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
10358 c = c*10 + (ptr[pp++] - CHAR_0);
10359 }
10360 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
10361 {
10362 errorcode = ERR60;
10363 ptr += pp;
10364 goto HAD_EARLY_ERROR;
10365 }
10366 if (p->type == PSO_LIMH) limit_heap = c;
10367 else if (p->type == PSO_LIMM) limit_match = c;
10368 else limit_depth = c;
10369 skipatstart += pp - skipatstart;
10370 break;
10371 }
10372 break; /* Out of the table scan loop */
10373 }
10374 }
10375 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
10376 }
10377 }
10378
10379 /* End of pattern-start options; advance to start of real regex. */
10380
10381 ptr += skipatstart;
10382
10383 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10384
10385 #ifndef SUPPORT_UNICODE
10386 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10387 {
10388 errorcode = ERR32;
10389 goto HAD_EARLY_ERROR;
10390 }
10391 #endif
10392
10393 /* Check UTF. We have the original options in 'options', with that value as
10394 modified by (*UTF) etc in cb->external_options. The extra option
10395 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10396 surrogate code points cannot be represented in UTF-16. */
10397
10398 utf = (cb.external_options & PCRE2_UTF) != 0;
10399 if (utf)
10400 {
10401 if ((options & PCRE2_NEVER_UTF) != 0)
10402 {
10403 errorcode = ERR74;
10404 goto HAD_EARLY_ERROR;
10405 }
10406 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10407 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10408 goto HAD_ERROR; /* Offset was set by valid_utf() */
10409
10410 #if PCRE2_CODE_UNIT_WIDTH == 16
10411 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10412 {
10413 errorcode = ERR91;
10414 goto HAD_EARLY_ERROR;
10415 }
10416 #endif
10417 }
10418
10419 /* Check UCP lockout. */
10420
10421 ucp = (cb.external_options & PCRE2_UCP) != 0;
10422 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10423 {
10424 errorcode = ERR75;
10425 goto HAD_EARLY_ERROR;
10426 }
10427
10428 /* Process the BSR setting. */
10429
10430 if (bsr == 0) bsr = ccontext->bsr_convention;
10431
10432 /* Process the newline setting. */
10433
10434 if (newline == 0) newline = ccontext->newline_convention;
10435 cb.nltype = NLTYPE_FIXED;
10436 switch(newline)
10437 {
10438 case PCRE2_NEWLINE_CR:
10439 cb.nllen = 1;
10440 cb.nl[0] = CHAR_CR;
10441 break;
10442
10443 case PCRE2_NEWLINE_LF:
10444 cb.nllen = 1;
10445 cb.nl[0] = CHAR_NL;
10446 break;
10447
10448 case PCRE2_NEWLINE_NUL:
10449 cb.nllen = 1;
10450 cb.nl[0] = CHAR_NUL;
10451 break;
10452
10453 case PCRE2_NEWLINE_CRLF:
10454 cb.nllen = 2;
10455 cb.nl[0] = CHAR_CR;
10456 cb.nl[1] = CHAR_NL;
10457 break;
10458
10459 case PCRE2_NEWLINE_ANY:
10460 cb.nltype = NLTYPE_ANY;
10461 break;
10462
10463 case PCRE2_NEWLINE_ANYCRLF:
10464 cb.nltype = NLTYPE_ANYCRLF;
10465 break;
10466
10467 default:
10468 errorcode = ERR56;
10469 goto HAD_EARLY_ERROR;
10470 }
10471
10472 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10473 their numerical equivalents, so that this information is always available for
10474 the remaining processing. (2) At the same time, parse the pattern and put a
10475 processed version into the parsed_pattern vector. This has escapes interpreted
10476 and comments removed (amongst other things).
10477
10478 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10479 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10480 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10481 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10482 characters greater than META_END (0x80000000) have to be coded as two units. In
10483 this case, therefore, we scan the pattern to check for such values. */
10484
10485 #if PCRE2_CODE_UNIT_WIDTH == 32
10486 if (!utf)
10487 {
10488 PCRE2_SPTR p;
10489 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10490 }
10491 #endif
10492
10493 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10494 is set we have to assume a numerical callout (4 elements) for each character
10495 plus one at the end. This is overkill, but memory is plentiful these days. For
10496 many smaller patterns the vector on the stack (which was set up above) can be
10497 used. */
10498
10499 parsed_size_needed = patlen - skipatstart + big32count;
10500
10501 if ((ccontext->extra_options &
10502 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10503 parsed_size_needed += 4;
10504
10505 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10506 parsed_size_needed = (parsed_size_needed + 1) * 5;
10507
10508 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10509 {
10510 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10511 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10512 if (heap_parsed_pattern == NULL)
10513 {
10514 *errorptr = ERR21;
10515 goto EXIT;
10516 }
10517 cb.parsed_pattern = heap_parsed_pattern;
10518 }
10519 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10520
10521 /* Do the parsing scan. */
10522
10523 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10524 if (errorcode != 0) goto HAD_CB_ERROR;
10525
10526 /* If there are any lookbehinds, scan the parsed pattern to figure out their
10527 lengths. Workspace is needed to remember whether numbered groups are or are not
10528 of limited length, and if limited, what the minimum and maximum lengths are.
10529 This caching saves re-computing the length of any group that is referenced more
10530 than once, which is particularly relevant when recursion is involved.
10531 Unnumbered groups do not have this exposure because they cannot be referenced.
10532 If there are sufficiently few groups, the default index vector on the stack, as
10533 set up above, can be used. Otherwise we have to get/free some heap memory. The
10534 vector must be initialized to zero. */
10535
10536 if (has_lookbehind)
10537 {
10538 int loopcount = 0;
10539 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10540 {
10541 cb.groupinfo = ccontext->memctl.malloc(
10542 (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10543 if (cb.groupinfo == NULL)
10544 {
10545 errorcode = ERR21;
10546 cb.erroroffset = 0;
10547 goto HAD_CB_ERROR;
10548 }
10549 }
10550 memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10551 errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10552 if (errorcode != 0) goto HAD_CB_ERROR;
10553 }
10554
10555 /* For debugging, there is a function that shows the parsed pattern vector. */
10556
10557 #ifdef DEBUG_SHOW_PARSED
10558 fprintf(stderr, "+++ Pre-scan complete:\n");
10559 show_parsed(&cb);
10560 #endif
10561
10562 /* For debugging capturing information this code can be enabled. */
10563
10564 #ifdef DEBUG_SHOW_CAPTURES
10565 {
10566 named_group *ng = cb.named_groups;
10567 fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10568 for (i = 0; i < cb.names_found; i++, ng++)
10569 {
10570 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10571 }
10572 }
10573 #endif
10574
10575 /* Pretend to compile the pattern while actually just accumulating the amount
10576 of memory required in the 'length' variable. This behaviour is triggered by
10577 passing a non-NULL final argument to compile_regex(). We pass a block of
10578 workspace (cworkspace) for it to compile parts of the pattern into; the
10579 compiled code is discarded when it is no longer needed, so hopefully this
10580 workspace will never overflow, though there is a test for its doing so.
10581
10582 On error, errorcode will be set non-zero, so we don't need to look at the
10583 result of the function. The initial options have been put into the cb block,
10584 but we still have to pass a separate options variable (the first argument)
10585 because the options may change as the pattern is processed. */
10586
10587 cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
10588 pptr = cb.parsed_pattern;
10589 code = cworkspace;
10590 *code = OP_BRA;
10591
10592 (void)compile_regex(cb.external_options, ccontext->extra_options, &code, &pptr,
10593 &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10594 &cb, &length);
10595
10596 if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
10597
10598 /* This should be caught in compile_regex(), but just in case... */
10599
10600 if (length > MAX_PATTERN_SIZE)
10601 {
10602 errorcode = ERR20;
10603 goto HAD_CB_ERROR;
10604 }
10605
10606 /* Compute the size of, then, if not too large, get and initialize the data
10607 block for storing the compiled pattern and names table. Integer overflow should
10608 no longer be possible because nowadays we limit the maximum value of
10609 cb.names_found and cb.name_entry_size. */
10610
10611 re_blocksize = sizeof(pcre2_real_code) +
10612 CU2BYTES(length +
10613 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10614
10615 if (re_blocksize > ccontext->max_pattern_compiled_length)
10616 {
10617 errorcode = ERR101;
10618 goto HAD_CB_ERROR;
10619 }
10620
10621 re = (pcre2_real_code *)
10622 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10623 if (re == NULL)
10624 {
10625 errorcode = ERR21;
10626 goto HAD_CB_ERROR;
10627 }
10628
10629 /* The compiler may put padding at the end of the pcre2_real_code structure in
10630 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10631 compiled pattern is copied (for example, when serialized) undefined bytes are
10632 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10633 write to the last 8 bytes of the structure before setting the fields. */
10634
10635 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10636 re->memctl = ccontext->memctl;
10637 re->tables = tables;
10638 re->executable_jit = NULL;
10639 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10640 re->blocksize = re_blocksize;
10641 re->magic_number = MAGIC_NUMBER;
10642 re->compile_options = options;
10643 re->overall_options = cb.external_options;
10644 re->extra_options = ccontext->extra_options;
10645 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10646 re->limit_heap = limit_heap;
10647 re->limit_match = limit_match;
10648 re->limit_depth = limit_depth;
10649 re->first_codeunit = 0;
10650 re->last_codeunit = 0;
10651 re->bsr_convention = bsr;
10652 re->newline_convention = newline;
10653 re->max_lookbehind = 0;
10654 re->minlength = 0;
10655 re->top_bracket = 0;
10656 re->top_backref = 0;
10657 re->name_entry_size = cb.name_entry_size;
10658 re->name_count = cb.names_found;
10659
10660 /* The basic block is immediately followed by the name table, and the compiled
10661 code follows after that. */
10662
10663 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10664 re->name_entry_size * re->name_count;
10665
10666 /* Update the compile data block for the actual compile. The starting points of
10667 the name/number translation table and of the code are passed around in the
10668 compile data block. The start/end pattern and initial options are already set
10669 from the pre-compile phase, as is the name_entry_size field. */
10670
10671 cb.parens_depth = 0;
10672 cb.assert_depth = 0;
10673 cb.lastcapture = 0;
10674 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10675 cb.start_code = codestart;
10676 cb.req_varyopt = 0;
10677 cb.had_accept = FALSE;
10678 cb.had_pruneorskip = FALSE;
10679
10680 /* If any named groups were found, create the name/number table from the list
10681 created in the pre-pass. */
10682
10683 if (cb.names_found > 0)
10684 {
10685 named_group *ng = cb.named_groups;
10686 for (i = 0; i < cb.names_found; i++, ng++)
10687 add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10688 }
10689
10690 /* Set up a starting, non-extracting bracket, then compile the expression. On
10691 error, errorcode will be set non-zero, so we don't need to look at the result
10692 of the function here. */
10693
10694 pptr = cb.parsed_pattern;
10695 code = (PCRE2_UCHAR *)codestart;
10696 *code = OP_BRA;
10697 regexrc = compile_regex(re->overall_options, ccontext->extra_options, &code,
10698 &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10699 NULL, &cb, NULL);
10700 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10701 re->top_bracket = cb.bracount;
10702 re->top_backref = cb.top_backref;
10703 re->max_lookbehind = cb.max_lookbehind;
10704
10705 if (cb.had_accept)
10706 {
10707 reqcu = 0; /* Must disable after (*ACCEPT) */
10708 reqcuflags = REQ_NONE;
10709 re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
10710 }
10711
10712 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10713 but the estimated length exceeds the really used length, adjust the value of
10714 re->blocksize, and if valgrind support is configured, mark the extra allocated
10715 memory as unaddressable, so that any out-of-bound reads can be detected. */
10716
10717 *code++ = OP_END;
10718 usedlength = code - codestart;
10719 if (usedlength > length) errorcode = ERR23; else
10720 {
10721 re->blocksize -= CU2BYTES(length - usedlength);
10722 #ifdef SUPPORT_VALGRIND
10723 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10724 #endif
10725 }
10726
10727 /* Scan the pattern for recursion/subroutine calls and convert the group
10728 numbers into offsets. Maintain a small cache so that repeated groups containing
10729 recursions are efficiently handled. */
10730
10731 #define RSCAN_CACHE_SIZE 8
10732
10733 if (errorcode == 0 && cb.had_recurse)
10734 {
10735 PCRE2_UCHAR *rcode;
10736 PCRE2_SPTR rgroup;
10737 unsigned int ccount = 0;
10738 int start = RSCAN_CACHE_SIZE;
10739 recurse_cache rc[RSCAN_CACHE_SIZE];
10740
10741 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10742 rcode != NULL;
10743 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10744 {
10745 int p, groupnumber;
10746
10747 groupnumber = (int)GET(rcode, 1);
10748 if (groupnumber == 0) rgroup = codestart; else
10749 {
10750 PCRE2_SPTR search_from = codestart;
10751 rgroup = NULL;
10752 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10753 {
10754 if (groupnumber == rc[p].groupnumber)
10755 {
10756 rgroup = rc[p].group;
10757 break;
10758 }
10759
10760 /* Group n+1 must always start to the right of group n, so we can save
10761 search time below when the new group number is greater than any of the
10762 previously found groups. */
10763
10764 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10765 }
10766
10767 if (rgroup == NULL)
10768 {
10769 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10770 if (rgroup == NULL)
10771 {
10772 errorcode = ERR53;
10773 break;
10774 }
10775 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10776 rc[start].groupnumber = groupnumber;
10777 rc[start].group = rgroup;
10778 if (ccount < RSCAN_CACHE_SIZE) ccount++;
10779 }
10780 }
10781
10782 PUT(rcode, 1, rgroup - codestart);
10783 }
10784 }
10785
10786 /* In rare debugging situations we sometimes need to look at the compiled code
10787 at this stage. */
10788
10789 #ifdef DEBUG_CALL_PRINTINT
10790 pcre2_printint(re, stderr, TRUE);
10791 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10792 #endif
10793
10794 /* Unless disabled, check whether any single character iterators can be
10795 auto-possessified. The function overwrites the appropriate opcode values, so
10796 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10797 used in this code because at least one compiler gives a warning about loss of
10798 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10799 function call. */
10800
10801 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10802 {
10803 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10804 if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10805 }
10806
10807 /* Failed to compile, or error while post-processing. */
10808
10809 if (errorcode != 0) goto HAD_CB_ERROR;
10810
10811 /* Successful compile. If the anchored option was not passed, set it if
10812 we can determine that the pattern is anchored by virtue of ^ characters or \A
10813 or anything else, such as starting with non-atomic .* when DOTALL is set and
10814 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10815 disable this case). */
10816
10817 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10818 is_anchored(codestart, 0, &cb, 0, FALSE))
10819 re->overall_options |= PCRE2_ANCHORED;
10820
10821 /* Set up the first code unit or startline flag, the required code unit, and
10822 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10823 is set, as the data it would create will not be used. Note that a first code
10824 unit (but not the startline flag) is useful for anchored patterns because it
10825 can still give a quick "no match" and also avoid searching for a last code
10826 unit. */
10827
10828 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10829 {
10830 int minminlength = 0; /* For minimal minlength from first/required CU */
10831
10832 /* If we do not have a first code unit, see if there is one that is asserted
10833 (these are not saved during the compile because they can cause conflicts with
10834 actual literals that follow). */
10835
10836 if (firstcuflags >= REQ_NONE)
10837 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10838
10839 /* Save the data for a first code unit. The existence of one means the
10840 minimum length must be at least 1. */
10841
10842 if (firstcuflags < REQ_NONE)
10843 {
10844 re->first_codeunit = firstcu;
10845 re->flags |= PCRE2_FIRSTSET;
10846 minminlength++;
10847
10848 /* Handle caseless first code units. */
10849
10850 if ((firstcuflags & REQ_CASELESS) != 0)
10851 {
10852 if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10853 {
10854 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10855 }
10856
10857 /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10858 In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10859 points and cannot have another case, but if UCP is set they may do. */
10860
10861 #ifdef SUPPORT_UNICODE
10862 #if PCRE2_CODE_UNIT_WIDTH == 8
10863 else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10864 re->flags |= PCRE2_FIRSTCASELESS;
10865 #else
10866 else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10867 UCD_OTHERCASE(firstcu) != firstcu)
10868 re->flags |= PCRE2_FIRSTCASELESS;
10869 #endif
10870 #endif /* SUPPORT_UNICODE */
10871 }
10872 }
10873
10874 /* When there is no first code unit, for non-anchored patterns, see if we can
10875 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10876 branches start with ^ and also when all branches start with non-atomic .* for
10877 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10878 that disables this case.) */
10879
10880 else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10881 is_startline(codestart, 0, &cb, 0, FALSE))
10882 re->flags |= PCRE2_STARTLINE;
10883
10884 /* Handle the "required code unit", if one is set. In the UTF case we can
10885 increment the minimum minimum length only if we are sure this really is a
10886 different character and not a non-starting code unit of the first character,
10887 because the minimum length count is in characters, not code units. */
10888
10889 if (reqcuflags < REQ_NONE)
10890 {
10891 #if PCRE2_CODE_UNIT_WIDTH == 16
10892 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10893 firstcuflags >= REQ_NONE || /* First not set */
10894 (firstcu & 0xf800) != 0xd800 || /* First not surrogate */
10895 (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
10896 #elif PCRE2_CODE_UNIT_WIDTH == 8
10897 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10898 firstcuflags >= REQ_NONE || /* First not set */
10899 (firstcu & 0x80) == 0 || /* First is ASCII */
10900 (reqcu & 0x80) == 0) /* Req is ASCII */
10901 #endif
10902 {
10903 minminlength++;
10904 }
10905
10906 /* In the case of an anchored pattern, set up the value only if it follows
10907 a variable length item in the pattern. */
10908
10909 if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10910 (reqcuflags & REQ_VARY) != 0)
10911 {
10912 re->last_codeunit = reqcu;
10913 re->flags |= PCRE2_LASTSET;
10914
10915 /* Handle caseless required code units as for first code units (above). */
10916
10917 if ((reqcuflags & REQ_CASELESS) != 0)
10918 {
10919 if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10920 {
10921 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10922 }
10923 #ifdef SUPPORT_UNICODE
10924 #if PCRE2_CODE_UNIT_WIDTH == 8
10925 else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10926 re->flags |= PCRE2_LASTCASELESS;
10927 #else
10928 else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10929 UCD_OTHERCASE(reqcu) != reqcu)
10930 re->flags |= PCRE2_LASTCASELESS;
10931 #endif
10932 #endif /* SUPPORT_UNICODE */
10933 }
10934 }
10935 }
10936
10937 /* Study the compiled pattern to set up information such as a bitmap of
10938 starting code units and a minimum matching length. */
10939
10940 if (PRIV(study)(re) != 0)
10941 {
10942 errorcode = ERR31;
10943 goto HAD_CB_ERROR;
10944 }
10945
10946 /* If study() set a bitmap of starting code units, it implies a minimum
10947 length of at least one. */
10948
10949 if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10950 minminlength = 1;
10951
10952 /* If the minimum length set (or not set) by study() is less than the minimum
10953 implied by required code units, override it. */
10954
10955 if (re->minlength < minminlength) re->minlength = minminlength;
10956 } /* End of start-of-match optimizations. */
10957
10958 /* Control ends up here in all cases. When running under valgrind, make a
10959 pattern's terminating zero defined again. If memory was obtained for the parsed
10960 version of the pattern, free it before returning. Also free the list of named
10961 groups if a larger one had to be obtained, and likewise the group information
10962 vector. */
10963
10964 EXIT:
10965 #ifdef SUPPORT_VALGRIND
10966 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10967 #endif
10968 if (cb.parsed_pattern != stack_parsed_pattern)
10969 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10970 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10971 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10972 if (cb.groupinfo != stack_groupinfo)
10973 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10974 return re; /* Will be NULL after an error */
10975
10976 /* Errors discovered in parse_regex() set the offset value in the compile
10977 block. Errors discovered before it is called must compute it from the ptr
10978 value. After parse_regex() is called, the offset in the compile block is set to
10979 the end of the pattern, but certain errors in compile_regex() may reset it if
10980 an offset is available in the parsed pattern. */
10981
10982 HAD_CB_ERROR:
10983 ptr = pattern + cb.erroroffset;
10984
10985 HAD_EARLY_ERROR:
10986 *erroroffset = ptr - pattern;
10987
10988 HAD_ERROR:
10989 *errorptr = errorcode;
10990 pcre2_code_free(re);
10991 re = NULL;
10992 goto EXIT;
10993 }
10994
10995 /* These #undefs are here to enable unity builds with CMake. */
10996
10997 #undef NLBLOCK /* Block containing newline information */
10998 #undef PSSTART /* Field containing processed string start */
10999 #undef PSEND /* Field containing processed string end */
11000
11001 /* End of pcre2_compile.c */
11002