1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2023 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #define NLBLOCK cb /* Block containing newline information */
47 #define PSSTART start_pattern /* Field containing processed string start */
48 #define PSEND end_pattern /* Field containing processed string end */
49
50 #include "pcre2_internal.h"
51
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63
64 /* Other debugging code can be enabled by these defines. */
65
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c) xdigitab[c]
75
76 #else /* Either 16-bit or 32-bit */
77 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82 #else /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110
111 /* Macros for manipulating elements of the parsed pattern vector. */
112
113 #define META_CODE(x) (x & 0xffff0000u)
114 #define META_DATA(x) (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116
117 /* Function definitions to allow mutual recursion */
118
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, uint32_t,
122 compile_block *, const uint32_t *, unsigned int);
123 #endif
124
125 static int
126 compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
127 uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
128 open_capitem *, compile_block *, PCRE2_SIZE *);
129
130 static int
131 get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
132 compile_block *);
133
134 static BOOL
135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136 compile_block *);
137
138 static int
139 check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140 compile_block *, int *);
141
142
143 /*************************************************
144 * Code parameters and static tables *
145 *************************************************/
146
147 #define MAX_GROUP_NUMBER 65535u
148 #define MAX_REPEAT_COUNT 65535u
149 #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
150
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163
164 In the real compile phase, this workspace is not currently used. */
165
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
167
168 #define C16_WORK_SIZE \
169 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174
175 #define GROUPINFO_DEFAULT_SIZE 256
176
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186
187 #define NAMED_GROUP_LIST_SIZE 20
188
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199
200 #define OFLOW_MAX (INT_MAX - 20)
201
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210
211 #define META_END 0x80000000u /* End of pattern */
212
213 #define META_ALT 0x80010000u /* alternation */
214 #define META_ATOMIC 0x80020000u /* atomic group */
215 #define META_BACKREF 0x80030000u /* Back ref */
216 #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
217 #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
219 #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
220 #define META_CAPTURE 0x80080000u /* Capturing parenthesis */
221 #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
222 #define META_CLASS 0x800a0000u /* start non-empty class */
223 #define META_CLASS_EMPTY 0x800b0000u /* empty class */
224 #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
225 #define META_CLASS_END 0x800d0000u /* end of non-empty class */
226 #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
227 #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
228 #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
229 #define META_COND_NAME 0x80110000u /* (?(<name>)... */
230 #define META_COND_NUMBER 0x80120000u /* (?(digits)... */
231 #define META_COND_RNAME 0x80130000u /* (?(R&name)... */
232 #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
233 #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR 0x80160000u /* $ metacharacter */
235 #define META_DOT 0x80170000u /* . metacharacter */
236 #define META_ESCAPE 0x80180000u /* \d and friends */
237 #define META_KET 0x80190000u /* closing parenthesis */
238 #define META_NOCAPTURE 0x801a0000u /* no capture parens */
239 #define META_OPTIONS 0x801b0000u /* (?i) and friends */
240 #define META_POSIX 0x801c0000u /* POSIX class item */
241 #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
242 #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
243 #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
244 #define META_RECURSE 0x80200000u /* Recursion */
245 #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
246 #define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
247
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250
251 #define META_LOOKAHEAD 0x80230000u /* (?= */
252 #define META_LOOKAHEADNOT 0x80240000u /* (?! */
253 #define META_LOOKBEHIND 0x80250000u /* (?<= */
254 #define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
255
256 /* These cannot be conditions */
257
258 #define META_LOOKAHEAD_NA 0x80270000u /* (*napla: */
259 #define META_LOOKBEHIND_NA 0x80280000u /* (*naplb: */
260
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264
265 #define META_MARK 0x80290000u /* (*MARK) */
266 #define META_ACCEPT 0x802a0000u /* (*ACCEPT) */
267 #define META_FAIL 0x802b0000u /* (*FAIL) */
268 #define META_COMMIT 0x802c0000u /* These */
269 #define META_COMMIT_ARG 0x802d0000u /* pairs */
270 #define META_PRUNE 0x802e0000u /* must */
271 #define META_PRUNE_ARG 0x802f0000u /* be */
272 #define META_SKIP 0x80300000u /* kept */
273 #define META_SKIP_ARG 0x80310000u /* in */
274 #define META_THEN 0x80320000u /* this */
275 #define META_THEN_ARG 0x80330000u /* order */
276
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278
279 #define META_ASTERISK 0x80340000u /* * */
280 #define META_ASTERISK_PLUS 0x80350000u /* *+ */
281 #define META_ASTERISK_QUERY 0x80360000u /* *? */
282 #define META_PLUS 0x80370000u /* + */
283 #define META_PLUS_PLUS 0x80380000u /* ++ */
284 #define META_PLUS_QUERY 0x80390000u /* +? */
285 #define META_QUERY 0x803a0000u /* ? */
286 #define META_QUERY_PLUS 0x803b0000u /* ?+ */
287 #define META_QUERY_QUERY 0x803c0000u /* ?? */
288 #define META_MINMAX 0x803d0000u /* {n,m} repeat */
289 #define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
291
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER META_MINMAX_QUERY
294
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305
306 static unsigned char meta_extra_lengths[] = {
307 0, /* META_END */
308 0, /* META_ALT */
309 0, /* META_ATOMIC */
310 0, /* META_BACKREF - more if group is >= 10 */
311 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
312 1, /* META_BIGVALUE */
313 3, /* META_CALLOUT_NUMBER */
314 3+SIZEOFFSET, /* META_CALLOUT_STRING */
315 0, /* META_CAPTURE */
316 0, /* META_CIRCUMFLEX */
317 0, /* META_CLASS */
318 0, /* META_CLASS_EMPTY */
319 0, /* META_CLASS_EMPTY_NOT */
320 0, /* META_CLASS_END */
321 0, /* META_CLASS_NOT */
322 0, /* META_COND_ASSERT */
323 SIZEOFFSET, /* META_COND_DEFINE */
324 1+SIZEOFFSET, /* META_COND_NAME */
325 1+SIZEOFFSET, /* META_COND_NUMBER */
326 1+SIZEOFFSET, /* META_COND_RNAME */
327 1+SIZEOFFSET, /* META_COND_RNUMBER */
328 3, /* META_COND_VERSION */
329 0, /* META_DOLLAR */
330 0, /* META_DOT */
331 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332 0, /* META_KET */
333 0, /* META_NOCAPTURE */
334 1, /* META_OPTIONS */
335 1, /* META_POSIX */
336 1, /* META_POSIX_NEG */
337 0, /* META_RANGE_ESCAPED */
338 0, /* META_RANGE_LITERAL */
339 SIZEOFFSET, /* META_RECURSE */
340 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
341 0, /* META_SCRIPT_RUN */
342 0, /* META_LOOKAHEAD */
343 0, /* META_LOOKAHEADNOT */
344 SIZEOFFSET, /* META_LOOKBEHIND */
345 SIZEOFFSET, /* META_LOOKBEHINDNOT */
346 0, /* META_LOOKAHEAD_NA */
347 SIZEOFFSET, /* META_LOOKBEHIND_NA */
348 1, /* META_MARK - plus the string length */
349 0, /* META_ACCEPT */
350 0, /* META_FAIL */
351 0, /* META_COMMIT */
352 1, /* META_COMMIT_ARG - plus the string length */
353 0, /* META_PRUNE */
354 1, /* META_PRUNE_ARG - plus the string length */
355 0, /* META_SKIP */
356 1, /* META_SKIP_ARG - plus the string length */
357 0, /* META_THEN */
358 1, /* META_THEN_ARG - plus the string length */
359 0, /* META_ASTERISK */
360 0, /* META_ASTERISK_PLUS */
361 0, /* META_ASTERISK_QUERY */
362 0, /* META_PLUS */
363 0, /* META_PLUS_PLUS */
364 0, /* META_PLUS_QUERY */
365 0, /* META_QUERY */
366 0, /* META_QUERY_PLUS */
367 0, /* META_QUERY_QUERY */
368 2, /* META_MINMAX */
369 2, /* META_MINMAX_PLUS */
370 2 /* META_MINMAX_QUERY */
371 };
372
373 /* Types for skipping parts of a parsed pattern. */
374
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380
381 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387
388 /* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389 variables, which are concerned with first and required code units. A value
390 greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391 matching xxcu variable is set, and the low valued bits are relevant. */
392
393 #define REQ_UNSET 0xffffffffu /* Not yet found anything */
394 #define REQ_NONE 0xfffffffeu /* Found not fixed character */
395 #define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
396 #define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
397
398 /* These flags are used in the groupinfo vector. */
399
400 #define GI_SET_FIXED_LENGTH 0x80000000u
401 #define GI_NOT_FIXED_LENGTH 0x40000000u
402 #define GI_FIXED_LENGTH_MASK 0x0000ffffu
403
404 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405 and is fast (a good compiler can turn it into a subtraction and unsigned
406 comparison). */
407
408 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409
410 /* Table to identify hex digits. The tables in chartables are dependent on the
411 locale, and may mark arbitrary characters as digits. We want to recognize only
412 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413 costs 256 bytes, but it is a lot faster than doing character value tests (at
414 least in some simple cases I timed), and in some applications one wants PCRE2
415 to compile efficiently as well as match efficiently. The value in the table is
416 the binary hex digit value, or 0xff for non-hex digits. */
417
418 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419 UTF-8 mode. */
420
421 #ifndef EBCDIC
422 static const uint8_t xdigitab[] =
423 {
424 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
428 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
429 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
430 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
431 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
432 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
434 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
435 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
436 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
437 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
438 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
439 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
440 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456
457 #else
458
459 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460
461 static const uint8_t xdigitab[] =
462 {
463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
466 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
467 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
472 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
473 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
474 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
475 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
476 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
477 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
479 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
480 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
481 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
482 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
483 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
484 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
485 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
486 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
487 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
488 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
489 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
490 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
491 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
492 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
493 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
494 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
495 #endif /* EBCDIC */
496
497
498 /* Table for handling alphanumeric escaped characters. Positive returns are
499 simple data values; negative values are for special things like \d and so on.
500 Zero means further processing is needed (for things like \x), or the escape is
501 invalid. */
502
503 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
504 in UTF-8 mode. It runs from '0' to 'z'. */
505
506 #ifndef EBCDIC
507 #define ESCAPES_FIRST CHAR_0
508 #define ESCAPES_LAST CHAR_z
509 #define UPPER_CASE(c) (c-32)
510
511 static const short int escapes[] = {
512 0, 0,
513 0, 0,
514 0, 0,
515 0, 0,
516 0, 0,
517 CHAR_COLON, CHAR_SEMICOLON,
518 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
519 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
520 CHAR_COMMERCIAL_AT, -ESC_A,
521 -ESC_B, -ESC_C,
522 -ESC_D, -ESC_E,
523 0, -ESC_G,
524 -ESC_H, 0,
525 0, -ESC_K,
526 0, 0,
527 -ESC_N, 0,
528 -ESC_P, -ESC_Q,
529 -ESC_R, -ESC_S,
530 0, 0,
531 -ESC_V, -ESC_W,
532 -ESC_X, 0,
533 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
534 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
535 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
536 CHAR_GRAVE_ACCENT, CHAR_BEL,
537 -ESC_b, 0,
538 -ESC_d, CHAR_ESC,
539 CHAR_FF, 0,
540 -ESC_h, 0,
541 0, -ESC_k,
542 0, 0,
543 CHAR_LF, 0,
544 -ESC_p, 0,
545 CHAR_CR, -ESC_s,
546 CHAR_HT, 0,
547 -ESC_v, -ESC_w,
548 0, 0,
549 -ESC_z
550 };
551
552 #else
553
554 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557 because it is defined as 'a', which of course picks up the ASCII value. */
558
559 #if 'a' == 0x81 /* Check for a real EBCDIC environment */
560 #define ESCAPES_FIRST CHAR_a
561 #define ESCAPES_LAST CHAR_9
562 #define UPPER_CASE(c) (c+64)
563 #else /* Testing in an ASCII environment */
564 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
565 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
566 #define UPPER_CASE(c) (c-32)
567 #endif
568
569 static const short int escapes[] = {
570 /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
571 /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
572 /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
573 /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
574 /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
575 /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
576 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
577 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
578 /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
579 /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
580 /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
581 /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
582 /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
583 /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
584 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
585 /* F8 */ 0, 0
586 };
587
588 /* We also need a table of characters that may follow \c in an EBCDIC
589 environment for characters 0-31. */
590
591 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592
593 #endif /* EBCDIC */
594
595
596 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597 searched linearly. Put all the names into a single string, in order to reduce
598 the number of relocations when a shared library is dynamically linked. The
599 string is built from string macros so that it works in UTF-8 mode on EBCDIC
600 platforms. */
601
602 typedef struct verbitem {
603 unsigned int len; /* Length of verb name */
604 uint32_t meta; /* Base META_ code */
605 int has_arg; /* Argument requirement */
606 } verbitem;
607
608 static const char verbnames[] =
609 "\0" /* Empty name is a shorthand for MARK */
610 STRING_MARK0
611 STRING_ACCEPT0
612 STRING_F0
613 STRING_FAIL0
614 STRING_COMMIT0
615 STRING_PRUNE0
616 STRING_SKIP0
617 STRING_THEN;
618
619 static const verbitem verbs[] = {
620 { 0, META_MARK, +1 }, /* > 0 => must have an argument */
621 { 4, META_MARK, +1 },
622 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
623 { 1, META_FAIL, -1 },
624 { 4, META_FAIL, -1 },
625 { 6, META_COMMIT, 0 },
626 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
627 { 4, META_SKIP, 0 },
628 { 4, META_THEN, 0 }
629 };
630
631 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632
633 /* Verb opcodes, indexed by their META code offset from META_MARK. */
634
635 static const uint32_t verbops[] = {
636 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638
639 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640
641 typedef struct alasitem {
642 unsigned int len; /* Length of name */
643 uint32_t meta; /* Base META_ code */
644 } alasitem;
645
646 static const char alasnames[] =
647 STRING_pla0
648 STRING_plb0
649 STRING_napla0
650 STRING_naplb0
651 STRING_nla0
652 STRING_nlb0
653 STRING_positive_lookahead0
654 STRING_positive_lookbehind0
655 STRING_non_atomic_positive_lookahead0
656 STRING_non_atomic_positive_lookbehind0
657 STRING_negative_lookahead0
658 STRING_negative_lookbehind0
659 STRING_atomic0
660 STRING_sr0
661 STRING_asr0
662 STRING_script_run0
663 STRING_atomic_script_run;
664
665 static const alasitem alasmeta[] = {
666 { 3, META_LOOKAHEAD },
667 { 3, META_LOOKBEHIND },
668 { 5, META_LOOKAHEAD_NA },
669 { 5, META_LOOKBEHIND_NA },
670 { 3, META_LOOKAHEADNOT },
671 { 3, META_LOOKBEHINDNOT },
672 { 18, META_LOOKAHEAD },
673 { 19, META_LOOKBEHIND },
674 { 29, META_LOOKAHEAD_NA },
675 { 30, META_LOOKBEHIND_NA },
676 { 18, META_LOOKAHEADNOT },
677 { 19, META_LOOKBEHINDNOT },
678 { 6, META_ATOMIC },
679 { 2, META_SCRIPT_RUN }, /* sr = script run */
680 { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681 { 10, META_SCRIPT_RUN }, /* script run */
682 { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
683 };
684
685 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686
687 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688
689 static uint32_t chartypeoffset[] = {
690 OP_STAR - OP_STAR, OP_STARI - OP_STAR,
691 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692
693 /* Tables of names of POSIX character classes and their lengths. The names are
694 now all in a single string, to reduce the number of relocations when a shared
695 library is dynamically loaded. The list of lengths is terminated by a zero
696 length entry. The first three must be alpha, lower, upper, as this is assumed
697 for handling case independence. The indices for several classes are needed, so
698 identify them. */
699
700 static const char posix_names[] =
701 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704 STRING_word0 STRING_xdigit;
705
706 static const uint8_t posix_name_lengths[] = {
707 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708
709 #define PC_DIGIT 7
710 #define PC_GRAPH 8
711 #define PC_PRINT 9
712 #define PC_PUNCT 10
713 #define PC_XDIGIT 13
714
715 /* Table of class bit maps for each POSIX class. Each class is formed from a
716 base map, with an optional addition or removal of another map. Then, for some
717 classes, there is some additional tweaking: for [:blank:] the vertical space
718 characters are removed, and for [:alpha:] and [:alnum:] the underscore
719 character is removed. The triples in the table consist of the base map offset,
720 second map offset or -1 if no second map, and a non-negative value for map
721 addition or a negative value for map subtraction (if there are two maps). The
722 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
723 remove vertical space characters, 2 => remove underscore. */
724
725 static const int posix_class_maps[] = {
726 cbit_word, cbit_digit, -2, /* alpha */
727 cbit_lower, -1, 0, /* lower */
728 cbit_upper, -1, 0, /* upper */
729 cbit_word, -1, 2, /* alnum - word without underscore */
730 cbit_print, cbit_cntrl, 0, /* ascii */
731 cbit_space, -1, 1, /* blank - a GNU extension */
732 cbit_cntrl, -1, 0, /* cntrl */
733 cbit_digit, -1, 0, /* digit */
734 cbit_graph, -1, 0, /* graph */
735 cbit_print, -1, 0, /* print */
736 cbit_punct, -1, 0, /* punct */
737 cbit_space, -1, 0, /* space */
738 cbit_word, -1, 0, /* word - a Perl extension */
739 cbit_xdigit, -1, 0 /* xdigit */
740 };
741
742 #ifdef SUPPORT_UNICODE
743
744 /* The POSIX class Unicode property substitutes that are used in UCP mode must
745 be in the order of the POSIX class names, defined above. */
746
747 static int posix_substitutes[] = {
748 PT_GC, ucp_L, /* alpha */
749 PT_PC, ucp_Ll, /* lower */
750 PT_PC, ucp_Lu, /* upper */
751 PT_ALNUM, 0, /* alnum */
752 -1, 0, /* ascii, treat as non-UCP */
753 -1, 1, /* blank, treat as \h */
754 PT_PC, ucp_Cc, /* cntrl */
755 PT_PC, ucp_Nd, /* digit */
756 PT_PXGRAPH, 0, /* graph */
757 PT_PXPRINT, 0, /* print */
758 PT_PXPUNCT, 0, /* punct */
759 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
760 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
761 PT_PXXDIGIT, 0 /* xdigit */ /* Perl has additional hex digits */
762 };
763 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
764 #endif /* SUPPORT_UNICODE */
765
766 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
767 are allowed. */
768
769 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
770 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
771 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
772 PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
773
774 #define PUBLIC_COMPILE_OPTIONS \
775 (PUBLIC_LITERAL_COMPILE_OPTIONS| \
776 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
777 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
778 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
779 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
780 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
781 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
782
783 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
784 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_CASELESS_RESTRICT)
785
786 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
787 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
788 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
789 PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
790 PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
791 PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
792 PCRE2_EXTRA_ASCII_DIGIT)
793
794 /* Compile time error code numbers. They are given names so that they can more
795 easily be tracked. When a new number is added, the tables called eint1 and
796 eint2 in pcre2posix.c may need to be updated, and a new error text must be
797 added to compile_error_texts in pcre2_error.c. Also, the error codes in
798 pcre2.h.in must be updated - their values are exactly 100 greater than these
799 values. */
800
801 enum { ERR0 = COMPILE_ERROR_BASE,
802 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
803 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
804 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
805 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
806 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
807 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
808 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
809 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
810 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
811 ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99, ERR100 };
812
813 /* This is a table of start-of-pattern options such as (*UTF) and settings such
814 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
815 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
816 generic and always supported. */
817
818 enum { PSO_OPT, /* Value is an option bit */
819 PSO_FLG, /* Value is a flag bit */
820 PSO_NL, /* Value is a newline type */
821 PSO_BSR, /* Value is a \R type */
822 PSO_LIMH, /* Read integer value for heap limit */
823 PSO_LIMM, /* Read integer value for match limit */
824 PSO_LIMD /* Read integer value for depth limit */
825 };
826
827 typedef struct pso {
828 const uint8_t *name;
829 uint16_t length;
830 uint16_t type;
831 uint32_t value;
832 } pso;
833
834 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
835
836 static const pso pso_list[] = {
837 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
838 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
839 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
840 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
841 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
842 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
843 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
844 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
845 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
846 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
847 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
848 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
849 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
850 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
851 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
852 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
853 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
854 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
855 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
856 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
857 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
858 };
859
860 /* This table is used when converting repeating opcodes into possessified
861 versions as a result of an explicit possessive quantifier such as ++. A zero
862 value means there is no possessified version - in those cases the item in
863 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
864 because all relevant opcodes are less than that. */
865
866 static const uint8_t opcode_possessify[] = {
867 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
868 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
869
870 0, /* NOTI */
871 OP_POSSTAR, 0, /* STAR, MINSTAR */
872 OP_POSPLUS, 0, /* PLUS, MINPLUS */
873 OP_POSQUERY, 0, /* QUERY, MINQUERY */
874 OP_POSUPTO, 0, /* UPTO, MINUPTO */
875 0, /* EXACT */
876 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
877
878 OP_POSSTARI, 0, /* STARI, MINSTARI */
879 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
880 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
881 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
882 0, /* EXACTI */
883 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
884
885 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
886 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
887 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
888 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
889 0, /* NOTEXACT */
890 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
891
892 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
893 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
894 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
895 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
896 0, /* NOTEXACTI */
897 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
898
899 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
900 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
901 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
902 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
903 0, /* TYPEEXACT */
904 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
905
906 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
907 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
908 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
909 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
910 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
911
912 0, 0, 0, /* CLASS, NCLASS, XCLASS */
913 0, 0, /* REF, REFI */
914 0, 0, /* DNREF, DNREFI */
915 0, 0 /* RECURSE, CALLOUT */
916 };
917
918
919 #ifdef DEBUG_SHOW_PARSED
920 /*************************************************
921 * Show the parsed pattern for debugging *
922 *************************************************/
923
924 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
925 can be enabled. */
926
show_parsed(compile_block * cb)927 static void show_parsed(compile_block *cb)
928 {
929 uint32_t *pptr = cb->parsed_pattern;
930
931 for (;;)
932 {
933 int max, min;
934 PCRE2_SIZE offset;
935 uint32_t i;
936 uint32_t length;
937 uint32_t meta_arg = META_DATA(*pptr);
938
939 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
940
941 if (*pptr < META_END)
942 {
943 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
944 pptr++;
945 }
946
947 else switch (META_CODE(*pptr++))
948 {
949 default:
950 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
951 return;
952
953 case META_END:
954 fprintf(stderr, "META_END\n");
955 return;
956
957 case META_CAPTURE:
958 fprintf(stderr, "META_CAPTURE %d", meta_arg);
959 break;
960
961 case META_RECURSE:
962 GETOFFSET(offset, pptr);
963 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
964 break;
965
966 case META_BACKREF:
967 if (meta_arg < 10)
968 offset = cb->small_ref_offset[meta_arg];
969 else
970 GETOFFSET(offset, pptr);
971 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
972 break;
973
974 case META_ESCAPE:
975 if (meta_arg == ESC_P || meta_arg == ESC_p)
976 {
977 uint32_t ptype = *pptr >> 16;
978 uint32_t pvalue = *pptr++ & 0xffff;
979 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
980 ptype, pvalue);
981 }
982 else
983 {
984 uint32_t cc;
985 /* There's just one escape we might have here that isn't negated in the
986 escapes table. */
987 if (meta_arg == ESC_g) cc = CHAR_g;
988 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
989 {
990 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
991 }
992 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
993 fprintf(stderr, "META \\%c", cc);
994 }
995 break;
996
997 case META_MINMAX:
998 min = *pptr++;
999 max = *pptr++;
1000 if (max != REPEAT_UNLIMITED)
1001 fprintf(stderr, "META {%d,%d}", min, max);
1002 else
1003 fprintf(stderr, "META {%d,}", min);
1004 break;
1005
1006 case META_MINMAX_QUERY:
1007 min = *pptr++;
1008 max = *pptr++;
1009 if (max != REPEAT_UNLIMITED)
1010 fprintf(stderr, "META {%d,%d}?", min, max);
1011 else
1012 fprintf(stderr, "META {%d,}?", min);
1013 break;
1014
1015 case META_MINMAX_PLUS:
1016 min = *pptr++;
1017 max = *pptr++;
1018 if (max != REPEAT_UNLIMITED)
1019 fprintf(stderr, "META {%d,%d}+", min, max);
1020 else
1021 fprintf(stderr, "META {%d,}+", min);
1022 break;
1023
1024 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1025 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1026 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1027 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1028 case META_DOT: fprintf(stderr, "META_DOT"); break;
1029 case META_ASTERISK: fprintf(stderr, "META *"); break;
1030 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1031 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1032 case META_PLUS: fprintf(stderr, "META +"); break;
1033 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1034 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1035 case META_QUERY: fprintf(stderr, "META ?"); break;
1036 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1037 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1038
1039 case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1040 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1041 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1042 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1043 case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1044 case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1045 case META_KET: fprintf(stderr, "META )"); break;
1046 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1047
1048 case META_CLASS: fprintf(stderr, "META ["); break;
1049 case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1050 case META_CLASS_END: fprintf(stderr, "META ]"); break;
1051 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1052 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1053
1054 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1055 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1056
1057 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1058 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1059
1060 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1061 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1062 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1063 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1064 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1065 case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1066
1067 case META_OPTIONS:
1068 fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
1069 pptr += 2;
1070 break;
1071
1072 case META_LOOKBEHIND:
1073 fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
1074 pptr += 2;
1075 break;
1076
1077 case META_LOOKBEHIND_NA:
1078 fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
1079 pptr += 2;
1080 break;
1081
1082 case META_LOOKBEHINDNOT:
1083 fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
1084 pptr += 2;
1085 break;
1086
1087 case META_CALLOUT_NUMBER:
1088 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1089 pptr[1]);
1090 pptr += 3;
1091 break;
1092
1093 case META_CALLOUT_STRING:
1094 {
1095 uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1096 uint32_t patlength = *pptr++; /* Length of next pattern item */
1097 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1098 GETOFFSET(offset, pptr);
1099 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1100 }
1101 break;
1102
1103 case META_RECURSE_BYNAME:
1104 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1105 GETOFFSET(offset, pptr);
1106 fprintf(stderr, "%zd", offset);
1107 break;
1108
1109 case META_BACKREF_BYNAME:
1110 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1111 GETOFFSET(offset, pptr);
1112 fprintf(stderr, "%zd", offset);
1113 break;
1114
1115 case META_COND_NUMBER:
1116 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1117 GETOFFSET(offset, pptr);
1118 fprintf(stderr, "%zd", offset);
1119 pptr++;
1120 break;
1121
1122 case META_COND_DEFINE:
1123 fprintf(stderr, "META (?(DEFINE) offset=");
1124 GETOFFSET(offset, pptr);
1125 fprintf(stderr, "%zd", offset);
1126 break;
1127
1128 case META_COND_VERSION:
1129 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1130 fprintf(stderr, "%d.", *pptr++);
1131 fprintf(stderr, "%d)", *pptr++);
1132 break;
1133
1134 case META_COND_NAME:
1135 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1136 GETOFFSET(offset, pptr);
1137 fprintf(stderr, "%zd", offset);
1138 break;
1139
1140 case META_COND_RNAME:
1141 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1142 GETOFFSET(offset, pptr);
1143 fprintf(stderr, "%zd", offset);
1144 break;
1145
1146 /* This is kept as a name, because it might be. */
1147
1148 case META_COND_RNUMBER:
1149 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1150 GETOFFSET(offset, pptr);
1151 fprintf(stderr, "%zd", offset);
1152 break;
1153
1154 case META_MARK:
1155 fprintf(stderr, "META (*MARK:");
1156 goto SHOWARG;
1157
1158 case META_COMMIT_ARG:
1159 fprintf(stderr, "META (*COMMIT:");
1160 goto SHOWARG;
1161
1162 case META_PRUNE_ARG:
1163 fprintf(stderr, "META (*PRUNE:");
1164 goto SHOWARG;
1165
1166 case META_SKIP_ARG:
1167 fprintf(stderr, "META (*SKIP:");
1168 goto SHOWARG;
1169
1170 case META_THEN_ARG:
1171 fprintf(stderr, "META (*THEN:");
1172 SHOWARG:
1173 length = *pptr++;
1174 for (i = 0; i < length; i++)
1175 {
1176 uint32_t cc = *pptr++;
1177 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1178 else fprintf(stderr, "\\x{%x}", cc);
1179 }
1180 fprintf(stderr, ") length=%u", length);
1181 break;
1182 }
1183 fprintf(stderr, "\n");
1184 }
1185 return;
1186 }
1187 #endif /* DEBUG_SHOW_PARSED */
1188
1189
1190
1191 /*************************************************
1192 * Copy compiled code *
1193 *************************************************/
1194
1195 /* Compiled JIT code cannot be copied, so the new compiled block has no
1196 associated JIT data. */
1197
1198 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1199 pcre2_code_copy(const pcre2_code *code)
1200 {
1201 PCRE2_SIZE* ref_count;
1202 pcre2_code *newcode;
1203
1204 if (code == NULL) return NULL;
1205 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1206 if (newcode == NULL) return NULL;
1207 memcpy(newcode, code, code->blocksize);
1208 newcode->executable_jit = NULL;
1209
1210 /* If the code is one that has been deserialized, increment the reference count
1211 in the decoded tables. */
1212
1213 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1214 {
1215 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1216 (*ref_count)++;
1217 }
1218
1219 return newcode;
1220 }
1221
1222
1223
1224 /*************************************************
1225 * Copy compiled code and character tables *
1226 *************************************************/
1227
1228 /* Compiled JIT code cannot be copied, so the new compiled block has no
1229 associated JIT data. This version of code_copy also makes a separate copy of
1230 the character tables. */
1231
1232 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1233 pcre2_code_copy_with_tables(const pcre2_code *code)
1234 {
1235 PCRE2_SIZE* ref_count;
1236 pcre2_code *newcode;
1237 uint8_t *newtables;
1238
1239 if (code == NULL) return NULL;
1240 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1241 if (newcode == NULL) return NULL;
1242 memcpy(newcode, code, code->blocksize);
1243 newcode->executable_jit = NULL;
1244
1245 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1246 code->memctl.memory_data);
1247 if (newtables == NULL)
1248 {
1249 code->memctl.free((void *)newcode, code->memctl.memory_data);
1250 return NULL;
1251 }
1252 memcpy(newtables, code->tables, TABLES_LENGTH);
1253 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1254 *ref_count = 1;
1255
1256 newcode->tables = newtables;
1257 newcode->flags |= PCRE2_DEREF_TABLES;
1258 return newcode;
1259 }
1260
1261
1262
1263 /*************************************************
1264 * Free compiled code *
1265 *************************************************/
1266
1267 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1268 pcre2_code_free(pcre2_code *code)
1269 {
1270 PCRE2_SIZE* ref_count;
1271
1272 if (code != NULL)
1273 {
1274 #ifdef SUPPORT_JIT
1275 if (code->executable_jit != NULL)
1276 PRIV(jit_free)(code->executable_jit, &code->memctl);
1277 #endif
1278
1279 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1280 {
1281 /* Decoded tables belong to the codes after deserialization, and they must
1282 be freed when there are no more references to them. The *ref_count should
1283 always be > 0. */
1284
1285 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1286 if (*ref_count > 0)
1287 {
1288 (*ref_count)--;
1289 if (*ref_count == 0)
1290 code->memctl.free((void *)code->tables, code->memctl.memory_data);
1291 }
1292 }
1293
1294 code->memctl.free(code, code->memctl.memory_data);
1295 }
1296 }
1297
1298
1299
1300 /*************************************************
1301 * Read a number, possibly signed *
1302 *************************************************/
1303
1304 /* This function is used to read numbers in the pattern. The initial pointer
1305 must be at the sign or first digit of the number. When relative values
1306 (introduced by + or -) are allowed, they are relative group numbers, and the
1307 result must be greater than zero.
1308
1309 Arguments:
1310 ptrptr points to the character pointer variable
1311 ptrend points to the end of the input string
1312 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1313 max_value the largest number allowed
1314 max_error the error to give for an over-large number
1315 intptr where to put the result
1316 errcodeptr where to put an error code
1317
1318 Returns: TRUE - a number was read
1319 FALSE - errorcode == 0 => no number was found
1320 errorcode != 0 => an error occurred
1321 */
1322
1323 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1324 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1325 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1326 {
1327 int sign = 0;
1328 uint32_t n = 0;
1329 PCRE2_SPTR ptr = *ptrptr;
1330 BOOL yield = FALSE;
1331
1332 *errorcodeptr = 0;
1333
1334 if (allow_sign >= 0 && ptr < ptrend)
1335 {
1336 if (*ptr == CHAR_PLUS)
1337 {
1338 sign = +1;
1339 max_value -= allow_sign;
1340 ptr++;
1341 }
1342 else if (*ptr == CHAR_MINUS)
1343 {
1344 sign = -1;
1345 ptr++;
1346 }
1347 }
1348
1349 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1350 while (ptr < ptrend && IS_DIGIT(*ptr))
1351 {
1352 n = n * 10 + *ptr++ - CHAR_0;
1353 if (n > max_value)
1354 {
1355 *errorcodeptr = max_error;
1356 goto EXIT;
1357 }
1358 }
1359
1360 if (allow_sign >= 0 && sign != 0)
1361 {
1362 if (n == 0)
1363 {
1364 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1365 goto EXIT;
1366 }
1367
1368 if (sign > 0) n += allow_sign;
1369 else if ((int)n > allow_sign)
1370 {
1371 *errorcodeptr = ERR15; /* Non-existent subpattern */
1372 goto EXIT;
1373 }
1374 else n = allow_sign + 1 - n;
1375 }
1376
1377 yield = TRUE;
1378
1379 EXIT:
1380 *intptr = n;
1381 *ptrptr = ptr;
1382 return yield;
1383 }
1384
1385
1386
1387 /*************************************************
1388 * Read repeat counts *
1389 *************************************************/
1390
1391 /* Read an item of the form {n,m} and return the values when non-NULL pointers
1392 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1393 larger value is used for "unlimited". We have to use signed arguments for
1394 read_number() because it is capable of returning a signed value. As of Perl
1395 5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1396 tabs after { and before } and between the numbers and the comma, so we do too.
1397
1398 Arguments:
1399 ptrptr points to pointer to character after '{'
1400 ptrend pointer to end of input
1401 minp if not NULL, pointer to int for min
1402 maxp if not NULL, pointer to int for max
1403 errorcodeptr points to error code variable
1404
1405 Returns: FALSE if not a repeat quantifier, errorcode set zero
1406 FALSE on error, with errorcode set non-zero
1407 TRUE on success, with pointer updated to point after '}'
1408 */
1409
1410 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1411 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1412 uint32_t *maxp, int *errorcodeptr)
1413 {
1414 PCRE2_SPTR p = *ptrptr;
1415 PCRE2_SPTR pp;
1416 BOOL yield = FALSE;
1417 BOOL had_minimum = FALSE;
1418 int32_t min = 0;
1419 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1420
1421 *errorcodeptr = 0;
1422 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1423
1424 /* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1425 such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1426 error. */
1427
1428 pp = p;
1429 if (pp < ptrend && IS_DIGIT(*pp))
1430 {
1431 had_minimum = TRUE;
1432 while (++pp < ptrend && IS_DIGIT(*pp)) {}
1433 }
1434
1435 while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1436 if (pp >= ptrend) return FALSE;
1437
1438 if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1439 {
1440 if (!had_minimum) return FALSE;
1441 }
1442 else
1443 {
1444 if (*pp++ != CHAR_COMMA) return FALSE;
1445 while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1446 if (pp >= ptrend) return FALSE;
1447 if (IS_DIGIT(*pp))
1448 {
1449 while (++pp < ptrend && IS_DIGIT(*pp)) {}
1450 }
1451 else if (!had_minimum) return FALSE;
1452 while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1453 if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1454 }
1455
1456 /* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
1457 or {n,m}. The only error that read_number() can return is for a number that is
1458 too big. If *errorcodeptr is returned as zero it means no number was found. */
1459
1460 /* Deal with {,m} or n too big. If we successfully read m there is no need to
1461 check m >= n because n defaults to zero. */
1462
1463 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1464 {
1465 if (*errorcodeptr != 0) goto EXIT; /* n too big */
1466 p++; /* Skip comma and subsequent spaces */
1467 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1468 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1469 {
1470 if (*errorcodeptr != 0) goto EXIT; /* m too big */
1471 }
1472 }
1473
1474 /* Have read one number. Deal with {n} or {n,} or {n,m} */
1475
1476 else
1477 {
1478 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1479 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1480 {
1481 max = min;
1482 }
1483 else /* Handle {n,} or {n,m} */
1484 {
1485 p++; /* Skip comma and subsequent spaces */
1486 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1487 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1488 {
1489 if (*errorcodeptr != 0) goto EXIT; /* m too big */
1490 }
1491
1492 if (max < min)
1493 {
1494 *errorcodeptr = ERR4;
1495 goto EXIT;
1496 }
1497 }
1498 }
1499
1500 /* Valid quantifier exists */
1501
1502 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1503 p++;
1504 yield = TRUE;
1505 if (minp != NULL) *minp = (uint32_t)min;
1506 if (maxp != NULL) *maxp = (uint32_t)max;
1507
1508 /* Update the pattern pointer */
1509
1510 EXIT:
1511 *ptrptr = p;
1512 return yield;
1513 }
1514
1515
1516
1517 /*************************************************
1518 * Handle escapes *
1519 *************************************************/
1520
1521 /* This function is called when a \ has been encountered. It either returns a
1522 positive value for a simple escape such as \d, or 0 for a data character, which
1523 is placed in chptr. A backreference to group n is returned as negative n. On
1524 entry, ptr is pointing at the character after \. On exit, it points after the
1525 final code unit of the escape sequence.
1526
1527 This function is also called from pcre2_substitute() to handle escape sequences
1528 in replacement strings. In this case, the cb argument is NULL, and in the case
1529 of escapes that have further processing, only sequences that define a data
1530 character are recognised. The isclass argument is not relevant; the options
1531 argument is the final value of the compiled pattern's options.
1532
1533 Arguments:
1534 ptrptr points to the input position pointer
1535 ptrend points to the end of the input
1536 chptr points to a returned data character
1537 errorcodeptr points to the errorcode variable (containing zero)
1538 options the current options bits
1539 xoptions the current extra options bits
1540 isclass TRUE if inside a character class
1541 cb compile data block or NULL when called from pcre2_substitute()
1542
1543 Returns: zero => a data character
1544 positive => a special escape sequence
1545 negative => a numerical back reference
1546 on error, errorcodeptr is set non-zero
1547 */
1548
1549 int
PRIV(check_escape)1550 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1551 int *errorcodeptr, uint32_t options, uint32_t xoptions, BOOL isclass,
1552 compile_block *cb)
1553 {
1554 BOOL utf = (options & PCRE2_UTF) != 0;
1555 BOOL alt_bsux =
1556 ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1557 PCRE2_SPTR ptr = *ptrptr;
1558 uint32_t c, cc;
1559 int escape = 0;
1560 int i;
1561
1562 /* If backslash is at the end of the string, it's an error. */
1563
1564 if (ptr >= ptrend)
1565 {
1566 *errorcodeptr = ERR1;
1567 return 0;
1568 }
1569
1570 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1571 *errorcodeptr = 0; /* Be optimistic */
1572
1573 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1574 value test saves a memory lookup for code points outside the alphanumeric
1575 range. */
1576
1577 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1578
1579 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1580 positive value is a literal value for something like \n. A negative value is
1581 the negation of one of the ESC_ macros that is passed back for handling by the
1582 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1583 is supported. If the value is zero, further processing is handled below. */
1584
1585 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1586 {
1587 if (i > 0)
1588 {
1589 c = (uint32_t)i;
1590 if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1591 c = CHAR_LF;
1592 }
1593 else /* Negative table entry */
1594 {
1595 escape = -i; /* Else return a special escape */
1596 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1597 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1598
1599 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1600 Unicode code points, as well as plain \N for "not newline". PCRE does not
1601 support \N{name}. However, it does support quantification such as \N{2,3},
1602 so if \N{ is not followed by U+dddd we check for a quantifier. */
1603
1604 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1605 {
1606 PCRE2_SPTR p = ptr + 1;
1607
1608 /* Perl ignores spaces and tabs after { */
1609
1610 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1611
1612 /* \N{U+ can be handled by the \x{ code. However, this construction is
1613 not valid in EBCDIC environments because it specifies a Unicode
1614 character, not a codepoint in the local code. For example \N{U+0041}
1615 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1616 casing semantics for the entire pattern, so allow it only in UTF (i.e.
1617 Unicode) mode. */
1618
1619 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1620 {
1621 #ifdef EBCDIC
1622 *errorcodeptr = ERR93;
1623 #else
1624 if (utf)
1625 {
1626 ptr = p + 2;
1627 escape = 0; /* Not a fancy escape after all */
1628 goto COME_FROM_NU;
1629 }
1630 else *errorcodeptr = ERR93;
1631 #endif
1632 }
1633
1634 /* Give an error if what follows is not a quantifier, but don't override
1635 an error set by the quantifier reader (e.g. number overflow). */
1636
1637 else
1638 {
1639 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1640 *errorcodeptr == 0)
1641 *errorcodeptr = ERR37;
1642 }
1643 }
1644 }
1645 }
1646
1647 /* Escapes that need further processing, including those that are unknown, have
1648 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1649 \o, and \x are recognized (\u and \U can never appear as they are used for case
1650 forcing). */
1651
1652 else
1653 {
1654 int s;
1655 PCRE2_SPTR oldptr;
1656 BOOL overflow;
1657
1658 /* Filter calls from pcre2_substitute(). */
1659
1660 if (cb == NULL)
1661 {
1662 if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1663 {
1664 *errorcodeptr = ERR3;
1665 return 0;
1666 }
1667 alt_bsux = FALSE; /* Do not modify \x handling */
1668 }
1669
1670 switch (c)
1671 {
1672 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1673 error. */
1674
1675 case CHAR_F:
1676 case CHAR_l:
1677 case CHAR_L:
1678 *errorcodeptr = ERR37;
1679 break;
1680
1681 /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1682 is set. Otherwise, \u must be followed by exactly four hex digits or, if
1683 PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1684 Otherwise it is a lowercase u letter. This gives some compatibility with
1685 ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1686 allowed. When \u{ is not followed by hex digits, a special return is given
1687 because otherwise \u{ 12} (for example) would be treated as u{12}. */
1688
1689 case CHAR_u:
1690 if (!alt_bsux) *errorcodeptr = ERR37; else
1691 {
1692 uint32_t xc;
1693
1694 if (ptr >= ptrend) break;
1695 if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1696 (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1697 {
1698 PCRE2_SPTR hptr = ptr + 1;
1699
1700 cc = 0;
1701 while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1702 {
1703 if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1704 {
1705 *errorcodeptr = ERR77;
1706 ptr = hptr; /* Show where */
1707 break; /* *hptr != } will cause another break below */
1708 }
1709 cc = (cc << 4) | xc;
1710 hptr++;
1711 }
1712
1713 if (hptr == ptr + 1 || /* No hex digits */
1714 hptr >= ptrend || /* Hit end of input */
1715 *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1716 {
1717 escape = ESC_ub; /* Special return */
1718 ptr++; /* Skip { */
1719 break; /* Hex escape not recognized */
1720 }
1721
1722 c = cc; /* Accept the code point */
1723 ptr = hptr + 1;
1724 }
1725
1726 else /* Must be exactly 4 hex digits */
1727 {
1728 if (ptrend - ptr < 4) break; /* Less than 4 chars */
1729 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1730 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1731 cc = (cc << 4) | xc;
1732 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1733 cc = (cc << 4) | xc;
1734 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1735 c = (cc << 4) | xc;
1736 ptr += 4;
1737 }
1738
1739 if (utf)
1740 {
1741 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1742 else
1743 if (c >= 0xd800 && c <= 0xdfff &&
1744 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1745 *errorcodeptr = ERR73;
1746 }
1747 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1748 }
1749 break;
1750
1751 /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1752 in which case it is an upper case letter. */
1753
1754 case CHAR_U:
1755 if (!alt_bsux) *errorcodeptr = ERR37;
1756 break;
1757
1758 /* In a character class, \g is just a literal "g". Outside a character
1759 class, \g must be followed by one of a number of specific things:
1760
1761 (1) A number, either plain or braced. If positive, it is an absolute
1762 backreference. If negative, it is a relative backreference. This is a Perl
1763 5.10 feature.
1764
1765 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1766 is part of Perl's movement towards a unified syntax for back references. As
1767 this is synonymous with \k{name}, we fudge it up by pretending it really
1768 was \k{name}.
1769
1770 (3) For Oniguruma compatibility we also support \g followed by a name or a
1771 number either in angle brackets or in single quotes. However, these are
1772 (possibly recursive) subroutine calls, _not_ backreferences. We return
1773 the ESC_g code.
1774
1775 Summary: Return a negative number for a numerical back reference, ESC_k for
1776 a named back reference, and ESC_g for a named or numbered subroutine call.
1777 */
1778
1779 case CHAR_g:
1780 if (isclass) break;
1781
1782 if (ptr >= ptrend)
1783 {
1784 *errorcodeptr = ERR57;
1785 break;
1786 }
1787
1788 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1789 {
1790 escape = ESC_g;
1791 break;
1792 }
1793
1794 /* If there is a brace delimiter, try to read a numerical reference. If
1795 there isn't one, assume we have a name and treat it as \k. */
1796
1797 if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1798 {
1799 PCRE2_SPTR p = ptr + 1;
1800
1801 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1802 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1803 errorcodeptr))
1804 {
1805 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1806 break;
1807 }
1808 while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1809
1810 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1811 {
1812 *errorcodeptr = ERR57;
1813 break;
1814 }
1815 ptr = p + 1;
1816 }
1817
1818 /* Read an undelimited number */
1819
1820 else
1821 {
1822 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1823 errorcodeptr))
1824 {
1825 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1826 break;
1827 }
1828 }
1829
1830 if (s <= 0)
1831 {
1832 *errorcodeptr = ERR15;
1833 break;
1834 }
1835
1836 escape = -s;
1837 break;
1838
1839 /* The handling of escape sequences consisting of a string of digits
1840 starting with one that is not zero is not straightforward. Perl has changed
1841 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1842 recommended to avoid the ambiguities in the old syntax.
1843
1844 Outside a character class, the digits are read as a decimal number. If the
1845 number is less than 10, or if there are that many previous extracting left
1846 brackets, it is a back reference. Otherwise, up to three octal digits are
1847 read to form an escaped character code. Thus \123 is likely to be octal 123
1848 (cf \0123, which is octal 012 followed by the literal 3).
1849
1850 Inside a character class, \ followed by a digit is always either a literal
1851 8 or 9 or an octal number. */
1852
1853 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1854 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1855
1856 if (!isclass)
1857 {
1858 oldptr = ptr;
1859 ptr--; /* Back to the digit */
1860
1861 /* As we know we are at a digit, the only possible error from
1862 read_number() is a number that is too large to be a group number. In this
1863 case we fall through handle this as not a group reference. If we have
1864 read a small enough number, check for a back reference.
1865
1866 \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1867 are octal escapes if there are not that many previous captures. */
1868
1869 if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1870 (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1871 {
1872 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1873 else escape = -s; /* Indicates a back reference */
1874 break;
1875 }
1876
1877 ptr = oldptr; /* Put the pointer back and fall through */
1878 }
1879
1880 /* Handle a digit following \ when the number is not a back reference, or
1881 we are within a character class. If the first digit is 8 or 9, Perl used to
1882 generate a binary zero and then treat the digit as a following literal. At
1883 least by Perl 5.18 this changed so as not to insert the binary zero. */
1884
1885 if (c >= CHAR_8) break;
1886
1887 /* Fall through */
1888
1889 /* \0 always starts an octal number, but we may drop through to here with a
1890 larger first octal digit. The original code used just to take the least
1891 significant 8 bits of octal numbers (I think this is what early Perls used
1892 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1893 but no more than 3 octal digits. */
1894
1895 case CHAR_0:
1896 c -= CHAR_0;
1897 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1898 c = c * 8 + *ptr++ - CHAR_0;
1899 #if PCRE2_CODE_UNIT_WIDTH == 8
1900 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1901 #endif
1902 break;
1903
1904 /* \o is a relatively new Perl feature, supporting a more general way of
1905 specifying character codes in octal. The only supported form is \o{ddd},
1906 with optional spaces or tabs after { and before }. */
1907
1908 case CHAR_o:
1909 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1910 {
1911 ptr--;
1912 *errorcodeptr = ERR55;
1913 break;
1914 }
1915
1916 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1917 if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1918 {
1919 *errorcodeptr = ERR78;
1920 break;
1921 }
1922
1923 c = 0;
1924 overflow = FALSE;
1925 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1926 {
1927 cc = *ptr++;
1928 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1929 #if PCRE2_CODE_UNIT_WIDTH == 32
1930 if (c >= 0x20000000l) { overflow = TRUE; break; }
1931 #endif
1932 c = (c << 3) + (cc - CHAR_0);
1933 #if PCRE2_CODE_UNIT_WIDTH == 8
1934 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1935 #elif PCRE2_CODE_UNIT_WIDTH == 16
1936 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1937 #elif PCRE2_CODE_UNIT_WIDTH == 32
1938 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1939 #endif
1940 }
1941
1942 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1943
1944 if (overflow)
1945 {
1946 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1947 *errorcodeptr = ERR34;
1948 }
1949 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1950 {
1951 if (utf && c >= 0xd800 && c <= 0xdfff &&
1952 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1953 {
1954 ptr--;
1955 *errorcodeptr = ERR73;
1956 }
1957 }
1958 else
1959 {
1960 ptr--;
1961 *errorcodeptr = ERR64;
1962 }
1963 break;
1964
1965 /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1966 by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1967
1968 case CHAR_x:
1969 if (alt_bsux)
1970 {
1971 uint32_t xc;
1972 if (ptrend - ptr < 2) break; /* Less than 2 characters */
1973 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1974 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1975 c = (cc << 4) | xc;
1976 ptr += 2;
1977 }
1978
1979 /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1980 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1981 digits. If not, { used to be treated as a data character. However, Perl
1982 seems to read hex digits up to the first non-such, and ignore the rest, so
1983 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1984 now gives an error. */
1985
1986 else
1987 {
1988 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1989 {
1990 ptr++;
1991 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1992
1993 #ifndef EBCDIC
1994 COME_FROM_NU:
1995 #endif
1996 if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1997 {
1998 *errorcodeptr = ERR78;
1999 break;
2000 }
2001 c = 0;
2002 overflow = FALSE;
2003
2004 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2005 {
2006 ptr++;
2007 if (c == 0 && cc == 0) continue; /* Leading zeroes */
2008 #if PCRE2_CODE_UNIT_WIDTH == 32
2009 if (c >= 0x10000000l) { overflow = TRUE; break; }
2010 #endif
2011 c = (c << 4) | cc;
2012 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2013 {
2014 overflow = TRUE;
2015 break;
2016 }
2017 }
2018
2019 /* Perl ignores spaces and tabs before } */
2020
2021 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2022
2023 /* On overflow, skip remaining hex digits */
2024
2025 if (overflow)
2026 {
2027 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2028 *errorcodeptr = ERR34;
2029 }
2030 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2031 {
2032 if (utf && c >= 0xd800 && c <= 0xdfff &&
2033 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2034 {
2035 ptr--;
2036 *errorcodeptr = ERR73;
2037 }
2038 }
2039
2040 /* If the sequence of hex digits (followed by optional space) does not
2041 end with '}', give an error. We used just to recognize this construct
2042 and fall through to the normal \x handling, but nowadays Perl gives an
2043 error, which seems much more sensible, so we do too. */
2044
2045 else
2046 {
2047 ptr--;
2048 *errorcodeptr = ERR67;
2049 }
2050 } /* End of \x{} processing */
2051
2052 /* Read a up to two hex digits after \x */
2053
2054 else
2055 {
2056 c = 0;
2057 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
2058 ptr++;
2059 c = cc;
2060 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
2061 ptr++;
2062 c = (c << 4) | cc;
2063 } /* End of \xdd handling */
2064 } /* End of Perl-style \x handling */
2065 break;
2066
2067 /* The handling of \c is different in ASCII and EBCDIC environments. In an
2068 ASCII (or Unicode) environment, an error is given if the character
2069 following \c is not a printable ASCII character. Otherwise, the following
2070 character is upper-cased if it is a letter, and after that the 0x40 bit is
2071 flipped. The result is the value of the escape.
2072
2073 In an EBCDIC environment the handling of \c is compatible with the
2074 specification in the perlebcdic document. The following character must be
2075 a letter or one of small number of special characters. These provide a
2076 means of defining the character values 0-31.
2077
2078 For testing the EBCDIC handling of \c in an ASCII environment, recognize
2079 the EBCDIC value of 'c' explicitly. */
2080
2081 #if defined EBCDIC && 'a' != 0x81
2082 case 0x83:
2083 #else
2084 case CHAR_c:
2085 #endif
2086 if (ptr >= ptrend)
2087 {
2088 *errorcodeptr = ERR2;
2089 break;
2090 }
2091 c = *ptr;
2092 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2093
2094 /* Handle \c in an ASCII/Unicode environment. */
2095
2096 #ifndef EBCDIC /* ASCII/UTF-8 coding */
2097 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
2098 {
2099 *errorcodeptr = ERR68;
2100 break;
2101 }
2102 c ^= 0x40;
2103
2104 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2105 255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2106 POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2107 The other valid sequences correspond to a list of specific characters. */
2108
2109 #else
2110 if (c == CHAR_QUESTION_MARK)
2111 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2112 else
2113 {
2114 for (i = 0; i < 32; i++)
2115 {
2116 if (c == ebcdic_escape_c[i]) break;
2117 }
2118 if (i < 32) c = i; else *errorcodeptr = ERR68;
2119 }
2120 #endif /* EBCDIC */
2121
2122 ptr++;
2123 break;
2124
2125 /* Any other alphanumeric following \ is an error. Perl gives an error only
2126 if in warning mode, but PCRE doesn't have a warning mode. */
2127
2128 default:
2129 *errorcodeptr = ERR3;
2130 *ptrptr = ptr - 1; /* Point to the character at fault */
2131 return 0;
2132 }
2133 }
2134
2135 /* Set the pointer to the next character before returning. */
2136
2137 *ptrptr = ptr;
2138 *chptr = c;
2139 return escape;
2140 }
2141
2142
2143
2144 #ifdef SUPPORT_UNICODE
2145 /*************************************************
2146 * Handle \P and \p *
2147 *************************************************/
2148
2149 /* This function is called after \P or \p has been encountered, provided that
2150 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2151 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2152 after the final code unit of the escape sequence.
2153
2154 Arguments:
2155 ptrptr the pattern position pointer
2156 negptr a boolean that is set TRUE for negation else FALSE
2157 ptypeptr an unsigned int that is set to the type value
2158 pdataptr an unsigned int that is set to the detailed property value
2159 errorcodeptr the error code variable
2160 cb the compile data
2161
2162 Returns: TRUE if the type value was found, or FALSE for an invalid type
2163 */
2164
2165 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2166 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2167 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2168 {
2169 PCRE2_UCHAR c;
2170 PCRE2_SIZE i, bot, top;
2171 PCRE2_SPTR ptr = *ptrptr;
2172 PCRE2_UCHAR name[50];
2173 PCRE2_UCHAR *vptr = NULL;
2174 uint16_t ptscript = PT_NOTSCRIPT;
2175
2176 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2177 c = *ptr++;
2178 *negptr = FALSE;
2179
2180 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2181 negation. */
2182
2183 if (c == CHAR_LEFT_CURLY_BRACKET)
2184 {
2185 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2186
2187 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2188 {
2189 *negptr = TRUE;
2190 ptr++;
2191 }
2192
2193 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2194 {
2195 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2196 c = *ptr++;
2197 #if PCRE2_CODE_UNIT_WIDTH != 8
2198 while (c == '_' || c == '-' || (c <= 0xff && isspace(c)))
2199 #else
2200 while (c == '_' || c == '-' || isspace(c))
2201 #endif
2202 {
2203 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2204 c = *ptr++;
2205 }
2206 if (c == CHAR_NUL) goto ERROR_RETURN;
2207 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2208 name[i] = tolower(c);
2209 if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
2210 }
2211
2212 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2213 name[i] = 0;
2214 }
2215
2216 /* If { doesn't follow \p or \P there is just one following character, which
2217 must be an ASCII letter. */
2218
2219 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2220 {
2221 name[0] = tolower(c);
2222 name[1] = 0;
2223 }
2224 else goto ERROR_RETURN;
2225
2226 *ptrptr = ptr;
2227
2228 /* If the property contains ':' or '=' we have class name and value separately
2229 specified. The following are supported:
2230
2231 . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2232 . Script (synonym sc) for which the property name is the script name
2233 . Script_Extensions (synonym scx), ditto
2234
2235 As this is a small number, we currently just check the names directly. If this
2236 grows, a sorted table and a switch will be neater.
2237
2238 For both the script properties, set a PT_xxx value so that (1) they can be
2239 distinguished and (2) invalid script names that happen to be the name of
2240 another property can be diagnosed. */
2241
2242 if (vptr != NULL)
2243 {
2244 int offset = 0;
2245 PCRE2_UCHAR sname[8];
2246
2247 *vptr = 0; /* Terminate property name */
2248 if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2249 PRIV(strcmp_c8)(name, STRING_bc) == 0)
2250 {
2251 offset = 4;
2252 sname[0] = CHAR_b;
2253 sname[1] = CHAR_i; /* There is no strcpy_c8 function */
2254 sname[2] = CHAR_d;
2255 sname[3] = CHAR_i;
2256 }
2257
2258 else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2259 PRIV(strcmp_c8)(name, STRING_sc) == 0)
2260 ptscript = PT_SC;
2261
2262 else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2263 PRIV(strcmp_c8)(name, STRING_scx) == 0)
2264 ptscript = PT_SCX;
2265
2266 else
2267 {
2268 *errorcodeptr = ERR47;
2269 return FALSE;
2270 }
2271
2272 /* Adjust the string in name[] as needed */
2273
2274 memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2275 if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2276 }
2277
2278 /* Search for a recognized property using binary chop. */
2279
2280 bot = 0;
2281 top = PRIV(utt_size);
2282
2283 while (bot < top)
2284 {
2285 int r;
2286 i = (bot + top) >> 1;
2287 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2288
2289 /* When a matching property is found, some extra checking is needed when the
2290 \p{xx:yy} syntax is used and xx is either sc or scx. */
2291
2292 if (r == 0)
2293 {
2294 *pdataptr = PRIV(utt)[i].value;
2295 if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2296 {
2297 *ptypeptr = PRIV(utt)[i].type;
2298 return TRUE;
2299 }
2300
2301 switch (PRIV(utt)[i].type)
2302 {
2303 case PT_SC:
2304 *ptypeptr = PT_SC;
2305 return TRUE;
2306
2307 case PT_SCX:
2308 *ptypeptr = ptscript;
2309 return TRUE;
2310 }
2311
2312 break; /* Non-script found */
2313 }
2314
2315 if (r > 0) bot = i + 1; else top = i;
2316 }
2317
2318 *errorcodeptr = ERR47; /* Unrecognized property */
2319 return FALSE;
2320
2321 ERROR_RETURN: /* Malformed \P or \p */
2322 *errorcodeptr = ERR46;
2323 *ptrptr = ptr;
2324 return FALSE;
2325 }
2326 #endif
2327
2328
2329
2330 /*************************************************
2331 * Check for POSIX class syntax *
2332 *************************************************/
2333
2334 /* This function is called when the sequence "[:" or "[." or "[=" is
2335 encountered in a character class. It checks whether this is followed by a
2336 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2337 reach an unescaped ']' without the special preceding character, return FALSE.
2338
2339 Originally, this function only recognized a sequence of letters between the
2340 terminators, but it seems that Perl recognizes any sequence of characters,
2341 though of course unknown POSIX names are subsequently rejected. Perl gives an
2342 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2343 didn't consider this to be a POSIX class. Likewise for [:1234:].
2344
2345 The problem in trying to be exactly like Perl is in the handling of escapes. We
2346 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2347 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2348 below handles the special cases \\ and \], but does not try to do any other
2349 escape processing. This makes it different from Perl for cases such as
2350 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2351 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2352 when Perl does, I think.
2353
2354 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2355 It seems that the appearance of a nested POSIX class supersedes an apparent
2356 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2357 a digit. This is handled by returning FALSE if the start of a new group with
2358 the same terminator is encountered, since the next closing sequence must close
2359 the nested group, not the outer one.
2360
2361 In Perl, unescaped square brackets may also appear as part of class names. For
2362 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2363 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2364 seem right at all. PCRE does not allow closing square brackets in POSIX class
2365 names.
2366
2367 Arguments:
2368 ptr pointer to the character after the initial [ (colon, dot, equals)
2369 ptrend pointer to the end of the pattern
2370 endptr where to return a pointer to the terminating ':', '.', or '='
2371
2372 Returns: TRUE or FALSE
2373 */
2374
2375 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2376 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2377 {
2378 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2379 terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2380
2381 for (; ptrend - ptr >= 2; ptr++)
2382 {
2383 if (*ptr == CHAR_BACKSLASH &&
2384 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2385 ptr++;
2386
2387 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2388 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2389
2390 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2391 {
2392 *endptr = ptr;
2393 return TRUE;
2394 }
2395 }
2396
2397 return FALSE;
2398 }
2399
2400
2401
2402 /*************************************************
2403 * Check POSIX class name *
2404 *************************************************/
2405
2406 /* This function is called to check the name given in a POSIX-style class entry
2407 such as [:alnum:].
2408
2409 Arguments:
2410 ptr points to the first letter
2411 len the length of the name
2412
2413 Returns: a value representing the name, or -1 if unknown
2414 */
2415
2416 static int
check_posix_name(PCRE2_SPTR ptr,int len)2417 check_posix_name(PCRE2_SPTR ptr, int len)
2418 {
2419 const char *pn = posix_names;
2420 int yield = 0;
2421 while (posix_name_lengths[yield] != 0)
2422 {
2423 if (len == posix_name_lengths[yield] &&
2424 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2425 pn += posix_name_lengths[yield] + 1;
2426 yield++;
2427 }
2428 return -1;
2429 }
2430
2431
2432
2433 /*************************************************
2434 * Read a subpattern or VERB name *
2435 *************************************************/
2436
2437 /* This function is called from parse_regex() below whenever it needs to read
2438 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2439 pointer must be to the preceding character. If that character is '*' we are
2440 reading a verb or alpha assertion name. The pointer is updated to point after
2441 the name, for a VERB or alpha assertion name, or after tha name's terminator
2442 for a subpattern name. Returning both the offset and the name pointer is
2443 redundant information, but some callers use one and some the other, so it is
2444 simplest just to return both. When the name is in braces, spaces and tabs are
2445 allowed (and ignored) at either end.
2446
2447 Arguments:
2448 ptrptr points to the character pointer variable
2449 ptrend points to the end of the input string
2450 utf true if the input is UTF-encoded
2451 terminator the terminator of a subpattern name must be this
2452 offsetptr where to put the offset from the start of the pattern
2453 nameptr where to put a pointer to the name in the input
2454 namelenptr where to put the length of the name
2455 errcodeptr where to put an error code
2456 cb pointer to the compile data block
2457
2458 Returns: TRUE if a name was read
2459 FALSE otherwise, with error code set
2460 */
2461
2462 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2463 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2464 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2465 int *errorcodeptr, compile_block *cb)
2466 {
2467 PCRE2_SPTR ptr = *ptrptr;
2468 BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2469 BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2470
2471 if (is_braced)
2472 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2473
2474 if (ptr >= ptrend) /* No characters in name */
2475 {
2476 *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2477 ERR60; /* Verb not recognized or malformed */
2478 goto FAILED;
2479 }
2480
2481 *nameptr = ptr;
2482 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2483
2484 /* In UTF mode, a group name may contain letters and decimal digits as defined
2485 by Unicode properties, and underscores, but must not start with a digit. */
2486
2487 #ifdef SUPPORT_UNICODE
2488 if (utf && is_group)
2489 {
2490 uint32_t c, type;
2491
2492 GETCHAR(c, ptr);
2493 type = UCD_CHARTYPE(c);
2494
2495 if (type == ucp_Nd)
2496 {
2497 *errorcodeptr = ERR44;
2498 goto FAILED;
2499 }
2500
2501 for(;;)
2502 {
2503 if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2504 c != CHAR_UNDERSCORE) break;
2505 ptr++;
2506 FORWARDCHARTEST(ptr, ptrend);
2507 if (ptr >= ptrend) break;
2508 GETCHAR(c, ptr);
2509 type = UCD_CHARTYPE(c);
2510 }
2511 }
2512 else
2513 #else
2514 (void)utf; /* Avoid compiler warning */
2515 #endif /* SUPPORT_UNICODE */
2516
2517 /* Handle non-group names and group names in non-UTF modes. A group name must
2518 not start with a digit. If either of the others start with a digit it just
2519 won't be recognized. */
2520
2521 {
2522 if (is_group && IS_DIGIT(*ptr))
2523 {
2524 *errorcodeptr = ERR44;
2525 goto FAILED;
2526 }
2527
2528 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2529 {
2530 ptr++;
2531 }
2532 }
2533
2534 /* Check name length */
2535
2536 if (ptr > *nameptr + MAX_NAME_SIZE)
2537 {
2538 *errorcodeptr = ERR48;
2539 goto FAILED;
2540 }
2541 *namelenptr = (uint32_t)(ptr - *nameptr);
2542
2543 /* Subpattern names must not be empty, and their terminator is checked here.
2544 (What follows a verb or alpha assertion name is checked separately.) */
2545
2546 if (is_group)
2547 {
2548 if (ptr == *nameptr)
2549 {
2550 *errorcodeptr = ERR62; /* Subpattern name expected */
2551 goto FAILED;
2552 }
2553 if (is_braced)
2554 while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2555 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2556 {
2557 *errorcodeptr = ERR42;
2558 goto FAILED;
2559 }
2560 ptr++;
2561 }
2562
2563 *ptrptr = ptr;
2564 return TRUE;
2565
2566 FAILED:
2567 *ptrptr = ptr;
2568 return FALSE;
2569 }
2570
2571
2572
2573 /*************************************************
2574 * Manage callouts at start of cycle *
2575 *************************************************/
2576
2577 /* At the start of a new item in parse_regex() we are able to record the
2578 details of the previous item in a prior callout, and also to set up an
2579 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2580 which would otherwise happen for items such as \Q that contribute nothing to
2581 the parsed pattern.
2582
2583 Arguments:
2584 ptr current pattern pointer
2585 pcalloutptr points to a pointer to previous callout, or NULL
2586 auto_callout TRUE if auto_callouts are enabled
2587 parsed_pattern the parsed pattern pointer
2588 cb compile block
2589
2590 Returns: possibly updated parsed_pattern pointer.
2591 */
2592
2593 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2594 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2595 uint32_t *parsed_pattern, compile_block *cb)
2596 {
2597 uint32_t *previous_callout = *pcalloutptr;
2598
2599 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2600 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2601
2602 if (!auto_callout) previous_callout = NULL; else
2603 {
2604 if (previous_callout == NULL ||
2605 previous_callout != parsed_pattern - 4 ||
2606 previous_callout[3] != 255)
2607 {
2608 previous_callout = parsed_pattern; /* Set up new automatic callout */
2609 parsed_pattern += 4;
2610 previous_callout[0] = META_CALLOUT_NUMBER;
2611 previous_callout[2] = 0;
2612 previous_callout[3] = 255;
2613 }
2614 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2615 }
2616
2617 *pcalloutptr = previous_callout;
2618 return parsed_pattern;
2619 }
2620
2621
2622
2623 /*************************************************
2624 * Handle \d, \D, \s, \S, \w, \W *
2625 *************************************************/
2626
2627 /* This function is called from parse_regex() below, both for freestanding
2628 escapes, and those within classes, to handle those escapes that may change when
2629 Unicode property support is requested. Note that PCRE2_UCP will never be set
2630 without Unicode support because that is checked when pcre2_compile() is called.
2631
2632 Arguments:
2633 escape the ESC_... value
2634 parsed_pattern where to add the code
2635 options options bits
2636 xoptions extra options bits
2637
2638 Returns: updated value of parsed_pattern
2639 */
2640 static uint32_t *
handle_escdsw(int escape,uint32_t * parsed_pattern,uint32_t options,uint32_t xoptions)2641 handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2642 uint32_t xoptions)
2643 {
2644 uint32_t ascii_option = 0;
2645 uint32_t prop = ESC_p;
2646
2647 switch(escape)
2648 {
2649 case ESC_D:
2650 prop = ESC_P;
2651 /* Fall through */
2652 case ESC_d:
2653 ascii_option = PCRE2_EXTRA_ASCII_BSD;
2654 break;
2655
2656 case ESC_S:
2657 prop = ESC_P;
2658 /* Fall through */
2659 case ESC_s:
2660 ascii_option = PCRE2_EXTRA_ASCII_BSS;
2661 break;
2662
2663 case ESC_W:
2664 prop = ESC_P;
2665 /* Fall through */
2666 case ESC_w:
2667 ascii_option = PCRE2_EXTRA_ASCII_BSW;
2668 break;
2669 }
2670
2671 if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2672 {
2673 *parsed_pattern++ = META_ESCAPE + escape;
2674 }
2675 else
2676 {
2677 *parsed_pattern++ = META_ESCAPE + prop;
2678 switch(escape)
2679 {
2680 case ESC_d:
2681 case ESC_D:
2682 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2683 break;
2684
2685 case ESC_s:
2686 case ESC_S:
2687 *parsed_pattern++ = PT_SPACE << 16;
2688 break;
2689
2690 case ESC_w:
2691 case ESC_W:
2692 *parsed_pattern++ = PT_WORD << 16;
2693 break;
2694 }
2695 }
2696
2697 return parsed_pattern;
2698 }
2699
2700
2701
2702 /*************************************************
2703 * Parse regex and identify named groups *
2704 *************************************************/
2705
2706 /* This function is called first of all. It scans the pattern and does two
2707 things: (1) It identifies capturing groups and makes a table of named capturing
2708 groups so that information about them is fully available to both the compiling
2709 scans. (2) It writes a parsed version of the pattern with comments omitted and
2710 escapes processed into the parsed_pattern vector.
2711
2712 Arguments:
2713 ptr points to the start of the pattern
2714 options compiling dynamic options (may change during the scan)
2715 has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2716 cb pointer to the compile data block
2717
2718 Returns: zero on success or a non-zero error code, with the
2719 error offset placed in the cb field
2720 */
2721
2722 /* A structure and some flags for dealing with nested groups. */
2723
2724 typedef struct nest_save {
2725 uint16_t nest_depth;
2726 uint16_t reset_group;
2727 uint16_t max_group;
2728 uint16_t flags;
2729 uint32_t options;
2730 uint32_t xoptions;
2731 } nest_save;
2732
2733 #define NSF_RESET 0x0001u
2734 #define NSF_CONDASSERT 0x0002u
2735 #define NSF_ATOMICSR 0x0004u
2736
2737 /* Options that are changeable within the pattern must be tracked during
2738 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2739 but all must be tracked so that META_OPTIONS items set the correct values for
2740 the main compiling phase. */
2741
2742 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2743 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2744 PCRE2_UNGREEDY)
2745
2746 #define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
2747 PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
2748 PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
2749
2750 /* States used for analyzing ranges in character classes. The two OK values
2751 must be last. */
2752
2753 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2754
2755 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2756 the storing of literal values in the main parsed pattern, where they can always
2757 be quantified. */
2758
2759 #if PCRE2_CODE_UNIT_WIDTH == 32
2760 #define PARSED_LITERAL(c, p) \
2761 { \
2762 if (c >= META_END) *p++ = META_BIGVALUE; \
2763 *p++ = c; \
2764 okquantifier = TRUE; \
2765 }
2766 #else
2767 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2768 #endif
2769
2770 /* Here's the actual function. */
2771
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2772 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2773 compile_block *cb)
2774 {
2775 uint32_t c;
2776 uint32_t delimiter;
2777 uint32_t namelen;
2778 uint32_t class_range_state;
2779 uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2780 uint32_t *verbstartptr = NULL;
2781 uint32_t *previous_callout = NULL;
2782 uint32_t *parsed_pattern = cb->parsed_pattern;
2783 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2784 uint32_t *this_parsed_item = NULL;
2785 uint32_t *prev_parsed_item = NULL;
2786 uint32_t meta_quantifier = 0;
2787 uint32_t add_after_mark = 0;
2788 uint32_t xoptions = cb->cx->extra_options;
2789 uint16_t nest_depth = 0;
2790 int after_manual_callout = 0;
2791 int expect_cond_assert = 0;
2792 int errorcode = 0;
2793 int escape;
2794 int i;
2795 BOOL inescq = FALSE;
2796 BOOL inverbname = FALSE;
2797 BOOL utf = (options & PCRE2_UTF) != 0;
2798 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2799 BOOL isdupname;
2800 BOOL negate_class;
2801 BOOL okquantifier = FALSE;
2802 PCRE2_SPTR thisptr;
2803 PCRE2_SPTR name;
2804 PCRE2_SPTR ptrend = cb->end_pattern;
2805 PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2806 named_group *ng;
2807 nest_save *top_nest, *end_nests;
2808
2809 /* Insert leading items for word and line matching (features provided for the
2810 benefit of pcre2grep). */
2811
2812 if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
2813 {
2814 *parsed_pattern++ = META_CIRCUMFLEX;
2815 *parsed_pattern++ = META_NOCAPTURE;
2816 }
2817 else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
2818 {
2819 *parsed_pattern++ = META_ESCAPE + ESC_b;
2820 *parsed_pattern++ = META_NOCAPTURE;
2821 }
2822
2823 /* If the pattern is actually a literal string, process it separately to avoid
2824 cluttering up the main loop. */
2825
2826 if ((options & PCRE2_LITERAL) != 0)
2827 {
2828 while (ptr < ptrend)
2829 {
2830 if (parsed_pattern >= parsed_pattern_end)
2831 {
2832 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2833 goto FAILED;
2834 }
2835 thisptr = ptr;
2836 GETCHARINCTEST(c, ptr);
2837 if (auto_callout)
2838 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2839 auto_callout, parsed_pattern, cb);
2840 PARSED_LITERAL(c, parsed_pattern);
2841 }
2842 goto PARSED_END;
2843 }
2844
2845 /* Process a real regex which may contain meta-characters. */
2846
2847 top_nest = NULL;
2848 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2849
2850 /* The size of the nest_save structure might not be a factor of the size of the
2851 workspace. Therefore we must round down end_nests so as to correctly avoid
2852 creating a nest_save that spans the end of the workspace. */
2853
2854 end_nests = (nest_save *)((char *)end_nests -
2855 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2856
2857 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2858
2859 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2860
2861 /* Now scan the pattern */
2862
2863 while (ptr < ptrend)
2864 {
2865 int prev_expect_cond_assert;
2866 uint32_t min_repeat = 0, max_repeat = 0;
2867 uint32_t set, unset, *optset;
2868 uint32_t xset, xunset, *xoptset;
2869 uint32_t terminator;
2870 uint32_t prev_meta_quantifier;
2871 BOOL prev_okquantifier;
2872 PCRE2_SPTR tempptr;
2873 PCRE2_SIZE offset;
2874
2875 if (parsed_pattern >= parsed_pattern_end)
2876 {
2877 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2878 goto FAILED;
2879 }
2880
2881 if (nest_depth > cb->cx->parens_nest_limit)
2882 {
2883 errorcode = ERR19;
2884 goto FAILED; /* Parentheses too deeply nested */
2885 }
2886
2887 /* If the last time round this loop something was added, parsed_pattern will
2888 no longer be equal to this_parsed_item. Remember where the previous item
2889 started and reset for the next item. Note that sometimes round the loop,
2890 nothing gets added (e.g. for ignored white space). */
2891
2892 if (this_parsed_item != parsed_pattern)
2893 {
2894 prev_parsed_item = this_parsed_item;
2895 this_parsed_item = parsed_pattern;
2896 }
2897
2898 /* Get next input character, save its position for callout handling. */
2899
2900 thisptr = ptr;
2901 GETCHARINCTEST(c, ptr);
2902
2903 /* Copy quoted literals until \E, allowing for the possibility of automatic
2904 callouts, except when processing a (*VERB) "name". */
2905
2906 if (inescq)
2907 {
2908 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2909 {
2910 inescq = FALSE;
2911 ptr++; /* Skip E */
2912 }
2913 else
2914 {
2915 if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2916 { /* expecting a conditional assertion, */
2917 ptr--; /* but an empty \Q\E sequence is OK. */
2918 errorcode = ERR28;
2919 goto FAILED;
2920 }
2921 if (inverbname)
2922 { /* Don't use PARSED_LITERAL() because it */
2923 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2924 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2925 #endif
2926 *parsed_pattern++ = c;
2927 }
2928 else
2929 {
2930 if (after_manual_callout-- <= 0)
2931 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2932 auto_callout, parsed_pattern, cb);
2933 PARSED_LITERAL(c, parsed_pattern);
2934 }
2935 meta_quantifier = 0;
2936 }
2937 continue; /* Next character */
2938 }
2939
2940 /* If we are processing the "name" part of a (*VERB:NAME) item, all
2941 characters up to the closing parenthesis are literals except when
2942 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2943 and \E and escaped characters are allowed (no character types such as \d). If
2944 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2945 this by not entering the special (*VERB:NAME) processing - they are then
2946 picked up below. Note that c is a character, not a code unit, so we must not
2947 use MAX_255 to test its size because MAX_255 tests code units and is assumed
2948 TRUE in 8-bit mode. */
2949
2950 if (inverbname &&
2951 (
2952 /* EITHER: not both options set */
2953 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2954 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2955 #ifdef SUPPORT_UNICODE
2956 /* OR: character > 255 AND not Unicode Pattern White Space */
2957 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2958 #endif
2959 /* OR: not a # comment or isspace() white space */
2960 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2961 #ifdef SUPPORT_UNICODE
2962 /* and not CHAR_NEL when Unicode is supported */
2963 && c != CHAR_NEL
2964 #endif
2965 )))
2966 {
2967 PCRE2_SIZE verbnamelength;
2968
2969 switch(c)
2970 {
2971 default: /* Don't use PARSED_LITERAL() because it */
2972 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2973 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2974 #endif
2975 *parsed_pattern++ = c;
2976 break;
2977
2978 case CHAR_RIGHT_PARENTHESIS:
2979 inverbname = FALSE;
2980 /* This is the length in characters */
2981 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2982 /* But the limit on the length is in code units */
2983 if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2984 {
2985 ptr--;
2986 errorcode = ERR76;
2987 goto FAILED;
2988 }
2989 *verblengthptr = (uint32_t)verbnamelength;
2990
2991 /* If this name was on a verb such as (*ACCEPT) which does not continue,
2992 a (*MARK) was generated for the name. We now add the original verb as the
2993 next item. */
2994
2995 if (add_after_mark != 0)
2996 {
2997 *parsed_pattern++ = add_after_mark;
2998 add_after_mark = 0;
2999 }
3000 break;
3001
3002 case CHAR_BACKSLASH:
3003 if ((options & PCRE2_ALT_VERBNAMES) != 0)
3004 {
3005 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3006 xoptions, FALSE, cb);
3007 if (errorcode != 0) goto FAILED;
3008 }
3009 else escape = 0; /* Treat all as literal */
3010
3011 switch(escape)
3012 {
3013 case 0: /* Don't use PARSED_LITERAL() because it */
3014 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
3015 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3016 #endif
3017 *parsed_pattern++ = c;
3018 break;
3019
3020 case ESC_ub:
3021 *parsed_pattern++ = CHAR_u;
3022 PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3023 break;
3024
3025 case ESC_Q:
3026 inescq = TRUE;
3027 break;
3028
3029 case ESC_E: /* Ignore */
3030 break;
3031
3032 default:
3033 errorcode = ERR40; /* Invalid in verb name */
3034 goto FAILED;
3035 }
3036 }
3037 continue; /* Next character in pattern */
3038 }
3039
3040 /* Not a verb name character. At this point we must process everything that
3041 must not change the quantification state. This is mainly comments, but we
3042 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3043 A+, as in Perl. An isolated \E is ignored. */
3044
3045 if (c == CHAR_BACKSLASH && ptr < ptrend)
3046 {
3047 if (*ptr == CHAR_Q || *ptr == CHAR_E)
3048 {
3049 inescq = *ptr == CHAR_Q;
3050 ptr++;
3051 continue;
3052 }
3053 }
3054
3055 /* Skip over whitespace and # comments in extended mode. Note that c is a
3056 character, not a code unit, so we must not use MAX_255 to test its size
3057 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3058 whitespace characters are those designated as "Pattern White Space" by
3059 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3060 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3061 subset of space characters that match \h and \v. */
3062
3063 if ((options & PCRE2_EXTENDED) != 0)
3064 {
3065 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3066 #ifdef SUPPORT_UNICODE
3067 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3068 #endif
3069 if (c == CHAR_NUMBER_SIGN)
3070 {
3071 while (ptr < ptrend)
3072 {
3073 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
3074 { /* IS_NEWLINE sets cb->nllen. */
3075 ptr += cb->nllen;
3076 break;
3077 }
3078 ptr++;
3079 #ifdef SUPPORT_UNICODE
3080 if (utf) FORWARDCHARTEST(ptr, ptrend);
3081 #endif
3082 }
3083 continue; /* Next character in pattern */
3084 }
3085 }
3086
3087 /* Skip over bracketed comments */
3088
3089 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3090 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3091 {
3092 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3093 if (ptr >= ptrend)
3094 {
3095 errorcode = ERR18; /* A special error for missing ) in a comment */
3096 goto FAILED; /* to make it easier to debug. */
3097 }
3098 ptr++;
3099 continue; /* Next character in pattern */
3100 }
3101
3102 /* If the next item is not a quantifier, fill in length of any previous
3103 callout and create an auto callout if required. */
3104
3105 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3106 (c != CHAR_LEFT_CURLY_BRACKET ||
3107 (tempptr = ptr,
3108 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3109 {
3110 if (after_manual_callout-- <= 0)
3111 {
3112 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3113 parsed_pattern, cb);
3114 this_parsed_item = parsed_pattern; /* New start for current item */
3115 }
3116 }
3117
3118 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3119 assertion, possibly preceded by a callout. If the value is 1, we have just
3120 had the callout and expect an assertion. There must be at least 3 more
3121 characters in all cases. When expect_cond_assert is 2, we know that the
3122 current character is an opening parenthesis, as otherwise we wouldn't be
3123 here. However, when it is 1, we need to check, and it's easiest just to check
3124 always. Note that expect_cond_assert may be negative, since all callouts just
3125 decrement it. */
3126
3127 if (expect_cond_assert > 0)
3128 {
3129 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3130 (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3131 if (ok)
3132 {
3133 if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
3134 {
3135 ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3136 }
3137 else switch(ptr[1]) /* Traditional symbolic format */
3138 {
3139 case CHAR_C:
3140 ok = expect_cond_assert == 2;
3141 break;
3142
3143 case CHAR_EQUALS_SIGN:
3144 case CHAR_EXCLAMATION_MARK:
3145 break;
3146
3147 case CHAR_LESS_THAN_SIGN:
3148 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3149 break;
3150
3151 default:
3152 ok = FALSE;
3153 }
3154 }
3155
3156 if (!ok)
3157 {
3158 ptr--; /* Adjust error offset */
3159 errorcode = ERR28;
3160 goto FAILED;
3161 }
3162 }
3163
3164 /* Remember whether we are expecting a conditional assertion, and set the
3165 default for this item. */
3166
3167 prev_expect_cond_assert = expect_cond_assert;
3168 expect_cond_assert = 0;
3169
3170 /* Remember quantification status for the previous significant item, then set
3171 default for this item. */
3172
3173 prev_okquantifier = okquantifier;
3174 prev_meta_quantifier = meta_quantifier;
3175 okquantifier = FALSE;
3176 meta_quantifier = 0;
3177
3178 /* If the previous significant item was a quantifier, adjust the parsed code
3179 if there is a following modifier. The base meta value is always followed by
3180 the PLUS and QUERY values, in that order. We do this here rather than after
3181 reading a quantifier so that intervening comments and /x whitespace can be
3182 ignored without having to replicate code. */
3183
3184 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3185 {
3186 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3187 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3188 0x00020000u : 0x00010000u);
3189 continue; /* Next character in pattern */
3190 }
3191
3192 /* Process the next item in the main part of a pattern. */
3193
3194 switch(c)
3195 {
3196 default: /* Non-special character */
3197 PARSED_LITERAL(c, parsed_pattern);
3198 break;
3199
3200
3201 /* ---- Escape sequence ---- */
3202
3203 case CHAR_BACKSLASH:
3204 tempptr = ptr;
3205 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3206 xoptions, FALSE, cb);
3207 if (errorcode != 0)
3208 {
3209 ESCAPE_FAILED:
3210 if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3211 goto FAILED;
3212 ptr = tempptr;
3213 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3214 {
3215 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3216 }
3217 escape = 0; /* Treat as literal character */
3218 }
3219
3220 /* The escape was a data escape or literal character. */
3221
3222 if (escape == 0)
3223 {
3224 PARSED_LITERAL(c, parsed_pattern);
3225 }
3226
3227 /* The escape was a back (or forward) reference. We keep the offset in
3228 order to give a more useful diagnostic for a bad forward reference. For
3229 references to groups numbered less than 10 we can't use more than two items
3230 in parsed_pattern because they may be just two characters in the input (and
3231 in a 64-bit world an offset may need two elements). So for them, the offset
3232 of the first occurrent is held in a special vector. */
3233
3234 else if (escape < 0)
3235 {
3236 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3237 escape = -escape;
3238 *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3239 if (escape < 10)
3240 {
3241 if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3242 cb->small_ref_offset[escape] = offset;
3243 }
3244 else
3245 {
3246 PUTOFFSET(offset, parsed_pattern);
3247 }
3248 okquantifier = TRUE;
3249 }
3250
3251 /* The escape was a character class such as \d etc. or other special
3252 escape indicator such as \A or \X. Most of them generate just a single
3253 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3254 value. They are supported only when Unicode is available. The type and
3255 value are packed into a single 32-bit value so that the whole sequences
3256 uses only two elements in the parsed_vector. This is because the same
3257 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3258 set.
3259
3260 There are also some cases where the escape sequence is followed by a name:
3261 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3262 and \g'name' are subroutine calls by name; \g{name} is a synonym for
3263 \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3264 and returned as a negative value (handled above). A name is coded as an
3265 offset into the pattern and a length. */
3266
3267 else switch (escape)
3268 {
3269 case ESC_C:
3270 #ifdef NEVER_BACKSLASH_C
3271 errorcode = ERR85;
3272 goto ESCAPE_FAILED;
3273 #else
3274 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3275 {
3276 errorcode = ERR83;
3277 goto ESCAPE_FAILED;
3278 }
3279 #endif
3280 okquantifier = TRUE;
3281 *parsed_pattern++ = META_ESCAPE + escape;
3282 break;
3283
3284 /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3285 when \u{ is not followed by hex digits and }. It requests two literal
3286 characters, u and { and we need this, as otherwise \u{ 12} (for example)
3287 would be treated as u{12} now that spaces are allowed in quantifiers. */
3288
3289 case ESC_ub:
3290 *parsed_pattern++ = CHAR_u;
3291 PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3292 break;
3293
3294 case ESC_X:
3295 #ifndef SUPPORT_UNICODE
3296 errorcode = ERR45; /* Supported only with Unicode support */
3297 goto ESCAPE_FAILED;
3298 #endif
3299 case ESC_H:
3300 case ESC_h:
3301 case ESC_N:
3302 case ESC_R:
3303 case ESC_V:
3304 case ESC_v:
3305 okquantifier = TRUE;
3306 *parsed_pattern++ = META_ESCAPE + escape;
3307 break;
3308
3309 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3310 *parsed_pattern++ = META_ESCAPE + escape;
3311 break;
3312
3313 /* Escapes that may change in UCP mode. */
3314
3315 case ESC_d:
3316 case ESC_D:
3317 case ESC_s:
3318 case ESC_S:
3319 case ESC_w:
3320 case ESC_W:
3321 okquantifier = TRUE;
3322 parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3323 xoptions);
3324 break;
3325
3326 /* Unicode property matching */
3327
3328 case ESC_P:
3329 case ESC_p:
3330 #ifdef SUPPORT_UNICODE
3331 {
3332 BOOL negated;
3333 uint16_t ptype = 0, pdata = 0;
3334 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3335 goto ESCAPE_FAILED;
3336 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3337 *parsed_pattern++ = META_ESCAPE + escape;
3338 *parsed_pattern++ = (ptype << 16) | pdata;
3339 okquantifier = TRUE;
3340 }
3341 #else
3342 errorcode = ERR45;
3343 goto ESCAPE_FAILED;
3344 #endif
3345 break; /* End \P and \p */
3346
3347 /* When \g is used with quotes or angle brackets as delimiters, it is a
3348 numerical or named subroutine call, and control comes here. When used
3349 with brace delimiters it is a numberical back reference and does not come
3350 here because check_escape() returns it directly as a reference. \k is
3351 always a named back reference. */
3352
3353 case ESC_g:
3354 case ESC_k:
3355 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3356 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3357 {
3358 errorcode = (escape == ESC_g)? ERR57 : ERR69;
3359 goto ESCAPE_FAILED;
3360 }
3361 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3362 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3363 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3364
3365 /* For a non-braced \g, check for a numerical recursion. */
3366
3367 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3368 {
3369 PCRE2_SPTR p = ptr + 1;
3370
3371 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3372 &errorcode))
3373 {
3374 if (p >= ptrend || *p != terminator)
3375 {
3376 errorcode = ERR57;
3377 goto ESCAPE_FAILED;
3378 }
3379 ptr = p;
3380 goto SET_RECURSION;
3381 }
3382 if (errorcode != 0) goto ESCAPE_FAILED;
3383 }
3384
3385 /* Not a numerical recursion. Perl allows spaces and tabs after { and
3386 before } but not for other delimiters. */
3387
3388 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3389 &errorcode, cb)) goto ESCAPE_FAILED;
3390
3391 /* \k and \g when used with braces are back references, whereas \g used
3392 with quotes or angle brackets is a recursion */
3393
3394 *parsed_pattern++ =
3395 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3396 META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3397 *parsed_pattern++ = namelen;
3398
3399 PUTOFFSET(offset, parsed_pattern);
3400 okquantifier = TRUE;
3401 break; /* End special escape processing */
3402 }
3403 break; /* End escape sequence processing */
3404
3405
3406 /* ---- Single-character special items ---- */
3407
3408 case CHAR_CIRCUMFLEX_ACCENT:
3409 *parsed_pattern++ = META_CIRCUMFLEX;
3410 break;
3411
3412 case CHAR_DOLLAR_SIGN:
3413 *parsed_pattern++ = META_DOLLAR;
3414 break;
3415
3416 case CHAR_DOT:
3417 *parsed_pattern++ = META_DOT;
3418 okquantifier = TRUE;
3419 break;
3420
3421
3422 /* ---- Single-character quantifiers ---- */
3423
3424 case CHAR_ASTERISK:
3425 meta_quantifier = META_ASTERISK;
3426 goto CHECK_QUANTIFIER;
3427
3428 case CHAR_PLUS:
3429 meta_quantifier = META_PLUS;
3430 goto CHECK_QUANTIFIER;
3431
3432 case CHAR_QUESTION_MARK:
3433 meta_quantifier = META_QUERY;
3434 goto CHECK_QUANTIFIER;
3435
3436
3437 /* ---- Potential {n,m} quantifier ---- */
3438
3439 case CHAR_LEFT_CURLY_BRACKET:
3440 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3441 &errorcode))
3442 {
3443 if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3444 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3445 break; /* No more quantifier processing */
3446 }
3447 meta_quantifier = META_MINMAX;
3448 /* Fall through */
3449
3450
3451 /* ---- Quantifier post-processing ---- */
3452
3453 /* Check that a quantifier is allowed after the previous item. This
3454 guarantees that there is a previous item. */
3455
3456 CHECK_QUANTIFIER:
3457 if (!prev_okquantifier)
3458 {
3459 errorcode = ERR9;
3460 goto FAILED_BACK;
3461 }
3462
3463 /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3464 quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3465 sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3466 wrapping it in non-capturing brackets, but we have to allow for a preceding
3467 (*MARK) for when (*ACCEPT) has an argument. */
3468
3469 if (*prev_parsed_item == META_ACCEPT)
3470 {
3471 uint32_t *p;
3472 for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3473 *verbstartptr = META_NOCAPTURE;
3474 parsed_pattern[1] = META_KET;
3475 parsed_pattern += 2;
3476 }
3477
3478 /* Now we can put the quantifier into the parsed pattern vector. At this
3479 stage, we have only the basic quantifier. The check for a following + or ?
3480 modifier happens at the top of the loop, after any intervening comments
3481 have been removed. */
3482
3483 *parsed_pattern++ = meta_quantifier;
3484 if (c == CHAR_LEFT_CURLY_BRACKET)
3485 {
3486 *parsed_pattern++ = min_repeat;
3487 *parsed_pattern++ = max_repeat;
3488 }
3489 break;
3490
3491
3492 /* ---- Character class ---- */
3493
3494 case CHAR_LEFT_SQUARE_BRACKET:
3495 okquantifier = TRUE;
3496
3497 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3498 used for "start of word" and "end of word". As these are otherwise illegal
3499 sequences, we don't break anything by recognizing them. They are replaced
3500 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3501 erroneous and are handled by the normal code below. */
3502
3503 if (ptrend - ptr >= 6 &&
3504 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3505 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3506 {
3507 *parsed_pattern++ = META_ESCAPE + ESC_b;
3508
3509 if (ptr[2] == CHAR_LESS_THAN_SIGN)
3510 {
3511 *parsed_pattern++ = META_LOOKAHEAD;
3512 }
3513 else
3514 {
3515 *parsed_pattern++ = META_LOOKBEHIND;
3516 *has_lookbehind = TRUE;
3517
3518 /* The offset is used only for the "non-fixed length" error; this won't
3519 occur here, so just store zero. */
3520
3521 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3522 }
3523
3524 if ((options & PCRE2_UCP) == 0)
3525 *parsed_pattern++ = META_ESCAPE + ESC_w;
3526 else
3527 {
3528 *parsed_pattern++ = META_ESCAPE + ESC_p;
3529 *parsed_pattern++ = PT_WORD << 16;
3530 }
3531 *parsed_pattern++ = META_KET;
3532 ptr += 6;
3533 break;
3534 }
3535
3536 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3537 they are encountered at the top level, so we'll do that too. */
3538
3539 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3540 *ptr == CHAR_EQUALS_SIGN) &&
3541 check_posix_syntax(ptr, ptrend, &tempptr))
3542 {
3543 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3544 goto FAILED;
3545 }
3546
3547 /* Process a regular character class. If the first character is '^', set
3548 the negation flag. If the first few characters (either before or after ^)
3549 are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3550 This makes for compatibility with Perl. */
3551
3552 negate_class = FALSE;
3553 while (ptr < ptrend)
3554 {
3555 GETCHARINCTEST(c, ptr);
3556 if (c == CHAR_BACKSLASH)
3557 {
3558 if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3559 else if (ptrend - ptr >= 3 &&
3560 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3561 ptr += 3;
3562 else
3563 break;
3564 }
3565 else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3566 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3567 continue;
3568 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3569 negate_class = TRUE;
3570 else break;
3571 }
3572
3573 /* Now the real contents of the class; c has the first "real" character.
3574 Empty classes are permitted only if the option is set. */
3575
3576 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3577 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3578 {
3579 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3580 break; /* End of class processing */
3581 }
3582
3583 /* Process a non-empty class. */
3584
3585 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3586 class_range_state = RANGE_NO;
3587
3588 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3589 because there are holes in the encoding, and simply using the range A-Z
3590 (for example) would include the characters in the holes. This applies only
3591 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3592 in this respect. In order to accommodate this, we keep track of whether
3593 character values are literal or not, and a state variable for handling
3594 ranges. */
3595
3596 /* Loop for the contents of the class */
3597
3598 for (;;)
3599 {
3600 BOOL char_is_literal = TRUE;
3601
3602 /* Inside \Q...\E everything is literal except \E */
3603
3604 if (inescq)
3605 {
3606 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3607 {
3608 inescq = FALSE; /* Reset literal state */
3609 ptr++; /* Skip the 'E' */
3610 goto CLASS_CONTINUE;
3611 }
3612 goto CLASS_LITERAL;
3613 }
3614
3615 /* Skip over space and tab (only) in extended-more mode. */
3616
3617 if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3618 (c == CHAR_SPACE || c == CHAR_HT))
3619 goto CLASS_CONTINUE;
3620
3621 /* Handle POSIX class names. Perl allows a negation extension of the
3622 form [:^name:]. A square bracket that doesn't match the syntax is
3623 treated as a literal. We also recognize the POSIX constructions
3624 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3625 5.6 and 5.8 do. */
3626
3627 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3628 ptrend - ptr >= 3 &&
3629 (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3630 *ptr == CHAR_EQUALS_SIGN) &&
3631 check_posix_syntax(ptr, ptrend, &tempptr))
3632 {
3633 BOOL posix_negate = FALSE;
3634 int posix_class;
3635
3636 /* Perl treats a hyphen before a POSIX class as a literal, not the
3637 start of a range. However, it gives a warning in its warning mode. PCRE
3638 does not have a warning mode, so we give an error, because this is
3639 likely an error on the user's part. */
3640
3641 if (class_range_state == RANGE_STARTED)
3642 {
3643 errorcode = ERR50;
3644 goto FAILED;
3645 }
3646
3647 if (*ptr != CHAR_COLON)
3648 {
3649 errorcode = ERR13;
3650 goto FAILED_BACK;
3651 }
3652
3653 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3654 {
3655 posix_negate = TRUE;
3656 ptr++;
3657 }
3658
3659 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3660 if (posix_class < 0)
3661 {
3662 errorcode = ERR30;
3663 goto FAILED;
3664 }
3665 ptr = tempptr + 2;
3666
3667 /* Perl treats a hyphen after a POSIX class as a literal, not the
3668 start of a range. However, it gives a warning in its warning mode
3669 unless the hyphen is the last character in the class. PCRE does not
3670 have a warning mode, so we give an error, because this is likely an
3671 error on the user's part. */
3672
3673 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3674 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3675 {
3676 errorcode = ERR50;
3677 goto FAILED;
3678 }
3679
3680 /* Set "a hyphen is not the start of a range" for the -] case, and also
3681 in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3682 fuzzers do that kind of thing) and *then* a hyphen. This causes that
3683 hyphen to be treated as a literal. I don't think it's worth setting up
3684 special apparatus to do otherwise. */
3685
3686 class_range_state = RANGE_NO;
3687
3688 /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
3689 of the POSIX classes are converted to use Unicode properties \p or \P
3690 or, in one case, \h or \H. The substitutes table has two values per
3691 class, containing the type and value of a \p or \P item. The special
3692 cases are specified with a negative type: a non-zero value causes \h or
3693 \H to be used, and a zero value falls through to behave like a non-UCP
3694 POSIX class. There are now also some extra options that force ASCII for
3695 some classes. */
3696
3697 #ifdef SUPPORT_UNICODE
3698 if ((options & PCRE2_UCP) != 0 &&
3699 (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
3700 !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
3701 (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
3702 {
3703 int ptype = posix_substitutes[2*posix_class];
3704 int pvalue = posix_substitutes[2*posix_class + 1];
3705
3706 if (ptype >= 0)
3707 {
3708 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3709 *parsed_pattern++ = (ptype << 16) | pvalue;
3710 goto CLASS_CONTINUE;
3711 }
3712
3713 if (pvalue != 0)
3714 {
3715 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3716 goto CLASS_CONTINUE;
3717 }
3718
3719 /* Fall through */
3720 }
3721 #endif /* SUPPORT_UNICODE */
3722
3723 /* Non-UCP POSIX class */
3724
3725 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3726 *parsed_pattern++ = posix_class;
3727 }
3728
3729 /* Handle potential start of range */
3730
3731 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3732 {
3733 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3734 META_RANGE_LITERAL : META_RANGE_ESCAPED;
3735 class_range_state = RANGE_STARTED;
3736 }
3737
3738 /* Handle a literal character */
3739
3740 else if (c != CHAR_BACKSLASH)
3741 {
3742 CLASS_LITERAL:
3743 if (class_range_state == RANGE_STARTED)
3744 {
3745 if (c == parsed_pattern[-2]) /* Optimize one-char range */
3746 parsed_pattern--;
3747 else if (parsed_pattern[-2] > c) /* Check range is in order */
3748 {
3749 errorcode = ERR8;
3750 goto FAILED_BACK;
3751 }
3752 else
3753 {
3754 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3755 parsed_pattern[-1] = META_RANGE_ESCAPED;
3756 PARSED_LITERAL(c, parsed_pattern);
3757 }
3758 class_range_state = RANGE_NO;
3759 }
3760 else /* Potential start of range */
3761 {
3762 class_range_state = char_is_literal?
3763 RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3764 PARSED_LITERAL(c, parsed_pattern);
3765 }
3766 }
3767
3768 /* Handle escapes in a class */
3769
3770 else
3771 {
3772 tempptr = ptr;
3773 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3774 xoptions, TRUE, cb);
3775
3776 if (errorcode != 0)
3777 {
3778 if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3779 goto FAILED;
3780 ptr = tempptr;
3781 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3782 {
3783 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3784 }
3785 escape = 0; /* Treat as literal character */
3786 }
3787
3788 switch(escape)
3789 {
3790 case 0: /* Escaped character code point is in c */
3791 char_is_literal = FALSE;
3792 goto CLASS_LITERAL; /* (a few lines above) */
3793
3794 case ESC_b:
3795 c = CHAR_BS; /* \b is backspace in a class */
3796 char_is_literal = FALSE;
3797 goto CLASS_LITERAL;
3798
3799 case ESC_Q:
3800 inescq = TRUE; /* Enter literal mode */
3801 goto CLASS_CONTINUE;
3802
3803 case ESC_E: /* Ignore orphan \E */
3804 goto CLASS_CONTINUE;
3805
3806 case ESC_B: /* Always an error in a class */
3807 case ESC_R:
3808 case ESC_X:
3809 errorcode = ERR7;
3810 ptr--;
3811 goto FAILED;
3812 }
3813
3814 /* The second part of a range can be a single-character escape
3815 sequence (detected above), but not any of the other escapes. Perl
3816 treats a hyphen as a literal in such circumstances. However, in Perl's
3817 warning mode, a warning is given, so PCRE now faults it, as it is
3818 almost certainly a mistake on the user's part. */
3819
3820 if (class_range_state == RANGE_STARTED)
3821 {
3822 errorcode = ERR50;
3823 goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
3824 }
3825
3826 /* Of the remaining escapes, only those that define characters are
3827 allowed in a class. None may start a range. */
3828
3829 class_range_state = RANGE_NO;
3830 switch(escape)
3831 {
3832 case ESC_N:
3833 errorcode = ERR71;
3834 goto FAILED;
3835
3836 case ESC_H:
3837 case ESC_h:
3838 case ESC_V:
3839 case ESC_v:
3840 *parsed_pattern++ = META_ESCAPE + escape;
3841 break;
3842
3843 /* These escapes may be converted to Unicode property tests when
3844 PCRE2_UCP is set. */
3845
3846 case ESC_d:
3847 case ESC_D:
3848 case ESC_s:
3849 case ESC_S:
3850 case ESC_w:
3851 case ESC_W:
3852 parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3853 xoptions);
3854 break;
3855
3856 /* Explicit Unicode property matching */
3857
3858 case ESC_P:
3859 case ESC_p:
3860 #ifdef SUPPORT_UNICODE
3861 {
3862 BOOL negated;
3863 uint16_t ptype = 0, pdata = 0;
3864 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3865 goto FAILED;
3866 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3867 *parsed_pattern++ = META_ESCAPE + escape;
3868 *parsed_pattern++ = (ptype << 16) | pdata;
3869 }
3870 #else
3871 errorcode = ERR45;
3872 goto FAILED;
3873 #endif
3874 break; /* End \P and \p */
3875
3876 default: /* All others are not allowed in a class */
3877 errorcode = ERR7;
3878 ptr--;
3879 goto FAILED;
3880 }
3881
3882 /* Perl gives a warning unless a following hyphen is the last character
3883 in the class. PCRE throws an error. */
3884
3885 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3886 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3887 {
3888 errorcode = ERR50;
3889 goto FAILED;
3890 }
3891 }
3892
3893 /* Proceed to next thing in the class. */
3894
3895 CLASS_CONTINUE:
3896 if (ptr >= ptrend)
3897 {
3898 errorcode = ERR6; /* Missing terminating ']' */
3899 goto FAILED;
3900 }
3901 GETCHARINCTEST(c, ptr);
3902 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3903 } /* End of class-processing loop */
3904
3905 /* -] at the end of a class is a literal '-' */
3906
3907 if (class_range_state == RANGE_STARTED)
3908 {
3909 parsed_pattern[-1] = CHAR_MINUS;
3910 class_range_state = RANGE_NO;
3911 }
3912
3913 *parsed_pattern++ = META_CLASS_END;
3914 break; /* End of character class */
3915
3916
3917 /* ---- Opening parenthesis ---- */
3918
3919 case CHAR_LEFT_PARENTHESIS:
3920 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3921
3922 /* If ( is not followed by ? it is either a capture or a special verb or an
3923 alpha assertion or a positive non-atomic lookahead. */
3924
3925 if (*ptr != CHAR_QUESTION_MARK)
3926 {
3927 const char *vn;
3928
3929 /* Handle capturing brackets (or non-capturing if auto-capture is turned
3930 off). */
3931
3932 if (*ptr != CHAR_ASTERISK)
3933 {
3934 nest_depth++;
3935 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3936 {
3937 if (cb->bracount >= MAX_GROUP_NUMBER)
3938 {
3939 errorcode = ERR97;
3940 goto FAILED;
3941 }
3942 cb->bracount++;
3943 *parsed_pattern++ = META_CAPTURE | cb->bracount;
3944 }
3945 else *parsed_pattern++ = META_NOCAPTURE;
3946 }
3947
3948 /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3949 quantifier" error rather than "(*MARK) must have an argument". */
3950
3951 else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3952 break;
3953
3954 /* Handle "alpha assertions" such as (*pla:...). Most of these are
3955 synonyms for the historical symbolic assertions, but the script run and
3956 non-atomic lookaround ones are new. They are distinguished by starting
3957 with a lower case letter. Checking both ends of the alphabet makes this
3958 work in all character codes. */
3959
3960 else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3961 {
3962 uint32_t meta;
3963
3964 vn = alasnames;
3965 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3966 &errorcode, cb)) goto FAILED;
3967 if (ptr >= ptrend || *ptr != CHAR_COLON)
3968 {
3969 errorcode = ERR95; /* Malformed */
3970 goto FAILED;
3971 }
3972
3973 /* Scan the table of alpha assertion names */
3974
3975 for (i = 0; i < alascount; i++)
3976 {
3977 if (namelen == alasmeta[i].len &&
3978 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3979 break;
3980 vn += alasmeta[i].len + 1;
3981 }
3982
3983 if (i >= alascount)
3984 {
3985 errorcode = ERR95; /* Alpha assertion not recognized */
3986 goto FAILED;
3987 }
3988
3989 /* Check for expecting an assertion condition. If so, only atomic
3990 lookaround assertions are valid. */
3991
3992 meta = alasmeta[i].meta;
3993 if (prev_expect_cond_assert > 0 &&
3994 (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3995 {
3996 errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3997 ERR98 : ERR28; /* (Atomic) assertion expected */
3998 goto FAILED;
3999 }
4000
4001 /* The lookaround alphabetic synonyms can mostly be handled by jumping
4002 to the code that handles the traditional symbolic forms. */
4003
4004 switch(meta)
4005 {
4006 default:
4007 errorcode = ERR89; /* Unknown code; should never occur because */
4008 goto FAILED; /* the meta values come from a table above. */
4009
4010 case META_ATOMIC:
4011 goto ATOMIC_GROUP;
4012
4013 case META_LOOKAHEAD:
4014 goto POSITIVE_LOOK_AHEAD;
4015
4016 case META_LOOKAHEAD_NA:
4017 goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4018
4019 case META_LOOKAHEADNOT:
4020 goto NEGATIVE_LOOK_AHEAD;
4021
4022 case META_LOOKBEHIND:
4023 case META_LOOKBEHINDNOT:
4024 case META_LOOKBEHIND_NA:
4025 *parsed_pattern++ = meta;
4026 ptr--;
4027 goto POST_LOOKBEHIND;
4028
4029 /* The script run facilities are handled here. Unicode support is
4030 required (give an error if not, as this is a security issue). Always
4031 record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4032 META_ATOMIC and remember that we need two META_KETs at the end. */
4033
4034 case META_SCRIPT_RUN:
4035 case META_ATOMIC_SCRIPT_RUN:
4036 #ifdef SUPPORT_UNICODE
4037 *parsed_pattern++ = META_SCRIPT_RUN;
4038 nest_depth++;
4039 ptr++;
4040 if (meta == META_ATOMIC_SCRIPT_RUN)
4041 {
4042 *parsed_pattern++ = META_ATOMIC;
4043 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4044 else if (++top_nest >= end_nests)
4045 {
4046 errorcode = ERR84;
4047 goto FAILED;
4048 }
4049 top_nest->nest_depth = nest_depth;
4050 top_nest->flags = NSF_ATOMICSR;
4051 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4052 top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4053 }
4054 break;
4055 #else /* SUPPORT_UNICODE */
4056 errorcode = ERR96;
4057 goto FAILED;
4058 #endif
4059 }
4060 }
4061
4062
4063 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4064
4065 else
4066 {
4067 vn = verbnames;
4068 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4069 &errorcode, cb)) goto FAILED;
4070 if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4071 *ptr != CHAR_RIGHT_PARENTHESIS))
4072 {
4073 errorcode = ERR60; /* Malformed */
4074 goto FAILED;
4075 }
4076
4077 /* Scan the table of verb names */
4078
4079 for (i = 0; i < verbcount; i++)
4080 {
4081 if (namelen == verbs[i].len &&
4082 PRIV(strncmp_c8)(name, vn, namelen) == 0)
4083 break;
4084 vn += verbs[i].len + 1;
4085 }
4086
4087 if (i >= verbcount)
4088 {
4089 errorcode = ERR60; /* Verb not recognized */
4090 goto FAILED;
4091 }
4092
4093 /* An empty argument is treated as no argument. */
4094
4095 if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4096 ptr[1] == CHAR_RIGHT_PARENTHESIS)
4097 ptr++; /* Advance to the closing parens */
4098
4099 /* Check for mandatory non-empty argument; this is (*MARK) */
4100
4101 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4102 {
4103 errorcode = ERR66;
4104 goto FAILED;
4105 }
4106
4107 /* Remember where this verb, possibly with a preceding (*MARK), starts,
4108 for handling quantified (*ACCEPT). */
4109
4110 verbstartptr = parsed_pattern;
4111 okquantifier = (verbs[i].meta == META_ACCEPT);
4112
4113 /* It appears that Perl allows any characters whatsoever, other than a
4114 closing parenthesis, to appear in arguments ("names"), so we no longer
4115 insist on letters, digits, and underscores. Perl does not, however, do
4116 any interpretation within arguments, and has no means of including a
4117 closing parenthesis. PCRE supports escape processing but only when it
4118 is requested by an option. We set inverbname TRUE here, and let the
4119 main loop take care of this so that escape and \x processing is done by
4120 the main code above. */
4121
4122 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
4123 {
4124 /* Some optional arguments can be treated as a preceding (*MARK) */
4125
4126 if (verbs[i].has_arg < 0)
4127 {
4128 add_after_mark = verbs[i].meta;
4129 *parsed_pattern++ = META_MARK;
4130 }
4131
4132 /* The remaining verbs with arguments (except *MARK) need a different
4133 opcode. */
4134
4135 else
4136 {
4137 *parsed_pattern++ = verbs[i].meta +
4138 ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4139 }
4140
4141 /* Set up for reading the name in the main loop. */
4142
4143 verblengthptr = parsed_pattern++;
4144 verbnamestart = ptr;
4145 inverbname = TRUE;
4146 }
4147 else /* No verb "name" argument */
4148 {
4149 *parsed_pattern++ = verbs[i].meta;
4150 }
4151 } /* End of (*VERB) handling */
4152 break; /* Done with this parenthesis */
4153 } /* End of groups that don't start with (? */
4154
4155
4156 /* ---- Items starting (? ---- */
4157
4158 /* The type of item is determined by what follows (?. Handle (?| and option
4159 changes under "default" because both need a new block on the nest stack.
4160 Comments starting with (?# are handled above. Note that there is some
4161 ambiguity about the sequence (?- because if a digit follows it's a relative
4162 recursion or subroutine call whereas otherwise it's an option unsetting. */
4163
4164 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4165
4166 switch(*ptr)
4167 {
4168 default:
4169 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4170 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
4171
4172 /* We now have either (?| or a (possibly empty) option setting,
4173 optionally followed by a non-capturing group. */
4174
4175 nest_depth++;
4176 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4177 else if (++top_nest >= end_nests)
4178 {
4179 errorcode = ERR84;
4180 goto FAILED;
4181 }
4182 top_nest->nest_depth = nest_depth;
4183 top_nest->flags = 0;
4184 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4185 top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4186
4187 /* Start of non-capturing group that resets the capture count for each
4188 branch. */
4189
4190 if (*ptr == CHAR_VERTICAL_LINE)
4191 {
4192 top_nest->reset_group = (uint16_t)cb->bracount;
4193 top_nest->max_group = (uint16_t)cb->bracount;
4194 top_nest->flags |= NSF_RESET;
4195 cb->external_flags |= PCRE2_DUPCAPUSED;
4196 *parsed_pattern++ = META_NOCAPTURE;
4197 ptr++;
4198 }
4199
4200 /* Scan for options imnrsxJU to be set or unset. */
4201
4202 else
4203 {
4204 BOOL hyphenok = TRUE;
4205 uint32_t oldoptions = options;
4206 uint32_t oldxoptions = xoptions;
4207
4208 top_nest->reset_group = 0;
4209 top_nest->max_group = 0;
4210 set = unset = 0;
4211 optset = &set;
4212 xset = xunset = 0;
4213 xoptset = &xset;
4214
4215 /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4216
4217 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4218 {
4219 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4220 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4221 xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4222 hyphenok = FALSE;
4223 ptr++;
4224 }
4225
4226 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4227 *ptr != CHAR_COLON)
4228 {
4229 switch (*ptr++)
4230 {
4231 case CHAR_MINUS:
4232 if (!hyphenok)
4233 {
4234 errorcode = ERR94;
4235 ptr--; /* Correct the offset */
4236 goto FAILED;
4237 }
4238 optset = &unset;
4239 xoptset = &xunset;
4240 hyphenok = FALSE;
4241 break;
4242
4243 /* There are some two-character sequences that start with 'a'. */
4244
4245 case CHAR_a:
4246 if (ptr < ptrend)
4247 {
4248 if (*ptr == CHAR_D)
4249 {
4250 *xoptset |= PCRE2_EXTRA_ASCII_BSD;
4251 ptr++;
4252 break;
4253 }
4254 if (*ptr == CHAR_P)
4255 {
4256 *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
4257 ptr++;
4258 break;
4259 }
4260 if (*ptr == CHAR_S)
4261 {
4262 *xoptset |= PCRE2_EXTRA_ASCII_BSS;
4263 ptr++;
4264 break;
4265 }
4266 if (*ptr == CHAR_T)
4267 {
4268 *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
4269 ptr++;
4270 break;
4271 }
4272 if (*ptr == CHAR_W)
4273 {
4274 *xoptset |= PCRE2_EXTRA_ASCII_BSW;
4275 ptr++;
4276 break;
4277 }
4278 }
4279 *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
4280 PCRE2_EXTRA_ASCII_BSW|
4281 PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
4282 break;
4283
4284 case CHAR_J: /* Record that it changed in the external options */
4285 *optset |= PCRE2_DUPNAMES;
4286 cb->external_flags |= PCRE2_JCHANGED;
4287 break;
4288
4289 case CHAR_i: *optset |= PCRE2_CASELESS; break;
4290 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4291 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4292 case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
4293 case CHAR_s: *optset |= PCRE2_DOTALL; break;
4294 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4295
4296 /* If x appears twice it sets the extended extended option. */
4297
4298 case CHAR_x:
4299 *optset |= PCRE2_EXTENDED;
4300 if (ptr < ptrend && *ptr == CHAR_x)
4301 {
4302 *optset |= PCRE2_EXTENDED_MORE;
4303 ptr++;
4304 }
4305 break;
4306
4307 default:
4308 errorcode = ERR11;
4309 ptr--; /* Correct the offset */
4310 goto FAILED;
4311 }
4312 }
4313
4314 /* If we are setting extended without extended-more, ensure that any
4315 existing extended-more gets unset. Also, unsetting extended must also
4316 unset extended-more. */
4317
4318 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4319 (unset & PCRE2_EXTENDED) != 0)
4320 unset |= PCRE2_EXTENDED_MORE;
4321
4322 options = (options | set) & (~unset);
4323 xoptions = (xoptions | xset) & (~xunset);
4324
4325 /* If the options ended with ')' this is not the start of a nested
4326 group with option changes, so the options change at this level.
4327 In this case, if the previous level set up a nest block, discard the
4328 one we have just created. Otherwise adjust it for the previous level.
4329 If the options ended with ':' we are starting a non-capturing group,
4330 possibly with an options setting. */
4331
4332 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4333 if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4334 {
4335 nest_depth--; /* This is not a nested group after all. */
4336 if (top_nest > (nest_save *)(cb->start_workspace) &&
4337 (top_nest-1)->nest_depth == nest_depth) top_nest--;
4338 else top_nest->nest_depth = nest_depth;
4339 }
4340 else *parsed_pattern++ = META_NOCAPTURE;
4341
4342 /* If nothing changed, no need to record. */
4343
4344 if (options != oldoptions || xoptions != oldxoptions)
4345 {
4346 *parsed_pattern++ = META_OPTIONS;
4347 *parsed_pattern++ = options;
4348 *parsed_pattern++ = xoptions;
4349 }
4350 } /* End options processing */
4351 break; /* End default case after (? */
4352
4353
4354 /* ---- Python syntax support ---- */
4355
4356 case CHAR_P:
4357 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4358
4359 /* (?P<name> is the same as (?<name>, which defines a named group. */
4360
4361 if (*ptr == CHAR_LESS_THAN_SIGN)
4362 {
4363 terminator = CHAR_GREATER_THAN_SIGN;
4364 goto DEFINE_NAME;
4365 }
4366
4367 /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4368 call. */
4369
4370 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4371
4372 /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4373 else after (?P is an error. */
4374
4375 if (*ptr != CHAR_EQUALS_SIGN)
4376 {
4377 errorcode = ERR41;
4378 goto FAILED;
4379 }
4380 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4381 &namelen, &errorcode, cb)) goto FAILED;
4382 *parsed_pattern++ = META_BACKREF_BYNAME;
4383 *parsed_pattern++ = namelen;
4384 PUTOFFSET(offset, parsed_pattern);
4385 okquantifier = TRUE;
4386 break; /* End of (?P processing */
4387
4388
4389 /* ---- Recursion/subroutine calls by number ---- */
4390
4391 case CHAR_R:
4392 i = 0; /* (?R) == (?R0) */
4393 ptr++;
4394 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4395 {
4396 errorcode = ERR58;
4397 goto FAILED;
4398 }
4399 goto SET_RECURSION;
4400
4401 /* An item starting (?- followed by a digit comes here via the "default"
4402 case because (?- followed by a non-digit is an options setting. */
4403
4404 case CHAR_PLUS:
4405 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4406 {
4407 errorcode = ERR29; /* Missing number */
4408 goto FAILED;
4409 }
4410 /* Fall through */
4411
4412 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4413 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4414 RECURSION_BYNUMBER:
4415 if (!read_number(&ptr, ptrend,
4416 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4417 MAX_GROUP_NUMBER, ERR61,
4418 &i, &errorcode)) goto FAILED;
4419 if (i < 0) /* NB (?0) is permitted */
4420 {
4421 errorcode = ERR15; /* Unknown group */
4422 goto FAILED_BACK;
4423 }
4424 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4425 goto UNCLOSED_PARENTHESIS;
4426
4427 SET_RECURSION:
4428 *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4429 offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4430 ptr++;
4431 PUTOFFSET(offset, parsed_pattern);
4432 okquantifier = TRUE;
4433 break; /* End of recursive call by number handling */
4434
4435
4436 /* ---- Recursion/subroutine calls by name ---- */
4437
4438 case CHAR_AMPERSAND:
4439 RECURSE_BY_NAME:
4440 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4441 &namelen, &errorcode, cb)) goto FAILED;
4442 *parsed_pattern++ = META_RECURSE_BYNAME;
4443 *parsed_pattern++ = namelen;
4444 PUTOFFSET(offset, parsed_pattern);
4445 okquantifier = TRUE;
4446 break;
4447
4448 /* ---- Callout with numerical or string argument ---- */
4449
4450 case CHAR_C:
4451 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4452
4453 /* If the previous item was a condition starting (?(? an assertion,
4454 optionally preceded by a callout, is expected. This is checked later on,
4455 during actual compilation. However we need to identify this kind of
4456 assertion in this pass because it must not be qualified. The value of
4457 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4458 for a callout - still leaving a positive value that identifies the
4459 assertion. Multiple callouts or any other items will make it zero or
4460 less, which doesn't matter because they will cause an error later. */
4461
4462 expect_cond_assert = prev_expect_cond_assert - 1;
4463
4464 /* If previous_callout is not NULL, it means this follows a previous
4465 callout. If it was a manual callout, do nothing; this means its "length
4466 of next pattern item" field will remain zero. If it was an automatic
4467 callout, abolish it. */
4468
4469 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4470 previous_callout == parsed_pattern - 4 &&
4471 parsed_pattern[-1] == 255)
4472 parsed_pattern = previous_callout;
4473
4474 /* Save for updating next pattern item length, and skip one item before
4475 completing. */
4476
4477 previous_callout = parsed_pattern;
4478 after_manual_callout = 1;
4479
4480 /* Handle a string argument; specific delimiter is required. */
4481
4482 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4483 {
4484 PCRE2_SIZE calloutlength;
4485 PCRE2_SPTR startptr = ptr;
4486
4487 delimiter = 0;
4488 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4489 {
4490 if (*ptr == PRIV(callout_start_delims)[i])
4491 {
4492 delimiter = PRIV(callout_end_delims)[i];
4493 break;
4494 }
4495 }
4496 if (delimiter == 0)
4497 {
4498 errorcode = ERR82;
4499 goto FAILED;
4500 }
4501
4502 *parsed_pattern = META_CALLOUT_STRING;
4503 parsed_pattern += 3; /* Skip pattern info */
4504
4505 for (;;)
4506 {
4507 if (++ptr >= ptrend)
4508 {
4509 errorcode = ERR81;
4510 ptr = startptr; /* To give a more useful message */
4511 goto FAILED;
4512 }
4513 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4514 break;
4515 }
4516
4517 calloutlength = (PCRE2_SIZE)(ptr - startptr);
4518 if (calloutlength > UINT32_MAX)
4519 {
4520 errorcode = ERR72;
4521 goto FAILED;
4522 }
4523 *parsed_pattern++ = (uint32_t)calloutlength;
4524 offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4525 PUTOFFSET(offset, parsed_pattern);
4526 }
4527
4528 /* Handle a callout with an optional numerical argument, which must be
4529 less than or equal to 255. A missing argument gives 0. */
4530
4531 else
4532 {
4533 int n = 0;
4534 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
4535 parsed_pattern += 3; /* Skip pattern info */
4536 while (ptr < ptrend && IS_DIGIT(*ptr))
4537 {
4538 n = n * 10 + *ptr++ - CHAR_0;
4539 if (n > 255)
4540 {
4541 errorcode = ERR38;
4542 goto FAILED;
4543 }
4544 }
4545 *parsed_pattern++ = n;
4546 }
4547
4548 /* Both formats must have a closing parenthesis */
4549
4550 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4551 {
4552 errorcode = ERR39;
4553 goto FAILED;
4554 }
4555 ptr++;
4556
4557 /* Remember the offset to the next item in the pattern, and set a default
4558 length. This should get updated after the next item is read. */
4559
4560 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4561 previous_callout[2] = 0;
4562 break; /* End callout */
4563
4564
4565 /* ---- Conditional group ---- */
4566
4567 /* A condition can be an assertion, a number (referring to a numbered
4568 group's having been set), a name (referring to a named group), or 'R',
4569 referring to overall recursion. R<digits> and R&name are also permitted
4570 for recursion state tests. Numbers may be preceded by + or - to specify a
4571 relative group number.
4572
4573 There are several syntaxes for testing a named group: (?(name)) is used
4574 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4575
4576 There are two unfortunate ambiguities. 'R' can be the recursive thing or
4577 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4578 the Perl DEFINE feature or the Python named test. We look for a name
4579 first; if not found, we try the other case.
4580
4581 For compatibility with auto-callouts, we allow a callout to be specified
4582 before a condition that is an assertion. */
4583
4584 case CHAR_LEFT_PARENTHESIS:
4585 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4586 nest_depth++;
4587
4588 /* If the next character is ? or * there must be an assertion next
4589 (optionally preceded by a callout). We do not check this here, but
4590 instead we set expect_cond_assert to 2. If this is still greater than
4591 zero (callouts decrement it) when the next assertion is read, it will be
4592 marked as a condition that must not be repeated. A value greater than
4593 zero also causes checking that an assertion (possibly with callout)
4594 follows. */
4595
4596 if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4597 {
4598 *parsed_pattern++ = META_COND_ASSERT;
4599 ptr--; /* Pull pointer back to the opening parenthesis. */
4600 expect_cond_assert = 2;
4601 break; /* End of conditional */
4602 }
4603
4604 /* Handle (?([+-]number)... */
4605
4606 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4607 &errorcode))
4608 {
4609 if (i <= 0)
4610 {
4611 errorcode = ERR15;
4612 goto FAILED;
4613 }
4614 *parsed_pattern++ = META_COND_NUMBER;
4615 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4616 PUTOFFSET(offset, parsed_pattern);
4617 *parsed_pattern++ = i;
4618 }
4619 else if (errorcode != 0) goto FAILED; /* Number too big */
4620
4621 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4622
4623 else if (ptrend - ptr >= 10 &&
4624 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4625 ptr[7] != CHAR_RIGHT_PARENTHESIS)
4626 {
4627 uint32_t ge = 0;
4628 int major = 0;
4629 int minor = 0;
4630
4631 ptr += 7;
4632 if (*ptr == CHAR_GREATER_THAN_SIGN)
4633 {
4634 ge = 1;
4635 ptr++;
4636 }
4637
4638 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4639 references its argument twice. */
4640
4641 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4642 goto BAD_VERSION_CONDITION;
4643
4644 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4645 goto FAILED;
4646
4647 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4648 if (*ptr == CHAR_DOT)
4649 {
4650 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4651 minor = (*ptr++ - CHAR_0) * 10;
4652 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4653 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4654 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4655 goto BAD_VERSION_CONDITION;
4656 }
4657
4658 *parsed_pattern++ = META_COND_VERSION;
4659 *parsed_pattern++ = ge;
4660 *parsed_pattern++ = major;
4661 *parsed_pattern++ = minor;
4662 }
4663
4664 /* All the remaining cases now require us to read a name. We cannot at
4665 this stage distinguish ambiguous cases such as (?(R12) which might be a
4666 recursion test by number or a name, because the named groups have not yet
4667 all been identified. Those cases are treated as names, but given a
4668 different META code. */
4669
4670 else
4671 {
4672 BOOL was_r_ampersand = FALSE;
4673
4674 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4675 {
4676 terminator = CHAR_RIGHT_PARENTHESIS;
4677 was_r_ampersand = TRUE;
4678 ptr++;
4679 }
4680 else if (*ptr == CHAR_LESS_THAN_SIGN)
4681 terminator = CHAR_GREATER_THAN_SIGN;
4682 else if (*ptr == CHAR_APOSTROPHE)
4683 terminator = CHAR_APOSTROPHE;
4684 else
4685 {
4686 terminator = CHAR_RIGHT_PARENTHESIS;
4687 ptr--; /* Point to char before name */
4688 }
4689 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4690 &errorcode, cb)) goto FAILED;
4691
4692 /* Handle (?(R&name) */
4693
4694 if (was_r_ampersand)
4695 {
4696 *parsed_pattern = META_COND_RNAME;
4697 ptr--; /* Back to closing parens */
4698 }
4699
4700 /* Handle (?(name). If the name is "DEFINE" we identify it with a
4701 special code. Likewise if the name consists of R followed only by
4702 digits. Otherwise, handle it like a quoted name. */
4703
4704 else if (terminator == CHAR_RIGHT_PARENTHESIS)
4705 {
4706 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4707 *parsed_pattern = META_COND_DEFINE;
4708 else
4709 {
4710 for (i = 1; i < (int)namelen; i++)
4711 if (!IS_DIGIT(name[i])) break;
4712 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4713 META_COND_RNUMBER : META_COND_NAME;
4714 }
4715 ptr--; /* Back to closing parens */
4716 }
4717
4718 /* Handle (?('name') or (?(<name>) */
4719
4720 else *parsed_pattern = META_COND_NAME;
4721
4722 /* All these cases except DEFINE end with the name length and offset;
4723 DEFINE just has an offset (for the "too many branches" error). */
4724
4725 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4726 PUTOFFSET(offset, parsed_pattern);
4727 } /* End cases that read a name */
4728
4729 /* Check the closing parenthesis of the condition */
4730
4731 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4732 {
4733 errorcode = ERR24;
4734 goto FAILED;
4735 }
4736 ptr++;
4737 break; /* End of condition processing */
4738
4739
4740 /* ---- Atomic group ---- */
4741
4742 case CHAR_GREATER_THAN_SIGN:
4743 ATOMIC_GROUP: /* Come from (*atomic: */
4744 *parsed_pattern++ = META_ATOMIC;
4745 nest_depth++;
4746 ptr++;
4747 break;
4748
4749
4750 /* ---- Lookahead assertions ---- */
4751
4752 case CHAR_EQUALS_SIGN:
4753 POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
4754 *parsed_pattern++ = META_LOOKAHEAD;
4755 ptr++;
4756 goto POST_ASSERTION;
4757
4758 case CHAR_ASTERISK:
4759 POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
4760 *parsed_pattern++ = META_LOOKAHEAD_NA;
4761 ptr++;
4762 goto POST_ASSERTION;
4763
4764 case CHAR_EXCLAMATION_MARK:
4765 NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
4766 *parsed_pattern++ = META_LOOKAHEADNOT;
4767 ptr++;
4768 goto POST_ASSERTION;
4769
4770
4771 /* ---- Lookbehind assertions ---- */
4772
4773 /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4774 is the start of the name of a capturing group. */
4775
4776 case CHAR_LESS_THAN_SIGN:
4777 if (ptrend - ptr <= 1 ||
4778 (ptr[1] != CHAR_EQUALS_SIGN &&
4779 ptr[1] != CHAR_EXCLAMATION_MARK &&
4780 ptr[1] != CHAR_ASTERISK))
4781 {
4782 terminator = CHAR_GREATER_THAN_SIGN;
4783 goto DEFINE_NAME;
4784 }
4785 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4786 META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4787 META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4788
4789 POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
4790 *has_lookbehind = TRUE;
4791 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4792 PUTOFFSET(offset, parsed_pattern);
4793 ptr += 2;
4794 /* Fall through */
4795
4796 /* If the previous item was a condition starting (?(? an assertion,
4797 optionally preceded by a callout, is expected. This is checked later on,
4798 during actual compilation. However we need to identify this kind of
4799 assertion in this pass because it must not be qualified. The value of
4800 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4801 for a callout - still leaving a positive value that identifies the
4802 assertion. Multiple callouts or any other items will make it zero or
4803 less, which doesn't matter because they will cause an error later. */
4804
4805 POST_ASSERTION:
4806 nest_depth++;
4807 if (prev_expect_cond_assert > 0)
4808 {
4809 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4810 else if (++top_nest >= end_nests)
4811 {
4812 errorcode = ERR84;
4813 goto FAILED;
4814 }
4815 top_nest->nest_depth = nest_depth;
4816 top_nest->flags = NSF_CONDASSERT;
4817 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4818 top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4819 }
4820 break;
4821
4822
4823 /* ---- Define a named group ---- */
4824
4825 /* A named group may be defined as (?'name') or (?<name>). In the latter
4826 case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4827 terminator set to '>'. */
4828
4829 case CHAR_APOSTROPHE:
4830 terminator = CHAR_APOSTROPHE; /* Terminator */
4831
4832 DEFINE_NAME:
4833 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4834 &errorcode, cb)) goto FAILED;
4835
4836 /* We have a name for this capturing group. It is also assigned a number,
4837 which is its primary means of identification. */
4838
4839 if (cb->bracount >= MAX_GROUP_NUMBER)
4840 {
4841 errorcode = ERR97;
4842 goto FAILED;
4843 }
4844 cb->bracount++;
4845 *parsed_pattern++ = META_CAPTURE | cb->bracount;
4846 nest_depth++;
4847
4848 /* Check not too many names */
4849
4850 if (cb->names_found >= MAX_NAME_COUNT)
4851 {
4852 errorcode = ERR49;
4853 goto FAILED;
4854 }
4855
4856 /* Adjust the entry size to accommodate the longest name found. */
4857
4858 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4859 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4860
4861 /* Scan the list to check for duplicates. For duplicate names, if the
4862 number is the same, break the loop, which causes the name to be
4863 discarded; otherwise, if DUPNAMES is not set, give an error.
4864 If it is set, allow the name with a different number, but continue
4865 scanning in case this is a duplicate with the same number. For
4866 non-duplicate names, give an error if the number is duplicated. */
4867
4868 isdupname = FALSE;
4869 ng = cb->named_groups;
4870 for (i = 0; i < cb->names_found; i++, ng++)
4871 {
4872 if (namelen == ng->length &&
4873 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4874 {
4875 if (ng->number == cb->bracount) break;
4876 if ((options & PCRE2_DUPNAMES) == 0)
4877 {
4878 errorcode = ERR43;
4879 goto FAILED;
4880 }
4881 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4882 cb->dupnames = TRUE; /* Duplicate names exist */
4883 }
4884 else if (ng->number == cb->bracount)
4885 {
4886 errorcode = ERR65;
4887 goto FAILED;
4888 }
4889 }
4890
4891 if (i < cb->names_found) break; /* Ignore duplicate with same number */
4892
4893 /* Increase the list size if necessary */
4894
4895 if (cb->names_found >= cb->named_group_list_size)
4896 {
4897 uint32_t newsize = cb->named_group_list_size * 2;
4898 named_group *newspace =
4899 cb->cx->memctl.malloc(newsize * sizeof(named_group),
4900 cb->cx->memctl.memory_data);
4901 if (newspace == NULL)
4902 {
4903 errorcode = ERR21;
4904 goto FAILED;
4905 }
4906
4907 memcpy(newspace, cb->named_groups,
4908 cb->named_group_list_size * sizeof(named_group));
4909 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4910 cb->cx->memctl.free((void *)cb->named_groups,
4911 cb->cx->memctl.memory_data);
4912 cb->named_groups = newspace;
4913 cb->named_group_list_size = newsize;
4914 }
4915
4916 /* Add this name to the list */
4917
4918 cb->named_groups[cb->names_found].name = name;
4919 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4920 cb->named_groups[cb->names_found].number = cb->bracount;
4921 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4922 cb->names_found++;
4923 break;
4924 } /* End of (? switch */
4925 break; /* End of ( handling */
4926
4927
4928 /* ---- Branch terminators ---- */
4929
4930 /* Alternation: reset the capture count if we are in a (?| group. */
4931
4932 case CHAR_VERTICAL_LINE:
4933 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4934 (top_nest->flags & NSF_RESET) != 0)
4935 {
4936 if (cb->bracount > top_nest->max_group)
4937 top_nest->max_group = (uint16_t)cb->bracount;
4938 cb->bracount = top_nest->reset_group;
4939 }
4940 *parsed_pattern++ = META_ALT;
4941 break;
4942
4943 /* End of group; reset the capture count to the maximum if we are in a (?|
4944 group and/or reset the options that are tracked during parsing. Disallow
4945 quantifier for a condition that is an assertion. */
4946
4947 case CHAR_RIGHT_PARENTHESIS:
4948 okquantifier = TRUE;
4949 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4950 {
4951 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4952 xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
4953 if ((top_nest->flags & NSF_RESET) != 0 &&
4954 top_nest->max_group > cb->bracount)
4955 cb->bracount = top_nest->max_group;
4956 if ((top_nest->flags & NSF_CONDASSERT) != 0)
4957 okquantifier = FALSE;
4958
4959 if ((top_nest->flags & NSF_ATOMICSR) != 0)
4960 {
4961 *parsed_pattern++ = META_KET;
4962 }
4963
4964 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4965 else top_nest--;
4966 }
4967 if (nest_depth == 0) /* Unmatched closing parenthesis */
4968 {
4969 errorcode = ERR22;
4970 goto FAILED_BACK;
4971 }
4972 nest_depth--;
4973 *parsed_pattern++ = META_KET;
4974 break;
4975 } /* End of switch on pattern character */
4976 } /* End of main character scan loop */
4977
4978 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4979
4980 if (inverbname && ptr >= ptrend)
4981 {
4982 errorcode = ERR60;
4983 goto FAILED;
4984 }
4985
4986 /* Manage callout for the final item */
4987
4988 PARSED_END:
4989 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4990 parsed_pattern, cb);
4991
4992 /* Insert trailing items for word and line matching (features provided for the
4993 benefit of pcre2grep). */
4994
4995 if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
4996 {
4997 *parsed_pattern++ = META_KET;
4998 *parsed_pattern++ = META_DOLLAR;
4999 }
5000 else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5001 {
5002 *parsed_pattern++ = META_KET;
5003 *parsed_pattern++ = META_ESCAPE + ESC_b;
5004 }
5005
5006 /* Terminate the parsed pattern, then return success if all groups are closed.
5007 Otherwise we have unclosed parentheses. */
5008
5009 if (parsed_pattern >= parsed_pattern_end)
5010 {
5011 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
5012 goto FAILED;
5013 }
5014
5015 *parsed_pattern = META_END;
5016 if (nest_depth == 0) return 0;
5017
5018 UNCLOSED_PARENTHESIS:
5019 errorcode = ERR14;
5020
5021 /* Come here for all failures. */
5022
5023 FAILED:
5024 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5025 return errorcode;
5026
5027 /* Some errors need to indicate the previous character. */
5028
5029 FAILED_BACK:
5030 ptr--;
5031 goto FAILED;
5032
5033 /* This failure happens several times. */
5034
5035 BAD_VERSION_CONDITION:
5036 errorcode = ERR79;
5037 goto FAILED;
5038 }
5039
5040
5041
5042 /*************************************************
5043 * Find first significant opcode *
5044 *************************************************/
5045
5046 /* This is called by several functions that scan a compiled expression looking
5047 for a fixed first character, or an anchoring opcode etc. It skips over things
5048 that do not influence this. For some calls, it makes sense to skip negative
5049 forward and all backward assertions, and also the \b assertion; for others it
5050 does not.
5051
5052 Arguments:
5053 code pointer to the start of the group
5054 skipassert TRUE if certain assertions are to be skipped
5055
5056 Returns: pointer to the first significant opcode
5057 */
5058
5059 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)5060 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5061 {
5062 for (;;)
5063 {
5064 switch ((int)*code)
5065 {
5066 case OP_ASSERT_NOT:
5067 case OP_ASSERTBACK:
5068 case OP_ASSERTBACK_NOT:
5069 case OP_ASSERTBACK_NA:
5070 if (!skipassert) return code;
5071 do code += GET(code, 1); while (*code == OP_ALT);
5072 code += PRIV(OP_lengths)[*code];
5073 break;
5074
5075 case OP_WORD_BOUNDARY:
5076 case OP_NOT_WORD_BOUNDARY:
5077 case OP_UCP_WORD_BOUNDARY:
5078 case OP_NOT_UCP_WORD_BOUNDARY:
5079 if (!skipassert) return code;
5080 /* Fall through */
5081
5082 case OP_CALLOUT:
5083 case OP_CREF:
5084 case OP_DNCREF:
5085 case OP_RREF:
5086 case OP_DNRREF:
5087 case OP_FALSE:
5088 case OP_TRUE:
5089 code += PRIV(OP_lengths)[*code];
5090 break;
5091
5092 case OP_CALLOUT_STR:
5093 code += GET(code, 1 + 2*LINK_SIZE);
5094 break;
5095
5096 case OP_SKIPZERO:
5097 code += 2 + GET(code, 2) + LINK_SIZE;
5098 break;
5099
5100 case OP_COND:
5101 case OP_SCOND:
5102 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
5103 code[GET(code, 1)] != OP_KET) /* More than one branch */
5104 return code;
5105 code += GET(code, 1) + 1 + LINK_SIZE;
5106 break;
5107
5108 case OP_MARK:
5109 case OP_COMMIT_ARG:
5110 case OP_PRUNE_ARG:
5111 case OP_SKIP_ARG:
5112 case OP_THEN_ARG:
5113 code += code[1] + PRIV(OP_lengths)[*code];
5114 break;
5115
5116 default:
5117 return code;
5118 }
5119 }
5120 /* Control never reaches here */
5121 }
5122
5123
5124
5125 #ifdef SUPPORT_UNICODE
5126 /*************************************************
5127 * Get othercase range *
5128 *************************************************/
5129
5130 /* This function is passed the start and end of a class range in UCP mode. For
5131 single characters the range may be just one character long. The function
5132 searches up the characters, looking for ranges of characters in the "other"
5133 case. Each call returns the next one, updating the start address. A character
5134 with multiple other cases is returned on its own with a special return value.
5135
5136 Arguments:
5137 cptr points to starting character value; updated
5138 d end value
5139 ocptr where to put start of othercase range
5140 odptr where to put end of othercase range
5141 restricted TRUE if caseless restriction applies
5142
5143 Yield: -1 when no more
5144 0 when a range is returned
5145 >0 the CASESET offset for char with multiple other cases;
5146 for this return, *ocptr contains the original
5147 */
5148
5149 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr,BOOL restricted)5150 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
5151 uint32_t *odptr, BOOL restricted)
5152 {
5153 uint32_t c, othercase, next;
5154 unsigned int co;
5155
5156 /* Find the first character that has an other case. If it has multiple other
5157 cases, return its case offset value. When CASELESS_RESTRICT is set, ignore the
5158 multi-case entries that begin with ASCII values. In 32-bit mode, a value
5159 greater than the Unicode maximum ends the range. */
5160
5161 for (c = *cptr; c <= d; c++)
5162 {
5163 #if PCRE2_CODE_UNIT_WIDTH == 32
5164 if (c > MAX_UTF_CODE_POINT) return -1;
5165 #endif
5166 if ((co = UCD_CASESET(c)) != 0 &&
5167 (!restricted || PRIV(ucd_caseless_sets)[co] > 127))
5168 {
5169 *ocptr = c++; /* Character that has the set */
5170 *cptr = c; /* Rest of input range */
5171 return (int)co;
5172 }
5173
5174 /* This is not a valid multiple-case character. Check that the single other
5175 case is different to the original. We don't need to check "restricted" here
5176 because the non-ASCII characters with multiple cases that include an ASCII
5177 character don't have a different "othercase". */
5178
5179 if ((othercase = UCD_OTHERCASE(c)) != c) break;
5180 }
5181
5182 if (c > d) return -1; /* Reached end of range */
5183
5184 /* Found a character that has a single other case. Search for the end of the
5185 range, which is either the end of the input range, or a character that has zero
5186 or more than one other cases. */
5187
5188 *ocptr = othercase;
5189 next = othercase + 1;
5190
5191 for (++c; c <= d; c++)
5192 {
5193 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
5194 next++;
5195 }
5196
5197 *odptr = next - 1; /* End of othercase range */
5198 *cptr = c; /* Rest of input range */
5199 return 0;
5200 }
5201 #endif /* SUPPORT_UNICODE */
5202
5203
5204
5205 /*************************************************
5206 * Add a character or range to a class (internal) *
5207 *************************************************/
5208
5209 /* This function packages up the logic of adding a character or range of
5210 characters to a class. The character values in the arguments will be within the
5211 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
5212 called only from within the "add to class" group of functions, some of which
5213 are recursive and mutually recursive. The external entry point is
5214 add_to_class().
5215
5216 Arguments:
5217 classbits the bit map for characters < 256
5218 uchardptr points to the pointer for extra data
5219 options the options bits
5220 xoptions the extra options bits
5221 cb compile data
5222 start start of range character
5223 end end of range character
5224
5225 Returns: the number of < 256 characters added
5226 the pointer to extra data is updated
5227 */
5228
5229 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5230 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5231 uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start,
5232 uint32_t end)
5233 {
5234 uint32_t c;
5235 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5236 unsigned int n8 = 0;
5237
5238 /* If caseless matching is required, scan the range and process alternate
5239 cases. In Unicode, there are 8-bit characters that have alternate cases that
5240 are greater than 255 and vice-versa (though these may be ignored if caseless
5241 restriction is in force). Sometimes we can just extend the original range. */
5242
5243 if ((options & PCRE2_CASELESS) != 0)
5244 {
5245 #ifdef SUPPORT_UNICODE
5246 if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5247 {
5248 int rc;
5249 uint32_t oc, od;
5250
5251 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
5252 c = start;
5253
5254 while ((rc = get_othercase_range(&c, end, &oc, &od,
5255 (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)) >= 0)
5256 {
5257 /* Handle a single character that has more than one other case. */
5258
5259 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr,
5260 options, xoptions, cb, PRIV(ucd_caseless_sets) + rc, oc);
5261
5262 /* Do nothing if the other case range is within the original range. */
5263
5264 else if (oc >= cb->class_range_start && od <= cb->class_range_end)
5265 continue;
5266
5267 /* Extend the original range if there is overlap, noting that if oc < c,
5268 we can't have od > end because a subrange is always shorter than the
5269 basic range. Otherwise, use a recursive call to add the additional range.
5270 */
5271
5272 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5273 else if (od > end && oc <= end + 1)
5274 {
5275 end = od; /* Extend upwards */
5276 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5277 }
5278 else n8 += add_to_class_internal(classbits, uchardptr, options, xoptions,
5279 cb, oc, od);
5280 }
5281 }
5282 else
5283 #else
5284 (void)xoptions; /* Avoid compiler warning */
5285 #endif /* SUPPORT_UNICODE */
5286
5287 /* Not UTF mode */
5288
5289 for (c = start; c <= classbits_end; c++)
5290 {
5291 SETBIT(classbits, cb->fcc[c]);
5292 n8++;
5293 }
5294 }
5295
5296 /* Now handle the originally supplied range. Adjust the final value according
5297 to the bit length - this means that the same lists of (e.g.) horizontal spaces
5298 can be used in all cases. */
5299
5300 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5301 end = MAX_NON_UTF_CHAR;
5302
5303 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5304
5305 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
5306
5307 for (c = start; c <= classbits_end; c++)
5308 {
5309 /* Regardless of start, c will always be <= 255. */
5310 SETBIT(classbits, c);
5311 n8++;
5312 }
5313
5314 #ifdef SUPPORT_WIDE_CHARS
5315 if (start <= 0xff) start = 0xff + 1;
5316
5317 if (end >= start)
5318 {
5319 PCRE2_UCHAR *uchardata = *uchardptr;
5320
5321 #ifdef SUPPORT_UNICODE
5322 if ((options & PCRE2_UTF) != 0)
5323 {
5324 if (start < end)
5325 {
5326 *uchardata++ = XCL_RANGE;
5327 uchardata += PRIV(ord2utf)(start, uchardata);
5328 uchardata += PRIV(ord2utf)(end, uchardata);
5329 }
5330 else if (start == end)
5331 {
5332 *uchardata++ = XCL_SINGLE;
5333 uchardata += PRIV(ord2utf)(start, uchardata);
5334 }
5335 }
5336 else
5337 #endif /* SUPPORT_UNICODE */
5338
5339 /* Without UTF support, character values are constrained by the bit length,
5340 and can only be > 256 for 16-bit and 32-bit libraries. */
5341
5342 #if PCRE2_CODE_UNIT_WIDTH == 8
5343 {}
5344 #else
5345 if (start < end)
5346 {
5347 *uchardata++ = XCL_RANGE;
5348 *uchardata++ = start;
5349 *uchardata++ = end;
5350 }
5351 else if (start == end)
5352 {
5353 *uchardata++ = XCL_SINGLE;
5354 *uchardata++ = start;
5355 }
5356 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
5357 *uchardptr = uchardata; /* Updata extra data pointer */
5358 }
5359 #else /* SUPPORT_WIDE_CHARS */
5360 (void)uchardptr; /* Avoid compiler warning */
5361 #endif /* SUPPORT_WIDE_CHARS */
5362
5363 return n8; /* Number of 8-bit characters */
5364 }
5365
5366
5367
5368 #ifdef SUPPORT_UNICODE
5369 /*************************************************
5370 * Add a list of characters to a class (internal) *
5371 *************************************************/
5372
5373 /* This function is used for adding a list of case-equivalent characters to a
5374 class when in UTF mode. This function is called only from within
5375 add_to_class_internal(), with which it is mutually recursive.
5376
5377 Arguments:
5378 classbits the bit map for characters < 256
5379 uchardptr points to the pointer for extra data
5380 options the options bits
5381 xoptions the extra options bits
5382 cb contains pointers to tables etc.
5383 p points to row of 32-bit values, terminated by NOTACHAR
5384 except character to omit; this is used when adding lists of
5385 case-equivalent characters to avoid including the one we
5386 already know about
5387
5388 Returns: the number of < 256 characters added
5389 the pointer to extra data is updated
5390 */
5391
5392 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5393 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5394 uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p,
5395 unsigned int except)
5396 {
5397 unsigned int n8 = 0;
5398 while (p[0] < NOTACHAR)
5399 {
5400 unsigned int n = 0;
5401 if (p[0] != except)
5402 {
5403 while(p[n+1] == p[0] + n + 1) n++;
5404 n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5405 p[0], p[n]);
5406 }
5407 p += n + 1;
5408 }
5409 return n8;
5410 }
5411 #endif
5412
5413
5414
5415 /*************************************************
5416 * External entry point for add range to class *
5417 *************************************************/
5418
5419 /* This function sets the overall range so that the internal functions can try
5420 to avoid duplication when handling case-independence.
5421
5422 Arguments:
5423 classbits the bit map for characters < 256
5424 uchardptr points to the pointer for extra data
5425 options the options bits
5426 xoptions the extra options bits
5427 cb compile data
5428 start start of range character
5429 end end of range character
5430
5431 Returns: the number of < 256 characters added
5432 the pointer to extra data is updated
5433 */
5434
5435 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,uint32_t start,uint32_t end)5436 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5437 uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end)
5438 {
5439 cb->class_range_start = start;
5440 cb->class_range_end = end;
5441 return add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5442 start, end);
5443 }
5444
5445
5446 /*************************************************
5447 * External entry point for add list to class *
5448 *************************************************/
5449
5450 /* This function is used for adding a list of horizontal or vertical whitespace
5451 characters to a class. The list must be in order so that ranges of characters
5452 can be detected and handled appropriately. This function sets the overall range
5453 so that the internal functions can try to avoid duplication when handling
5454 case-independence.
5455
5456 Arguments:
5457 classbits the bit map for characters < 256
5458 uchardptr points to the pointer for extra data
5459 options the options bits
5460 xoptions the extra options bits
5461 cb contains pointers to tables etc.
5462 p points to row of 32-bit values, terminated by NOTACHAR
5463 except character to omit; this is used when adding lists of
5464 case-equivalent characters to avoid including the one we
5465 already know about
5466
5467 Returns: the number of < 256 characters added
5468 the pointer to extra data is updated
5469 */
5470
5471 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p,unsigned int except)5472 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5473 uint32_t xoptions, compile_block *cb, const uint32_t *p, unsigned int except)
5474 {
5475 unsigned int n8 = 0;
5476 while (p[0] < NOTACHAR)
5477 {
5478 unsigned int n = 0;
5479 if (p[0] != except)
5480 {
5481 while(p[n+1] == p[0] + n + 1) n++;
5482 cb->class_range_start = p[0];
5483 cb->class_range_end = p[n];
5484 n8 += add_to_class_internal(classbits, uchardptr, options, xoptions, cb,
5485 p[0], p[n]);
5486 }
5487 p += n + 1;
5488 }
5489 return n8;
5490 }
5491
5492
5493
5494 /*************************************************
5495 * Add characters not in a list to a class *
5496 *************************************************/
5497
5498 /* This function is used for adding the complement of a list of horizontal or
5499 vertical whitespace to a class. The list must be in order.
5500
5501 Arguments:
5502 classbits the bit map for characters < 256
5503 uchardptr points to the pointer for extra data
5504 options the options bits
5505 xoptions the extra options bits
5506 cb contains pointers to tables etc.
5507 p points to row of 32-bit values, terminated by NOTACHAR
5508
5509 Returns: the number of < 256 characters added
5510 the pointer to extra data is updated
5511 */
5512
5513 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,uint32_t xoptions,compile_block * cb,const uint32_t * p)5514 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5515 uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p)
5516 {
5517 BOOL utf = (options & PCRE2_UTF) != 0;
5518 unsigned int n8 = 0;
5519 if (p[0] > 0)
5520 n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, 0, p[0] - 1);
5521 while (p[0] < NOTACHAR)
5522 {
5523 while (p[1] == p[0] + 1) p++;
5524 n8 += add_to_class(classbits, uchardptr, options, xoptions, cb, p[0] + 1,
5525 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5526 p++;
5527 }
5528 return n8;
5529 }
5530
5531
5532
5533 /*************************************************
5534 * Find details of duplicate group names *
5535 *************************************************/
5536
5537 /* This is called from compile_branch() when it needs to know the index and
5538 count of duplicates in the names table when processing named backreferences,
5539 either directly, or as conditions.
5540
5541 Arguments:
5542 name points to the name
5543 length the length of the name
5544 indexptr where to put the index
5545 countptr where to put the count of duplicates
5546 errorcodeptr where to put an error code
5547 cb the compile block
5548
5549 Returns: TRUE if OK, FALSE if not, error code set
5550 */
5551
5552 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5553 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5554 int *countptr, int *errorcodeptr, compile_block *cb)
5555 {
5556 uint32_t i, groupnumber;
5557 int count;
5558 PCRE2_UCHAR *slot = cb->name_table;
5559
5560 /* Find the first entry in the table */
5561
5562 for (i = 0; i < cb->names_found; i++)
5563 {
5564 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5565 slot[IMM2_SIZE+length] == 0) break;
5566 slot += cb->name_entry_size;
5567 }
5568
5569 /* This should not occur, because this function is called only when we know we
5570 have duplicate names. Give an internal error. */
5571
5572 if (i >= cb->names_found)
5573 {
5574 *errorcodeptr = ERR53;
5575 cb->erroroffset = name - cb->start_pattern;
5576 return FALSE;
5577 }
5578
5579 /* Record the index and then see how many duplicates there are, updating the
5580 backref map and maximum back reference as we do. */
5581
5582 *indexptr = i;
5583 count = 0;
5584
5585 for (;;)
5586 {
5587 count++;
5588 groupnumber = GET2(slot,0);
5589 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5590 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5591 if (++i >= cb->names_found) break;
5592 slot += cb->name_entry_size;
5593 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5594 (slot+IMM2_SIZE)[length] != 0) break;
5595 }
5596
5597 *countptr = count;
5598 return TRUE;
5599 }
5600
5601
5602
5603 /*************************************************
5604 * Compile one branch *
5605 *************************************************/
5606
5607 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5608 the options are changed during the branch, the pointer is used to change the
5609 external options bits. This function is used during the pre-compile phase when
5610 we are trying to find out the amount of memory needed, as well as during the
5611 real compile phase. The value of lengthptr distinguishes the two phases.
5612
5613 Arguments:
5614 optionsptr pointer to the option bits
5615 xoptionsptr pointer to the extra option bits
5616 codeptr points to the pointer to the current code point
5617 pptrptr points to the current parsed pattern pointer
5618 errorcodeptr points to error code variable
5619 firstcuptr place to put the first required code unit
5620 firstcuflagsptr place to put the first code unit flags
5621 reqcuptr place to put the last required code unit
5622 reqcuflagsptr place to put the last required code unit flags
5623 bcptr points to current branch chain
5624 open_caps points to current capitem
5625 cb contains pointers to tables etc.
5626 lengthptr NULL during the real compile phase
5627 points to length accumulator during pre-compile phase
5628
5629 Returns: 0 There's been an error, *errorcodeptr is non-zero
5630 +1 Success, this branch must match at least one character
5631 -1 Success, this branch may match an empty string
5632 */
5633
5634 static int
compile_branch(uint32_t * optionsptr,uint32_t * xoptionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)5635 compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5636 PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5637 uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5638 uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5639 compile_block *cb, PCRE2_SIZE *lengthptr)
5640 {
5641 int bravalue = 0;
5642 int okreturn = -1;
5643 int group_return = 0;
5644 uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
5645 uint32_t greedy_default, greedy_non_default;
5646 uint32_t repeat_type, op_type;
5647 uint32_t options = *optionsptr; /* May change dynamically */
5648 uint32_t xoptions = *xoptionsptr; /* May change dynamically */
5649 uint32_t firstcu, reqcu;
5650 uint32_t zeroreqcu, zerofirstcu;
5651 uint32_t escape;
5652 uint32_t *pptr = *pptrptr;
5653 uint32_t meta, meta_arg;
5654 uint32_t firstcuflags, reqcuflags;
5655 uint32_t zeroreqcuflags, zerofirstcuflags;
5656 uint32_t req_caseopt, reqvary, tempreqvary;
5657 PCRE2_SIZE offset = 0;
5658 PCRE2_SIZE length_prevgroup = 0;
5659 PCRE2_UCHAR *code = *codeptr;
5660 PCRE2_UCHAR *last_code = code;
5661 PCRE2_UCHAR *orig_code = code;
5662 PCRE2_UCHAR *tempcode;
5663 PCRE2_UCHAR *previous = NULL;
5664 PCRE2_UCHAR op_previous;
5665 BOOL groupsetfirstcu = FALSE;
5666 BOOL had_accept = FALSE;
5667 BOOL matched_char = FALSE;
5668 BOOL previous_matched_char = FALSE;
5669 BOOL reset_caseful = FALSE;
5670 const uint8_t *cbits = cb->cbits;
5671 uint8_t classbits[32];
5672
5673 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5674 not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
5675 as we process the pattern. */
5676
5677 #ifdef SUPPORT_UNICODE
5678 BOOL utf = (options & PCRE2_UTF) != 0;
5679 BOOL ucp = (options & PCRE2_UCP) != 0;
5680 #else /* No Unicode support */
5681 BOOL utf = FALSE;
5682 #endif
5683
5684 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5685 class_uchardata always so that it can be passed to add_to_class() always,
5686 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5687 alternative calls for the different cases. */
5688
5689 PCRE2_UCHAR *class_uchardata;
5690 #ifdef SUPPORT_WIDE_CHARS
5691 BOOL xclass;
5692 PCRE2_UCHAR *class_uchardata_base;
5693 #endif
5694
5695 /* Set up the default and non-default settings for greediness */
5696
5697 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5698 greedy_non_default = greedy_default ^ 1;
5699
5700 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5701 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5702 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5703
5704 When we hit a repeat whose minimum is zero, we may have to adjust these values
5705 to take the zero repeat into account. This is implemented by setting them to
5706 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5707 item types that can be repeated set these backoff variables appropriately. */
5708
5709 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5710 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5711
5712 /* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
5713 according to the current setting of the caseless flag. The REQ_CASELESS value
5714 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5715 to record the case status of the value. This is used only for ASCII characters.
5716 */
5717
5718 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
5719
5720 /* Switch on next META item until the end of the branch */
5721
5722 for (;; pptr++)
5723 {
5724 #ifdef SUPPORT_WIDE_CHARS
5725 BOOL xclass_has_prop;
5726 #endif
5727 BOOL negate_class;
5728 BOOL should_flip_negation;
5729 BOOL match_all_or_no_wide_chars;
5730 BOOL possessive_quantifier;
5731 BOOL note_group_empty;
5732 int class_has_8bitchar;
5733 uint32_t mclength;
5734 uint32_t skipunits;
5735 uint32_t subreqcu, subfirstcu;
5736 uint32_t groupnumber;
5737 uint32_t verbarglen, verbculen;
5738 uint32_t subreqcuflags, subfirstcuflags;
5739 open_capitem *oc;
5740 PCRE2_UCHAR mcbuffer[8];
5741
5742 /* Get next META item in the pattern and its potential argument. */
5743
5744 meta = META_CODE(*pptr);
5745 meta_arg = META_DATA(*pptr);
5746
5747 /* If we are in the pre-compile phase, accumulate the length used for the
5748 previous cycle of this loop, unless the next item is a quantifier. */
5749
5750 if (lengthptr != NULL)
5751 {
5752 if (code > cb->start_workspace + cb->workspace_size -
5753 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5754 {
5755 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5756 ERR52 : ERR86;
5757 return 0;
5758 }
5759
5760 /* There is at least one situation where code goes backwards: this is the
5761 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5762 is processed, the whole class is eliminated. However, it is created first,
5763 so we have to allow memory for it. Therefore, don't ever reduce the length
5764 at this point. */
5765
5766 if (code < last_code) code = last_code;
5767
5768 /* If the next thing is not a quantifier, we add the length of the previous
5769 item into the total, and reset the code pointer to the start of the
5770 workspace. Otherwise leave the previous item available to be quantified. */
5771
5772 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5773 {
5774 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5775 {
5776 *errorcodeptr = ERR20; /* Integer overflow */
5777 return 0;
5778 }
5779 *lengthptr += (PCRE2_SIZE)(code - orig_code);
5780 if (*lengthptr > MAX_PATTERN_SIZE)
5781 {
5782 *errorcodeptr = ERR20; /* Pattern is too large */
5783 return 0;
5784 }
5785 code = orig_code;
5786 }
5787
5788 /* Remember where this code item starts so we can catch the "backwards"
5789 case above next time round. */
5790
5791 last_code = code;
5792 }
5793
5794 /* Process the next parsed pattern item. If it is not a quantifier, remember
5795 where it starts so that it can be quantified when a quantifier follows.
5796 Checking for the legality of quantifiers happens in parse_regex(), except for
5797 a quantifier after an assertion that is a condition. */
5798
5799 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5800 {
5801 previous = code;
5802 if (matched_char && !had_accept) okreturn = 1;
5803 }
5804
5805 previous_matched_char = matched_char;
5806 matched_char = FALSE;
5807 note_group_empty = FALSE;
5808 skipunits = 0; /* Default value for most subgroups */
5809
5810 switch(meta)
5811 {
5812 /* ===================================================================*/
5813 /* The branch terminates at pattern end or | or ) */
5814
5815 case META_END:
5816 case META_ALT:
5817 case META_KET:
5818 *firstcuptr = firstcu;
5819 *firstcuflagsptr = firstcuflags;
5820 *reqcuptr = reqcu;
5821 *reqcuflagsptr = reqcuflags;
5822 *codeptr = code;
5823 *pptrptr = pptr;
5824 return okreturn;
5825
5826
5827 /* ===================================================================*/
5828 /* Handle single-character metacharacters. In multiline mode, ^ disables
5829 the setting of any following char as a first character. */
5830
5831 case META_CIRCUMFLEX:
5832 if ((options & PCRE2_MULTILINE) != 0)
5833 {
5834 if (firstcuflags == REQ_UNSET)
5835 zerofirstcuflags = firstcuflags = REQ_NONE;
5836 *code++ = OP_CIRCM;
5837 }
5838 else *code++ = OP_CIRC;
5839 break;
5840
5841 case META_DOLLAR:
5842 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5843 break;
5844
5845 /* There can never be a first char if '.' is first, whatever happens about
5846 repeats. The value of reqcu doesn't change either. */
5847
5848 case META_DOT:
5849 matched_char = TRUE;
5850 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5851 zerofirstcu = firstcu;
5852 zerofirstcuflags = firstcuflags;
5853 zeroreqcu = reqcu;
5854 zeroreqcuflags = reqcuflags;
5855 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5856 break;
5857
5858
5859 /* ===================================================================*/
5860 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5861 Otherwise, an initial ']' is taken as a data character. When empty classes
5862 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5863 match any character, so generate OP_ALLANY. */
5864
5865 case META_CLASS_EMPTY:
5866 case META_CLASS_EMPTY_NOT:
5867 matched_char = TRUE;
5868 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5869 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5870 zerofirstcu = firstcu;
5871 zerofirstcuflags = firstcuflags;
5872 break;
5873
5874
5875 /* ===================================================================*/
5876 /* Non-empty character class. If the included characters are all < 256, we
5877 build a 32-byte bitmap of the permitted characters, except in the special
5878 case where there is only one such character. For negated classes, we build
5879 the map as usual, then invert it at the end. However, we use a different
5880 opcode so that data characters > 255 can be handled correctly.
5881
5882 If the class contains characters outside the 0-255 range, a different
5883 opcode is compiled. It may optionally have a bit map for characters < 256,
5884 but those above are explicitly listed afterwards. A flag code unit tells
5885 whether the bitmap is present, and whether this is a negated class or
5886 not. */
5887
5888 case META_CLASS_NOT:
5889 case META_CLASS:
5890 matched_char = TRUE;
5891 negate_class = meta == META_CLASS_NOT;
5892
5893 /* We can optimize the case of a single character in a class by generating
5894 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5895 negative. In the negative case there can be no first char if this item is
5896 first, whatever repeat count may follow. In the case of reqcu, save the
5897 previous value for reinstating. */
5898
5899 /* NOTE: at present this optimization is not effective if the only
5900 character in a class in 32-bit, non-UCP mode has its top bit set. */
5901
5902 if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5903 {
5904 #ifdef SUPPORT_UNICODE
5905 uint32_t d;
5906 #endif
5907 uint32_t c = pptr[1];
5908
5909 pptr += 2; /* Move on to class end */
5910 if (meta == META_CLASS) /* A positive one-char class can be */
5911 { /* handled as a normal literal character. */
5912 meta = c; /* Set up the character */
5913 goto NORMAL_CHAR_SET;
5914 }
5915
5916 /* Handle a negative one-character class */
5917
5918 zeroreqcu = reqcu;
5919 zeroreqcuflags = reqcuflags;
5920 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5921 zerofirstcu = firstcu;
5922 zerofirstcuflags = firstcuflags;
5923
5924 /* For caseless UTF or UCP mode, check whether this character has more
5925 than one other case. If so, generate a special OP_NOTPROP item instead of
5926 OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
5927 caseless set that starts with an ASCII character. */
5928
5929 #ifdef SUPPORT_UNICODE
5930 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5931 (d = UCD_CASESET(c)) != 0 &&
5932 ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
5933 PRIV(ucd_caseless_sets)[d] > 127))
5934 {
5935 *code++ = OP_NOTPROP;
5936 *code++ = PT_CLIST;
5937 *code++ = d;
5938 break; /* We are finished with this class */
5939 }
5940 #endif
5941 /* Char has only one other (usable) case, or UCP not available */
5942
5943 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5944 code += PUTCHAR(c, code);
5945 break; /* We are finished with this class */
5946 } /* End of 1-char optimization */
5947
5948 /* Handle character classes that contain more than just one literal
5949 character. If there are exactly two characters in a positive class, see if
5950 they are case partners. This can be optimized to generate a caseless single
5951 character match (which also sets first/required code units if relevant).
5952 When casing restrictions apply, ignore a caseless set if both characters
5953 are ASCII. */
5954
5955 if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5956 pptr[3] == META_CLASS_END)
5957 {
5958 uint32_t c = pptr[1];
5959
5960 #ifdef SUPPORT_UNICODE
5961 if (UCD_CASESET(c) == 0 ||
5962 ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
5963 c < 128 && pptr[2] < 128))
5964 #endif
5965 {
5966 uint32_t d;
5967
5968 #ifdef SUPPORT_UNICODE
5969 if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5970 #endif
5971 {
5972 #if PCRE2_CODE_UNIT_WIDTH != 8
5973 if (c > 255) d = c; else
5974 #endif
5975 d = TABLE_GET(c, cb->fcc, c);
5976 }
5977
5978 if (c != d && pptr[2] == d)
5979 {
5980 pptr += 3; /* Move on to class end */
5981 meta = c;
5982 if ((options & PCRE2_CASELESS) == 0)
5983 {
5984 reset_caseful = TRUE;
5985 options |= PCRE2_CASELESS;
5986 req_caseopt = REQ_CASELESS;
5987 }
5988 goto CLASS_CASELESS_CHAR;
5989 }
5990 }
5991 }
5992
5993 /* If a non-extended class contains a negative special such as \S, we need
5994 to flip the negation flag at the end, so that support for characters > 255
5995 works correctly (they are all included in the class). An extended class may
5996 need to insert specific matching or non-matching code for wide characters.
5997 */
5998
5999 should_flip_negation = match_all_or_no_wide_chars = FALSE;
6000
6001 /* Extended class (xclass) will be used when characters > 255
6002 might match. */
6003
6004 #ifdef SUPPORT_WIDE_CHARS
6005 xclass = FALSE;
6006 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
6007 class_uchardata_base = class_uchardata; /* Save the start */
6008 #endif
6009
6010 /* For optimization purposes, we track some properties of the class:
6011 class_has_8bitchar will be non-zero if the class contains at least one
6012 character with a code point less than 256; xclass_has_prop will be TRUE if
6013 Unicode property checks are present in the class. */
6014
6015 class_has_8bitchar = 0;
6016 #ifdef SUPPORT_WIDE_CHARS
6017 xclass_has_prop = FALSE;
6018 #endif
6019
6020 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
6021 in a temporary bit of memory, in case the class contains fewer than two
6022 8-bit characters because in that case the compiled code doesn't use the bit
6023 map. */
6024
6025 memset(classbits, 0, 32 * sizeof(uint8_t));
6026
6027 /* Process items until META_CLASS_END is reached. */
6028
6029 while ((meta = *(++pptr)) != META_CLASS_END)
6030 {
6031 /* Handle POSIX classes such as [:alpha:] etc. */
6032
6033 if (meta == META_POSIX || meta == META_POSIX_NEG)
6034 {
6035 BOOL local_negate = (meta == META_POSIX_NEG);
6036 int posix_class = *(++pptr);
6037 int taboffset, tabopt;
6038 uint8_t pbits[32];
6039
6040 should_flip_negation = local_negate; /* Note negative special */
6041
6042 /* If matching is caseless, upper and lower are converted to alpha.
6043 This relies on the fact that the class table starts with alpha,
6044 lower, upper as the first 3 entries. */
6045
6046 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
6047 posix_class = 0;
6048
6049 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
6050 different escape sequences that use Unicode properties \p or \P.
6051 Others that are not available via \p or \P have to generate
6052 XCL_PROP/XCL_NOTPROP directly, which is done here. */
6053
6054 #ifdef SUPPORT_UNICODE
6055 if ((options & PCRE2_UCP) != 0 &&
6056 (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
6057 {
6058 switch(posix_class)
6059 {
6060 case PC_GRAPH:
6061 case PC_PRINT:
6062 case PC_PUNCT:
6063 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
6064 *class_uchardata++ = (PCRE2_UCHAR)
6065 ((posix_class == PC_GRAPH)? PT_PXGRAPH :
6066 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
6067 *class_uchardata++ = 0;
6068 xclass_has_prop = TRUE;
6069 goto CONTINUE_CLASS;
6070
6071 /* For the other POSIX classes (ex: ascii) we are going to
6072 fall through to the non-UCP case and build a bit map for
6073 characters with code points less than 256. However, if we are in
6074 a negated POSIX class, characters with code points greater than
6075 255 must either all match or all not match, depending on whether
6076 the whole class is not or is negated. For example, for
6077 [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
6078 they must not.
6079
6080 In the special case where there are no xclass items, this is
6081 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
6082 explicit range is needed for OP_XCLASS. Setting a flag here
6083 causes the range to be generated later when it is known that
6084 OP_XCLASS is required. In the 8-bit library this is relevant only in
6085 utf mode, since no wide characters can exist otherwise. */
6086
6087 default:
6088 #if PCRE2_CODE_UNIT_WIDTH == 8
6089 if (utf)
6090 #endif
6091 match_all_or_no_wide_chars |= local_negate;
6092 break;
6093 }
6094 }
6095 #endif /* SUPPORT_UNICODE */
6096
6097 /* In the non-UCP case, or when UCP makes no difference, we build the
6098 bit map for the POSIX class in a chunk of local store because we may
6099 be adding and subtracting from it, and we don't want to subtract bits
6100 that may be in the main map already. At the end we or the result into
6101 the bit map that is being built. */
6102
6103 posix_class *= 3;
6104
6105 /* Copy in the first table (always present) */
6106
6107 memcpy(pbits, cbits + posix_class_maps[posix_class],
6108 32 * sizeof(uint8_t));
6109
6110 /* If there is a second table, add or remove it as required. */
6111
6112 taboffset = posix_class_maps[posix_class + 1];
6113 tabopt = posix_class_maps[posix_class + 2];
6114
6115 if (taboffset >= 0)
6116 {
6117 if (tabopt >= 0)
6118 for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
6119 else
6120 for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
6121 }
6122
6123 /* Now see if we need to remove any special characters. An option
6124 value of 1 removes vertical space and 2 removes underscore. */
6125
6126 if (tabopt < 0) tabopt = -tabopt;
6127 if (tabopt == 1) pbits[1] &= ~0x3c;
6128 else if (tabopt == 2) pbits[11] &= 0x7f;
6129
6130 /* Add the POSIX table or its complement into the main table that is
6131 being built and we are done. */
6132
6133 if (local_negate)
6134 for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
6135 else
6136 for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
6137
6138 /* Every class contains at least one < 256 character. */
6139
6140 class_has_8bitchar = 1;
6141 goto CONTINUE_CLASS; /* End of POSIX handling */
6142 }
6143
6144 /* Other than POSIX classes, the only items we should encounter are
6145 \d-type escapes and literal characters (possibly as ranges). */
6146
6147 if (meta == META_BIGVALUE)
6148 {
6149 meta = *(++pptr);
6150 goto CLASS_LITERAL;
6151 }
6152
6153 /* Any other non-literal must be an escape */
6154
6155 if (meta >= META_END)
6156 {
6157 if (META_CODE(meta) != META_ESCAPE)
6158 {
6159 #ifdef DEBUG_SHOW_PARSED
6160 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
6161 "in character class\n", meta);
6162 #endif
6163 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
6164 return 0;
6165 }
6166 escape = META_DATA(meta);
6167
6168 /* Every class contains at least one < 256 character. */
6169
6170 class_has_8bitchar++;
6171
6172 switch(escape)
6173 {
6174 case ESC_d:
6175 for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
6176 break;
6177
6178 case ESC_D:
6179 should_flip_negation = TRUE;
6180 for (int i = 0; i < 32; i++)
6181 classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
6182 break;
6183
6184 case ESC_w:
6185 for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
6186 break;
6187
6188 case ESC_W:
6189 should_flip_negation = TRUE;
6190 for (int i = 0; i < 32; i++)
6191 classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
6192 break;
6193
6194 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
6195 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
6196 previously set by something earlier in the character class.
6197 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
6198 we could just adjust the appropriate bit. From PCRE 8.34 we no
6199 longer treat \s and \S specially. */
6200
6201 case ESC_s:
6202 for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
6203 break;
6204
6205 case ESC_S:
6206 should_flip_negation = TRUE;
6207 for (int i = 0; i < 32; i++)
6208 classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
6209 break;
6210
6211 /* When adding the horizontal or vertical space lists to a class, or
6212 their complements, disable PCRE2_CASELESS, because it justs wastes
6213 time, and in the "not-x" UTF cases can create unwanted duplicates in
6214 the XCLASS list (provoked by characters that have more than one other
6215 case and by both cases being in the same "not-x" sublist). */
6216
6217 case ESC_h:
6218 (void)add_list_to_class(classbits, &class_uchardata,
6219 options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list),
6220 NOTACHAR);
6221 break;
6222
6223 case ESC_H:
6224 (void)add_not_list_to_class(classbits, &class_uchardata,
6225 options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list));
6226 break;
6227
6228 case ESC_v:
6229 (void)add_list_to_class(classbits, &class_uchardata,
6230 options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list),
6231 NOTACHAR);
6232 break;
6233
6234 case ESC_V:
6235 (void)add_not_list_to_class(classbits, &class_uchardata,
6236 options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list));
6237 break;
6238
6239 /* If Unicode is not supported, \P and \p are not allowed and are
6240 faulted at parse time, so will never appear here. */
6241
6242 #ifdef SUPPORT_UNICODE
6243 case ESC_p:
6244 case ESC_P:
6245 {
6246 uint32_t ptype = *(++pptr) >> 16;
6247 uint32_t pdata = *pptr & 0xffff;
6248 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
6249 *class_uchardata++ = ptype;
6250 *class_uchardata++ = pdata;
6251 xclass_has_prop = TRUE;
6252 class_has_8bitchar--; /* Undo! */
6253 }
6254 break;
6255 #endif
6256 }
6257
6258 goto CONTINUE_CLASS;
6259 } /* End handling \d-type escapes */
6260
6261 /* A literal character may be followed by a range meta. At parse time
6262 there are checks for out-of-order characters, for ranges where the two
6263 characters are equal, and for hyphens that cannot indicate a range. At
6264 this point, therefore, no checking is needed. */
6265
6266 else
6267 {
6268 uint32_t c, d;
6269
6270 CLASS_LITERAL:
6271 c = d = meta;
6272
6273 /* Remember if \r or \n were explicitly used */
6274
6275 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6276
6277 /* Process a character range */
6278
6279 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6280 {
6281 #ifdef EBCDIC
6282 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6283 #endif
6284 pptr += 2;
6285 d = *pptr;
6286 if (d == META_BIGVALUE) d = *(++pptr);
6287
6288 /* Remember an explicit \r or \n, and add the range to the class. */
6289
6290 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6291
6292 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6293 because there are holes in the encoding, and simply using the range
6294 A-Z (for example) would include the characters in the holes. This
6295 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6296
6297 #ifdef EBCDIC
6298 if (range_is_literal &&
6299 (cb->ctypes[c] & ctype_letter) != 0 &&
6300 (cb->ctypes[d] & ctype_letter) != 0 &&
6301 (c <= CHAR_z) == (d <= CHAR_z))
6302 {
6303 uint32_t uc = (d <= CHAR_z)? 0 : 64;
6304 uint32_t C = c - uc;
6305 uint32_t D = d - uc;
6306
6307 if (C <= CHAR_i)
6308 {
6309 class_has_8bitchar +=
6310 add_to_class(classbits, &class_uchardata, options, xoptions,
6311 cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc);
6312 C = CHAR_j;
6313 }
6314
6315 if (C <= D && C <= CHAR_r)
6316 {
6317 class_has_8bitchar +=
6318 add_to_class(classbits, &class_uchardata, options, xoptions,
6319 cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc);
6320 C = CHAR_s;
6321 }
6322
6323 if (C <= D)
6324 {
6325 class_has_8bitchar +=
6326 add_to_class(classbits, &class_uchardata, options, xoptions,
6327 cb, C + uc, D + uc);
6328 }
6329 }
6330 else
6331 #endif
6332 /* Not an EBCDIC special range */
6333
6334 class_has_8bitchar += add_to_class(classbits, &class_uchardata,
6335 options, xoptions, cb, c, d);
6336 goto CONTINUE_CLASS; /* Go get the next char in the class */
6337 } /* End of range handling */
6338
6339
6340 /* Handle a single character. */
6341
6342 class_has_8bitchar +=
6343 add_to_class(classbits, &class_uchardata, options, xoptions, cb,
6344 meta, meta);
6345 }
6346
6347 /* Continue to the next item in the class. */
6348
6349 CONTINUE_CLASS:
6350
6351 #ifdef SUPPORT_WIDE_CHARS
6352 /* If any wide characters or Unicode properties have been encountered,
6353 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6354 of the extra data and reset the pointer. This is so that very large
6355 classes that contain a zillion wide characters or Unicode property tests
6356 do not overwrite the workspace (which is on the stack). */
6357
6358 if (class_uchardata > class_uchardata_base)
6359 {
6360 xclass = TRUE;
6361 if (lengthptr != NULL)
6362 {
6363 *lengthptr += class_uchardata - class_uchardata_base;
6364 class_uchardata = class_uchardata_base;
6365 }
6366 }
6367 #endif
6368
6369 continue; /* Needed to avoid error when not supporting wide chars */
6370 } /* End of main class-processing loop */
6371
6372 /* If this class is the first thing in the branch, there can be no first
6373 char setting, whatever the repeat count. Any reqcu setting must remain
6374 unchanged after any kind of repeat. */
6375
6376 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6377 zerofirstcu = firstcu;
6378 zerofirstcuflags = firstcuflags;
6379 zeroreqcu = reqcu;
6380 zeroreqcuflags = reqcuflags;
6381
6382 /* If there are characters with values > 255, or Unicode property settings
6383 (\p or \P), we have to compile an extended class, with its own opcode,
6384 unless there were no property settings and there was a negated special such
6385 as \S in the class, and PCRE2_UCP is not set, because in that case all
6386 characters > 255 are in or not in the class, so any that were explicitly
6387 given as well can be ignored.
6388
6389 In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
6390 were present in a class, we either have to match or not match all wide
6391 characters (depending on whether the whole class is or is not negated).
6392 This requirement is indicated by match_all_or_no_wide_chars being true.
6393 We do this by including an explicit range, which works in both cases.
6394 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6395 cannot be any wide characters in 8-bit non-UTF mode.
6396
6397 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6398 class where \S etc is present without PCRE2_UCP, causing an extended class
6399 to be compiled, we make sure that all characters > 255 are included by
6400 forcing match_all_or_no_wide_chars to be true.
6401
6402 If, when generating an xclass, there are no characters < 256, we can omit
6403 the bitmap in the actual compiled code. */
6404
6405 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
6406 if (xclass && (
6407 #ifdef SUPPORT_UNICODE
6408 (options & PCRE2_UCP) != 0 ||
6409 #endif
6410 xclass_has_prop || !should_flip_negation))
6411 {
6412 if (match_all_or_no_wide_chars || (
6413 #if PCRE2_CODE_UNIT_WIDTH == 8
6414 utf &&
6415 #endif
6416 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6417 {
6418 *class_uchardata++ = XCL_RANGE;
6419 if (utf) /* Will always be utf in the 8-bit library */
6420 {
6421 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6422 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6423 }
6424 else /* Can only happen for the 16-bit & 32-bit libraries */
6425 {
6426 #if PCRE2_CODE_UNIT_WIDTH == 16
6427 *class_uchardata++ = 0x100;
6428 *class_uchardata++ = 0xffffu;
6429 #elif PCRE2_CODE_UNIT_WIDTH == 32
6430 *class_uchardata++ = 0x100;
6431 *class_uchardata++ = 0xffffffffu;
6432 #endif
6433 }
6434 }
6435 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
6436 *code++ = OP_XCLASS;
6437 code += LINK_SIZE;
6438 *code = negate_class? XCL_NOT:0;
6439 if (xclass_has_prop) *code |= XCL_HASPROP;
6440
6441 /* If the map is required, move up the extra data to make room for it;
6442 otherwise just move the code pointer to the end of the extra data. */
6443
6444 if (class_has_8bitchar > 0)
6445 {
6446 *code++ |= XCL_MAP;
6447 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6448 CU2BYTES(class_uchardata - code));
6449 if (negate_class && !xclass_has_prop)
6450 {
6451 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6452 for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6453 }
6454 memcpy(code, classbits, 32);
6455 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6456 }
6457 else code = class_uchardata;
6458
6459 /* Now fill in the complete length of the item */
6460
6461 PUT(previous, 1, (int)(code - previous));
6462 break; /* End of class handling */
6463 }
6464 #endif /* SUPPORT_WIDE_CHARS */
6465
6466 /* If there are no characters > 255, or they are all to be included or
6467 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6468 whole class was negated and whether there were negative specials such as \S
6469 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6470 negating it if necessary. */
6471
6472 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6473 if (lengthptr == NULL) /* Save time in the pre-compile phase */
6474 {
6475 if (negate_class)
6476 {
6477 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6478 for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6479 }
6480 memcpy(code, classbits, 32);
6481 }
6482 code += 32 / sizeof(PCRE2_UCHAR);
6483 break; /* End of class processing */
6484
6485
6486 /* ===================================================================*/
6487 /* Deal with (*VERB)s. */
6488
6489 /* Check for open captures before ACCEPT and close those that are within
6490 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6491 assertion. In the first pass, just accumulate the length required;
6492 otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6493 workspace overflow. Do not set firstcu after *ACCEPT. */
6494
6495 case META_ACCEPT:
6496 cb->had_accept = had_accept = TRUE;
6497 for (oc = open_caps;
6498 oc != NULL && oc->assert_depth >= cb->assert_depth;
6499 oc = oc->next)
6500 {
6501 if (lengthptr != NULL)
6502 {
6503 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6504 }
6505 else
6506 {
6507 *code++ = OP_CLOSE;
6508 PUT2INC(code, 0, oc->number);
6509 }
6510 }
6511 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6512 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6513 break;
6514
6515 case META_PRUNE:
6516 case META_SKIP:
6517 cb->had_pruneorskip = TRUE;
6518 /* Fall through */
6519 case META_COMMIT:
6520 case META_FAIL:
6521 *code++ = verbops[(meta - META_MARK) >> 16];
6522 break;
6523
6524 case META_THEN:
6525 cb->external_flags |= PCRE2_HASTHEN;
6526 *code++ = OP_THEN;
6527 break;
6528
6529 /* Handle verbs with arguments. Arguments can be very long, especially in
6530 16- and 32-bit modes, and can overflow the workspace in the first pass.
6531 However, the argument length is constrained to be small enough to fit in
6532 one code unit. This check happens in parse_regex(). In the first pass,
6533 instead of putting the argument into memory, we just update the length
6534 counter and set up an empty argument. */
6535
6536 case META_THEN_ARG:
6537 cb->external_flags |= PCRE2_HASTHEN;
6538 goto VERB_ARG;
6539
6540 case META_PRUNE_ARG:
6541 case META_SKIP_ARG:
6542 cb->had_pruneorskip = TRUE;
6543 /* Fall through */
6544 case META_MARK:
6545 case META_COMMIT_ARG:
6546 VERB_ARG:
6547 *code++ = verbops[(meta - META_MARK) >> 16];
6548 /* The length is in characters. */
6549 verbarglen = *(++pptr);
6550 verbculen = 0;
6551 tempcode = code++;
6552 for (int i = 0; i < (int)verbarglen; i++)
6553 {
6554 meta = *(++pptr);
6555 #ifdef SUPPORT_UNICODE
6556 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6557 #endif
6558 {
6559 mclength = 1;
6560 mcbuffer[0] = meta;
6561 }
6562 if (lengthptr != NULL) *lengthptr += mclength; else
6563 {
6564 memcpy(code, mcbuffer, CU2BYTES(mclength));
6565 code += mclength;
6566 verbculen += mclength;
6567 }
6568 }
6569
6570 *tempcode = verbculen; /* Fill in the code unit length */
6571 *code++ = 0; /* Terminating zero */
6572 break;
6573
6574
6575 /* ===================================================================*/
6576 /* Handle options change. The new setting must be passed back for use in
6577 subsequent branches. Reset the greedy defaults and the case value for
6578 firstcu and reqcu. */
6579
6580 case META_OPTIONS:
6581 *optionsptr = options = *(++pptr);
6582 *xoptionsptr = xoptions = *(++pptr);
6583 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6584 greedy_non_default = greedy_default ^ 1;
6585 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6586 break;
6587
6588
6589 /* ===================================================================*/
6590 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6591 because it could be a numerical check on recursion, or a name check on a
6592 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6593 we can handle it either way. We first try for a name; if not found, process
6594 the number. */
6595
6596 case META_COND_RNUMBER: /* (?(Rdigits) */
6597 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6598 case META_COND_RNAME: /* (?(R&name) - test for recursion */
6599 bravalue = OP_COND;
6600 {
6601 int count, index;
6602 unsigned int i;
6603 PCRE2_SPTR name;
6604 named_group *ng = cb->named_groups;
6605 uint32_t length = *(++pptr);
6606
6607 GETPLUSOFFSET(offset, pptr);
6608 name = cb->start_pattern + offset;
6609
6610 /* In the first pass, the names generated in the pre-pass are available,
6611 but the main name table has not yet been created. Scan the list of names
6612 generated in the pre-pass in order to get a number and whether or not
6613 this name is duplicated. If it is not duplicated, we can handle it as a
6614 numerical group. */
6615
6616 for (i = 0; i < cb->names_found; i++, ng++)
6617 {
6618 if (length == ng->length &&
6619 PRIV(strncmp)(name, ng->name, length) == 0)
6620 {
6621 if (!ng->isdup)
6622 {
6623 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6624 PUT2(code, 2+LINK_SIZE, ng->number);
6625 if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6626 skipunits = 1+IMM2_SIZE;
6627 goto GROUP_PROCESS_NOTE_EMPTY;
6628 }
6629 break; /* Found a duplicated name */
6630 }
6631 }
6632
6633 /* If the name was not found we have a bad reference, unless we are
6634 dealing with R<digits>, which is treated as a recursion test by number.
6635 */
6636
6637 if (i >= cb->names_found)
6638 {
6639 groupnumber = 0;
6640 if (meta == META_COND_RNUMBER)
6641 {
6642 for (i = 1; i < length; i++)
6643 {
6644 groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6645 if (groupnumber > MAX_GROUP_NUMBER)
6646 {
6647 *errorcodeptr = ERR61;
6648 cb->erroroffset = offset + i;
6649 return 0;
6650 }
6651 }
6652 }
6653
6654 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6655 {
6656 *errorcodeptr = ERR15;
6657 cb->erroroffset = offset;
6658 return 0;
6659 }
6660
6661 /* (?Rdigits) treated as a recursion reference by number. A value of
6662 zero (which is the result of both (?R) and (?R0)) means "any", and is
6663 translated into RREF_ANY (which is 0xffff). */
6664
6665 if (groupnumber == 0) groupnumber = RREF_ANY;
6666 code[1+LINK_SIZE] = OP_RREF;
6667 PUT2(code, 2+LINK_SIZE, groupnumber);
6668 skipunits = 1+IMM2_SIZE;
6669 goto GROUP_PROCESS_NOTE_EMPTY;
6670 }
6671
6672 /* A duplicated name was found. Note that if an R<digits> name is found
6673 (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6674
6675 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6676
6677 /* We have a duplicated name. In the compile pass we have to search the
6678 main table in order to get the index and count values. */
6679
6680 count = 0; /* Values for first pass (avoids compiler warning) */
6681 index = 0;
6682 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6683 &count, errorcodeptr, cb)) return 0;
6684
6685 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6686 insert appropriate data values. */
6687
6688 code[1+LINK_SIZE]++;
6689 skipunits = 1+2*IMM2_SIZE;
6690 PUT2(code, 2+LINK_SIZE, index);
6691 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6692 }
6693 goto GROUP_PROCESS_NOTE_EMPTY;
6694
6695 /* The DEFINE condition is always false. Its internal groups may never
6696 be called, so matched_char must remain false, hence the jump to
6697 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6698
6699 case META_COND_DEFINE:
6700 bravalue = OP_COND;
6701 GETPLUSOFFSET(offset, pptr);
6702 code[1+LINK_SIZE] = OP_DEFINE;
6703 skipunits = 1;
6704 goto GROUP_PROCESS;
6705
6706 /* Conditional test of a group's being set. */
6707
6708 case META_COND_NUMBER:
6709 bravalue = OP_COND;
6710 GETPLUSOFFSET(offset, pptr);
6711 groupnumber = *(++pptr);
6712 if (groupnumber > cb->bracount)
6713 {
6714 *errorcodeptr = ERR15;
6715 cb->erroroffset = offset;
6716 return 0;
6717 }
6718 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6719 offset -= 2; /* Point at initial ( for too many branches error */
6720 code[1+LINK_SIZE] = OP_CREF;
6721 skipunits = 1+IMM2_SIZE;
6722 PUT2(code, 2+LINK_SIZE, groupnumber);
6723 goto GROUP_PROCESS_NOTE_EMPTY;
6724
6725 /* Test for the PCRE2 version. */
6726
6727 case META_COND_VERSION:
6728 bravalue = OP_COND;
6729 if (pptr[1] > 0)
6730 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6731 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6732 OP_TRUE : OP_FALSE;
6733 else
6734 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6735 OP_TRUE : OP_FALSE;
6736 skipunits = 1;
6737 pptr += 3;
6738 goto GROUP_PROCESS_NOTE_EMPTY;
6739
6740 /* The condition is an assertion, possibly preceded by a callout. */
6741
6742 case META_COND_ASSERT:
6743 bravalue = OP_COND;
6744 goto GROUP_PROCESS_NOTE_EMPTY;
6745
6746
6747 /* ===================================================================*/
6748 /* Handle all kinds of nested bracketed groups. The non-capturing,
6749 non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6750
6751 case META_LOOKAHEAD:
6752 bravalue = OP_ASSERT;
6753 cb->assert_depth += 1;
6754 goto GROUP_PROCESS;
6755
6756 case META_LOOKAHEAD_NA:
6757 bravalue = OP_ASSERT_NA;
6758 cb->assert_depth += 1;
6759 goto GROUP_PROCESS;
6760
6761 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6762 thing to do, but Perl allows all assertions to be quantified, and when
6763 they contain capturing parentheses there may be a potential use for
6764 this feature. Not that that applies to a quantified (?!) but we allow
6765 it for uniformity. */
6766
6767 case META_LOOKAHEADNOT:
6768 if (pptr[1] == META_KET &&
6769 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6770 {
6771 *code++ = OP_FAIL;
6772 pptr++;
6773 }
6774 else
6775 {
6776 bravalue = OP_ASSERT_NOT;
6777 cb->assert_depth += 1;
6778 goto GROUP_PROCESS;
6779 }
6780 break;
6781
6782 case META_LOOKBEHIND:
6783 bravalue = OP_ASSERTBACK;
6784 cb->assert_depth += 1;
6785 goto GROUP_PROCESS;
6786
6787 case META_LOOKBEHINDNOT:
6788 bravalue = OP_ASSERTBACK_NOT;
6789 cb->assert_depth += 1;
6790 goto GROUP_PROCESS;
6791
6792 case META_LOOKBEHIND_NA:
6793 bravalue = OP_ASSERTBACK_NA;
6794 cb->assert_depth += 1;
6795 goto GROUP_PROCESS;
6796
6797 case META_ATOMIC:
6798 bravalue = OP_ONCE;
6799 goto GROUP_PROCESS_NOTE_EMPTY;
6800
6801 case META_SCRIPT_RUN:
6802 bravalue = OP_SCRIPT_RUN;
6803 goto GROUP_PROCESS_NOTE_EMPTY;
6804
6805 case META_NOCAPTURE:
6806 bravalue = OP_BRA;
6807 /* Fall through */
6808
6809 /* Process nested bracketed regex. The nesting depth is maintained for the
6810 benefit of the stackguard function. The test for too deep nesting is now
6811 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6812 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6813 note of whether or not they may match an empty string. */
6814
6815 GROUP_PROCESS_NOTE_EMPTY:
6816 note_group_empty = TRUE;
6817
6818 GROUP_PROCESS:
6819 cb->parens_depth += 1;
6820 *code = bravalue;
6821 pptr++;
6822 tempcode = code;
6823 tempreqvary = cb->req_varyopt; /* Save value before group */
6824 length_prevgroup = 0; /* Initialize for pre-compile phase */
6825
6826 if ((group_return =
6827 compile_regex(
6828 options, /* The options state */
6829 xoptions, /* The extra options state */
6830 &tempcode, /* Where to put code (updated) */
6831 &pptr, /* Input pointer (updated) */
6832 errorcodeptr, /* Where to put an error message */
6833 skipunits, /* Skip over bracket number */
6834 &subfirstcu, /* For possible first char */
6835 &subfirstcuflags,
6836 &subreqcu, /* For possible last char */
6837 &subreqcuflags,
6838 bcptr, /* Current branch chain */
6839 open_caps, /* Pointer to capture stack */
6840 cb, /* Compile data block */
6841 (lengthptr == NULL)? NULL : /* Actual compile phase */
6842 &length_prevgroup /* Pre-compile phase */
6843 )) == 0)
6844 return 0; /* Error */
6845
6846 cb->parens_depth -= 1;
6847
6848 /* If that was a non-conditional significant group (not an assertion, not a
6849 DEFINE) that matches at least one character, then the current item matches
6850 a character. Conditionals are handled below. */
6851
6852 if (note_group_empty && bravalue != OP_COND && group_return > 0)
6853 matched_char = TRUE;
6854
6855 /* If we've just compiled an assertion, pop the assert depth. */
6856
6857 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6858 cb->assert_depth -= 1;
6859
6860 /* At the end of compiling, code is still pointing to the start of the
6861 group, while tempcode has been updated to point past the end of the group.
6862 The parsed pattern pointer (pptr) is on the closing META_KET.
6863
6864 If this is a conditional bracket, check that there are no more than
6865 two branches in the group, or just one if it's a DEFINE group. We do this
6866 in the real compile phase, not in the pre-pass, where the whole group may
6867 not be available. */
6868
6869 if (bravalue == OP_COND && lengthptr == NULL)
6870 {
6871 PCRE2_UCHAR *tc = code;
6872 int condcount = 0;
6873
6874 do {
6875 condcount++;
6876 tc += GET(tc,1);
6877 }
6878 while (*tc != OP_KET);
6879
6880 /* A DEFINE group is never obeyed inline (the "condition" is always
6881 false). It must have only one branch. Having checked this, change the
6882 opcode to OP_FALSE. */
6883
6884 if (code[LINK_SIZE+1] == OP_DEFINE)
6885 {
6886 if (condcount > 1)
6887 {
6888 cb->erroroffset = offset;
6889 *errorcodeptr = ERR54;
6890 return 0;
6891 }
6892 code[LINK_SIZE+1] = OP_FALSE;
6893 bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6894 }
6895
6896 /* A "normal" conditional group. If there is just one branch, we must not
6897 make use of its firstcu or reqcu, because this is equivalent to an
6898 empty second branch. Also, it may match an empty string. If there are two
6899 branches, this item must match a character if the group must. */
6900
6901 else
6902 {
6903 if (condcount > 2)
6904 {
6905 cb->erroroffset = offset;
6906 *errorcodeptr = ERR27;
6907 return 0;
6908 }
6909 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6910 else if (group_return > 0) matched_char = TRUE;
6911 }
6912 }
6913
6914 /* In the pre-compile phase, update the length by the length of the group,
6915 less the brackets at either end. Then reduce the compiled code to just a
6916 set of non-capturing brackets so that it doesn't use much memory if it is
6917 duplicated by a quantifier.*/
6918
6919 if (lengthptr != NULL)
6920 {
6921 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6922 {
6923 *errorcodeptr = ERR20;
6924 return 0;
6925 }
6926 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6927 code++; /* This already contains bravalue */
6928 PUTINC(code, 0, 1 + LINK_SIZE);
6929 *code++ = OP_KET;
6930 PUTINC(code, 0, 1 + LINK_SIZE);
6931 break; /* No need to waste time with special character handling */
6932 }
6933
6934 /* Otherwise update the main code pointer to the end of the group. */
6935
6936 code = tempcode;
6937
6938 /* For a DEFINE group, required and first character settings are not
6939 relevant. */
6940
6941 if (bravalue == OP_DEFINE) break;
6942
6943 /* Handle updating of the required and first code units for other types of
6944 group. Update for normal brackets of all kinds, and conditions with two
6945 branches (see code above). If the bracket is followed by a quantifier with
6946 zero repeat, we have to back off. Hence the definition of zeroreqcu and
6947 zerofirstcu outside the main loop so that they can be accessed for the back
6948 off. */
6949
6950 zeroreqcu = reqcu;
6951 zeroreqcuflags = reqcuflags;
6952 zerofirstcu = firstcu;
6953 zerofirstcuflags = firstcuflags;
6954 groupsetfirstcu = FALSE;
6955
6956 if (bravalue >= OP_ONCE) /* Not an assertion */
6957 {
6958 /* If we have not yet set a firstcu in this branch, take it from the
6959 subpattern, remembering that it was set here so that a repeat of more
6960 than one can replicate it as reqcu if necessary. If the subpattern has
6961 no firstcu, set "none" for the whole branch. In both cases, a zero
6962 repeat forces firstcu to "none". */
6963
6964 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6965 {
6966 if (subfirstcuflags < REQ_NONE)
6967 {
6968 firstcu = subfirstcu;
6969 firstcuflags = subfirstcuflags;
6970 groupsetfirstcu = TRUE;
6971 }
6972 else firstcuflags = REQ_NONE;
6973 zerofirstcuflags = REQ_NONE;
6974 }
6975
6976 /* If firstcu was previously set, convert the subpattern's firstcu
6977 into reqcu if there wasn't one, using the vary flag that was in
6978 existence beforehand. */
6979
6980 else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6981 {
6982 subreqcu = subfirstcu;
6983 subreqcuflags = subfirstcuflags | tempreqvary;
6984 }
6985
6986 /* If the subpattern set a required code unit (or set a first code unit
6987 that isn't really the first code unit - see above), set it. */
6988
6989 if (subreqcuflags < REQ_NONE)
6990 {
6991 reqcu = subreqcu;
6992 reqcuflags = subreqcuflags;
6993 }
6994 }
6995
6996 /* For a forward assertion, we take the reqcu, if set, provided that the
6997 group has also set a firstcu. This can be helpful if the pattern that
6998 follows the assertion doesn't set a different char. For example, it's
6999 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7000 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7001 the "real" "a" would then become a reqcu instead of a firstcu. This is
7002 overcome by a scan at the end if there's no firstcu, looking for an
7003 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7004 we must only take the reqcu when the group also set a firstcu. Otherwise,
7005 in that example, 'X' ends up set for both. */
7006
7007 else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7008 subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7009 {
7010 reqcu = subreqcu;
7011 reqcuflags = subreqcuflags;
7012 }
7013
7014 break; /* End of nested group handling */
7015
7016
7017 /* ===================================================================*/
7018 /* Handle named backreferences and recursions. */
7019
7020 case META_BACKREF_BYNAME:
7021 case META_RECURSE_BYNAME:
7022 {
7023 int count, index;
7024 PCRE2_SPTR name;
7025 BOOL is_dupname = FALSE;
7026 named_group *ng = cb->named_groups;
7027 uint32_t length = *(++pptr);
7028
7029 GETPLUSOFFSET(offset, pptr);
7030 name = cb->start_pattern + offset;
7031
7032 /* In the first pass, the names generated in the pre-pass are available,
7033 but the main name table has not yet been created. Scan the list of names
7034 generated in the pre-pass in order to get a number and whether or not
7035 this name is duplicated. */
7036
7037 groupnumber = 0;
7038 for (unsigned int i = 0; i < cb->names_found; i++, ng++)
7039 {
7040 if (length == ng->length &&
7041 PRIV(strncmp)(name, ng->name, length) == 0)
7042 {
7043 is_dupname = ng->isdup;
7044 groupnumber = ng->number;
7045
7046 /* For a recursion, that's all that is needed. We can now go to
7047 the code that handles numerical recursion, applying it to the first
7048 group with the given name. */
7049
7050 if (meta == META_RECURSE_BYNAME)
7051 {
7052 meta_arg = groupnumber;
7053 goto HANDLE_NUMERICAL_RECURSION;
7054 }
7055
7056 /* For a back reference, update the back reference map and the
7057 maximum back reference. */
7058
7059 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7060 if (groupnumber > cb->top_backref)
7061 cb->top_backref = groupnumber;
7062 }
7063 }
7064
7065 /* If the name was not found we have a bad reference. */
7066
7067 if (groupnumber == 0)
7068 {
7069 *errorcodeptr = ERR15;
7070 cb->erroroffset = offset;
7071 return 0;
7072 }
7073
7074 /* If a back reference name is not duplicated, we can handle it as
7075 a numerical reference. */
7076
7077 if (!is_dupname)
7078 {
7079 meta_arg = groupnumber;
7080 goto HANDLE_SINGLE_REFERENCE;
7081 }
7082
7083 /* If a back reference name is duplicated, we generate a different
7084 opcode to a numerical back reference. In the second pass we must
7085 search for the index and count in the final name table. */
7086
7087 count = 0; /* Values for first pass (avoids compiler warning) */
7088 index = 0;
7089 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
7090 &count, errorcodeptr, cb)) return 0;
7091
7092 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7093 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7094 PUT2INC(code, 0, index);
7095 PUT2INC(code, 0, count);
7096 }
7097 break;
7098
7099
7100 /* ===================================================================*/
7101 /* Handle a numerical callout. */
7102
7103 case META_CALLOUT_NUMBER:
7104 code[0] = OP_CALLOUT;
7105 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7106 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7107 code[1 + 2*LINK_SIZE] = pptr[3];
7108 pptr += 3;
7109 code += PRIV(OP_lengths)[OP_CALLOUT];
7110 break;
7111
7112
7113 /* ===================================================================*/
7114 /* Handle a callout with a string argument. In the pre-pass we just compute
7115 the length without generating anything. The length in pptr[3] includes both
7116 delimiters; in the actual compile only the first one is copied, but a
7117 terminating zero is added. Any doubled delimiters within the string make
7118 this an overestimate, but it is not worth bothering about. */
7119
7120 case META_CALLOUT_STRING:
7121 if (lengthptr != NULL)
7122 {
7123 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7124 pptr += 3;
7125 SKIPOFFSET(pptr);
7126 }
7127
7128 /* In the real compile we can copy the string. The starting delimiter is
7129 included so that the client can discover it if they want. We also pass the
7130 start offset to help a script language give better error messages. */
7131
7132 else
7133 {
7134 PCRE2_SPTR pp;
7135 uint32_t delimiter;
7136 uint32_t length = pptr[3];
7137 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7138
7139 code[0] = OP_CALLOUT_STR;
7140 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
7141 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
7142
7143 pptr += 3;
7144 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
7145 pp = cb->start_pattern + offset;
7146 delimiter = *callout_string++ = *pp++;
7147 if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7148 delimiter = CHAR_RIGHT_CURLY_BRACKET;
7149 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
7150
7151 /* The syntax of the pattern was checked in the parsing scan. The length
7152 includes both delimiters, but we have passed the opening one just above,
7153 so we reduce length before testing it. The test is for > 1 because we do
7154 not want to copy the final delimiter. This also ensures that pp[1] is
7155 accessible. */
7156
7157 while (--length > 1)
7158 {
7159 if (*pp == delimiter && pp[1] == delimiter)
7160 {
7161 *callout_string++ = delimiter;
7162 pp += 2;
7163 length--;
7164 }
7165 else *callout_string++ = *pp++;
7166 }
7167 *callout_string++ = CHAR_NUL;
7168
7169 /* Set the length of the entire item, the advance to its end. */
7170
7171 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7172 code = callout_string;
7173 }
7174 break;
7175
7176
7177 /* ===================================================================*/
7178 /* Handle repetition. The different types are all sorted out in the parsing
7179 pass. */
7180
7181 case META_MINMAX_PLUS:
7182 case META_MINMAX_QUERY:
7183 case META_MINMAX:
7184 repeat_min = *(++pptr);
7185 repeat_max = *(++pptr);
7186 goto REPEAT;
7187
7188 case META_ASTERISK:
7189 case META_ASTERISK_PLUS:
7190 case META_ASTERISK_QUERY:
7191 repeat_min = 0;
7192 repeat_max = REPEAT_UNLIMITED;
7193 goto REPEAT;
7194
7195 case META_PLUS:
7196 case META_PLUS_PLUS:
7197 case META_PLUS_QUERY:
7198 repeat_min = 1;
7199 repeat_max = REPEAT_UNLIMITED;
7200 goto REPEAT;
7201
7202 case META_QUERY:
7203 case META_QUERY_PLUS:
7204 case META_QUERY_QUERY:
7205 repeat_min = 0;
7206 repeat_max = 1;
7207
7208 REPEAT:
7209 if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7210
7211 /* Remember whether this is a variable length repeat, and default to
7212 single-char opcodes. */
7213
7214 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7215 op_type = 0;
7216
7217 /* Adjust first and required code units for a zero repeat. */
7218
7219 if (repeat_min == 0)
7220 {
7221 firstcu = zerofirstcu;
7222 firstcuflags = zerofirstcuflags;
7223 reqcu = zeroreqcu;
7224 reqcuflags = zeroreqcuflags;
7225 }
7226
7227 /* Note the greediness and possessiveness. */
7228
7229 switch (meta)
7230 {
7231 case META_MINMAX_PLUS:
7232 case META_ASTERISK_PLUS:
7233 case META_PLUS_PLUS:
7234 case META_QUERY_PLUS:
7235 repeat_type = 0; /* Force greedy */
7236 possessive_quantifier = TRUE;
7237 break;
7238
7239 case META_MINMAX_QUERY:
7240 case META_ASTERISK_QUERY:
7241 case META_PLUS_QUERY:
7242 case META_QUERY_QUERY:
7243 repeat_type = greedy_non_default;
7244 possessive_quantifier = FALSE;
7245 break;
7246
7247 default:
7248 repeat_type = greedy_default;
7249 possessive_quantifier = FALSE;
7250 break;
7251 }
7252
7253 /* Save start of previous item, in case we have to move it up in order to
7254 insert something before it, and remember what it was. */
7255
7256 tempcode = previous;
7257 op_previous = *previous;
7258
7259 /* Now handle repetition for the different types of item. If the repeat
7260 minimum and the repeat maximum are both 1, we can ignore the quantifier for
7261 non-parenthesized items, as they have only one alternative. For anything in
7262 parentheses, we must not ignore if {1} is possessive. */
7263
7264 switch (op_previous)
7265 {
7266 /* If previous was a character or negated character match, abolish the
7267 item and generate a repeat item instead. If a char item has a minimum of
7268 more than one, ensure that it is set in reqcu - it might not be if a
7269 sequence such as x{3} is the first thing in a branch because the x will
7270 have gone into firstcu instead. */
7271
7272 case OP_CHAR:
7273 case OP_CHARI:
7274 case OP_NOT:
7275 case OP_NOTI:
7276 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7277 op_type = chartypeoffset[op_previous - OP_CHAR];
7278
7279 /* Deal with UTF characters that take up more than one code unit. */
7280
7281 #ifdef MAYBE_UTF_MULTI
7282 if (utf && NOT_FIRSTCU(code[-1]))
7283 {
7284 PCRE2_UCHAR *lastchar = code - 1;
7285 BACKCHAR(lastchar);
7286 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
7287 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
7288 }
7289 else
7290 #endif /* MAYBE_UTF_MULTI */
7291
7292 /* Handle the case of a single code unit - either with no UTF support, or
7293 with UTF disabled, or for a single-code-unit UTF character. In the latter
7294 case, for a repeated positive match, get the caseless flag for the
7295 required code unit from the previous character, because a class like [Aa]
7296 sets a caseless A but by now the req_caseopt flag has been reset. */
7297
7298 {
7299 mcbuffer[0] = code[-1];
7300 mclength = 1;
7301 if (op_previous <= OP_CHARI && repeat_min > 1)
7302 {
7303 reqcu = mcbuffer[0];
7304 reqcuflags = cb->req_varyopt;
7305 if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7306 }
7307 }
7308 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
7309
7310 /* If previous was a character class or a back reference, we put the
7311 repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7312
7313 #ifdef SUPPORT_WIDE_CHARS
7314 case OP_XCLASS:
7315 #endif
7316 case OP_CLASS:
7317 case OP_NCLASS:
7318 case OP_REF:
7319 case OP_REFI:
7320 case OP_DNREF:
7321 case OP_DNREFI:
7322
7323 if (repeat_max == 0)
7324 {
7325 code = previous;
7326 goto END_REPEAT;
7327 }
7328 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7329
7330 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7331 *code++ = OP_CRSTAR + repeat_type;
7332 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7333 *code++ = OP_CRPLUS + repeat_type;
7334 else if (repeat_min == 0 && repeat_max == 1)
7335 *code++ = OP_CRQUERY + repeat_type;
7336 else
7337 {
7338 *code++ = OP_CRRANGE + repeat_type;
7339 PUT2INC(code, 0, repeat_min);
7340 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
7341 PUT2INC(code, 0, repeat_max);
7342 }
7343 break;
7344
7345 /* If previous is OP_FAIL, it was generated by an empty class []
7346 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7347 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7348 time. We can just ignore this repeat. */
7349
7350 case OP_FAIL:
7351 goto END_REPEAT;
7352
7353 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7354 because pcre2_match() could not handle backtracking into recursively
7355 called groups. Now that this backtracking is available, we no longer need
7356 to do this. However, we still need to replicate recursions as we do for
7357 groups so as to have independent backtracking points. We can replicate
7358 for the minimum number of repeats directly. For optional repeats we now
7359 wrap the recursion in OP_BRA brackets and make use of the bracket
7360 repetition. */
7361
7362 case OP_RECURSE:
7363 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7364 goto END_REPEAT;
7365
7366 /* Generate unwrapped repeats for a non-zero minimum, except when the
7367 minimum is 1 and the maximum unlimited, because that can be handled with
7368 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7369 minimum, we just need to generate the appropriate additional copies.
7370 Otherwise we need to generate one more, to simulate the situation when
7371 the minimum is zero. */
7372
7373 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7374 {
7375 int replicate = repeat_min;
7376 if (repeat_min == repeat_max) replicate--;
7377
7378 /* In the pre-compile phase, we don't actually do the replication. We
7379 just adjust the length as if we had. Do some paranoid checks for
7380 potential integer overflow. */
7381
7382 if (lengthptr != NULL)
7383 {
7384 PCRE2_SIZE delta;
7385 if (PRIV(ckd_smul)(&delta, replicate, 1 + LINK_SIZE) ||
7386 OFLOW_MAX - *lengthptr < delta)
7387 {
7388 *errorcodeptr = ERR20;
7389 return 0;
7390 }
7391 *lengthptr += delta;
7392 }
7393
7394 else for (int i = 0; i < replicate; i++)
7395 {
7396 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7397 previous = code;
7398 code += 1 + LINK_SIZE;
7399 }
7400
7401 /* If the number of repeats is fixed, we are done. Otherwise, adjust
7402 the counts and fall through. */
7403
7404 if (repeat_min == repeat_max) break;
7405 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7406 repeat_min = 0;
7407 }
7408
7409 /* Wrap the recursion call in OP_BRA brackets. */
7410
7411 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7412 op_previous = *previous = OP_BRA;
7413 PUT(previous, 1, 2 + 2*LINK_SIZE);
7414 previous[2 + 2*LINK_SIZE] = OP_KET;
7415 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7416 code += 2 + 2 * LINK_SIZE;
7417 length_prevgroup = 3 + 3*LINK_SIZE;
7418 group_return = -1; /* Set "may match empty string" */
7419
7420 /* Now treat as a repeated OP_BRA. */
7421 /* Fall through */
7422
7423 /* If previous was a bracket group, we may have to replicate it in
7424 certain cases. Note that at this point we can encounter only the "basic"
7425 bracket opcodes such as BRA and CBRA, as this is the place where they get
7426 converted into the more special varieties such as BRAPOS and SBRA.
7427 Originally, PCRE did not allow repetition of assertions, but now it does,
7428 for Perl compatibility. */
7429
7430 case OP_ASSERT:
7431 case OP_ASSERT_NOT:
7432 case OP_ASSERT_NA:
7433 case OP_ASSERTBACK:
7434 case OP_ASSERTBACK_NOT:
7435 case OP_ASSERTBACK_NA:
7436 case OP_ONCE:
7437 case OP_SCRIPT_RUN:
7438 case OP_BRA:
7439 case OP_CBRA:
7440 case OP_COND:
7441 {
7442 int len = (int)(code - previous);
7443 PCRE2_UCHAR *bralink = NULL;
7444 PCRE2_UCHAR *brazeroptr = NULL;
7445
7446 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7447 goto END_REPEAT;
7448
7449 /* Repeating a DEFINE group (or any group where the condition is always
7450 FALSE and there is only one branch) is pointless, but Perl allows the
7451 syntax, so we just ignore the repeat. */
7452
7453 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7454 previous[GET(previous, 1)] != OP_ALT)
7455 goto END_REPEAT;
7456
7457 /* Perl allows all assertions to be quantified, and when they contain
7458 capturing parentheses and/or are optional there are potential uses for
7459 this feature. PCRE2 used to force the maximum quantifier to 1 on the
7460 invalid grounds that further repetition was never useful. This was
7461 always a bit pointless, since an assertion could be wrapped with a
7462 repeated group to achieve the effect. General repetition is now
7463 permitted, but if the maximum is unlimited it is set to one more than
7464 the minimum. */
7465
7466 if (op_previous < OP_ONCE) /* Assertion */
7467 {
7468 if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7469 }
7470
7471 /* The case of a zero minimum is special because of the need to stick
7472 OP_BRAZERO in front of it, and because the group appears once in the
7473 data, whereas in other cases it appears the minimum number of times. For
7474 this reason, it is simplest to treat this case separately, as otherwise
7475 the code gets far too messy. There are several special subcases when the
7476 minimum is zero. */
7477
7478 if (repeat_min == 0)
7479 {
7480 /* If the maximum is also zero, we used to just omit the group from
7481 the output altogether, like this:
7482
7483 ** if (repeat_max == 0)
7484 ** {
7485 ** code = previous;
7486 ** goto END_REPEAT;
7487 ** }
7488
7489 However, that fails when a group or a subgroup within it is
7490 referenced as a subroutine from elsewhere in the pattern, so now we
7491 stick in OP_SKIPZERO in front of it so that it is skipped on
7492 execution. As we don't have a list of which groups are referenced, we
7493 cannot do this selectively.
7494
7495 If the maximum is 1 or unlimited, we just have to stick in the
7496 BRAZERO and do no more at this point. */
7497
7498 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7499 {
7500 (void)memmove(previous + 1, previous, CU2BYTES(len));
7501 code++;
7502 if (repeat_max == 0)
7503 {
7504 *previous++ = OP_SKIPZERO;
7505 goto END_REPEAT;
7506 }
7507 brazeroptr = previous; /* Save for possessive optimizing */
7508 *previous++ = OP_BRAZERO + repeat_type;
7509 }
7510
7511 /* If the maximum is greater than 1 and limited, we have to replicate
7512 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7513 The first one has to be handled carefully because it's the original
7514 copy, which has to be moved up. The remainder can be handled by code
7515 that is common with the non-zero minimum case below. We have to
7516 adjust the value or repeat_max, since one less copy is required. */
7517
7518 else
7519 {
7520 int linkoffset;
7521 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7522 code += 2 + LINK_SIZE;
7523 *previous++ = OP_BRAZERO + repeat_type;
7524 *previous++ = OP_BRA;
7525
7526 /* We chain together the bracket link offset fields that have to be
7527 filled in later when the ends of the brackets are reached. */
7528
7529 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7530 bralink = previous;
7531 PUTINC(previous, 0, linkoffset);
7532 }
7533
7534 if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7535 }
7536
7537 /* If the minimum is greater than zero, replicate the group as many
7538 times as necessary, and adjust the maximum to the number of subsequent
7539 copies that we need. */
7540
7541 else
7542 {
7543 if (repeat_min > 1)
7544 {
7545 /* In the pre-compile phase, we don't actually do the replication.
7546 We just adjust the length as if we had. Do some paranoid checks for
7547 potential integer overflow. */
7548
7549 if (lengthptr != NULL)
7550 {
7551 PCRE2_SIZE delta;
7552 if (PRIV(ckd_smul)(&delta, repeat_min - 1, length_prevgroup) ||
7553 OFLOW_MAX - *lengthptr < delta)
7554 {
7555 *errorcodeptr = ERR20;
7556 return 0;
7557 }
7558 *lengthptr += delta;
7559 }
7560
7561 /* This is compiling for real. If there is a set first code unit
7562 for the group, and we have not yet set a "required code unit", set
7563 it. */
7564
7565 else
7566 {
7567 if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7568 {
7569 reqcu = firstcu;
7570 reqcuflags = firstcuflags;
7571 }
7572 for (uint32_t i = 1; i < repeat_min; i++)
7573 {
7574 memcpy(code, previous, CU2BYTES(len));
7575 code += len;
7576 }
7577 }
7578 }
7579
7580 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7581 }
7582
7583 /* This code is common to both the zero and non-zero minimum cases. If
7584 the maximum is limited, it replicates the group in a nested fashion,
7585 remembering the bracket starts on a stack. In the case of a zero
7586 minimum, the first one was set up above. In all cases the repeat_max
7587 now specifies the number of additional copies needed. Again, we must
7588 remember to replicate entries on the forward reference list. */
7589
7590 if (repeat_max != REPEAT_UNLIMITED)
7591 {
7592 /* In the pre-compile phase, we don't actually do the replication. We
7593 just adjust the length as if we had. For each repetition we must add
7594 1 to the length for BRAZERO and for all but the last repetition we
7595 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7596 paranoid checks to avoid integer overflow. */
7597
7598 if (lengthptr != NULL && repeat_max > 0)
7599 {
7600 PCRE2_SIZE delta;
7601 if (PRIV(ckd_smul)(&delta, repeat_max,
7602 length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7603 OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7604 {
7605 *errorcodeptr = ERR20;
7606 return 0;
7607 }
7608 delta -= (2 + 2*LINK_SIZE); /* Last one doesn't nest */
7609 *lengthptr += delta;
7610 }
7611
7612 /* This is compiling for real */
7613
7614 else for (uint32_t i = repeat_max; i >= 1; i--)
7615 {
7616 *code++ = OP_BRAZERO + repeat_type;
7617
7618 /* All but the final copy start a new nesting, maintaining the
7619 chain of brackets outstanding. */
7620
7621 if (i != 1)
7622 {
7623 int linkoffset;
7624 *code++ = OP_BRA;
7625 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7626 bralink = code;
7627 PUTINC(code, 0, linkoffset);
7628 }
7629
7630 memcpy(code, previous, CU2BYTES(len));
7631 code += len;
7632 }
7633
7634 /* Now chain through the pending brackets, and fill in their length
7635 fields (which are holding the chain links pro tem). */
7636
7637 while (bralink != NULL)
7638 {
7639 int oldlinkoffset;
7640 int linkoffset = (int)(code - bralink + 1);
7641 PCRE2_UCHAR *bra = code - linkoffset;
7642 oldlinkoffset = GET(bra, 1);
7643 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7644 *code++ = OP_KET;
7645 PUTINC(code, 0, linkoffset);
7646 PUT(bra, 1, linkoffset);
7647 }
7648 }
7649
7650 /* If the maximum is unlimited, set a repeater in the final copy. For
7651 SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7652 possessively repeated ONCE brackets can be converted into non-capturing
7653 brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7654 saves having to deal with possessive ONCEs specially.
7655
7656 Otherwise, when we are doing the actual compile phase, check to see
7657 whether this group is one that could match an empty string. If so,
7658 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7659 that runtime checking can be done. [This check is also applied to ONCE
7660 and SCRIPT_RUN groups at runtime, but in a different way.]
7661
7662 Then, if the quantifier was possessive and the bracket is not a
7663 conditional, we convert the BRA code to the POS form, and the KET code
7664 to KETRPOS. (It turns out to be convenient at runtime to detect this
7665 kind of subpattern at both the start and at the end.) The use of
7666 special opcodes makes it possible to reduce greatly the stack usage in
7667 pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7668 OP_BRAPOSZERO.
7669
7670 Then, if the minimum number of matches is 1 or 0, cancel the possessive
7671 flag so that the default action below, of wrapping everything inside
7672 atomic brackets, does not happen. When the minimum is greater than 1,
7673 there will be earlier copies of the group, and so we still have to wrap
7674 the whole thing. */
7675
7676 else
7677 {
7678 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7679 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7680
7681 /* Convert possessive ONCE brackets to non-capturing */
7682
7683 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7684
7685 /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7686 to do is to set the KET. */
7687
7688 if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7689 *ketcode = OP_KETRMAX + repeat_type;
7690
7691 /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7692 (which have been converted to non-capturing above). */
7693
7694 else
7695 {
7696 /* In the compile phase, adjust the opcode if the group can match
7697 an empty string. For a conditional group with only one branch, the
7698 value of group_return will not show "could be empty", so we must
7699 check that separately. */
7700
7701 if (lengthptr == NULL)
7702 {
7703 if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7704 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7705 *bracode = OP_SCOND;
7706 }
7707
7708 /* Handle possessive quantifiers. */
7709
7710 if (possessive_quantifier)
7711 {
7712 /* For COND brackets, we wrap the whole thing in a possessively
7713 repeated non-capturing bracket, because we have not invented POS
7714 versions of the COND opcodes. */
7715
7716 if (*bracode == OP_COND || *bracode == OP_SCOND)
7717 {
7718 int nlen = (int)(code - bracode);
7719 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7720 code += 1 + LINK_SIZE;
7721 nlen += 1 + LINK_SIZE;
7722 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7723 *code++ = OP_KETRPOS;
7724 PUTINC(code, 0, nlen);
7725 PUT(bracode, 1, nlen);
7726 }
7727
7728 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7729
7730 else
7731 {
7732 *bracode += 1; /* Switch to xxxPOS opcodes */
7733 *ketcode = OP_KETRPOS;
7734 }
7735
7736 /* If the minimum is zero, mark it as possessive, then unset the
7737 possessive flag when the minimum is 0 or 1. */
7738
7739 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7740 if (repeat_min < 2) possessive_quantifier = FALSE;
7741 }
7742
7743 /* Non-possessive quantifier */
7744
7745 else *ketcode = OP_KETRMAX + repeat_type;
7746 }
7747 }
7748 }
7749 break;
7750
7751 /* If previous was a character type match (\d or similar), abolish it and
7752 create a suitable repeat item. The code is shared with single-character
7753 repeats by setting op_type to add a suitable offset into repeat_type.
7754 Note the the Unicode property types will be present only when
7755 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7756 here because it just makes it horribly messy. */
7757
7758 default:
7759 if (op_previous >= OP_EODN) /* Not a character type - internal error */
7760 {
7761 *errorcodeptr = ERR10;
7762 return 0;
7763 }
7764 else
7765 {
7766 int prop_type, prop_value;
7767 PCRE2_UCHAR *oldcode;
7768
7769 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7770
7771 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7772 mclength = 0; /* Not a character */
7773
7774 if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7775 {
7776 prop_type = previous[1];
7777 prop_value = previous[2];
7778 }
7779 else
7780 {
7781 /* Come here from just above with a character in mcbuffer/mclength. */
7782 OUTPUT_SINGLE_REPEAT:
7783 prop_type = prop_value = -1;
7784 }
7785
7786 /* At this point, if prop_type == prop_value == -1 we either have a
7787 character in mcbuffer when mclength is greater than zero, or we have
7788 mclength zero, in which case there is a non-property character type in
7789 op_previous. If prop_type/value are not negative, we have a property
7790 character type in op_previous. */
7791
7792 oldcode = code; /* Save where we were */
7793 code = previous; /* Usually overwrite previous item */
7794
7795 /* If the maximum is zero then the minimum must also be zero; Perl allows
7796 this case, so we do too - by simply omitting the item altogether. */
7797
7798 if (repeat_max == 0) goto END_REPEAT;
7799
7800 /* Combine the op_type with the repeat_type */
7801
7802 repeat_type += op_type;
7803
7804 /* A minimum of zero is handled either as the special case * or ?, or as
7805 an UPTO, with the maximum given. */
7806
7807 if (repeat_min == 0)
7808 {
7809 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7810 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7811 else
7812 {
7813 *code++ = OP_UPTO + repeat_type;
7814 PUT2INC(code, 0, repeat_max);
7815 }
7816 }
7817
7818 /* A repeat minimum of 1 is optimized into some special cases. If the
7819 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7820 left in place and, if the maximum is greater than 1, we use OP_UPTO with
7821 one less than the maximum. */
7822
7823 else if (repeat_min == 1)
7824 {
7825 if (repeat_max == REPEAT_UNLIMITED)
7826 *code++ = OP_PLUS + repeat_type;
7827 else
7828 {
7829 code = oldcode; /* Leave previous item in place */
7830 if (repeat_max == 1) goto END_REPEAT;
7831 *code++ = OP_UPTO + repeat_type;
7832 PUT2INC(code, 0, repeat_max - 1);
7833 }
7834 }
7835
7836 /* The case {n,n} is just an EXACT, while the general case {n,m} is
7837 handled as an EXACT followed by an UPTO or STAR or QUERY. */
7838
7839 else
7840 {
7841 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7842 PUT2INC(code, 0, repeat_min);
7843
7844 /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7845 and then generate the second opcode. For a repeated Unicode property
7846 match, there are two extra values that define the required property,
7847 and mclength is set zero to indicate this. */
7848
7849 if (repeat_max != repeat_min)
7850 {
7851 if (mclength > 0)
7852 {
7853 memcpy(code, mcbuffer, CU2BYTES(mclength));
7854 code += mclength;
7855 }
7856 else
7857 {
7858 *code++ = op_previous;
7859 if (prop_type >= 0)
7860 {
7861 *code++ = prop_type;
7862 *code++ = prop_value;
7863 }
7864 }
7865
7866 /* Now set up the following opcode */
7867
7868 if (repeat_max == REPEAT_UNLIMITED)
7869 *code++ = OP_STAR + repeat_type;
7870 else
7871 {
7872 repeat_max -= repeat_min;
7873 if (repeat_max == 1)
7874 {
7875 *code++ = OP_QUERY + repeat_type;
7876 }
7877 else
7878 {
7879 *code++ = OP_UPTO + repeat_type;
7880 PUT2INC(code, 0, repeat_max);
7881 }
7882 }
7883 }
7884 }
7885
7886 /* Fill in the character or character type for the final opcode. */
7887
7888 if (mclength > 0)
7889 {
7890 memcpy(code, mcbuffer, CU2BYTES(mclength));
7891 code += mclength;
7892 }
7893 else
7894 {
7895 *code++ = op_previous;
7896 if (prop_type >= 0)
7897 {
7898 *code++ = prop_type;
7899 *code++ = prop_value;
7900 }
7901 }
7902 }
7903 break;
7904 } /* End of switch on different op_previous values */
7905
7906
7907 /* If the character following a repeat is '+', possessive_quantifier is
7908 TRUE. For some opcodes, there are special alternative opcodes for this
7909 case. For anything else, we wrap the entire repeated item inside OP_ONCE
7910 brackets. Logically, the '+' notation is just syntactic sugar, taken from
7911 Sun's Java package, but the special opcodes can optimize it.
7912
7913 Some (but not all) possessively repeated subpatterns have already been
7914 completely handled in the code just above. For them, possessive_quantifier
7915 is always FALSE at this stage. Note that the repeated item starts at
7916 tempcode, not at previous, which might be the first part of a string whose
7917 (former) last char we repeated. */
7918
7919 if (possessive_quantifier)
7920 {
7921 int len;
7922
7923 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7924 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7925 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7926 remains is greater than zero, there's a further opcode that can be
7927 handled. If not, do nothing, leaving the EXACT alone. */
7928
7929 switch(*tempcode)
7930 {
7931 case OP_TYPEEXACT:
7932 tempcode += PRIV(OP_lengths)[*tempcode] +
7933 ((tempcode[1 + IMM2_SIZE] == OP_PROP
7934 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7935 break;
7936
7937 /* CHAR opcodes are used for exacts whose count is 1. */
7938
7939 case OP_CHAR:
7940 case OP_CHARI:
7941 case OP_NOT:
7942 case OP_NOTI:
7943 case OP_EXACT:
7944 case OP_EXACTI:
7945 case OP_NOTEXACT:
7946 case OP_NOTEXACTI:
7947 tempcode += PRIV(OP_lengths)[*tempcode];
7948 #ifdef SUPPORT_UNICODE
7949 if (utf && HAS_EXTRALEN(tempcode[-1]))
7950 tempcode += GET_EXTRALEN(tempcode[-1]);
7951 #endif
7952 break;
7953
7954 /* For the class opcodes, the repeat operator appears at the end;
7955 adjust tempcode to point to it. */
7956
7957 case OP_CLASS:
7958 case OP_NCLASS:
7959 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7960 break;
7961
7962 #ifdef SUPPORT_WIDE_CHARS
7963 case OP_XCLASS:
7964 tempcode += GET(tempcode, 1);
7965 break;
7966 #endif
7967 }
7968
7969 /* If tempcode is equal to code (which points to the end of the repeated
7970 item), it means we have skipped an EXACT item but there is no following
7971 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7972 all other cases, tempcode will be pointing to the repeat opcode, and will
7973 be less than code, so the value of len will be greater than 0. */
7974
7975 len = (int)(code - tempcode);
7976 if (len > 0)
7977 {
7978 unsigned int repcode = *tempcode;
7979
7980 /* There is a table for possessifying opcodes, all of which are less
7981 than OP_CALLOUT. A zero entry means there is no possessified version.
7982 */
7983
7984 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7985 *tempcode = opcode_possessify[repcode];
7986
7987 /* For opcode without a special possessified version, wrap the item in
7988 ONCE brackets. */
7989
7990 else
7991 {
7992 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7993 code += 1 + LINK_SIZE;
7994 len += 1 + LINK_SIZE;
7995 tempcode[0] = OP_ONCE;
7996 *code++ = OP_KET;
7997 PUTINC(code, 0, len);
7998 PUT(tempcode, 1, len);
7999 }
8000 }
8001 }
8002
8003 /* We set the "follows varying string" flag for subsequently encountered
8004 reqcus if it isn't already set and we have just passed a varying length
8005 item. */
8006
8007 END_REPEAT:
8008 cb->req_varyopt |= reqvary;
8009 break;
8010
8011
8012 /* ===================================================================*/
8013 /* Handle a 32-bit data character with a value greater than META_END. */
8014
8015 case META_BIGVALUE:
8016 pptr++;
8017 goto NORMAL_CHAR;
8018
8019
8020 /* ===============================================================*/
8021 /* Handle a back reference by number, which is the meta argument. The
8022 pattern offsets for back references to group numbers less than 10 are held
8023 in a special vector, to avoid using more than two parsed pattern elements
8024 in 64-bit environments. We only need the offset to the first occurrence,
8025 because if that doesn't fail, subsequent ones will also be OK. */
8026
8027 case META_BACKREF:
8028 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8029 else GETPLUSOFFSET(offset, pptr);
8030
8031 if (meta_arg > cb->bracount)
8032 {
8033 cb->erroroffset = offset;
8034 *errorcodeptr = ERR15; /* Non-existent subpattern */
8035 return 0;
8036 }
8037
8038 /* Come here from named backref handling when the reference is to a
8039 single group (that is, not to a duplicated name). The back reference
8040 data will have already been updated. We must disable firstcu if not
8041 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8042 later. */
8043
8044 HANDLE_SINGLE_REFERENCE:
8045 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8046 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8047 PUT2INC(code, 0, meta_arg);
8048
8049 /* Update the map of back references, and keep the highest one. We
8050 could do this in parse_regex() for numerical back references, but not
8051 for named back references, because we don't know the numbers to which
8052 named back references refer. So we do it all in this function. */
8053
8054 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8055 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8056 break;
8057
8058
8059 /* ===============================================================*/
8060 /* Handle recursion by inserting the number of the called group (which is
8061 the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8062 scanned and these numbers are replaced by offsets within the pattern. It is
8063 done like this to avoid problems with forward references and adjusting
8064 offsets when groups are duplicated and moved (as discovered in previous
8065 implementations). Note that a recursion does not have a set first
8066 character. */
8067
8068 case META_RECURSE:
8069 GETPLUSOFFSET(offset, pptr);
8070 if (meta_arg > cb->bracount)
8071 {
8072 cb->erroroffset = offset;
8073 *errorcodeptr = ERR15; /* Non-existent subpattern */
8074 return 0;
8075 }
8076 HANDLE_NUMERICAL_RECURSION:
8077 *code = OP_RECURSE;
8078 PUT(code, 1, meta_arg);
8079 code += 1 + LINK_SIZE;
8080 groupsetfirstcu = FALSE;
8081 cb->had_recurse = TRUE;
8082 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8083 zerofirstcu = firstcu;
8084 zerofirstcuflags = firstcuflags;
8085 break;
8086
8087
8088 /* ===============================================================*/
8089 /* Handle capturing parentheses; the number is the meta argument. */
8090
8091 case META_CAPTURE:
8092 bravalue = OP_CBRA;
8093 skipunits = IMM2_SIZE;
8094 PUT2(code, 1+LINK_SIZE, meta_arg);
8095 cb->lastcapture = meta_arg;
8096 goto GROUP_PROCESS_NOTE_EMPTY;
8097
8098
8099 /* ===============================================================*/
8100 /* Handle escape sequence items. For ones like \d, the ESC_values are
8101 arranged to be the same as the corresponding OP_values in the default case
8102 when PCRE2_UCP is not set (which is the only case in which they will appear
8103 here).
8104
8105 Note: \Q and \E are never seen here, as they were dealt with in
8106 parse_pattern(). Neither are numerical back references or recursions, which
8107 were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8108 \g, when followed by names, are turned into META_BACKREF_BYNAME or
8109 META_RECURSE_BYNAME. */
8110
8111 case META_ESCAPE:
8112
8113 /* We can test for escape sequences that consume a character because their
8114 values lie between ESC_b and ESC_Z; this may have to change if any new ones
8115 are ever created. For these sequences, we disable the setting of a first
8116 character if it hasn't already been set. */
8117
8118 if (meta_arg > ESC_b && meta_arg < ESC_Z)
8119 {
8120 matched_char = TRUE;
8121 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8122 }
8123
8124 /* Set values to reset to if this is followed by a zero repeat. */
8125
8126 zerofirstcu = firstcu;
8127 zerofirstcuflags = firstcuflags;
8128 zeroreqcu = reqcu;
8129 zeroreqcuflags = reqcuflags;
8130
8131 /* If Unicode is not supported, \P and \p are not allowed and are
8132 faulted at parse time, so will never appear here. */
8133
8134 #ifdef SUPPORT_UNICODE
8135 if (meta_arg == ESC_P || meta_arg == ESC_p)
8136 {
8137 uint32_t ptype = *(++pptr) >> 16;
8138 uint32_t pdata = *pptr & 0xffff;
8139
8140 /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
8141 from the auto-anchoring code. */
8142
8143 if (meta_arg == ESC_p && ptype == PT_ANY)
8144 {
8145 *code++ = OP_ALLANY;
8146 }
8147 else
8148 {
8149 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8150 *code++ = ptype;
8151 *code++ = pdata;
8152 }
8153 break; /* End META_ESCAPE */
8154 }
8155 #endif
8156
8157 /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8158 done. However, there's an option, in case anyone was relying on it. */
8159
8160 if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8161 (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8162 {
8163 *errorcodeptr = ERR99;
8164 return 0;
8165 }
8166
8167 /* For the rest (including \X when Unicode is supported - if not it's
8168 faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8169 not set; if it is set, most of them do not show up here because they are
8170 converted into Unicode property tests in parse_regex().
8171
8172 In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8173 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8174 There are special UCP codes for \B and \b which are used in UCP mode unless
8175 "word" matching is being forced to ASCII.
8176
8177 Note that \b and \B do a one-character lookbehind, and \A also behaves as
8178 if it does. */
8179
8180 switch(meta_arg)
8181 {
8182 case ESC_C:
8183 cb->external_flags |= PCRE2_HASBKC; /* Record */
8184 #if PCRE2_CODE_UNIT_WIDTH == 32
8185 meta_arg = OP_ALLANY;
8186 #else
8187 if (!utf) meta_arg = OP_ALLANY;
8188 #endif
8189 break;
8190
8191 case ESC_B:
8192 case ESC_b:
8193 if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8194 meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8195 OP_UCP_WORD_BOUNDARY;
8196 /* Fall through */
8197
8198 case ESC_A:
8199 if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8200 break;
8201 }
8202
8203 *code++ = meta_arg;
8204 break; /* End META_ESCAPE */
8205
8206
8207 /* ===================================================================*/
8208 /* Handle an unrecognized meta value. A parsed pattern value less than
8209 META_END is a literal. Otherwise we have a problem. */
8210
8211 default:
8212 if (meta >= META_END)
8213 {
8214 #ifdef DEBUG_SHOW_PARSED
8215 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
8216 #endif
8217 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
8218 return 0;
8219 }
8220
8221 /* Handle a literal character. We come here by goto in the case of a
8222 32-bit, non-UTF character whose value is greater than META_END. */
8223
8224 NORMAL_CHAR:
8225 meta = *pptr; /* Get the full 32 bits */
8226 NORMAL_CHAR_SET: /* Character is already in meta */
8227 matched_char = TRUE;
8228
8229 /* For caseless UTF or UCP mode, check whether this character has more than
8230 one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8231 When casing restrictions apply, ignore caseless sets that start with an
8232 ASCII character. */
8233
8234 #ifdef SUPPORT_UNICODE
8235 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8236 {
8237 uint32_t caseset = UCD_CASESET(meta);
8238 if (caseset != 0 &&
8239 ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) == 0 ||
8240 PRIV(ucd_caseless_sets)[caseset] > 127))
8241 {
8242 *code++ = OP_PROP;
8243 *code++ = PT_CLIST;
8244 *code++ = caseset;
8245 if (firstcuflags == REQ_UNSET)
8246 firstcuflags = zerofirstcuflags = REQ_NONE;
8247 break; /* End handling this meta item */
8248 }
8249 }
8250 #endif
8251
8252 /* Caseful matches, or caseless and not one of the multicase characters. We
8253 come here by goto in the case of a positive class that contains only
8254 case-partners of a character with just two cases; matched_char has already
8255 been set TRUE and options fudged if necessary. */
8256
8257 CLASS_CASELESS_CHAR:
8258
8259 /* Get the character's code units into mcbuffer, with the length in
8260 mclength. When not in UTF mode, the length is always 1. */
8261
8262 #ifdef SUPPORT_UNICODE
8263 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8264 #endif
8265 {
8266 mclength = 1;
8267 mcbuffer[0] = meta;
8268 }
8269
8270 /* Generate the appropriate code */
8271
8272 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8273 memcpy(code, mcbuffer, CU2BYTES(mclength));
8274 code += mclength;
8275
8276 /* Remember if \r or \n were seen */
8277
8278 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8279 cb->external_flags |= PCRE2_HASCRORLF;
8280
8281 /* Set the first and required code units appropriately. If no previous
8282 first code unit, set it from this character, but revert to none on a zero
8283 repeat. Otherwise, leave the firstcu value alone, and don't change it on
8284 a zero repeat. */
8285
8286 if (firstcuflags == REQ_UNSET)
8287 {
8288 zerofirstcuflags = REQ_NONE;
8289 zeroreqcu = reqcu;
8290 zeroreqcuflags = reqcuflags;
8291
8292 /* If the character is more than one code unit long, we can set a single
8293 firstcu only if it is not to be matched caselessly. Multiple possible
8294 starting code units may be picked up later in the studying code. */
8295
8296 if (mclength == 1 || req_caseopt == 0)
8297 {
8298 firstcu = mcbuffer[0];
8299 firstcuflags = req_caseopt;
8300 if (mclength != 1)
8301 {
8302 reqcu = code[-1];
8303 reqcuflags = cb->req_varyopt;
8304 }
8305 }
8306 else firstcuflags = reqcuflags = REQ_NONE;
8307 }
8308
8309 /* firstcu was previously set; we can set reqcu only if the length is
8310 1 or the matching is caseful. */
8311
8312 else
8313 {
8314 zerofirstcu = firstcu;
8315 zerofirstcuflags = firstcuflags;
8316 zeroreqcu = reqcu;
8317 zeroreqcuflags = reqcuflags;
8318 if (mclength == 1 || req_caseopt == 0)
8319 {
8320 reqcu = code[-1];
8321 reqcuflags = req_caseopt | cb->req_varyopt;
8322 }
8323 }
8324
8325 /* If caselessness was temporarily instated, reset it. */
8326
8327 if (reset_caseful)
8328 {
8329 options &= ~PCRE2_CASELESS;
8330 req_caseopt = 0;
8331 reset_caseful = FALSE;
8332 }
8333
8334 break; /* End literal character handling */
8335 } /* End of big switch */
8336 } /* End of big loop */
8337
8338 /* Control never reaches here. */
8339 }
8340
8341
8342
8343 /*************************************************
8344 * Compile regex: a sequence of alternatives *
8345 *************************************************/
8346
8347 /* On entry, pptr is pointing past the bracket meta, but on return it points to
8348 the closing bracket or META_END. The code variable is pointing at the code unit
8349 into which the BRA operator has been stored. This function is used during the
8350 pre-compile phase when we are trying to find out the amount of memory needed,
8351 as well as during the real compile phase. The value of lengthptr distinguishes
8352 the two phases.
8353
8354 Arguments:
8355 options option bits, including any changes for this subpattern
8356 xoptions extra option bits, ditto
8357 codeptr -> the address of the current code pointer
8358 pptrptr -> the address of the current parsed pattern pointer
8359 errorcodeptr -> pointer to error code variable
8360 skipunits skip this many code units at start (for brackets and OP_COND)
8361 firstcuptr place to put the first required code unit
8362 firstcuflagsptr place to put the first code unit flags
8363 reqcuptr place to put the last required code unit
8364 reqcuflagsptr place to put the last required code unit flags
8365 bcptr pointer to the chain of currently open branches
8366 cb points to the data block with tables pointers etc.
8367 lengthptr NULL during the real compile phase
8368 points to length accumulator during pre-compile phase
8369
8370 Returns: 0 There has been an error
8371 +1 Success, this group must match at least one character
8372 -1 Success, this group may match an empty string
8373 */
8374
8375 static int
compile_regex(uint32_t options,uint32_t xoptions,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,uint32_t * firstcuflagsptr,uint32_t * reqcuptr,uint32_t * reqcuflagsptr,branch_chain * bcptr,open_capitem * open_caps,compile_block * cb,PCRE2_SIZE * lengthptr)8376 compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8377 uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8378 uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8379 uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8380 compile_block *cb, PCRE2_SIZE *lengthptr)
8381 {
8382 PCRE2_UCHAR *code = *codeptr;
8383 PCRE2_UCHAR *last_branch = code;
8384 PCRE2_UCHAR *start_bracket = code;
8385 BOOL lookbehind;
8386 open_capitem capitem;
8387 int capnumber = 0;
8388 int okreturn = 1;
8389 uint32_t *pptr = *pptrptr;
8390 uint32_t firstcu, reqcu;
8391 uint32_t lookbehindlength;
8392 uint32_t lookbehindminlength;
8393 uint32_t firstcuflags, reqcuflags;
8394 uint32_t branchfirstcu, branchreqcu;
8395 uint32_t branchfirstcuflags, branchreqcuflags;
8396 PCRE2_SIZE length;
8397 branch_chain bc;
8398
8399 /* If set, call the external function that checks for stack availability. */
8400
8401 if (cb->cx->stack_guard != NULL &&
8402 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8403 {
8404 *errorcodeptr= ERR33;
8405 return 0;
8406 }
8407
8408 /* Miscellaneous initialization */
8409
8410 bc.outer = bcptr;
8411 bc.current_branch = code;
8412
8413 firstcu = reqcu = 0;
8414 firstcuflags = reqcuflags = REQ_UNSET;
8415
8416 /* Accumulate the length for use in the pre-compile phase. Start with the
8417 length of the BRA and KET and any extra code units that are required at the
8418 beginning. We accumulate in a local variable to save frequent testing of
8419 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8420 start and end of each alternative, because compiled items are discarded during
8421 the pre-compile phase so that the workspace is not exceeded. */
8422
8423 length = 2 + 2*LINK_SIZE + skipunits;
8424
8425 /* Remember if this is a lookbehind assertion, and if it is, save its length
8426 and skip over the pattern offset. */
8427
8428 lookbehind = *code == OP_ASSERTBACK ||
8429 *code == OP_ASSERTBACK_NOT ||
8430 *code == OP_ASSERTBACK_NA;
8431
8432 if (lookbehind)
8433 {
8434 lookbehindlength = META_DATA(pptr[-1]);
8435 lookbehindminlength = *pptr;
8436 pptr += SIZEOFFSET;
8437 }
8438 else lookbehindlength = lookbehindminlength = 0;
8439
8440 /* If this is a capturing subpattern, add to the chain of open capturing items
8441 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8442 need be tested here; changing this opcode to one of its variants, e.g.
8443 OP_SCBRAPOS, happens later, after the group has been compiled. */
8444
8445 if (*code == OP_CBRA)
8446 {
8447 capnumber = GET2(code, 1 + LINK_SIZE);
8448 capitem.number = capnumber;
8449 capitem.next = open_caps;
8450 capitem.assert_depth = cb->assert_depth;
8451 open_caps = &capitem;
8452 }
8453
8454 /* Offset is set zero to mark that this bracket is still open */
8455
8456 PUT(code, 1, 0);
8457 code += 1 + LINK_SIZE + skipunits;
8458
8459 /* Loop for each alternative branch */
8460
8461 for (;;)
8462 {
8463 int branch_return;
8464
8465 /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8466 is only a single mimimum length for the whole assertion. When the mimimum
8467 length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8468 though not necessarily the same length. In this case, the original OP_REVERSE
8469 can be used. It can also be used if a branch in a variable length lookbehind
8470 has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8471 maximum and minimum values. */
8472
8473 if (lookbehind && lookbehindlength > 0)
8474 {
8475 if (lookbehindminlength == LOOKBEHIND_MAX ||
8476 lookbehindminlength == lookbehindlength)
8477 {
8478 *code++ = OP_REVERSE;
8479 PUT2INC(code, 0, lookbehindlength);
8480 length += 1 + IMM2_SIZE;
8481 }
8482 else
8483 {
8484 *code++ = OP_VREVERSE;
8485 PUT2INC(code, 0, lookbehindminlength);
8486 PUT2INC(code, 0, lookbehindlength);
8487 length += 1 + 2*IMM2_SIZE;
8488 }
8489 }
8490
8491 /* Now compile the branch; in the pre-compile phase its length gets added
8492 into the length. */
8493
8494 if ((branch_return =
8495 compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8496 &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8497 &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8498 return 0;
8499
8500 /* If a branch can match an empty string, so can the whole group. */
8501
8502 if (branch_return < 0) okreturn = -1;
8503
8504 /* In the real compile phase, there is some post-processing to be done. */
8505
8506 if (lengthptr == NULL)
8507 {
8508 /* If this is the first branch, the firstcu and reqcu values for the
8509 branch become the values for the regex. */
8510
8511 if (*last_branch != OP_ALT)
8512 {
8513 firstcu = branchfirstcu;
8514 firstcuflags = branchfirstcuflags;
8515 reqcu = branchreqcu;
8516 reqcuflags = branchreqcuflags;
8517 }
8518
8519 /* If this is not the first branch, the first char and reqcu have to
8520 match the values from all the previous branches, except that if the
8521 previous value for reqcu didn't have REQ_VARY set, it can still match,
8522 and we set REQ_VARY for the group from this branch's value. */
8523
8524 else
8525 {
8526 /* If we previously had a firstcu, but it doesn't match the new branch,
8527 we have to abandon the firstcu for the regex, but if there was
8528 previously no reqcu, it takes on the value of the old firstcu. */
8529
8530 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8531 {
8532 if (firstcuflags < REQ_NONE)
8533 {
8534 if (reqcuflags >= REQ_NONE)
8535 {
8536 reqcu = firstcu;
8537 reqcuflags = firstcuflags;
8538 }
8539 }
8540 firstcuflags = REQ_NONE;
8541 }
8542
8543 /* If we (now or from before) have no firstcu, a firstcu from the
8544 branch becomes a reqcu if there isn't a branch reqcu. */
8545
8546 if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8547 branchreqcuflags >= REQ_NONE)
8548 {
8549 branchreqcu = branchfirstcu;
8550 branchreqcuflags = branchfirstcuflags;
8551 }
8552
8553 /* Now ensure that the reqcus match */
8554
8555 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8556 reqcu != branchreqcu)
8557 reqcuflags = REQ_NONE;
8558 else
8559 {
8560 reqcu = branchreqcu;
8561 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8562 }
8563 }
8564 }
8565
8566 /* Handle reaching the end of the expression, either ')' or end of pattern.
8567 In the real compile phase, go back through the alternative branches and
8568 reverse the chain of offsets, with the field in the BRA item now becoming an
8569 offset to the first alternative. If there are no alternatives, it points to
8570 the end of the group. The length in the terminating ket is always the length
8571 of the whole bracketed item. Return leaving the pointer at the terminating
8572 char. */
8573
8574 if (META_CODE(*pptr) != META_ALT)
8575 {
8576 if (lengthptr == NULL)
8577 {
8578 PCRE2_SIZE branch_length = code - last_branch;
8579 do
8580 {
8581 PCRE2_SIZE prev_length = GET(last_branch, 1);
8582 PUT(last_branch, 1, branch_length);
8583 branch_length = prev_length;
8584 last_branch -= branch_length;
8585 }
8586 while (branch_length > 0);
8587 }
8588
8589 /* Fill in the ket */
8590
8591 *code = OP_KET;
8592 PUT(code, 1, (int)(code - start_bracket));
8593 code += 1 + LINK_SIZE;
8594
8595 /* Set values to pass back */
8596
8597 *codeptr = code;
8598 *pptrptr = pptr;
8599 *firstcuptr = firstcu;
8600 *firstcuflagsptr = firstcuflags;
8601 *reqcuptr = reqcu;
8602 *reqcuflagsptr = reqcuflags;
8603 if (lengthptr != NULL)
8604 {
8605 if (OFLOW_MAX - *lengthptr < length)
8606 {
8607 *errorcodeptr = ERR20;
8608 return 0;
8609 }
8610 *lengthptr += length;
8611 }
8612 return okreturn;
8613 }
8614
8615 /* Another branch follows. In the pre-compile phase, we can move the code
8616 pointer back to where it was for the start of the first branch. (That is,
8617 pretend that each branch is the only one.)
8618
8619 In the real compile phase, insert an ALT node. Its length field points back
8620 to the previous branch while the bracket remains open. At the end the chain
8621 is reversed. It's done like this so that the start of the bracket has a
8622 zero offset until it is closed, making it possible to detect recursion. */
8623
8624 if (lengthptr != NULL)
8625 {
8626 code = *codeptr + 1 + LINK_SIZE + skipunits;
8627 length += 1 + LINK_SIZE;
8628 }
8629 else
8630 {
8631 *code = OP_ALT;
8632 PUT(code, 1, (int)(code - last_branch));
8633 bc.current_branch = last_branch = code;
8634 code += 1 + LINK_SIZE;
8635 }
8636
8637 /* Set the maximum lookbehind length for the next branch (if not in a
8638 lookbehind the value will be zero) and then advance past the vertical bar. */
8639
8640 lookbehindlength = META_DATA(*pptr);
8641 pptr++;
8642 }
8643 /* Control never reaches here */
8644 }
8645
8646
8647
8648 /*************************************************
8649 * Check for anchored pattern *
8650 *************************************************/
8651
8652 /* Try to find out if this is an anchored regular expression. Consider each
8653 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8654 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8655 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8656 be found, because ^ generates OP_CIRCM in that mode.
8657
8658 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8659 This is the code for \G, which means "match at start of match position, taking
8660 into account the match offset".
8661
8662 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8663 because that will try the rest of the pattern at all possible matching points,
8664 so there is no point trying again.... er ....
8665
8666 .... except when the .* appears inside capturing parentheses, and there is a
8667 subsequent back reference to those parentheses. We haven't enough information
8668 to catch that case precisely.
8669
8670 At first, the best we could do was to detect when .* was in capturing brackets
8671 and the highest back reference was greater than or equal to that level.
8672 However, by keeping a bitmap of the first 31 back references, we can catch some
8673 of the more common cases more precisely.
8674
8675 ... A second exception is when the .* appears inside an atomic group, because
8676 this prevents the number of characters it matches from being adjusted.
8677
8678 Arguments:
8679 code points to start of the compiled pattern
8680 bracket_map a bitmap of which brackets we are inside while testing; this
8681 handles up to substring 31; after that we just have to take
8682 the less precise approach
8683 cb points to the compile data block
8684 atomcount atomic group level
8685 inassert TRUE if in an assertion
8686
8687 Returns: TRUE or FALSE
8688 */
8689
8690 static BOOL
is_anchored(PCRE2_SPTR code,uint32_t bracket_map,compile_block * cb,int atomcount,BOOL inassert)8691 is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8692 int atomcount, BOOL inassert)
8693 {
8694 do {
8695 PCRE2_SPTR scode = first_significant_code(
8696 code + PRIV(OP_lengths)[*code], FALSE);
8697 int op = *scode;
8698
8699 /* Non-capturing brackets */
8700
8701 if (op == OP_BRA || op == OP_BRAPOS ||
8702 op == OP_SBRA || op == OP_SBRAPOS)
8703 {
8704 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8705 return FALSE;
8706 }
8707
8708 /* Capturing brackets */
8709
8710 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8711 op == OP_SCBRA || op == OP_SCBRAPOS)
8712 {
8713 int n = GET2(scode, 1+LINK_SIZE);
8714 uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8715 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8716 }
8717
8718 /* Positive forward assertion */
8719
8720 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8721 {
8722 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8723 }
8724
8725 /* Condition. If there is no second branch, it can't be anchored. */
8726
8727 else if (op == OP_COND || op == OP_SCOND)
8728 {
8729 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8730 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8731 return FALSE;
8732 }
8733
8734 /* Atomic groups */
8735
8736 else if (op == OP_ONCE)
8737 {
8738 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8739 return FALSE;
8740 }
8741
8742 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8743 it isn't in brackets that are or may be referenced or inside an atomic
8744 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8745 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8746 with the subject "aab", which matches "b", i.e. not at the start of a line.
8747 There is also an option that disables auto-anchoring. */
8748
8749 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8750 op == OP_TYPEPOSSTAR))
8751 {
8752 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8753 atomcount > 0 || cb->had_pruneorskip || inassert ||
8754 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8755 return FALSE;
8756 }
8757
8758 /* Check for explicit anchoring */
8759
8760 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8761
8762 code += GET(code, 1);
8763 }
8764 while (*code == OP_ALT); /* Loop for each alternative */
8765 return TRUE;
8766 }
8767
8768
8769
8770 /*************************************************
8771 * Check for starting with ^ or .* *
8772 *************************************************/
8773
8774 /* This is called to find out if every branch starts with ^ or .* so that
8775 "first char" processing can be done to speed things up in multiline
8776 matching and for non-DOTALL patterns that start with .* (which must start at
8777 the beginning or after \n). As in the case of is_anchored() (see above), we
8778 have to take account of back references to capturing brackets that contain .*
8779 because in that case we can't make the assumption. Also, the appearance of .*
8780 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8781 or *SKIP does not count, because once again the assumption no longer holds.
8782
8783 Arguments:
8784 code points to start of the compiled pattern or a group
8785 bracket_map a bitmap of which brackets we are inside while testing; this
8786 handles up to substring 31; after that we just have to take
8787 the less precise approach
8788 cb points to the compile data
8789 atomcount atomic group level
8790 inassert TRUE if in an assertion
8791
8792 Returns: TRUE or FALSE
8793 */
8794
8795 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8796 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8797 int atomcount, BOOL inassert)
8798 {
8799 do {
8800 PCRE2_SPTR scode = first_significant_code(
8801 code + PRIV(OP_lengths)[*code], FALSE);
8802 int op = *scode;
8803
8804 /* If we are at the start of a conditional assertion group, *both* the
8805 conditional assertion *and* what follows the condition must satisfy the test
8806 for start of line. Other kinds of condition fail. Note that there may be an
8807 auto-callout at the start of a condition. */
8808
8809 if (op == OP_COND)
8810 {
8811 scode += 1 + LINK_SIZE;
8812
8813 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8814 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8815
8816 switch (*scode)
8817 {
8818 case OP_CREF:
8819 case OP_DNCREF:
8820 case OP_RREF:
8821 case OP_DNRREF:
8822 case OP_FAIL:
8823 case OP_FALSE:
8824 case OP_TRUE:
8825 return FALSE;
8826
8827 default: /* Assertion */
8828 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8829 do scode += GET(scode, 1); while (*scode == OP_ALT);
8830 scode += 1 + LINK_SIZE;
8831 break;
8832 }
8833 scode = first_significant_code(scode, FALSE);
8834 op = *scode;
8835 }
8836
8837 /* Non-capturing brackets */
8838
8839 if (op == OP_BRA || op == OP_BRAPOS ||
8840 op == OP_SBRA || op == OP_SBRAPOS)
8841 {
8842 if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8843 return FALSE;
8844 }
8845
8846 /* Capturing brackets */
8847
8848 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8849 op == OP_SCBRA || op == OP_SCBRAPOS)
8850 {
8851 int n = GET2(scode, 1+LINK_SIZE);
8852 unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8853 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8854 }
8855
8856 /* Positive forward assertions */
8857
8858 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8859 {
8860 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8861 return FALSE;
8862 }
8863
8864 /* Atomic brackets */
8865
8866 else if (op == OP_ONCE)
8867 {
8868 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8869 return FALSE;
8870 }
8871
8872 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8873 brackets that may be referenced or an assertion, and as long as the pattern
8874 does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8875 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8876 i.e. not at the start of a line. There is also an option that disables this
8877 optimization. */
8878
8879 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8880 {
8881 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8882 atomcount > 0 || cb->had_pruneorskip || inassert ||
8883 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8884 return FALSE;
8885 }
8886
8887 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8888 in particular that this includes atomic brackets OP_ONCE because the number
8889 of characters matched by .* cannot be adjusted inside them. */
8890
8891 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8892
8893 /* Move on to the next alternative */
8894
8895 code += GET(code, 1);
8896 }
8897 while (*code == OP_ALT); /* Loop for each alternative */
8898 return TRUE;
8899 }
8900
8901
8902
8903 /*************************************************
8904 * Scan compiled regex for recursion reference *
8905 *************************************************/
8906
8907 /* This function scans through a compiled pattern until it finds an instance of
8908 OP_RECURSE.
8909
8910 Arguments:
8911 code points to start of expression
8912 utf TRUE in UTF mode
8913
8914 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8915 */
8916
8917 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8918 find_recurse(PCRE2_SPTR code, BOOL utf)
8919 {
8920 for (;;)
8921 {
8922 PCRE2_UCHAR c = *code;
8923 if (c == OP_END) return NULL;
8924 if (c == OP_RECURSE) return code;
8925
8926 /* XCLASS is used for classes that cannot be represented just by a bit map.
8927 This includes negated single high-valued characters. CALLOUT_STR is used for
8928 callouts with string arguments. In both cases the length in the table is
8929 zero; the actual length is stored in the compiled code. */
8930
8931 if (c == OP_XCLASS) code += GET(code, 1);
8932 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8933
8934 /* Otherwise, we can get the item's length from the table, except that for
8935 repeated character types, we have to test for \p and \P, which have an extra
8936 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8937 we must add in its length. */
8938
8939 else
8940 {
8941 switch(c)
8942 {
8943 case OP_TYPESTAR:
8944 case OP_TYPEMINSTAR:
8945 case OP_TYPEPLUS:
8946 case OP_TYPEMINPLUS:
8947 case OP_TYPEQUERY:
8948 case OP_TYPEMINQUERY:
8949 case OP_TYPEPOSSTAR:
8950 case OP_TYPEPOSPLUS:
8951 case OP_TYPEPOSQUERY:
8952 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8953 break;
8954
8955 case OP_TYPEPOSUPTO:
8956 case OP_TYPEUPTO:
8957 case OP_TYPEMINUPTO:
8958 case OP_TYPEEXACT:
8959 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8960 code += 2;
8961 break;
8962
8963 case OP_MARK:
8964 case OP_COMMIT_ARG:
8965 case OP_PRUNE_ARG:
8966 case OP_SKIP_ARG:
8967 case OP_THEN_ARG:
8968 code += code[1];
8969 break;
8970 }
8971
8972 /* Add in the fixed length from the table */
8973
8974 code += PRIV(OP_lengths)[c];
8975
8976 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8977 be followed by a multi-unit character. The length in the table is a
8978 minimum, so we have to arrange to skip the extra units. */
8979
8980 #ifdef MAYBE_UTF_MULTI
8981 if (utf) switch(c)
8982 {
8983 case OP_CHAR:
8984 case OP_CHARI:
8985 case OP_NOT:
8986 case OP_NOTI:
8987 case OP_EXACT:
8988 case OP_EXACTI:
8989 case OP_NOTEXACT:
8990 case OP_NOTEXACTI:
8991 case OP_UPTO:
8992 case OP_UPTOI:
8993 case OP_NOTUPTO:
8994 case OP_NOTUPTOI:
8995 case OP_MINUPTO:
8996 case OP_MINUPTOI:
8997 case OP_NOTMINUPTO:
8998 case OP_NOTMINUPTOI:
8999 case OP_POSUPTO:
9000 case OP_POSUPTOI:
9001 case OP_NOTPOSUPTO:
9002 case OP_NOTPOSUPTOI:
9003 case OP_STAR:
9004 case OP_STARI:
9005 case OP_NOTSTAR:
9006 case OP_NOTSTARI:
9007 case OP_MINSTAR:
9008 case OP_MINSTARI:
9009 case OP_NOTMINSTAR:
9010 case OP_NOTMINSTARI:
9011 case OP_POSSTAR:
9012 case OP_POSSTARI:
9013 case OP_NOTPOSSTAR:
9014 case OP_NOTPOSSTARI:
9015 case OP_PLUS:
9016 case OP_PLUSI:
9017 case OP_NOTPLUS:
9018 case OP_NOTPLUSI:
9019 case OP_MINPLUS:
9020 case OP_MINPLUSI:
9021 case OP_NOTMINPLUS:
9022 case OP_NOTMINPLUSI:
9023 case OP_POSPLUS:
9024 case OP_POSPLUSI:
9025 case OP_NOTPOSPLUS:
9026 case OP_NOTPOSPLUSI:
9027 case OP_QUERY:
9028 case OP_QUERYI:
9029 case OP_NOTQUERY:
9030 case OP_NOTQUERYI:
9031 case OP_MINQUERY:
9032 case OP_MINQUERYI:
9033 case OP_NOTMINQUERY:
9034 case OP_NOTMINQUERYI:
9035 case OP_POSQUERY:
9036 case OP_POSQUERYI:
9037 case OP_NOTPOSQUERY:
9038 case OP_NOTPOSQUERYI:
9039 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9040 break;
9041 }
9042 #else
9043 (void)(utf); /* Keep compiler happy by referencing function argument */
9044 #endif /* MAYBE_UTF_MULTI */
9045 }
9046 }
9047 }
9048
9049
9050
9051 /*************************************************
9052 * Check for asserted fixed first code unit *
9053 *************************************************/
9054
9055 /* During compilation, the "first code unit" settings from forward assertions
9056 are discarded, because they can cause conflicts with actual literals that
9057 follow. However, if we end up without a first code unit setting for an
9058 unanchored pattern, it is worth scanning the regex to see if there is an
9059 initial asserted first code unit. If all branches start with the same asserted
9060 code unit, or with a non-conditional bracket all of whose alternatives start
9061 with the same asserted code unit (recurse ad lib), then we return that code
9062 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9063 REQ_NONE in the flags.
9064
9065 Arguments:
9066 code points to start of compiled pattern
9067 flags points to the first code unit flags
9068 inassert non-zero if in an assertion
9069
9070 Returns: the fixed first code unit, or 0 with REQ_NONE in flags
9071 */
9072
9073 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,uint32_t * flags,uint32_t inassert)9074 find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9075 {
9076 uint32_t c = 0;
9077 uint32_t cflags = REQ_NONE;
9078
9079 *flags = REQ_NONE;
9080 do {
9081 uint32_t d;
9082 uint32_t dflags;
9083 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9084 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9085 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9086 PCRE2_UCHAR op = *scode;
9087
9088 switch(op)
9089 {
9090 default:
9091 return 0;
9092
9093 case OP_BRA:
9094 case OP_BRAPOS:
9095 case OP_CBRA:
9096 case OP_SCBRA:
9097 case OP_CBRAPOS:
9098 case OP_SCBRAPOS:
9099 case OP_ASSERT:
9100 case OP_ASSERT_NA:
9101 case OP_ONCE:
9102 case OP_SCRIPT_RUN:
9103 d = find_firstassertedcu(scode, &dflags, inassert +
9104 ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9105 if (dflags >= REQ_NONE) return 0;
9106 if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9107 else if (c != d || cflags != dflags) return 0;
9108 break;
9109
9110 case OP_EXACT:
9111 scode += IMM2_SIZE;
9112 /* Fall through */
9113
9114 case OP_CHAR:
9115 case OP_PLUS:
9116 case OP_MINPLUS:
9117 case OP_POSPLUS:
9118 if (inassert == 0) return 0;
9119 if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9120 else if (c != scode[1]) return 0;
9121 break;
9122
9123 case OP_EXACTI:
9124 scode += IMM2_SIZE;
9125 /* Fall through */
9126
9127 case OP_CHARI:
9128 case OP_PLUSI:
9129 case OP_MINPLUSI:
9130 case OP_POSPLUSI:
9131 if (inassert == 0) return 0;
9132
9133 /* If the character is more than one code unit long, we cannot set its
9134 first code unit when matching caselessly. Later scanning may pick up
9135 multiple code units. */
9136
9137 #ifdef SUPPORT_UNICODE
9138 #if PCRE2_CODE_UNIT_WIDTH == 8
9139 if (scode[1] >= 0x80) return 0;
9140 #elif PCRE2_CODE_UNIT_WIDTH == 16
9141 if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9142 #endif
9143 #endif
9144
9145 if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9146 else if (c != scode[1]) return 0;
9147 break;
9148 }
9149
9150 code += GET(code, 1);
9151 }
9152 while (*code == OP_ALT);
9153
9154 *flags = cflags;
9155 return c;
9156 }
9157
9158
9159
9160 /*************************************************
9161 * Add an entry to the name/number table *
9162 *************************************************/
9163
9164 /* This function is called between compiling passes to add an entry to the
9165 name/number table, maintaining alphabetical order. Checking for permitted
9166 and forbidden duplicates has already been done.
9167
9168 Arguments:
9169 cb the compile data block
9170 name the name to add
9171 length the length of the name
9172 groupno the group number
9173 tablecount the count of names in the table so far
9174
9175 Returns: nothing
9176 */
9177
9178 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)9179 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
9180 unsigned int groupno, uint32_t tablecount)
9181 {
9182 uint32_t i;
9183 PCRE2_UCHAR *slot = cb->name_table;
9184
9185 for (i = 0; i < tablecount; i++)
9186 {
9187 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
9188 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
9189 crc = -1; /* Current name is a substring */
9190
9191 /* Make space in the table and break the loop for an earlier name. For a
9192 duplicate or later name, carry on. We do this for duplicates so that in the
9193 simple case (when ?(| is not used) they are in order of their numbers. In all
9194 cases they are in the order in which they appear in the pattern. */
9195
9196 if (crc < 0)
9197 {
9198 (void)memmove(slot + cb->name_entry_size, slot,
9199 CU2BYTES((tablecount - i) * cb->name_entry_size));
9200 break;
9201 }
9202
9203 /* Continue the loop for a later or duplicate name */
9204
9205 slot += cb->name_entry_size;
9206 }
9207
9208 PUT2(slot, 0, groupno);
9209 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
9210
9211 /* Add a terminating zero and fill the rest of the slot with zeroes so that
9212 the memory is all initialized. Otherwise valgrind moans about uninitialized
9213 memory when saving serialized compiled patterns. */
9214
9215 memset(slot + IMM2_SIZE + length, 0,
9216 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
9217 }
9218
9219
9220
9221 /*************************************************
9222 * Skip in parsed pattern *
9223 *************************************************/
9224
9225 /* This function is called to skip parts of the parsed pattern when finding the
9226 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9227 the end of the branch, it is called to skip over an internal lookaround or
9228 (DEFINE) group, and it is also called to skip to the end of a class, during
9229 which it will never encounter nested groups (but there's no need to have
9230 special code for that).
9231
9232 When called to find the end of a branch or group, pptr must point to the first
9233 meta code inside the branch, not the branch-starting code. In other cases it
9234 can point to the item that causes the function to be called.
9235
9236 Arguments:
9237 pptr current pointer to skip from
9238 skiptype PSKIP_CLASS when skipping to end of class
9239 PSKIP_ALT when META_ALT ends the skip
9240 PSKIP_KET when only META_KET ends the skip
9241
9242 Returns: new value of pptr
9243 NULL if META_END is reached - should never occur
9244 or for an unknown meta value - likewise
9245 */
9246
9247 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)9248 parsed_skip(uint32_t *pptr, uint32_t skiptype)
9249 {
9250 uint32_t nestlevel = 0;
9251
9252 for (;; pptr++)
9253 {
9254 uint32_t meta = META_CODE(*pptr);
9255
9256 switch(meta)
9257 {
9258 default: /* Just skip over most items */
9259 if (meta < META_END) continue; /* Literal */
9260 break;
9261
9262 /* This should never occur. */
9263
9264 case META_END:
9265 return NULL;
9266
9267 /* The data for these items is variable in length. */
9268
9269 case META_BACKREF: /* Offset is present only if group >= 10 */
9270 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9271 break;
9272
9273 case META_ESCAPE: /* A few escapes are followed by data items. */
9274 switch (META_DATA(*pptr))
9275 {
9276 case ESC_P:
9277 case ESC_p:
9278 pptr += 1;
9279 break;
9280
9281 case ESC_g:
9282 case ESC_k:
9283 pptr += 1 + SIZEOFFSET;
9284 break;
9285 }
9286 break;
9287
9288 case META_MARK: /* Add the length of the name. */
9289 case META_COMMIT_ARG:
9290 case META_PRUNE_ARG:
9291 case META_SKIP_ARG:
9292 case META_THEN_ARG:
9293 pptr += pptr[1];
9294 break;
9295
9296 /* These are the "active" items in this loop. */
9297
9298 case META_CLASS_END:
9299 if (skiptype == PSKIP_CLASS) return pptr;
9300 break;
9301
9302 case META_ATOMIC:
9303 case META_CAPTURE:
9304 case META_COND_ASSERT:
9305 case META_COND_DEFINE:
9306 case META_COND_NAME:
9307 case META_COND_NUMBER:
9308 case META_COND_RNAME:
9309 case META_COND_RNUMBER:
9310 case META_COND_VERSION:
9311 case META_LOOKAHEAD:
9312 case META_LOOKAHEADNOT:
9313 case META_LOOKAHEAD_NA:
9314 case META_LOOKBEHIND:
9315 case META_LOOKBEHINDNOT:
9316 case META_LOOKBEHIND_NA:
9317 case META_NOCAPTURE:
9318 case META_SCRIPT_RUN:
9319 nestlevel++;
9320 break;
9321
9322 case META_ALT:
9323 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9324 break;
9325
9326 case META_KET:
9327 if (nestlevel == 0) return pptr;
9328 nestlevel--;
9329 break;
9330 }
9331
9332 /* The extra data item length for each meta is in a table. */
9333
9334 meta = (meta >> 16) & 0x7fff;
9335 if (meta >= sizeof(meta_extra_lengths)) return NULL;
9336 pptr += meta_extra_lengths[meta];
9337 }
9338 /* Control never reaches here */
9339 return pptr;
9340 }
9341
9342
9343
9344 /*************************************************
9345 * Find length of a parsed group *
9346 *************************************************/
9347
9348 /* This is called for nested groups within a branch of a lookbehind whose
9349 length is being computed. On entry, the pointer must be at the first element
9350 after the group initializing code. On exit it points to OP_KET. Caching is used
9351 to improve processing speed when the same capturing group occurs many times.
9352
9353 Arguments:
9354 pptrptr pointer to pointer in the parsed pattern
9355 minptr where to return the minimum length
9356 isinline FALSE if a reference or recursion; TRUE for inline group
9357 errcodeptr pointer to the errorcode
9358 lcptr pointer to the loop counter
9359 group number of captured group or -1 for a non-capturing group
9360 recurses chain of recurse_check to catch mutual recursion
9361 cb pointer to the compile data
9362
9363 Returns: the maximum group length or a negative number
9364 */
9365
9366 static int
get_grouplength(uint32_t ** pptrptr,int * minptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)9367 get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9368 int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9369 {
9370 uint32_t *gi = cb->groupinfo + 2 * group;
9371 int branchlength, branchminlength;
9372 int grouplength = -1;
9373 int groupminlength = INT_MAX;
9374
9375 /* The cache can be used only if there is no possibility of there being two
9376 groups with the same number. We do not need to set the end pointer for a group
9377 that is being processed as a back reference or recursion, but we must do so for
9378 an inline group. */
9379
9380 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9381 {
9382 uint32_t groupinfo = gi[0];
9383 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9384 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9385 {
9386 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9387 *minptr = gi[1];
9388 return groupinfo & GI_FIXED_LENGTH_MASK;
9389 }
9390 }
9391
9392 /* Scan the group. In this case we find the end pointer of necessity. */
9393
9394 for(;;)
9395 {
9396 branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9397 recurses, cb);
9398 if (branchlength < 0) goto ISNOTFIXED;
9399 if (branchlength > grouplength) grouplength = branchlength;
9400 if (branchminlength < groupminlength) groupminlength = branchminlength;
9401 if (**pptrptr == META_KET) break;
9402 *pptrptr += 1; /* Skip META_ALT */
9403 }
9404
9405 if (group > 0)
9406 {
9407 gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9408 gi[1] = groupminlength;
9409 }
9410
9411 *minptr = groupminlength;
9412 return grouplength;
9413
9414 ISNOTFIXED:
9415 if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9416 return -1;
9417 }
9418
9419
9420
9421 /*************************************************
9422 * Find length of a parsed branch *
9423 *************************************************/
9424
9425 /* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9426 giving an error if the length is not limited. On entry, *pptrptr points to the
9427 first element inside the branch. On exit it is set to point to the ALT or KET.
9428
9429 Arguments:
9430 pptrptr pointer to pointer in the parsed pattern
9431 minptr where to return the minimum length
9432 errcodeptr pointer to error code
9433 lcptr pointer to loop counter
9434 recurses chain of recurse_check to catch mutual recursion
9435 cb pointer to compile block
9436
9437 Returns: the maximum length, or a negative value on error
9438 */
9439
9440 static int
get_branchlength(uint32_t ** pptrptr,int * minptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9441 get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9442 parsed_recurse_check *recurses, compile_block *cb)
9443 {
9444 int branchlength = 0;
9445 int branchminlength = 0;
9446 int grouplength, groupminlength;
9447 uint32_t lastitemlength = 0;
9448 uint32_t lastitemminlength = 0;
9449 uint32_t *pptr = *pptrptr;
9450 PCRE2_SIZE offset;
9451 parsed_recurse_check this_recurse;
9452
9453 /* A large and/or complex regex can take too long to process. This can happen
9454 more often when (?| groups are present in the pattern because their length
9455 cannot be cached. */
9456
9457 if ((*lcptr)++ > 2000)
9458 {
9459 *errcodeptr = ERR35; /* Lookbehind is too complicated */
9460 return -1;
9461 }
9462
9463 /* Scan the branch, accumulating the length. */
9464
9465 for (;; pptr++)
9466 {
9467 parsed_recurse_check *r;
9468 uint32_t *gptr, *gptrend;
9469 uint32_t escape;
9470 uint32_t group = 0;
9471 uint32_t itemlength = 0;
9472 uint32_t itemminlength = 0;
9473 uint32_t min, max;
9474
9475 if (*pptr < META_END)
9476 {
9477 itemlength = itemminlength = 1;
9478 }
9479
9480 else switch (META_CODE(*pptr))
9481 {
9482 case META_KET:
9483 case META_ALT:
9484 goto EXIT;
9485
9486 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9487 actual termination. */
9488
9489 case META_ACCEPT:
9490 case META_FAIL:
9491 pptr = parsed_skip(pptr, PSKIP_ALT);
9492 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9493 goto EXIT;
9494
9495 case META_MARK:
9496 case META_COMMIT_ARG:
9497 case META_PRUNE_ARG:
9498 case META_SKIP_ARG:
9499 case META_THEN_ARG:
9500 pptr += pptr[1] + 1;
9501 break;
9502
9503 case META_CIRCUMFLEX:
9504 case META_COMMIT:
9505 case META_DOLLAR:
9506 case META_PRUNE:
9507 case META_SKIP:
9508 case META_THEN:
9509 break;
9510
9511 case META_OPTIONS:
9512 pptr += 2;
9513 break;
9514
9515 case META_BIGVALUE:
9516 itemlength = itemminlength = 1;
9517 pptr += 1;
9518 break;
9519
9520 case META_CLASS:
9521 case META_CLASS_NOT:
9522 itemlength = itemminlength = 1;
9523 pptr = parsed_skip(pptr, PSKIP_CLASS);
9524 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9525 break;
9526
9527 case META_CLASS_EMPTY_NOT:
9528 case META_DOT:
9529 itemlength = itemminlength = 1;
9530 break;
9531
9532 case META_CALLOUT_NUMBER:
9533 pptr += 3;
9534 break;
9535
9536 case META_CALLOUT_STRING:
9537 pptr += 3 + SIZEOFFSET;
9538 break;
9539
9540 /* Only some escapes consume a character. Of those, \R can match one or two
9541 characters, but \X is never allowed because it matches an unknown number of
9542 characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9543
9544 case META_ESCAPE:
9545 escape = META_DATA(*pptr);
9546 if (escape == ESC_X) return -1;
9547 if (escape == ESC_R)
9548 {
9549 itemminlength = 1;
9550 itemlength = 2;
9551 }
9552 else if (escape > ESC_b && escape < ESC_Z)
9553 {
9554 #if PCRE2_CODE_UNIT_WIDTH != 32
9555 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9556 {
9557 *errcodeptr = ERR36;
9558 return -1;
9559 }
9560 #endif
9561 itemlength = itemminlength = 1;
9562 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
9563 }
9564 break;
9565
9566 /* Lookaheads do not contribute to the length of this branch, but they may
9567 contain lookbehinds within them whose lengths need to be set. */
9568
9569 case META_LOOKAHEAD:
9570 case META_LOOKAHEADNOT:
9571 case META_LOOKAHEAD_NA:
9572 *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9573 if (*errcodeptr != 0) return -1;
9574
9575 /* Ignore any qualifiers that follow a lookahead assertion. */
9576
9577 switch (pptr[1])
9578 {
9579 case META_ASTERISK:
9580 case META_ASTERISK_PLUS:
9581 case META_ASTERISK_QUERY:
9582 case META_PLUS:
9583 case META_PLUS_PLUS:
9584 case META_PLUS_QUERY:
9585 case META_QUERY:
9586 case META_QUERY_PLUS:
9587 case META_QUERY_QUERY:
9588 pptr++;
9589 break;
9590
9591 case META_MINMAX:
9592 case META_MINMAX_PLUS:
9593 case META_MINMAX_QUERY:
9594 pptr += 3;
9595 break;
9596
9597 default:
9598 break;
9599 }
9600 break;
9601
9602 /* A nested lookbehind does not contribute any length to this lookbehind,
9603 but must itself be checked and have its lengths set. */
9604
9605 case META_LOOKBEHIND:
9606 case META_LOOKBEHINDNOT:
9607 case META_LOOKBEHIND_NA:
9608 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9609 return -1;
9610 break;
9611
9612 /* Back references and recursions are handled by very similar code. At this
9613 stage, the names generated in the parsing pass are available, but the main
9614 name table has not yet been created. So for the named varieties, scan the
9615 list of names in order to get the number of the first one in the pattern,
9616 and whether or not this name is duplicated. */
9617
9618 case META_BACKREF_BYNAME:
9619 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9620 goto ISNOTFIXED;
9621 /* Fall through */
9622
9623 case META_RECURSE_BYNAME:
9624 {
9625 int i;
9626 PCRE2_SPTR name;
9627 BOOL is_dupname = FALSE;
9628 named_group *ng = cb->named_groups;
9629 uint32_t meta_code = META_CODE(*pptr);
9630 uint32_t length = *(++pptr);
9631
9632 GETPLUSOFFSET(offset, pptr);
9633 name = cb->start_pattern + offset;
9634 for (i = 0; i < cb->names_found; i++, ng++)
9635 {
9636 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9637 {
9638 group = ng->number;
9639 is_dupname = ng->isdup;
9640 break;
9641 }
9642 }
9643
9644 if (group == 0)
9645 {
9646 *errcodeptr = ERR15; /* Non-existent subpattern */
9647 cb->erroroffset = offset;
9648 return -1;
9649 }
9650
9651 /* A numerical back reference can be fixed length if duplicate capturing
9652 groups are not being used. A non-duplicate named back reference can also
9653 be handled. */
9654
9655 if (meta_code == META_RECURSE_BYNAME ||
9656 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9657 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9658 }
9659 goto ISNOTFIXED; /* Duplicate name or number */
9660
9661 /* The offset values for back references < 10 are in a separate vector
9662 because otherwise they would use more than two parsed pattern elements on
9663 64-bit systems. */
9664
9665 case META_BACKREF:
9666 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9667 (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9668 goto ISNOTFIXED;
9669 group = META_DATA(*pptr);
9670 if (group < 10)
9671 {
9672 offset = cb->small_ref_offset[group];
9673 goto RECURSE_OR_BACKREF_LENGTH;
9674 }
9675
9676 /* Fall through */
9677 /* For groups >= 10 - picking up group twice does no harm. */
9678
9679 /* A true recursion implies not fixed length, but a subroutine call may
9680 be OK. Back reference "recursions" are also failed. */
9681
9682 case META_RECURSE:
9683 group = META_DATA(*pptr);
9684 GETPLUSOFFSET(offset, pptr);
9685
9686 RECURSE_OR_BACKREF_LENGTH:
9687 if (group > cb->bracount)
9688 {
9689 cb->erroroffset = offset;
9690 *errcodeptr = ERR15; /* Non-existent subpattern */
9691 return -1;
9692 }
9693 if (group == 0) goto ISNOTFIXED; /* Local recursion */
9694 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9695 {
9696 if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9697 else if (*gptr == (META_CAPTURE | group)) break;
9698 }
9699
9700 /* We must start the search for the end of the group at the first meta code
9701 inside the group. Otherwise it will be treated as an enclosed group. */
9702
9703 gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9704 if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9705 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9706 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9707 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9708 this_recurse.prev = recurses;
9709 this_recurse.groupptr = gptr;
9710
9711 /* We do not need to know the position of the end of the group, that is,
9712 gptr is not used after the call to get_grouplength(). Setting the second
9713 argument FALSE stops it scanning for the end when the length can be found
9714 in the cache. */
9715
9716 gptr++;
9717 grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9718 lcptr, group, &this_recurse, cb);
9719 if (grouplength < 0)
9720 {
9721 if (*errcodeptr == 0) goto ISNOTFIXED;
9722 return -1; /* Error already set */
9723 }
9724 itemlength = grouplength;
9725 itemminlength = groupminlength;
9726 break;
9727
9728 /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9729 the length of this branch. Skip from the following item to the next
9730 unpaired ket. */
9731
9732 case META_COND_DEFINE:
9733 pptr = parsed_skip(pptr + 1, PSKIP_KET);
9734 break;
9735
9736 /* Check other nested groups - advance past the initial data for each type
9737 and then seek a fixed length with get_grouplength(). */
9738
9739 case META_COND_NAME:
9740 case META_COND_NUMBER:
9741 case META_COND_RNAME:
9742 case META_COND_RNUMBER:
9743 pptr += 2 + SIZEOFFSET;
9744 goto CHECK_GROUP;
9745
9746 case META_COND_ASSERT:
9747 pptr += 1;
9748 goto CHECK_GROUP;
9749
9750 case META_COND_VERSION:
9751 pptr += 4;
9752 goto CHECK_GROUP;
9753
9754 case META_CAPTURE:
9755 group = META_DATA(*pptr);
9756 /* Fall through */
9757
9758 case META_ATOMIC:
9759 case META_NOCAPTURE:
9760 case META_SCRIPT_RUN:
9761 pptr++;
9762 CHECK_GROUP:
9763 grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9764 lcptr, group, recurses, cb);
9765 if (grouplength < 0) return -1;
9766 itemlength = grouplength;
9767 itemminlength = groupminlength;
9768 break;
9769
9770 case META_QUERY:
9771 case META_QUERY_PLUS:
9772 case META_QUERY_QUERY:
9773 min = 0;
9774 max = 1;
9775 goto REPETITION;
9776
9777 /* Exact repetition is OK; variable repetition is not. A repetition of zero
9778 must subtract the length that has already been added. */
9779
9780 case META_MINMAX:
9781 case META_MINMAX_PLUS:
9782 case META_MINMAX_QUERY:
9783 min = pptr[1];
9784 max = pptr[2];
9785 pptr += 2;
9786
9787 REPETITION:
9788 if (max != REPEAT_UNLIMITED)
9789 {
9790 if (lastitemlength != 0 && /* Should not occur, but just in case */
9791 max != 0 &&
9792 (INT_MAX - branchlength)/lastitemlength < max - 1)
9793 {
9794 *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */
9795 return -1;
9796 }
9797 if (min == 0) branchminlength -= lastitemminlength;
9798 else itemminlength = (min - 1) * lastitemminlength;
9799 if (max == 0) branchlength -= lastitemlength;
9800 else itemlength = (max - 1) * lastitemlength;
9801 break;
9802 }
9803 /* Fall through */
9804
9805 /* Any other item means this branch does not have a fixed length. */
9806
9807 default:
9808 ISNOTFIXED:
9809 *errcodeptr = ERR25; /* Not fixed length */
9810 return -1;
9811 }
9812
9813 /* Add the item length to the branchlength, checking for integer overflow and
9814 for the branch length exceeding the overall limit. Later, if there is at
9815 least one variable-length branch in the group, there is a test for the
9816 (smaller) variable-length branch length limit. */
9817
9818 if (INT_MAX - branchlength < (int)itemlength ||
9819 (branchlength += itemlength) > LOOKBEHIND_MAX)
9820 {
9821 *errcodeptr = ERR87;
9822 return -1;
9823 }
9824
9825 branchminlength += itemminlength;
9826
9827 /* Save this item length for use if the next item is a quantifier. */
9828
9829 lastitemlength = itemlength;
9830 lastitemminlength = itemminlength;
9831 }
9832
9833 EXIT:
9834 *pptrptr = pptr;
9835 *minptr = branchminlength;
9836 return branchlength;
9837
9838 PARSED_SKIP_FAILED:
9839 *errcodeptr = ERR90;
9840 return -1;
9841 }
9842
9843
9844
9845 /*************************************************
9846 * Set lengths in a lookbehind *
9847 *************************************************/
9848
9849 /* This function is called for each lookbehind, to set the lengths in its
9850 branches. An error occurs if any branch does not have a limited maximum length
9851 that is less than the limit (65535). On exit, the pointer must be left on the
9852 final ket.
9853
9854 The function also maintains the max_lookbehind value. Any lookbehind branch
9855 that contains a nested lookbehind may actually look further back than the
9856 length of the branch. The additional amount is passed back from
9857 get_branchlength() as an "extra" value.
9858
9859 Arguments:
9860 pptrptr pointer to pointer in the parsed pattern
9861 errcodeptr pointer to error code
9862 lcptr pointer to loop counter
9863 recurses chain of recurse_check to catch mutual recursion
9864 cb pointer to compile block
9865
9866 Returns: TRUE if all is well
9867 FALSE otherwise, with error code and offset set
9868 */
9869
9870 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9871 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9872 parsed_recurse_check *recurses, compile_block *cb)
9873 {
9874 PCRE2_SIZE offset;
9875 uint32_t *bptr = *pptrptr;
9876 uint32_t *gbptr = bptr;
9877 int maxlength = 0;
9878 int minlength = INT_MAX;
9879 BOOL variable = FALSE;
9880
9881 READPLUSOFFSET(offset, bptr); /* Offset for error messages */
9882 *pptrptr += SIZEOFFSET;
9883
9884 /* Each branch can have a different maximum length, but we can keep only a
9885 single minimum for the whole group, because there's nowhere to save individual
9886 values in the META_ALT item. */
9887
9888 do
9889 {
9890 int branchlength, branchminlength;
9891
9892 *pptrptr += 1;
9893 branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9894 recurses, cb);
9895
9896 if (branchlength < 0)
9897 {
9898 /* The errorcode and offset may already be set from a nested lookbehind. */
9899 if (*errcodeptr == 0) *errcodeptr = ERR25;
9900 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9901 return FALSE;
9902 }
9903
9904 if (branchlength != branchminlength) variable = TRUE;
9905 if (branchminlength < minlength) minlength = branchminlength;
9906 if (branchlength > maxlength) maxlength = branchlength;
9907 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9908 *bptr |= branchlength; /* branchlength never more than 65535 */
9909 bptr = *pptrptr;
9910 }
9911 while (*bptr == META_ALT);
9912
9913 /* If any branch is of variable length, the whole lookbehind is of variable
9914 length. If the maximum length of any branch exceeds the maximum for variable
9915 lookbehinds, give an error. Otherwise, the minimum length is set in the word
9916 that follows the original group META value. For a fixed-length lookbehind, this
9917 is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9918 possibly different) length. */
9919
9920 if (variable)
9921 {
9922 gbptr[1] = minlength;
9923 if ((uint32_t)maxlength > cb->max_varlookbehind)
9924 {
9925 *errcodeptr = ERR100;
9926 cb->erroroffset = offset;
9927 return FALSE;
9928 }
9929 }
9930 else gbptr[1] = LOOKBEHIND_MAX;
9931
9932
9933 gbptr[1] = variable? minlength : LOOKBEHIND_MAX;
9934 return TRUE;
9935 }
9936
9937
9938
9939 /*************************************************
9940 * Check parsed pattern lookbehinds *
9941 *************************************************/
9942
9943 /* This function is called at the end of parsing a pattern if any lookbehinds
9944 were encountered. It scans the parsed pattern for them, calling
9945 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9946 the error offset is marked unset. The enables the functions above not to
9947 override settings from deeper nestings.
9948
9949 This function is called recursively from get_branchlength() for lookaheads in
9950 order to process any lookbehinds that they may contain. It stops when it hits a
9951 non-nested closing parenthesis in this case, returning a pointer to it.
9952
9953 Arguments
9954 pptr points to where to start (start of pattern or start of lookahead)
9955 retptr if not NULL, return the ket pointer here
9956 recurses chain of recurse_check to catch mutual recursion
9957 cb points to the compile block
9958 lcptr points to loop counter
9959
9960 Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9961 */
9962
9963 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9964 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9965 parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9966 {
9967 int errorcode = 0;
9968 int nestlevel = 0;
9969
9970 cb->erroroffset = PCRE2_UNSET;
9971
9972 for (; *pptr != META_END; pptr++)
9973 {
9974 if (*pptr < META_END) continue; /* Literal */
9975
9976 switch (META_CODE(*pptr))
9977 {
9978 default:
9979 return ERR70; /* Unrecognized meta code */
9980
9981 case META_ESCAPE:
9982 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9983 pptr += 1;
9984 break;
9985
9986 case META_KET:
9987 if (--nestlevel < 0)
9988 {
9989 if (retptr != NULL) *retptr = pptr;
9990 return 0;
9991 }
9992 break;
9993
9994 case META_ATOMIC:
9995 case META_CAPTURE:
9996 case META_COND_ASSERT:
9997 case META_LOOKAHEAD:
9998 case META_LOOKAHEADNOT:
9999 case META_LOOKAHEAD_NA:
10000 case META_NOCAPTURE:
10001 case META_SCRIPT_RUN:
10002 nestlevel++;
10003 break;
10004
10005 case META_ACCEPT:
10006 case META_ALT:
10007 case META_ASTERISK:
10008 case META_ASTERISK_PLUS:
10009 case META_ASTERISK_QUERY:
10010 case META_BACKREF:
10011 case META_CIRCUMFLEX:
10012 case META_CLASS:
10013 case META_CLASS_EMPTY:
10014 case META_CLASS_EMPTY_NOT:
10015 case META_CLASS_END:
10016 case META_CLASS_NOT:
10017 case META_COMMIT:
10018 case META_DOLLAR:
10019 case META_DOT:
10020 case META_FAIL:
10021 case META_PLUS:
10022 case META_PLUS_PLUS:
10023 case META_PLUS_QUERY:
10024 case META_PRUNE:
10025 case META_QUERY:
10026 case META_QUERY_PLUS:
10027 case META_QUERY_QUERY:
10028 case META_RANGE_ESCAPED:
10029 case META_RANGE_LITERAL:
10030 case META_SKIP:
10031 case META_THEN:
10032 break;
10033
10034 case META_RECURSE:
10035 pptr += SIZEOFFSET;
10036 break;
10037
10038 case META_BACKREF_BYNAME:
10039 case META_RECURSE_BYNAME:
10040 pptr += 1 + SIZEOFFSET;
10041 break;
10042
10043 case META_COND_DEFINE:
10044 pptr += SIZEOFFSET;
10045 nestlevel++;
10046 break;
10047
10048 case META_COND_NAME:
10049 case META_COND_NUMBER:
10050 case META_COND_RNAME:
10051 case META_COND_RNUMBER:
10052 pptr += 1 + SIZEOFFSET;
10053 nestlevel++;
10054 break;
10055
10056 case META_COND_VERSION:
10057 pptr += 3;
10058 nestlevel++;
10059 break;
10060
10061 case META_CALLOUT_STRING:
10062 pptr += 3 + SIZEOFFSET;
10063 break;
10064
10065 case META_BIGVALUE:
10066 case META_POSIX:
10067 case META_POSIX_NEG:
10068 pptr += 1;
10069 break;
10070
10071 case META_MINMAX:
10072 case META_MINMAX_QUERY:
10073 case META_MINMAX_PLUS:
10074 case META_OPTIONS:
10075 pptr += 2;
10076 break;
10077
10078 case META_CALLOUT_NUMBER:
10079 pptr += 3;
10080 break;
10081
10082 case META_MARK:
10083 case META_COMMIT_ARG:
10084 case META_PRUNE_ARG:
10085 case META_SKIP_ARG:
10086 case META_THEN_ARG:
10087 pptr += 1 + pptr[1];
10088 break;
10089
10090 case META_LOOKBEHIND:
10091 case META_LOOKBEHINDNOT:
10092 case META_LOOKBEHIND_NA:
10093 if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10094 return errorcode;
10095 break;
10096 }
10097 }
10098
10099 return 0;
10100 }
10101
10102
10103
10104 /*************************************************
10105 * External function to compile a pattern *
10106 *************************************************/
10107
10108 /* This function reads a regular expression in the form of a string and returns
10109 a pointer to a block of store holding a compiled version of the expression.
10110
10111 Arguments:
10112 pattern the regular expression
10113 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
10114 options option bits
10115 errorptr pointer to errorcode
10116 erroroffset pointer to error offset
10117 ccontext points to a compile context or is NULL
10118
10119 Returns: pointer to compiled data block, or NULL on error,
10120 with errorcode and erroroffset set
10121 */
10122
10123 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)10124 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10125 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10126 {
10127 BOOL utf; /* Set TRUE for UTF mode */
10128 BOOL ucp; /* Set TRUE for UCP mode */
10129 BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
10130 BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
10131 pcre2_real_code *re = NULL; /* What we will return */
10132 compile_block cb; /* "Static" compile-time data */
10133 const uint8_t *tables; /* Char tables base pointer */
10134
10135 PCRE2_UCHAR *code; /* Current pointer in compiled code */
10136 PCRE2_SPTR codestart; /* Start of compiled code */
10137 PCRE2_SPTR ptr; /* Current pointer in pattern */
10138 uint32_t *pptr; /* Current pointer in parsed pattern */
10139
10140 PCRE2_SIZE length = 1; /* Allow for final END opcode */
10141 PCRE2_SIZE usedlength; /* Actual length used */
10142 PCRE2_SIZE re_blocksize; /* Size of memory block */
10143 PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
10144 PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
10145
10146 uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
10147 uint32_t firstcu, reqcu; /* Value of first/req code unit */
10148 uint32_t setflags = 0; /* NL and BSR set flags */
10149
10150 uint32_t skipatstart; /* When checking (*UTF) etc */
10151 uint32_t limit_heap = UINT32_MAX;
10152 uint32_t limit_match = UINT32_MAX; /* Unset match limits */
10153 uint32_t limit_depth = UINT32_MAX;
10154
10155 int newline = 0; /* Unset; can be set by the pattern */
10156 int bsr = 0; /* Unset; can be set by the pattern */
10157 int errorcode = 0; /* Initialize to avoid compiler warn */
10158 int regexrc; /* Return from compile */
10159
10160 uint32_t i; /* Local loop counter */
10161
10162 /* Comments at the head of this file explain about these variables. */
10163
10164 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10165 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10166 named_group named_groups[NAMED_GROUP_LIST_SIZE];
10167
10168 /* The workspace is used in different ways in the different compiling phases.
10169 It needs to be 16-bit aligned for the preliminary parsing scan. */
10170
10171 uint32_t c16workspace[C16_WORK_SIZE];
10172 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10173
10174
10175 /* -------------- Check arguments and set up the pattern ----------------- */
10176
10177 /* There must be error code and offset pointers. */
10178
10179 if (errorptr == NULL || erroroffset == NULL) return NULL;
10180 *errorptr = ERR0;
10181 *erroroffset = 0;
10182
10183 /* There must be a pattern, but NULL is allowed with zero length. */
10184
10185 if (pattern == NULL)
10186 {
10187 if (patlen == 0) pattern = (PCRE2_SPTR)""; else
10188 {
10189 *errorptr = ERR16;
10190 return NULL;
10191 }
10192 }
10193
10194 /* A NULL compile context means "use a default context" */
10195
10196 if (ccontext == NULL)
10197 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10198
10199 /* PCRE2_MATCH_INVALID_UTF implies UTF */
10200
10201 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10202
10203 /* Check that all undefined public option bits are zero. */
10204
10205 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10206 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10207 {
10208 *errorptr = ERR17;
10209 return NULL;
10210 }
10211
10212 if ((options & PCRE2_LITERAL) != 0 &&
10213 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10214 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10215 {
10216 *errorptr = ERR92;
10217 return NULL;
10218 }
10219
10220 /* A zero-terminated pattern is indicated by the special length value
10221 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10222
10223 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10224 patlen = PRIV(strlen)(pattern);
10225
10226 if (patlen > ccontext->max_pattern_length)
10227 {
10228 *errorptr = ERR88;
10229 return NULL;
10230 }
10231
10232 /* From here on, all returns from this function should end up going via the
10233 EXIT label. */
10234
10235
10236 /* ------------ Initialize the "static" compile data -------------- */
10237
10238 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10239
10240 cb.lcc = tables + lcc_offset; /* Individual */
10241 cb.fcc = tables + fcc_offset; /* character */
10242 cb.cbits = tables + cbits_offset; /* tables */
10243 cb.ctypes = tables + ctypes_offset;
10244
10245 cb.assert_depth = 0;
10246 cb.bracount = 0;
10247 cb.cx = ccontext;
10248 cb.dupnames = FALSE;
10249 cb.end_pattern = pattern + patlen;
10250 cb.erroroffset = 0;
10251 cb.external_flags = 0;
10252 cb.external_options = options;
10253 cb.groupinfo = stack_groupinfo;
10254 cb.had_recurse = FALSE;
10255 cb.lastcapture = 0;
10256 cb.max_lookbehind = 0; /* Max encountered */
10257 cb.max_varlookbehind = ccontext->max_varlookbehind; /* Limit */
10258 cb.name_entry_size = 0;
10259 cb.name_table = NULL;
10260 cb.named_groups = named_groups;
10261 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10262 cb.names_found = 0;
10263 cb.parens_depth = 0;
10264 cb.parsed_pattern = stack_parsed_pattern;
10265 cb.req_varyopt = 0;
10266 cb.start_code = cworkspace;
10267 cb.start_pattern = pattern;
10268 cb.start_workspace = cworkspace;
10269 cb.workspace_size = COMPILE_WORK_SIZE;
10270
10271 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10272 references to help in deciding whether (.*) can be treated as anchored or not.
10273 */
10274
10275 cb.top_backref = 0;
10276 cb.backref_map = 0;
10277
10278 /* Escape sequences \1 to \9 are always back references, but as they are only
10279 two characters long, only two elements can be used in the parsed_pattern
10280 vector. The first contains the reference, and we'd like to use the second to
10281 record the offset in the pattern, so that forward references to non-existent
10282 groups can be diagnosed later with an offset. However, on 64-bit systems,
10283 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10284 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10285 references have enough space for the offset to be put into the parsed pattern.
10286 */
10287
10288 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10289
10290
10291 /* --------------- Start looking at the pattern --------------- */
10292
10293 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10294 the start of the pattern, and remember the offset to the actual regex. With
10295 valgrind support, make the terminator of a zero-terminated pattern
10296 inaccessible. This catches bugs that would otherwise only show up for
10297 non-zero-terminated patterns. */
10298
10299 #ifdef SUPPORT_VALGRIND
10300 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10301 #endif
10302
10303 ptr = pattern;
10304 skipatstart = 0;
10305
10306 if ((options & PCRE2_LITERAL) == 0)
10307 {
10308 while (patlen - skipatstart >= 2 &&
10309 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10310 ptr[skipatstart+1] == CHAR_ASTERISK)
10311 {
10312 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10313 {
10314 uint32_t c, pp;
10315 const pso *p = pso_list + i;
10316
10317 if (patlen - skipatstart - 2 >= p->length &&
10318 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
10319 p->length) == 0)
10320 {
10321 skipatstart += p->length + 2;
10322 switch(p->type)
10323 {
10324 case PSO_OPT:
10325 cb.external_options |= p->value;
10326 break;
10327
10328 case PSO_FLG:
10329 setflags |= p->value;
10330 break;
10331
10332 case PSO_NL:
10333 newline = p->value;
10334 setflags |= PCRE2_NL_SET;
10335 break;
10336
10337 case PSO_BSR:
10338 bsr = p->value;
10339 setflags |= PCRE2_BSR_SET;
10340 break;
10341
10342 case PSO_LIMM:
10343 case PSO_LIMD:
10344 case PSO_LIMH:
10345 c = 0;
10346 pp = skipatstart;
10347 if (!IS_DIGIT(ptr[pp]))
10348 {
10349 errorcode = ERR60;
10350 ptr += pp;
10351 goto HAD_EARLY_ERROR;
10352 }
10353 while (IS_DIGIT(ptr[pp]))
10354 {
10355 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
10356 c = c*10 + (ptr[pp++] - CHAR_0);
10357 }
10358 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
10359 {
10360 errorcode = ERR60;
10361 ptr += pp;
10362 goto HAD_EARLY_ERROR;
10363 }
10364 if (p->type == PSO_LIMH) limit_heap = c;
10365 else if (p->type == PSO_LIMM) limit_match = c;
10366 else limit_depth = c;
10367 skipatstart += pp - skipatstart;
10368 break;
10369 }
10370 break; /* Out of the table scan loop */
10371 }
10372 }
10373 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
10374 }
10375 }
10376
10377 /* End of pattern-start options; advance to start of real regex. */
10378
10379 ptr += skipatstart;
10380
10381 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10382
10383 #ifndef SUPPORT_UNICODE
10384 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10385 {
10386 errorcode = ERR32;
10387 goto HAD_EARLY_ERROR;
10388 }
10389 #endif
10390
10391 /* Check UTF. We have the original options in 'options', with that value as
10392 modified by (*UTF) etc in cb->external_options. The extra option
10393 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10394 surrogate code points cannot be represented in UTF-16. */
10395
10396 utf = (cb.external_options & PCRE2_UTF) != 0;
10397 if (utf)
10398 {
10399 if ((options & PCRE2_NEVER_UTF) != 0)
10400 {
10401 errorcode = ERR74;
10402 goto HAD_EARLY_ERROR;
10403 }
10404 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10405 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10406 goto HAD_ERROR; /* Offset was set by valid_utf() */
10407
10408 #if PCRE2_CODE_UNIT_WIDTH == 16
10409 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10410 {
10411 errorcode = ERR91;
10412 goto HAD_EARLY_ERROR;
10413 }
10414 #endif
10415 }
10416
10417 /* Check UCP lockout. */
10418
10419 ucp = (cb.external_options & PCRE2_UCP) != 0;
10420 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10421 {
10422 errorcode = ERR75;
10423 goto HAD_EARLY_ERROR;
10424 }
10425
10426 /* Process the BSR setting. */
10427
10428 if (bsr == 0) bsr = ccontext->bsr_convention;
10429
10430 /* Process the newline setting. */
10431
10432 if (newline == 0) newline = ccontext->newline_convention;
10433 cb.nltype = NLTYPE_FIXED;
10434 switch(newline)
10435 {
10436 case PCRE2_NEWLINE_CR:
10437 cb.nllen = 1;
10438 cb.nl[0] = CHAR_CR;
10439 break;
10440
10441 case PCRE2_NEWLINE_LF:
10442 cb.nllen = 1;
10443 cb.nl[0] = CHAR_NL;
10444 break;
10445
10446 case PCRE2_NEWLINE_NUL:
10447 cb.nllen = 1;
10448 cb.nl[0] = CHAR_NUL;
10449 break;
10450
10451 case PCRE2_NEWLINE_CRLF:
10452 cb.nllen = 2;
10453 cb.nl[0] = CHAR_CR;
10454 cb.nl[1] = CHAR_NL;
10455 break;
10456
10457 case PCRE2_NEWLINE_ANY:
10458 cb.nltype = NLTYPE_ANY;
10459 break;
10460
10461 case PCRE2_NEWLINE_ANYCRLF:
10462 cb.nltype = NLTYPE_ANYCRLF;
10463 break;
10464
10465 default:
10466 errorcode = ERR56;
10467 goto HAD_EARLY_ERROR;
10468 }
10469
10470 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10471 their numerical equivalents, so that this information is always available for
10472 the remaining processing. (2) At the same time, parse the pattern and put a
10473 processed version into the parsed_pattern vector. This has escapes interpreted
10474 and comments removed (amongst other things).
10475
10476 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10477 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10478 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10479 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10480 characters greater than META_END (0x80000000) have to be coded as two units. In
10481 this case, therefore, we scan the pattern to check for such values. */
10482
10483 #if PCRE2_CODE_UNIT_WIDTH == 32
10484 if (!utf)
10485 {
10486 PCRE2_SPTR p;
10487 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10488 }
10489 #endif
10490
10491 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10492 is set we have to assume a numerical callout (4 elements) for each character
10493 plus one at the end. This is overkill, but memory is plentiful these days. For
10494 many smaller patterns the vector on the stack (which was set up above) can be
10495 used. */
10496
10497 parsed_size_needed = patlen - skipatstart + big32count;
10498
10499 if ((ccontext->extra_options &
10500 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10501 parsed_size_needed += 4;
10502
10503 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10504 parsed_size_needed = (parsed_size_needed + 1) * 5;
10505
10506 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10507 {
10508 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10509 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10510 if (heap_parsed_pattern == NULL)
10511 {
10512 *errorptr = ERR21;
10513 goto EXIT;
10514 }
10515 cb.parsed_pattern = heap_parsed_pattern;
10516 }
10517 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10518
10519 /* Do the parsing scan. */
10520
10521 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10522 if (errorcode != 0) goto HAD_CB_ERROR;
10523
10524 /* If there are any lookbehinds, scan the parsed pattern to figure out their
10525 lengths. Workspace is needed to remember whether numbered groups are or are not
10526 of limited length, and if limited, what the minimum and maximum lengths are.
10527 This caching saves re-computing the length of any group that is referenced more
10528 than once, which is particularly relevant when recursion is involved.
10529 Unnumbered groups do not have this exposure because they cannot be referenced.
10530 If there are sufficiently few groups, the default index vector on the stack, as
10531 set up above, can be used. Otherwise we have to get/free some heap memory. The
10532 vector must be initialized to zero. */
10533
10534 if (has_lookbehind)
10535 {
10536 int loopcount = 0;
10537 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10538 {
10539 cb.groupinfo = ccontext->memctl.malloc(
10540 (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10541 if (cb.groupinfo == NULL)
10542 {
10543 errorcode = ERR21;
10544 cb.erroroffset = 0;
10545 goto HAD_CB_ERROR;
10546 }
10547 }
10548 memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10549 errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10550 if (errorcode != 0) goto HAD_CB_ERROR;
10551 }
10552
10553 /* For debugging, there is a function that shows the parsed pattern vector. */
10554
10555 #ifdef DEBUG_SHOW_PARSED
10556 fprintf(stderr, "+++ Pre-scan complete:\n");
10557 show_parsed(&cb);
10558 #endif
10559
10560 /* For debugging capturing information this code can be enabled. */
10561
10562 #ifdef DEBUG_SHOW_CAPTURES
10563 {
10564 named_group *ng = cb.named_groups;
10565 fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10566 for (i = 0; i < cb.names_found; i++, ng++)
10567 {
10568 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10569 }
10570 }
10571 #endif
10572
10573 /* Pretend to compile the pattern while actually just accumulating the amount
10574 of memory required in the 'length' variable. This behaviour is triggered by
10575 passing a non-NULL final argument to compile_regex(). We pass a block of
10576 workspace (cworkspace) for it to compile parts of the pattern into; the
10577 compiled code is discarded when it is no longer needed, so hopefully this
10578 workspace will never overflow, though there is a test for its doing so.
10579
10580 On error, errorcode will be set non-zero, so we don't need to look at the
10581 result of the function. The initial options have been put into the cb block,
10582 but we still have to pass a separate options variable (the first argument)
10583 because the options may change as the pattern is processed. */
10584
10585 cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
10586 pptr = cb.parsed_pattern;
10587 code = cworkspace;
10588 *code = OP_BRA;
10589
10590 (void)compile_regex(cb.external_options, ccontext->extra_options, &code, &pptr,
10591 &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10592 &cb, &length);
10593
10594 if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
10595
10596 /* This should be caught in compile_regex(), but just in case... */
10597
10598 if (length > MAX_PATTERN_SIZE)
10599 {
10600 errorcode = ERR20;
10601 goto HAD_CB_ERROR;
10602 }
10603
10604 /* Compute the size of, and then get and initialize, the data block for storing
10605 the compiled pattern and names table. Integer overflow should no longer be
10606 possible because nowadays we limit the maximum value of cb.names_found and
10607 cb.name_entry_size. */
10608
10609 re_blocksize = sizeof(pcre2_real_code) +
10610 CU2BYTES(length +
10611 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10612 re = (pcre2_real_code *)
10613 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10614 if (re == NULL)
10615 {
10616 errorcode = ERR21;
10617 goto HAD_CB_ERROR;
10618 }
10619
10620 /* The compiler may put padding at the end of the pcre2_real_code structure in
10621 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10622 compiled pattern is copied (for example, when serialized) undefined bytes are
10623 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10624 write to the last 8 bytes of the structure before setting the fields. */
10625
10626 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10627 re->memctl = ccontext->memctl;
10628 re->tables = tables;
10629 re->executable_jit = NULL;
10630 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10631 re->blocksize = re_blocksize;
10632 re->magic_number = MAGIC_NUMBER;
10633 re->compile_options = options;
10634 re->overall_options = cb.external_options;
10635 re->extra_options = ccontext->extra_options;
10636 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10637 re->limit_heap = limit_heap;
10638 re->limit_match = limit_match;
10639 re->limit_depth = limit_depth;
10640 re->first_codeunit = 0;
10641 re->last_codeunit = 0;
10642 re->bsr_convention = bsr;
10643 re->newline_convention = newline;
10644 re->max_lookbehind = 0;
10645 re->minlength = 0;
10646 re->top_bracket = 0;
10647 re->top_backref = 0;
10648 re->name_entry_size = cb.name_entry_size;
10649 re->name_count = cb.names_found;
10650
10651 /* The basic block is immediately followed by the name table, and the compiled
10652 code follows after that. */
10653
10654 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10655 re->name_entry_size * re->name_count;
10656
10657 /* Update the compile data block for the actual compile. The starting points of
10658 the name/number translation table and of the code are passed around in the
10659 compile data block. The start/end pattern and initial options are already set
10660 from the pre-compile phase, as is the name_entry_size field. */
10661
10662 cb.parens_depth = 0;
10663 cb.assert_depth = 0;
10664 cb.lastcapture = 0;
10665 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10666 cb.start_code = codestart;
10667 cb.req_varyopt = 0;
10668 cb.had_accept = FALSE;
10669 cb.had_pruneorskip = FALSE;
10670
10671 /* If any named groups were found, create the name/number table from the list
10672 created in the pre-pass. */
10673
10674 if (cb.names_found > 0)
10675 {
10676 named_group *ng = cb.named_groups;
10677 for (i = 0; i < cb.names_found; i++, ng++)
10678 add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10679 }
10680
10681 /* Set up a starting, non-extracting bracket, then compile the expression. On
10682 error, errorcode will be set non-zero, so we don't need to look at the result
10683 of the function here. */
10684
10685 pptr = cb.parsed_pattern;
10686 code = (PCRE2_UCHAR *)codestart;
10687 *code = OP_BRA;
10688 regexrc = compile_regex(re->overall_options, ccontext->extra_options, &code,
10689 &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10690 NULL, &cb, NULL);
10691 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10692 re->top_bracket = cb.bracount;
10693 re->top_backref = cb.top_backref;
10694 re->max_lookbehind = cb.max_lookbehind;
10695
10696 if (cb.had_accept)
10697 {
10698 reqcu = 0; /* Must disable after (*ACCEPT) */
10699 reqcuflags = REQ_NONE;
10700 re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
10701 }
10702
10703 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10704 but the estimated length exceeds the really used length, adjust the value of
10705 re->blocksize, and if valgrind support is configured, mark the extra allocated
10706 memory as unaddressable, so that any out-of-bound reads can be detected. */
10707
10708 *code++ = OP_END;
10709 usedlength = code - codestart;
10710 if (usedlength > length) errorcode = ERR23; else
10711 {
10712 re->blocksize -= CU2BYTES(length - usedlength);
10713 #ifdef SUPPORT_VALGRIND
10714 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10715 #endif
10716 }
10717
10718 /* Scan the pattern for recursion/subroutine calls and convert the group
10719 numbers into offsets. Maintain a small cache so that repeated groups containing
10720 recursions are efficiently handled. */
10721
10722 #define RSCAN_CACHE_SIZE 8
10723
10724 if (errorcode == 0 && cb.had_recurse)
10725 {
10726 PCRE2_UCHAR *rcode;
10727 PCRE2_SPTR rgroup;
10728 unsigned int ccount = 0;
10729 int start = RSCAN_CACHE_SIZE;
10730 recurse_cache rc[RSCAN_CACHE_SIZE];
10731
10732 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10733 rcode != NULL;
10734 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10735 {
10736 int p, groupnumber;
10737
10738 groupnumber = (int)GET(rcode, 1);
10739 if (groupnumber == 0) rgroup = codestart; else
10740 {
10741 PCRE2_SPTR search_from = codestart;
10742 rgroup = NULL;
10743 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10744 {
10745 if (groupnumber == rc[p].groupnumber)
10746 {
10747 rgroup = rc[p].group;
10748 break;
10749 }
10750
10751 /* Group n+1 must always start to the right of group n, so we can save
10752 search time below when the new group number is greater than any of the
10753 previously found groups. */
10754
10755 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10756 }
10757
10758 if (rgroup == NULL)
10759 {
10760 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10761 if (rgroup == NULL)
10762 {
10763 errorcode = ERR53;
10764 break;
10765 }
10766 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10767 rc[start].groupnumber = groupnumber;
10768 rc[start].group = rgroup;
10769 if (ccount < RSCAN_CACHE_SIZE) ccount++;
10770 }
10771 }
10772
10773 PUT(rcode, 1, rgroup - codestart);
10774 }
10775 }
10776
10777 /* In rare debugging situations we sometimes need to look at the compiled code
10778 at this stage. */
10779
10780 #ifdef DEBUG_CALL_PRINTINT
10781 pcre2_printint(re, stderr, TRUE);
10782 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10783 #endif
10784
10785 /* Unless disabled, check whether any single character iterators can be
10786 auto-possessified. The function overwrites the appropriate opcode values, so
10787 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10788 used in this code because at least one compiler gives a warning about loss of
10789 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10790 function call. */
10791
10792 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10793 {
10794 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10795 if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10796 }
10797
10798 /* Failed to compile, or error while post-processing. */
10799
10800 if (errorcode != 0) goto HAD_CB_ERROR;
10801
10802 /* Successful compile. If the anchored option was not passed, set it if
10803 we can determine that the pattern is anchored by virtue of ^ characters or \A
10804 or anything else, such as starting with non-atomic .* when DOTALL is set and
10805 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10806 disable this case). */
10807
10808 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10809 is_anchored(codestart, 0, &cb, 0, FALSE))
10810 re->overall_options |= PCRE2_ANCHORED;
10811
10812 /* Set up the first code unit or startline flag, the required code unit, and
10813 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10814 is set, as the data it would create will not be used. Note that a first code
10815 unit (but not the startline flag) is useful for anchored patterns because it
10816 can still give a quick "no match" and also avoid searching for a last code
10817 unit. */
10818
10819 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10820 {
10821 int minminlength = 0; /* For minimal minlength from first/required CU */
10822
10823 /* If we do not have a first code unit, see if there is one that is asserted
10824 (these are not saved during the compile because they can cause conflicts with
10825 actual literals that follow). */
10826
10827 if (firstcuflags >= REQ_NONE)
10828 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10829
10830 /* Save the data for a first code unit. The existence of one means the
10831 minimum length must be at least 1. */
10832
10833 if (firstcuflags < REQ_NONE)
10834 {
10835 re->first_codeunit = firstcu;
10836 re->flags |= PCRE2_FIRSTSET;
10837 minminlength++;
10838
10839 /* Handle caseless first code units. */
10840
10841 if ((firstcuflags & REQ_CASELESS) != 0)
10842 {
10843 if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10844 {
10845 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10846 }
10847
10848 /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10849 In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10850 points and cannot have another case, but if UCP is set they may do. */
10851
10852 #ifdef SUPPORT_UNICODE
10853 #if PCRE2_CODE_UNIT_WIDTH == 8
10854 else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10855 re->flags |= PCRE2_FIRSTCASELESS;
10856 #else
10857 else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10858 UCD_OTHERCASE(firstcu) != firstcu)
10859 re->flags |= PCRE2_FIRSTCASELESS;
10860 #endif
10861 #endif /* SUPPORT_UNICODE */
10862 }
10863 }
10864
10865 /* When there is no first code unit, for non-anchored patterns, see if we can
10866 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10867 branches start with ^ and also when all branches start with non-atomic .* for
10868 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10869 that disables this case.) */
10870
10871 else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10872 is_startline(codestart, 0, &cb, 0, FALSE))
10873 re->flags |= PCRE2_STARTLINE;
10874
10875 /* Handle the "required code unit", if one is set. In the UTF case we can
10876 increment the minimum minimum length only if we are sure this really is a
10877 different character and not a non-starting code unit of the first character,
10878 because the minimum length count is in characters, not code units. */
10879
10880 if (reqcuflags < REQ_NONE)
10881 {
10882 #if PCRE2_CODE_UNIT_WIDTH == 16
10883 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10884 firstcuflags >= REQ_NONE || /* First not set */
10885 (firstcu & 0xf800) != 0xd800 || /* First not surrogate */
10886 (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
10887 #elif PCRE2_CODE_UNIT_WIDTH == 8
10888 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10889 firstcuflags >= REQ_NONE || /* First not set */
10890 (firstcu & 0x80) == 0 || /* First is ASCII */
10891 (reqcu & 0x80) == 0) /* Req is ASCII */
10892 #endif
10893 {
10894 minminlength++;
10895 }
10896
10897 /* In the case of an anchored pattern, set up the value only if it follows
10898 a variable length item in the pattern. */
10899
10900 if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10901 (reqcuflags & REQ_VARY) != 0)
10902 {
10903 re->last_codeunit = reqcu;
10904 re->flags |= PCRE2_LASTSET;
10905
10906 /* Handle caseless required code units as for first code units (above). */
10907
10908 if ((reqcuflags & REQ_CASELESS) != 0)
10909 {
10910 if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10911 {
10912 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10913 }
10914 #ifdef SUPPORT_UNICODE
10915 #if PCRE2_CODE_UNIT_WIDTH == 8
10916 else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10917 re->flags |= PCRE2_LASTCASELESS;
10918 #else
10919 else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10920 UCD_OTHERCASE(reqcu) != reqcu)
10921 re->flags |= PCRE2_LASTCASELESS;
10922 #endif
10923 #endif /* SUPPORT_UNICODE */
10924 }
10925 }
10926 }
10927
10928 /* Study the compiled pattern to set up information such as a bitmap of
10929 starting code units and a minimum matching length. */
10930
10931 if (PRIV(study)(re) != 0)
10932 {
10933 errorcode = ERR31;
10934 goto HAD_CB_ERROR;
10935 }
10936
10937 /* If study() set a bitmap of starting code units, it implies a minimum
10938 length of at least one. */
10939
10940 if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10941 minminlength = 1;
10942
10943 /* If the minimum length set (or not set) by study() is less than the minimum
10944 implied by required code units, override it. */
10945
10946 if (re->minlength < minminlength) re->minlength = minminlength;
10947 } /* End of start-of-match optimizations. */
10948
10949 /* Control ends up here in all cases. When running under valgrind, make a
10950 pattern's terminating zero defined again. If memory was obtained for the parsed
10951 version of the pattern, free it before returning. Also free the list of named
10952 groups if a larger one had to be obtained, and likewise the group information
10953 vector. */
10954
10955 EXIT:
10956 #ifdef SUPPORT_VALGRIND
10957 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10958 #endif
10959 if (cb.parsed_pattern != stack_parsed_pattern)
10960 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10961 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10962 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10963 if (cb.groupinfo != stack_groupinfo)
10964 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10965 return re; /* Will be NULL after an error */
10966
10967 /* Errors discovered in parse_regex() set the offset value in the compile
10968 block. Errors discovered before it is called must compute it from the ptr
10969 value. After parse_regex() is called, the offset in the compile block is set to
10970 the end of the pattern, but certain errors in compile_regex() may reset it if
10971 an offset is available in the parsed pattern. */
10972
10973 HAD_CB_ERROR:
10974 ptr = pattern + cb.erroroffset;
10975
10976 HAD_EARLY_ERROR:
10977 *erroroffset = ptr - pattern;
10978
10979 HAD_ERROR:
10980 *errorptr = errorcode;
10981 pcre2_code_free(re);
10982 re = NULL;
10983 goto EXIT;
10984 }
10985
10986 /* These #undefs are here to enable unity builds with CMake. */
10987
10988 #undef NLBLOCK /* Block containing newline information */
10989 #undef PSSTART /* Field containing processed string start */
10990 #undef PSEND /* Field containing processed string end */
10991
10992 /* End of pcre2_compile.c */
10993