1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2021 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #define NLBLOCK cb /* Block containing newline information */
47 #define PSSTART start_pattern /* Field containing processed string start */
48 #define PSEND end_pattern /* Field containing processed string end */
49
50 #include "pcre2_internal.h"
51
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63
64 /* Other debugging code can be enabled by these defines. */
65
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c) xdigitab[c]
75
76 #else /* Either 16-bit or 32-bit */
77 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82 #else /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110
111 /* Macros for manipulating elements of the parsed pattern vector. */
112
113 #define META_CODE(x) (x & 0xffff0000u)
114 #define META_DATA(x) (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116
117 /* Function definitions to allow mutual recursion */
118
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122 compile_block *, const uint32_t *, unsigned int);
123 #endif
124
125 static int
126 compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127 uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
128 compile_block *, PCRE2_SIZE *);
129
130 static int
131 get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132 compile_block *);
133
134 static BOOL
135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136 compile_block *);
137
138 static int
139 check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140 compile_block *, int *);
141
142
143 /*************************************************
144 * Code parameters and static tables *
145 *************************************************/
146
147 #define MAX_GROUP_NUMBER 65535u
148 #define MAX_REPEAT_COUNT 65535u
149 #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
150
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163
164 In the real compile phase, this workspace is not currently used. */
165
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
167
168 #define C16_WORK_SIZE \
169 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174
175 #define GROUPINFO_DEFAULT_SIZE 256
176
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186
187 #define NAMED_GROUP_LIST_SIZE 20
188
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199
200 #define OFLOW_MAX (INT_MAX - 20)
201
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210
211 #define META_END 0x80000000u /* End of pattern */
212
213 #define META_ALT 0x80010000u /* alternation */
214 #define META_ATOMIC 0x80020000u /* atomic group */
215 #define META_BACKREF 0x80030000u /* Back ref */
216 #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
217 #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
219 #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
220 #define META_CAPTURE 0x80080000u /* Capturing parenthesis */
221 #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
222 #define META_CLASS 0x800a0000u /* start non-empty class */
223 #define META_CLASS_EMPTY 0x800b0000u /* empty class */
224 #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
225 #define META_CLASS_END 0x800d0000u /* end of non-empty class */
226 #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
227 #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
228 #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
229 #define META_COND_NAME 0x80110000u /* (?(<name>)... */
230 #define META_COND_NUMBER 0x80120000u /* (?(digits)... */
231 #define META_COND_RNAME 0x80130000u /* (?(R&name)... */
232 #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
233 #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR 0x80160000u /* $ metacharacter */
235 #define META_DOT 0x80170000u /* . metacharacter */
236 #define META_ESCAPE 0x80180000u /* \d and friends */
237 #define META_KET 0x80190000u /* closing parenthesis */
238 #define META_NOCAPTURE 0x801a0000u /* no capture parens */
239 #define META_OPTIONS 0x801b0000u /* (?i) and friends */
240 #define META_POSIX 0x801c0000u /* POSIX class item */
241 #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
242 #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
243 #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
244 #define META_RECURSE 0x80200000u /* Recursion */
245 #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
246 #define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
247
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250
251 #define META_LOOKAHEAD 0x80230000u /* (?= */
252 #define META_LOOKAHEADNOT 0x80240000u /* (?! */
253 #define META_LOOKBEHIND 0x80250000u /* (?<= */
254 #define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
255
256 /* These cannot be conditions */
257
258 #define META_LOOKAHEAD_NA 0x80270000u /* (*napla: */
259 #define META_LOOKBEHIND_NA 0x80280000u /* (*naplb: */
260
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264
265 #define META_MARK 0x80290000u /* (*MARK) */
266 #define META_ACCEPT 0x802a0000u /* (*ACCEPT) */
267 #define META_FAIL 0x802b0000u /* (*FAIL) */
268 #define META_COMMIT 0x802c0000u /* These */
269 #define META_COMMIT_ARG 0x802d0000u /* pairs */
270 #define META_PRUNE 0x802e0000u /* must */
271 #define META_PRUNE_ARG 0x802f0000u /* be */
272 #define META_SKIP 0x80300000u /* kept */
273 #define META_SKIP_ARG 0x80310000u /* in */
274 #define META_THEN 0x80320000u /* this */
275 #define META_THEN_ARG 0x80330000u /* order */
276
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278
279 #define META_ASTERISK 0x80340000u /* * */
280 #define META_ASTERISK_PLUS 0x80350000u /* *+ */
281 #define META_ASTERISK_QUERY 0x80360000u /* *? */
282 #define META_PLUS 0x80370000u /* + */
283 #define META_PLUS_PLUS 0x80380000u /* ++ */
284 #define META_PLUS_QUERY 0x80390000u /* +? */
285 #define META_QUERY 0x803a0000u /* ? */
286 #define META_QUERY_PLUS 0x803b0000u /* ?+ */
287 #define META_QUERY_QUERY 0x803c0000u /* ?? */
288 #define META_MINMAX 0x803d0000u /* {n,m} repeat */
289 #define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
291
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER META_MINMAX_QUERY
294
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305
306 static unsigned char meta_extra_lengths[] = {
307 0, /* META_END */
308 0, /* META_ALT */
309 0, /* META_ATOMIC */
310 0, /* META_BACKREF - more if group is >= 10 */
311 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
312 1, /* META_BIGVALUE */
313 3, /* META_CALLOUT_NUMBER */
314 3+SIZEOFFSET, /* META_CALLOUT_STRING */
315 0, /* META_CAPTURE */
316 0, /* META_CIRCUMFLEX */
317 0, /* META_CLASS */
318 0, /* META_CLASS_EMPTY */
319 0, /* META_CLASS_EMPTY_NOT */
320 0, /* META_CLASS_END */
321 0, /* META_CLASS_NOT */
322 0, /* META_COND_ASSERT */
323 SIZEOFFSET, /* META_COND_DEFINE */
324 1+SIZEOFFSET, /* META_COND_NAME */
325 1+SIZEOFFSET, /* META_COND_NUMBER */
326 1+SIZEOFFSET, /* META_COND_RNAME */
327 1+SIZEOFFSET, /* META_COND_RNUMBER */
328 3, /* META_COND_VERSION */
329 0, /* META_DOLLAR */
330 0, /* META_DOT */
331 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332 0, /* META_KET */
333 0, /* META_NOCAPTURE */
334 1, /* META_OPTIONS */
335 1, /* META_POSIX */
336 1, /* META_POSIX_NEG */
337 0, /* META_RANGE_ESCAPED */
338 0, /* META_RANGE_LITERAL */
339 SIZEOFFSET, /* META_RECURSE */
340 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
341 0, /* META_SCRIPT_RUN */
342 0, /* META_LOOKAHEAD */
343 0, /* META_LOOKAHEADNOT */
344 SIZEOFFSET, /* META_LOOKBEHIND */
345 SIZEOFFSET, /* META_LOOKBEHINDNOT */
346 0, /* META_LOOKAHEAD_NA */
347 SIZEOFFSET, /* META_LOOKBEHIND_NA */
348 1, /* META_MARK - plus the string length */
349 0, /* META_ACCEPT */
350 0, /* META_FAIL */
351 0, /* META_COMMIT */
352 1, /* META_COMMIT_ARG - plus the string length */
353 0, /* META_PRUNE */
354 1, /* META_PRUNE_ARG - plus the string length */
355 0, /* META_SKIP */
356 1, /* META_SKIP_ARG - plus the string length */
357 0, /* META_THEN */
358 1, /* META_THEN_ARG - plus the string length */
359 0, /* META_ASTERISK */
360 0, /* META_ASTERISK_PLUS */
361 0, /* META_ASTERISK_QUERY */
362 0, /* META_PLUS */
363 0, /* META_PLUS_PLUS */
364 0, /* META_PLUS_QUERY */
365 0, /* META_QUERY */
366 0, /* META_QUERY_PLUS */
367 0, /* META_QUERY_QUERY */
368 2, /* META_MINMAX */
369 2, /* META_MINMAX_PLUS */
370 2 /* META_MINMAX_QUERY */
371 };
372
373 /* Types for skipping parts of a parsed pattern. */
374
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380
381 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387
388 /* Private flags added to firstcu and reqcu. */
389
390 #define REQ_CASELESS (1u << 0) /* Indicates caselessness */
391 #define REQ_VARY (1u << 1) /* reqcu followed non-literal item */
392 /* Negative values for the firstcu and reqcu flags */
393 #define REQ_UNSET (-2) /* Not yet found anything */
394 #define REQ_NONE (-1) /* Found not fixed char */
395
396 /* These flags are used in the groupinfo vector. */
397
398 #define GI_SET_FIXED_LENGTH 0x80000000u
399 #define GI_NOT_FIXED_LENGTH 0x40000000u
400 #define GI_FIXED_LENGTH_MASK 0x0000ffffu
401
402 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
403 and is fast (a good compiler can turn it into a subtraction and unsigned
404 comparison). */
405
406 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
407
408 /* Table to identify hex digits. The tables in chartables are dependent on the
409 locale, and may mark arbitrary characters as digits. We want to recognize only
410 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
411 costs 256 bytes, but it is a lot faster than doing character value tests (at
412 least in some simple cases I timed), and in some applications one wants PCRE2
413 to compile efficiently as well as match efficiently. The value in the table is
414 the binary hex digit value, or 0xff for non-hex digits. */
415
416 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
417 UTF-8 mode. */
418
419 #ifndef EBCDIC
420 static const uint8_t xdigitab[] =
421 {
422 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
423 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
424 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
428 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
429 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
430 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
431 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
432 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
434 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
435 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
436 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
437 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
438 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
439 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
440 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
441 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
444 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
445 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
446 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
447 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
448 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
449 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
450 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
454
455 #else
456
457 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
458
459 static const uint8_t xdigitab[] =
460 {
461 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
462 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
466 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
467 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
472 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
473 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
474 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
475 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
476 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
477 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
478 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
479 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
480 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
481 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
482 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
483 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
484 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
485 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
486 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
487 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
488 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
489 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
490 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
491 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
492 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
493 #endif /* EBCDIC */
494
495
496 /* Table for handling alphanumeric escaped characters. Positive returns are
497 simple data values; negative values are for special things like \d and so on.
498 Zero means further processing is needed (for things like \x), or the escape is
499 invalid. */
500
501 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
502 in UTF-8 mode. It runs from '0' to 'z'. */
503
504 #ifndef EBCDIC
505 #define ESCAPES_FIRST CHAR_0
506 #define ESCAPES_LAST CHAR_z
507 #define UPPER_CASE(c) (c-32)
508
509 static const short int escapes[] = {
510 0, 0,
511 0, 0,
512 0, 0,
513 0, 0,
514 0, 0,
515 CHAR_COLON, CHAR_SEMICOLON,
516 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
517 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
518 CHAR_COMMERCIAL_AT, -ESC_A,
519 -ESC_B, -ESC_C,
520 -ESC_D, -ESC_E,
521 0, -ESC_G,
522 -ESC_H, 0,
523 0, -ESC_K,
524 0, 0,
525 -ESC_N, 0,
526 -ESC_P, -ESC_Q,
527 -ESC_R, -ESC_S,
528 0, 0,
529 -ESC_V, -ESC_W,
530 -ESC_X, 0,
531 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
532 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
533 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
534 CHAR_GRAVE_ACCENT, CHAR_BEL,
535 -ESC_b, 0,
536 -ESC_d, CHAR_ESC,
537 CHAR_FF, 0,
538 -ESC_h, 0,
539 0, -ESC_k,
540 0, 0,
541 CHAR_LF, 0,
542 -ESC_p, 0,
543 CHAR_CR, -ESC_s,
544 CHAR_HT, 0,
545 -ESC_v, -ESC_w,
546 0, 0,
547 -ESC_z
548 };
549
550 #else
551
552 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
553 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
554 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
555 because it is defined as 'a', which of course picks up the ASCII value. */
556
557 #if 'a' == 0x81 /* Check for a real EBCDIC environment */
558 #define ESCAPES_FIRST CHAR_a
559 #define ESCAPES_LAST CHAR_9
560 #define UPPER_CASE(c) (c+64)
561 #else /* Testing in an ASCII environment */
562 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
563 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
564 #define UPPER_CASE(c) (c-32)
565 #endif
566
567 static const short int escapes[] = {
568 /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
569 /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
570 /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
571 /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
572 /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
573 /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
574 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
575 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
576 /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
577 /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
578 /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
579 /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
580 /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
581 /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
582 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
583 /* F8 */ 0, 0
584 };
585
586 /* We also need a table of characters that may follow \c in an EBCDIC
587 environment for characters 0-31. */
588
589 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
590
591 #endif /* EBCDIC */
592
593
594 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
595 searched linearly. Put all the names into a single string, in order to reduce
596 the number of relocations when a shared library is dynamically linked. The
597 string is built from string macros so that it works in UTF-8 mode on EBCDIC
598 platforms. */
599
600 typedef struct verbitem {
601 unsigned int len; /* Length of verb name */
602 uint32_t meta; /* Base META_ code */
603 int has_arg; /* Argument requirement */
604 } verbitem;
605
606 static const char verbnames[] =
607 "\0" /* Empty name is a shorthand for MARK */
608 STRING_MARK0
609 STRING_ACCEPT0
610 STRING_F0
611 STRING_FAIL0
612 STRING_COMMIT0
613 STRING_PRUNE0
614 STRING_SKIP0
615 STRING_THEN;
616
617 static const verbitem verbs[] = {
618 { 0, META_MARK, +1 }, /* > 0 => must have an argument */
619 { 4, META_MARK, +1 },
620 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
621 { 1, META_FAIL, -1 },
622 { 4, META_FAIL, -1 },
623 { 6, META_COMMIT, 0 },
624 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
625 { 4, META_SKIP, 0 },
626 { 4, META_THEN, 0 }
627 };
628
629 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
630
631 /* Verb opcodes, indexed by their META code offset from META_MARK. */
632
633 static const uint32_t verbops[] = {
634 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
635 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
636
637 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
638
639 typedef struct alasitem {
640 unsigned int len; /* Length of name */
641 uint32_t meta; /* Base META_ code */
642 } alasitem;
643
644 static const char alasnames[] =
645 STRING_pla0
646 STRING_plb0
647 STRING_napla0
648 STRING_naplb0
649 STRING_nla0
650 STRING_nlb0
651 STRING_positive_lookahead0
652 STRING_positive_lookbehind0
653 STRING_non_atomic_positive_lookahead0
654 STRING_non_atomic_positive_lookbehind0
655 STRING_negative_lookahead0
656 STRING_negative_lookbehind0
657 STRING_atomic0
658 STRING_sr0
659 STRING_asr0
660 STRING_script_run0
661 STRING_atomic_script_run;
662
663 static const alasitem alasmeta[] = {
664 { 3, META_LOOKAHEAD },
665 { 3, META_LOOKBEHIND },
666 { 5, META_LOOKAHEAD_NA },
667 { 5, META_LOOKBEHIND_NA },
668 { 3, META_LOOKAHEADNOT },
669 { 3, META_LOOKBEHINDNOT },
670 { 18, META_LOOKAHEAD },
671 { 19, META_LOOKBEHIND },
672 { 29, META_LOOKAHEAD_NA },
673 { 30, META_LOOKBEHIND_NA },
674 { 18, META_LOOKAHEADNOT },
675 { 19, META_LOOKBEHINDNOT },
676 { 6, META_ATOMIC },
677 { 2, META_SCRIPT_RUN }, /* sr = script run */
678 { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
679 { 10, META_SCRIPT_RUN }, /* script run */
680 { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
681 };
682
683 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
684
685 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
686
687 static uint32_t chartypeoffset[] = {
688 OP_STAR - OP_STAR, OP_STARI - OP_STAR,
689 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
690
691 /* Tables of names of POSIX character classes and their lengths. The names are
692 now all in a single string, to reduce the number of relocations when a shared
693 library is dynamically loaded. The list of lengths is terminated by a zero
694 length entry. The first three must be alpha, lower, upper, as this is assumed
695 for handling case independence. The indices for graph, print, and punct are
696 needed, so identify them. */
697
698 static const char posix_names[] =
699 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
700 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
701 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
702 STRING_word0 STRING_xdigit;
703
704 static const uint8_t posix_name_lengths[] = {
705 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
706
707 #define PC_GRAPH 8
708 #define PC_PRINT 9
709 #define PC_PUNCT 10
710
711 /* Table of class bit maps for each POSIX class. Each class is formed from a
712 base map, with an optional addition or removal of another map. Then, for some
713 classes, there is some additional tweaking: for [:blank:] the vertical space
714 characters are removed, and for [:alpha:] and [:alnum:] the underscore
715 character is removed. The triples in the table consist of the base map offset,
716 second map offset or -1 if no second map, and a non-negative value for map
717 addition or a negative value for map subtraction (if there are two maps). The
718 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
719 remove vertical space characters, 2 => remove underscore. */
720
721 static const int posix_class_maps[] = {
722 cbit_word, cbit_digit, -2, /* alpha */
723 cbit_lower, -1, 0, /* lower */
724 cbit_upper, -1, 0, /* upper */
725 cbit_word, -1, 2, /* alnum - word without underscore */
726 cbit_print, cbit_cntrl, 0, /* ascii */
727 cbit_space, -1, 1, /* blank - a GNU extension */
728 cbit_cntrl, -1, 0, /* cntrl */
729 cbit_digit, -1, 0, /* digit */
730 cbit_graph, -1, 0, /* graph */
731 cbit_print, -1, 0, /* print */
732 cbit_punct, -1, 0, /* punct */
733 cbit_space, -1, 0, /* space */
734 cbit_word, -1, 0, /* word - a Perl extension */
735 cbit_xdigit,-1, 0 /* xdigit */
736 };
737
738 #ifdef SUPPORT_UNICODE
739
740 /* The POSIX class Unicode property substitutes that are used in UCP mode must
741 be in the order of the POSIX class names, defined above. */
742
743 static int posix_substitutes[] = {
744 PT_GC, ucp_L, /* alpha */
745 PT_PC, ucp_Ll, /* lower */
746 PT_PC, ucp_Lu, /* upper */
747 PT_ALNUM, 0, /* alnum */
748 -1, 0, /* ascii, treat as non-UCP */
749 -1, 1, /* blank, treat as \h */
750 PT_PC, ucp_Cc, /* cntrl */
751 PT_PC, ucp_Nd, /* digit */
752 PT_PXGRAPH, 0, /* graph */
753 PT_PXPRINT, 0, /* print */
754 PT_PXPUNCT, 0, /* punct */
755 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
756 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
757 -1, 0 /* xdigit, treat as non-UCP */
758 };
759 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
760 #endif /* SUPPORT_UNICODE */
761
762 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
763 are allowed. */
764
765 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
766 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
767 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
768 PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
769
770 #define PUBLIC_COMPILE_OPTIONS \
771 (PUBLIC_LITERAL_COMPILE_OPTIONS| \
772 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
773 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
774 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
775 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
776 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
777 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
778
779 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
780 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
781
782 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
783 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
784 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
785 PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
786 PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)
787
788 /* Compile time error code numbers. They are given names so that they can more
789 easily be tracked. When a new number is added, the tables called eint1 and
790 eint2 in pcre2posix.c may need to be updated, and a new error text must be
791 added to compile_error_texts in pcre2_error.c. Also, the error codes in
792 pcre2.h.in must be updated - their values are exactly 100 greater than these
793 values. */
794
795 enum { ERR0 = COMPILE_ERROR_BASE,
796 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
797 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
798 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
799 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
800 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
801 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
802 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
803 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
804 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
805 ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 };
806
807 /* This is a table of start-of-pattern options such as (*UTF) and settings such
808 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
809 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
810 generic and always supported. */
811
812 enum { PSO_OPT, /* Value is an option bit */
813 PSO_FLG, /* Value is a flag bit */
814 PSO_NL, /* Value is a newline type */
815 PSO_BSR, /* Value is a \R type */
816 PSO_LIMH, /* Read integer value for heap limit */
817 PSO_LIMM, /* Read integer value for match limit */
818 PSO_LIMD }; /* Read integer value for depth limit */
819
820 typedef struct pso {
821 const uint8_t *name;
822 uint16_t length;
823 uint16_t type;
824 uint32_t value;
825 } pso;
826
827 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
828
829 static pso pso_list[] = {
830 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
831 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
832 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
833 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
834 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
835 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
836 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
837 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
838 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
839 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
840 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
841 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
842 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
843 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
844 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
845 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
846 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
847 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
848 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
849 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
850 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
851 };
852
853 /* This table is used when converting repeating opcodes into possessified
854 versions as a result of an explicit possessive quantifier such as ++. A zero
855 value means there is no possessified version - in those cases the item in
856 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
857 because all relevant opcodes are less than that. */
858
859 static const uint8_t opcode_possessify[] = {
860 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
861 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
862
863 0, /* NOTI */
864 OP_POSSTAR, 0, /* STAR, MINSTAR */
865 OP_POSPLUS, 0, /* PLUS, MINPLUS */
866 OP_POSQUERY, 0, /* QUERY, MINQUERY */
867 OP_POSUPTO, 0, /* UPTO, MINUPTO */
868 0, /* EXACT */
869 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
870
871 OP_POSSTARI, 0, /* STARI, MINSTARI */
872 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
873 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
874 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
875 0, /* EXACTI */
876 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
877
878 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
879 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
880 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
881 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
882 0, /* NOTEXACT */
883 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
884
885 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
886 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
887 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
888 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
889 0, /* NOTEXACTI */
890 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
891
892 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
893 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
894 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
895 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
896 0, /* TYPEEXACT */
897 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
898
899 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
900 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
901 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
902 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
903 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
904
905 0, 0, 0, /* CLASS, NCLASS, XCLASS */
906 0, 0, /* REF, REFI */
907 0, 0, /* DNREF, DNREFI */
908 0, 0 /* RECURSE, CALLOUT */
909 };
910
911
912 #ifdef DEBUG_SHOW_PARSED
913 /*************************************************
914 * Show the parsed pattern for debugging *
915 *************************************************/
916
917 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
918 can be enabled. */
919
show_parsed(compile_block * cb)920 static void show_parsed(compile_block *cb)
921 {
922 uint32_t *pptr = cb->parsed_pattern;
923
924 for (;;)
925 {
926 int max, min;
927 PCRE2_SIZE offset;
928 uint32_t i;
929 uint32_t length;
930 uint32_t meta_arg = META_DATA(*pptr);
931
932 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
933
934 if (*pptr < META_END)
935 {
936 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
937 pptr++;
938 }
939
940 else switch (META_CODE(*pptr++))
941 {
942 default:
943 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
944 return;
945
946 case META_END:
947 fprintf(stderr, "META_END\n");
948 return;
949
950 case META_CAPTURE:
951 fprintf(stderr, "META_CAPTURE %d", meta_arg);
952 break;
953
954 case META_RECURSE:
955 GETOFFSET(offset, pptr);
956 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
957 break;
958
959 case META_BACKREF:
960 if (meta_arg < 10)
961 offset = cb->small_ref_offset[meta_arg];
962 else
963 GETOFFSET(offset, pptr);
964 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
965 break;
966
967 case META_ESCAPE:
968 if (meta_arg == ESC_P || meta_arg == ESC_p)
969 {
970 uint32_t ptype = *pptr >> 16;
971 uint32_t pvalue = *pptr++ & 0xffff;
972 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
973 ptype, pvalue);
974 }
975 else
976 {
977 uint32_t cc;
978 /* There's just one escape we might have here that isn't negated in the
979 escapes table. */
980 if (meta_arg == ESC_g) cc = CHAR_g;
981 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
982 {
983 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
984 }
985 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
986 fprintf(stderr, "META \\%c", cc);
987 }
988 break;
989
990 case META_MINMAX:
991 min = *pptr++;
992 max = *pptr++;
993 if (max != REPEAT_UNLIMITED)
994 fprintf(stderr, "META {%d,%d}", min, max);
995 else
996 fprintf(stderr, "META {%d,}", min);
997 break;
998
999 case META_MINMAX_QUERY:
1000 min = *pptr++;
1001 max = *pptr++;
1002 if (max != REPEAT_UNLIMITED)
1003 fprintf(stderr, "META {%d,%d}?", min, max);
1004 else
1005 fprintf(stderr, "META {%d,}?", min);
1006 break;
1007
1008 case META_MINMAX_PLUS:
1009 min = *pptr++;
1010 max = *pptr++;
1011 if (max != REPEAT_UNLIMITED)
1012 fprintf(stderr, "META {%d,%d}+", min, max);
1013 else
1014 fprintf(stderr, "META {%d,}+", min);
1015 break;
1016
1017 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1018 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1019 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1020 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1021 case META_DOT: fprintf(stderr, "META_DOT"); break;
1022 case META_ASTERISK: fprintf(stderr, "META *"); break;
1023 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1024 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1025 case META_PLUS: fprintf(stderr, "META +"); break;
1026 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1027 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1028 case META_QUERY: fprintf(stderr, "META ?"); break;
1029 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1030 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1031
1032 case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1033 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1034 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1035 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1036 case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1037 case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1038 case META_KET: fprintf(stderr, "META )"); break;
1039 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1040
1041 case META_CLASS: fprintf(stderr, "META ["); break;
1042 case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1043 case META_CLASS_END: fprintf(stderr, "META ]"); break;
1044 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1045 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1046
1047 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1048 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1049
1050 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1051 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1052
1053 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1054 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1055 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1056 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1057 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1058 case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1059
1060 case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1061
1062 case META_LOOKBEHIND:
1063 fprintf(stderr, "META (?<= %d offset=", meta_arg);
1064 GETOFFSET(offset, pptr);
1065 fprintf(stderr, "%zd", offset);
1066 break;
1067
1068 case META_LOOKBEHIND_NA:
1069 fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
1070 GETOFFSET(offset, pptr);
1071 fprintf(stderr, "%zd", offset);
1072 break;
1073
1074 case META_LOOKBEHINDNOT:
1075 fprintf(stderr, "META (?<! %d offset=", meta_arg);
1076 GETOFFSET(offset, pptr);
1077 fprintf(stderr, "%zd", offset);
1078 break;
1079
1080 case META_CALLOUT_NUMBER:
1081 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1082 pptr[1]);
1083 pptr += 3;
1084 break;
1085
1086 case META_CALLOUT_STRING:
1087 {
1088 uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1089 uint32_t patlength = *pptr++; /* Length of next pattern item */
1090 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1091 GETOFFSET(offset, pptr);
1092 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1093 }
1094 break;
1095
1096 case META_RECURSE_BYNAME:
1097 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1098 GETOFFSET(offset, pptr);
1099 fprintf(stderr, "%zd", offset);
1100 break;
1101
1102 case META_BACKREF_BYNAME:
1103 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1104 GETOFFSET(offset, pptr);
1105 fprintf(stderr, "%zd", offset);
1106 break;
1107
1108 case META_COND_NUMBER:
1109 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1110 GETOFFSET(offset, pptr);
1111 fprintf(stderr, "%zd", offset);
1112 pptr++;
1113 break;
1114
1115 case META_COND_DEFINE:
1116 fprintf(stderr, "META (?(DEFINE) offset=");
1117 GETOFFSET(offset, pptr);
1118 fprintf(stderr, "%zd", offset);
1119 break;
1120
1121 case META_COND_VERSION:
1122 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1123 fprintf(stderr, "%d.", *pptr++);
1124 fprintf(stderr, "%d)", *pptr++);
1125 break;
1126
1127 case META_COND_NAME:
1128 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1129 GETOFFSET(offset, pptr);
1130 fprintf(stderr, "%zd", offset);
1131 break;
1132
1133 case META_COND_RNAME:
1134 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1135 GETOFFSET(offset, pptr);
1136 fprintf(stderr, "%zd", offset);
1137 break;
1138
1139 /* This is kept as a name, because it might be. */
1140
1141 case META_COND_RNUMBER:
1142 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1143 GETOFFSET(offset, pptr);
1144 fprintf(stderr, "%zd", offset);
1145 break;
1146
1147 case META_MARK:
1148 fprintf(stderr, "META (*MARK:");
1149 goto SHOWARG;
1150
1151 case META_COMMIT_ARG:
1152 fprintf(stderr, "META (*COMMIT:");
1153 goto SHOWARG;
1154
1155 case META_PRUNE_ARG:
1156 fprintf(stderr, "META (*PRUNE:");
1157 goto SHOWARG;
1158
1159 case META_SKIP_ARG:
1160 fprintf(stderr, "META (*SKIP:");
1161 goto SHOWARG;
1162
1163 case META_THEN_ARG:
1164 fprintf(stderr, "META (*THEN:");
1165 SHOWARG:
1166 length = *pptr++;
1167 for (i = 0; i < length; i++)
1168 {
1169 uint32_t cc = *pptr++;
1170 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1171 else fprintf(stderr, "\\x{%x}", cc);
1172 }
1173 fprintf(stderr, ") length=%u", length);
1174 break;
1175 }
1176 fprintf(stderr, "\n");
1177 }
1178 return;
1179 }
1180 #endif /* DEBUG_SHOW_PARSED */
1181
1182
1183
1184 /*************************************************
1185 * Copy compiled code *
1186 *************************************************/
1187
1188 /* Compiled JIT code cannot be copied, so the new compiled block has no
1189 associated JIT data. */
1190
1191 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1192 pcre2_code_copy(const pcre2_code *code)
1193 {
1194 PCRE2_SIZE* ref_count;
1195 pcre2_code *newcode;
1196
1197 if (code == NULL) return NULL;
1198 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1199 if (newcode == NULL) return NULL;
1200 memcpy(newcode, code, code->blocksize);
1201 newcode->executable_jit = NULL;
1202
1203 /* If the code is one that has been deserialized, increment the reference count
1204 in the decoded tables. */
1205
1206 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1207 {
1208 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1209 (*ref_count)++;
1210 }
1211
1212 return newcode;
1213 }
1214
1215
1216
1217 /*************************************************
1218 * Copy compiled code and character tables *
1219 *************************************************/
1220
1221 /* Compiled JIT code cannot be copied, so the new compiled block has no
1222 associated JIT data. This version of code_copy also makes a separate copy of
1223 the character tables. */
1224
1225 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1226 pcre2_code_copy_with_tables(const pcre2_code *code)
1227 {
1228 PCRE2_SIZE* ref_count;
1229 pcre2_code *newcode;
1230 uint8_t *newtables;
1231
1232 if (code == NULL) return NULL;
1233 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1234 if (newcode == NULL) return NULL;
1235 memcpy(newcode, code, code->blocksize);
1236 newcode->executable_jit = NULL;
1237
1238 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1239 code->memctl.memory_data);
1240 if (newtables == NULL)
1241 {
1242 code->memctl.free((void *)newcode, code->memctl.memory_data);
1243 return NULL;
1244 }
1245 memcpy(newtables, code->tables, TABLES_LENGTH);
1246 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1247 *ref_count = 1;
1248
1249 newcode->tables = newtables;
1250 newcode->flags |= PCRE2_DEREF_TABLES;
1251 return newcode;
1252 }
1253
1254
1255
1256 /*************************************************
1257 * Free compiled code *
1258 *************************************************/
1259
1260 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1261 pcre2_code_free(pcre2_code *code)
1262 {
1263 PCRE2_SIZE* ref_count;
1264
1265 if (code != NULL)
1266 {
1267 if (code->executable_jit != NULL)
1268 PRIV(jit_free)(code->executable_jit, &code->memctl);
1269
1270 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1271 {
1272 /* Decoded tables belong to the codes after deserialization, and they must
1273 be freed when there are no more references to them. The *ref_count should
1274 always be > 0. */
1275
1276 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1277 if (*ref_count > 0)
1278 {
1279 (*ref_count)--;
1280 if (*ref_count == 0)
1281 code->memctl.free((void *)code->tables, code->memctl.memory_data);
1282 }
1283 }
1284
1285 code->memctl.free(code, code->memctl.memory_data);
1286 }
1287 }
1288
1289
1290
1291 /*************************************************
1292 * Read a number, possibly signed *
1293 *************************************************/
1294
1295 /* This function is used to read numbers in the pattern. The initial pointer
1296 must be the sign or first digit of the number. When relative values (introduced
1297 by + or -) are allowed, they are relative group numbers, and the result must be
1298 greater than zero.
1299
1300 Arguments:
1301 ptrptr points to the character pointer variable
1302 ptrend points to the end of the input string
1303 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1304 max_value the largest number allowed
1305 max_error the error to give for an over-large number
1306 intptr where to put the result
1307 errcodeptr where to put an error code
1308
1309 Returns: TRUE - a number was read
1310 FALSE - errorcode == 0 => no number was found
1311 errorcode != 0 => an error occurred
1312 */
1313
1314 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1315 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1316 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1317 {
1318 int sign = 0;
1319 uint32_t n = 0;
1320 PCRE2_SPTR ptr = *ptrptr;
1321 BOOL yield = FALSE;
1322
1323 *errorcodeptr = 0;
1324
1325 if (allow_sign >= 0 && ptr < ptrend)
1326 {
1327 if (*ptr == CHAR_PLUS)
1328 {
1329 sign = +1;
1330 max_value -= allow_sign;
1331 ptr++;
1332 }
1333 else if (*ptr == CHAR_MINUS)
1334 {
1335 sign = -1;
1336 ptr++;
1337 }
1338 }
1339
1340 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1341 while (ptr < ptrend && IS_DIGIT(*ptr))
1342 {
1343 n = n * 10 + *ptr++ - CHAR_0;
1344 if (n > max_value)
1345 {
1346 *errorcodeptr = max_error;
1347 goto EXIT;
1348 }
1349 }
1350
1351 if (allow_sign >= 0 && sign != 0)
1352 {
1353 if (n == 0)
1354 {
1355 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1356 goto EXIT;
1357 }
1358
1359 if (sign > 0) n += allow_sign;
1360 else if ((int)n > allow_sign)
1361 {
1362 *errorcodeptr = ERR15; /* Non-existent subpattern */
1363 goto EXIT;
1364 }
1365 else n = allow_sign + 1 - n;
1366 }
1367
1368 yield = TRUE;
1369
1370 EXIT:
1371 *intptr = n;
1372 *ptrptr = ptr;
1373 return yield;
1374 }
1375
1376
1377
1378 /*************************************************
1379 * Read repeat counts *
1380 *************************************************/
1381
1382 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1383 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1384 larger value is used for "unlimited". We have to use signed arguments for
1385 read_number() because it is capable of returning a signed value.
1386
1387 Arguments:
1388 ptrptr points to pointer to character after'{'
1389 ptrend pointer to end of input
1390 minp if not NULL, pointer to int for min
1391 maxp if not NULL, pointer to int for max (-1 if no max)
1392 returned as -1 if no max
1393 errorcodeptr points to error code variable
1394
1395 Returns: FALSE if not a repeat quantifier, errorcode set zero
1396 FALSE on error, with errorcode set non-zero
1397 TRUE on success, with pointer updated to point after '}'
1398 */
1399
1400 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1401 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1402 uint32_t *maxp, int *errorcodeptr)
1403 {
1404 PCRE2_SPTR p;
1405 BOOL yield = FALSE;
1406 BOOL had_comma = FALSE;
1407 int32_t min = 0;
1408 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1409
1410 /* Check the syntax */
1411
1412 *errorcodeptr = 0;
1413 for (p = *ptrptr;; p++)
1414 {
1415 uint32_t c;
1416 if (p >= ptrend) return FALSE;
1417 c = *p;
1418 if (IS_DIGIT(c)) continue;
1419 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1420 if (c == CHAR_COMMA)
1421 {
1422 if (had_comma) return FALSE;
1423 had_comma = TRUE;
1424 }
1425 else return FALSE;
1426 }
1427
1428 /* The only error from read_number() is for a number that is too big. */
1429
1430 p = *ptrptr;
1431 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1432 goto EXIT;
1433
1434 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1435 {
1436 p++;
1437 max = min;
1438 }
1439 else
1440 {
1441 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1442 {
1443 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1444 errorcodeptr))
1445 goto EXIT;
1446 if (max < min)
1447 {
1448 *errorcodeptr = ERR4;
1449 goto EXIT;
1450 }
1451 }
1452 p++;
1453 }
1454
1455 yield = TRUE;
1456 if (minp != NULL) *minp = (uint32_t)min;
1457 if (maxp != NULL) *maxp = (uint32_t)max;
1458
1459 /* Update the pattern pointer */
1460
1461 EXIT:
1462 *ptrptr = p;
1463 return yield;
1464 }
1465
1466
1467
1468 /*************************************************
1469 * Handle escapes *
1470 *************************************************/
1471
1472 /* This function is called when a \ has been encountered. It either returns a
1473 positive value for a simple escape such as \d, or 0 for a data character, which
1474 is placed in chptr. A backreference to group n is returned as negative n. On
1475 entry, ptr is pointing at the character after \. On exit, it points after the
1476 final code unit of the escape sequence.
1477
1478 This function is also called from pcre2_substitute() to handle escape sequences
1479 in replacement strings. In this case, the cb argument is NULL, and in the case
1480 of escapes that have further processing, only sequences that define a data
1481 character are recognised. The isclass argument is not relevant; the options
1482 argument is the final value of the compiled pattern's options.
1483
1484 Arguments:
1485 ptrptr points to the input position pointer
1486 ptrend points to the end of the input
1487 chptr points to a returned data character
1488 errorcodeptr points to the errorcode variable (containing zero)
1489 options the current options bits
1490 isclass TRUE if inside a character class
1491 cb compile data block or NULL when called from pcre2_substitute()
1492
1493 Returns: zero => a data character
1494 positive => a special escape sequence
1495 negative => a numerical back reference
1496 on error, errorcodeptr is set non-zero
1497 */
1498
1499 int
PRIV(check_escape)1500 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1501 int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1502 compile_block *cb)
1503 {
1504 BOOL utf = (options & PCRE2_UTF) != 0;
1505 PCRE2_SPTR ptr = *ptrptr;
1506 uint32_t c, cc;
1507 int escape = 0;
1508 int i;
1509
1510 /* If backslash is at the end of the string, it's an error. */
1511
1512 if (ptr >= ptrend)
1513 {
1514 *errorcodeptr = ERR1;
1515 return 0;
1516 }
1517
1518 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1519 *errorcodeptr = 0; /* Be optimistic */
1520
1521 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1522 value test saves a memory lookup for code points outside the alphanumeric
1523 range. */
1524
1525 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1526
1527 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1528 positive value is a literal value for something like \n. A negative value is
1529 the negation of one of the ESC_ macros that is passed back for handling by the
1530 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1531 is supported. If the value is zero, further processing is handled below. */
1532
1533 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1534 {
1535 if (i > 0)
1536 {
1537 c = (uint32_t)i;
1538 if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1539 c = CHAR_LF;
1540 }
1541 else /* Negative table entry */
1542 {
1543 escape = -i; /* Else return a special escape */
1544 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1545 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1546
1547 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1548 Unicode code points, as well as plain \N for "not newline". PCRE does not
1549 support \N{name}. However, it does support quantification such as \N{2,3},
1550 so if \N{ is not followed by U+dddd we check for a quantifier. */
1551
1552 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1553 {
1554 PCRE2_SPTR p = ptr + 1;
1555
1556 /* \N{U+ can be handled by the \x{ code. However, this construction is
1557 not valid in EBCDIC environments because it specifies a Unicode
1558 character, not a codepoint in the local code. For example \N{U+0041}
1559 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1560 casing semantics for the entire pattern, so allow it only in UTF (i.e.
1561 Unicode) mode. */
1562
1563 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1564 {
1565 #ifdef EBCDIC
1566 *errorcodeptr = ERR93;
1567 #else
1568 if (utf)
1569 {
1570 ptr = p + 1;
1571 escape = 0; /* Not a fancy escape after all */
1572 goto COME_FROM_NU;
1573 }
1574 else *errorcodeptr = ERR93;
1575 #endif
1576 }
1577
1578 /* Give an error if what follows is not a quantifier, but don't override
1579 an error set by the quantifier reader (e.g. number overflow). */
1580
1581 else
1582 {
1583 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1584 *errorcodeptr == 0)
1585 *errorcodeptr = ERR37;
1586 }
1587 }
1588 }
1589 }
1590
1591 /* Escapes that need further processing, including those that are unknown, have
1592 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1593 \o, and \x are recognized (\u and \U can never appear as they are used for case
1594 forcing). */
1595
1596 else
1597 {
1598 int s;
1599 PCRE2_SPTR oldptr;
1600 BOOL overflow;
1601 BOOL alt_bsux =
1602 ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1603
1604 /* Filter calls from pcre2_substitute(). */
1605
1606 if (cb == NULL)
1607 {
1608 if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1609 {
1610 *errorcodeptr = ERR3;
1611 return 0;
1612 }
1613 alt_bsux = FALSE; /* Do not modify \x handling */
1614 }
1615
1616 switch (c)
1617 {
1618 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1619 error. */
1620
1621 case CHAR_F:
1622 case CHAR_l:
1623 case CHAR_L:
1624 *errorcodeptr = ERR37;
1625 break;
1626
1627 /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1628 is set. Otherwise, \u must be followed by exactly four hex digits or, if
1629 PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1630 Otherwise it is a lowercase u letter. This gives some compatibility with
1631 ECMAScript (aka JavaScript). */
1632
1633 case CHAR_u:
1634 if (!alt_bsux) *errorcodeptr = ERR37; else
1635 {
1636 uint32_t xc;
1637
1638 if (ptr >= ptrend) break;
1639 if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1640 (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1641 {
1642 PCRE2_SPTR hptr = ptr + 1;
1643 cc = 0;
1644
1645 while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1646 {
1647 if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1648 {
1649 *errorcodeptr = ERR77;
1650 ptr = hptr; /* Show where */
1651 break; /* *hptr != } will cause another break below */
1652 }
1653 cc = (cc << 4) | xc;
1654 hptr++;
1655 }
1656
1657 if (hptr == ptr + 1 || /* No hex digits */
1658 hptr >= ptrend || /* Hit end of input */
1659 *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1660 break; /* Hex escape not recognized */
1661
1662 c = cc; /* Accept the code point */
1663 ptr = hptr + 1;
1664 }
1665
1666 else /* Must be exactly 4 hex digits */
1667 {
1668 if (ptrend - ptr < 4) break; /* Less than 4 chars */
1669 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1670 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1671 cc = (cc << 4) | xc;
1672 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1673 cc = (cc << 4) | xc;
1674 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1675 c = (cc << 4) | xc;
1676 ptr += 4;
1677 }
1678
1679 if (utf)
1680 {
1681 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1682 else
1683 if (c >= 0xd800 && c <= 0xdfff &&
1684 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1685 *errorcodeptr = ERR73;
1686 }
1687 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1688 }
1689 break;
1690
1691 /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1692 in which case it is an upper case letter. */
1693
1694 case CHAR_U:
1695 if (!alt_bsux) *errorcodeptr = ERR37;
1696 break;
1697
1698 /* In a character class, \g is just a literal "g". Outside a character
1699 class, \g must be followed by one of a number of specific things:
1700
1701 (1) A number, either plain or braced. If positive, it is an absolute
1702 backreference. If negative, it is a relative backreference. This is a Perl
1703 5.10 feature.
1704
1705 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1706 is part of Perl's movement towards a unified syntax for back references. As
1707 this is synonymous with \k{name}, we fudge it up by pretending it really
1708 was \k{name}.
1709
1710 (3) For Oniguruma compatibility we also support \g followed by a name or a
1711 number either in angle brackets or in single quotes. However, these are
1712 (possibly recursive) subroutine calls, _not_ backreferences. We return
1713 the ESC_g code.
1714
1715 Summary: Return a negative number for a numerical back reference, ESC_k for
1716 a named back reference, and ESC_g for a named or numbered subroutine call.
1717 */
1718
1719 case CHAR_g:
1720 if (isclass) break;
1721
1722 if (ptr >= ptrend)
1723 {
1724 *errorcodeptr = ERR57;
1725 break;
1726 }
1727
1728 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1729 {
1730 escape = ESC_g;
1731 break;
1732 }
1733
1734 /* If there is a brace delimiter, try to read a numerical reference. If
1735 there isn't one, assume we have a name and treat it as \k. */
1736
1737 if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1738 {
1739 PCRE2_SPTR p = ptr + 1;
1740 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1741 errorcodeptr))
1742 {
1743 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1744 break;
1745 }
1746 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1747 {
1748 *errorcodeptr = ERR57;
1749 break;
1750 }
1751 ptr = p + 1;
1752 }
1753
1754 /* Read an undelimited number */
1755
1756 else
1757 {
1758 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1759 errorcodeptr))
1760 {
1761 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1762 break;
1763 }
1764 }
1765
1766 if (s <= 0)
1767 {
1768 *errorcodeptr = ERR15;
1769 break;
1770 }
1771
1772 escape = -s;
1773 break;
1774
1775 /* The handling of escape sequences consisting of a string of digits
1776 starting with one that is not zero is not straightforward. Perl has changed
1777 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1778 recommended to avoid the ambiguities in the old syntax.
1779
1780 Outside a character class, the digits are read as a decimal number. If the
1781 number is less than 10, or if there are that many previous extracting left
1782 brackets, it is a back reference. Otherwise, up to three octal digits are
1783 read to form an escaped character code. Thus \123 is likely to be octal 123
1784 (cf \0123, which is octal 012 followed by the literal 3).
1785
1786 Inside a character class, \ followed by a digit is always either a literal
1787 8 or 9 or an octal number. */
1788
1789 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1790 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1791
1792 if (!isclass)
1793 {
1794 oldptr = ptr;
1795 ptr--; /* Back to the digit */
1796
1797 /* As we know we are at a digit, the only possible error from
1798 read_number() is a number that is too large to be a group number. In this
1799 case we fall through handle this as not a group reference. If we have
1800 read a small enough number, check for a back reference.
1801
1802 \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1803 are octal escapes if there are not that many previous captures. */
1804
1805 if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1806 (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1807 {
1808 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1809 else escape = -s; /* Indicates a back reference */
1810 break;
1811 }
1812
1813 ptr = oldptr; /* Put the pointer back and fall through */
1814 }
1815
1816 /* Handle a digit following \ when the number is not a back reference, or
1817 we are within a character class. If the first digit is 8 or 9, Perl used to
1818 generate a binary zero and then treat the digit as a following literal. At
1819 least by Perl 5.18 this changed so as not to insert the binary zero. */
1820
1821 if (c >= CHAR_8) break;
1822
1823 /* Fall through */
1824
1825 /* \0 always starts an octal number, but we may drop through to here with a
1826 larger first octal digit. The original code used just to take the least
1827 significant 8 bits of octal numbers (I think this is what early Perls used
1828 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1829 but no more than 3 octal digits. */
1830
1831 case CHAR_0:
1832 c -= CHAR_0;
1833 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1834 c = c * 8 + *ptr++ - CHAR_0;
1835 #if PCRE2_CODE_UNIT_WIDTH == 8
1836 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1837 #endif
1838 break;
1839
1840 /* \o is a relatively new Perl feature, supporting a more general way of
1841 specifying character codes in octal. The only supported form is \o{ddd}. */
1842
1843 case CHAR_o:
1844 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1845 {
1846 ptr--;
1847 *errorcodeptr = ERR55;
1848 }
1849 else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1850 *errorcodeptr = ERR78;
1851 else
1852 {
1853 c = 0;
1854 overflow = FALSE;
1855 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1856 {
1857 cc = *ptr++;
1858 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1859 #if PCRE2_CODE_UNIT_WIDTH == 32
1860 if (c >= 0x20000000l) { overflow = TRUE; break; }
1861 #endif
1862 c = (c << 3) + (cc - CHAR_0);
1863 #if PCRE2_CODE_UNIT_WIDTH == 8
1864 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1865 #elif PCRE2_CODE_UNIT_WIDTH == 16
1866 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1867 #elif PCRE2_CODE_UNIT_WIDTH == 32
1868 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1869 #endif
1870 }
1871 if (overflow)
1872 {
1873 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1874 *errorcodeptr = ERR34;
1875 }
1876 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1877 {
1878 if (utf && c >= 0xd800 && c <= 0xdfff &&
1879 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1880 {
1881 ptr--;
1882 *errorcodeptr = ERR73;
1883 }
1884 }
1885 else
1886 {
1887 ptr--;
1888 *errorcodeptr = ERR64;
1889 }
1890 }
1891 break;
1892
1893 /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1894 by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1895
1896 case CHAR_x:
1897 if (alt_bsux)
1898 {
1899 uint32_t xc;
1900 if (ptrend - ptr < 2) break; /* Less than 2 characters */
1901 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1902 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1903 c = (cc << 4) | xc;
1904 ptr += 2;
1905 }
1906
1907 /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1908 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1909 digits. If not, { used to be treated as a data character. However, Perl
1910 seems to read hex digits up to the first non-such, and ignore the rest, so
1911 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1912 now gives an error. */
1913
1914 else
1915 {
1916 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1917 {
1918 #ifndef EBCDIC
1919 COME_FROM_NU:
1920 #endif
1921 if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1922 {
1923 *errorcodeptr = ERR78;
1924 break;
1925 }
1926 c = 0;
1927 overflow = FALSE;
1928
1929 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1930 {
1931 ptr++;
1932 if (c == 0 && cc == 0) continue; /* Leading zeroes */
1933 #if PCRE2_CODE_UNIT_WIDTH == 32
1934 if (c >= 0x10000000l) { overflow = TRUE; break; }
1935 #endif
1936 c = (c << 4) | cc;
1937 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1938 {
1939 overflow = TRUE;
1940 break;
1941 }
1942 }
1943
1944 if (overflow)
1945 {
1946 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1947 *errorcodeptr = ERR34;
1948 }
1949 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1950 {
1951 if (utf && c >= 0xd800 && c <= 0xdfff &&
1952 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1953 {
1954 ptr--;
1955 *errorcodeptr = ERR73;
1956 }
1957 }
1958
1959 /* If the sequence of hex digits does not end with '}', give an error.
1960 We used just to recognize this construct and fall through to the normal
1961 \x handling, but nowadays Perl gives an error, which seems much more
1962 sensible, so we do too. */
1963
1964 else
1965 {
1966 ptr--;
1967 *errorcodeptr = ERR67;
1968 }
1969 } /* End of \x{} processing */
1970
1971 /* Read a up to two hex digits after \x */
1972
1973 else
1974 {
1975 c = 0;
1976 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1977 ptr++;
1978 c = cc;
1979 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1980 ptr++;
1981 c = (c << 4) | cc;
1982 } /* End of \xdd handling */
1983 } /* End of Perl-style \x handling */
1984 break;
1985
1986 /* The handling of \c is different in ASCII and EBCDIC environments. In an
1987 ASCII (or Unicode) environment, an error is given if the character
1988 following \c is not a printable ASCII character. Otherwise, the following
1989 character is upper-cased if it is a letter, and after that the 0x40 bit is
1990 flipped. The result is the value of the escape.
1991
1992 In an EBCDIC environment the handling of \c is compatible with the
1993 specification in the perlebcdic document. The following character must be
1994 a letter or one of small number of special characters. These provide a
1995 means of defining the character values 0-31.
1996
1997 For testing the EBCDIC handling of \c in an ASCII environment, recognize
1998 the EBCDIC value of 'c' explicitly. */
1999
2000 #if defined EBCDIC && 'a' != 0x81
2001 case 0x83:
2002 #else
2003 case CHAR_c:
2004 #endif
2005 if (ptr >= ptrend)
2006 {
2007 *errorcodeptr = ERR2;
2008 break;
2009 }
2010 c = *ptr;
2011 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2012
2013 /* Handle \c in an ASCII/Unicode environment. */
2014
2015 #ifndef EBCDIC /* ASCII/UTF-8 coding */
2016 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
2017 {
2018 *errorcodeptr = ERR68;
2019 break;
2020 }
2021 c ^= 0x40;
2022
2023 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2024 255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2025 POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2026 The other valid sequences correspond to a list of specific characters. */
2027
2028 #else
2029 if (c == CHAR_QUESTION_MARK)
2030 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2031 else
2032 {
2033 for (i = 0; i < 32; i++)
2034 {
2035 if (c == ebcdic_escape_c[i]) break;
2036 }
2037 if (i < 32) c = i; else *errorcodeptr = ERR68;
2038 }
2039 #endif /* EBCDIC */
2040
2041 ptr++;
2042 break;
2043
2044 /* Any other alphanumeric following \ is an error. Perl gives an error only
2045 if in warning mode, but PCRE doesn't have a warning mode. */
2046
2047 default:
2048 *errorcodeptr = ERR3;
2049 *ptrptr = ptr - 1; /* Point to the character at fault */
2050 return 0;
2051 }
2052 }
2053
2054 /* Set the pointer to the next character before returning. */
2055
2056 *ptrptr = ptr;
2057 *chptr = c;
2058 return escape;
2059 }
2060
2061
2062
2063 #ifdef SUPPORT_UNICODE
2064 /*************************************************
2065 * Handle \P and \p *
2066 *************************************************/
2067
2068 /* This function is called after \P or \p has been encountered, provided that
2069 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2070 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2071 after the final code unit of the escape sequence.
2072
2073 Arguments:
2074 ptrptr the pattern position pointer
2075 negptr a boolean that is set TRUE for negation else FALSE
2076 ptypeptr an unsigned int that is set to the type value
2077 pdataptr an unsigned int that is set to the detailed property value
2078 errorcodeptr the error code variable
2079 cb the compile data
2080
2081 Returns: TRUE if the type value was found, or FALSE for an invalid type
2082 */
2083
2084 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2085 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2086 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2087 {
2088 PCRE2_UCHAR c;
2089 PCRE2_SIZE i, bot, top;
2090 PCRE2_SPTR ptr = *ptrptr;
2091 PCRE2_UCHAR name[32];
2092
2093 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2094 c = *ptr++;
2095 *negptr = FALSE;
2096
2097 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2098 negation. */
2099
2100 if (c == CHAR_LEFT_CURLY_BRACKET)
2101 {
2102 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2103 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2104 {
2105 *negptr = TRUE;
2106 ptr++;
2107 }
2108 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2109 {
2110 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2111 c = *ptr++;
2112 if (c == CHAR_NUL) goto ERROR_RETURN;
2113 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2114 name[i] = c;
2115 }
2116 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2117 name[i] = 0;
2118 }
2119
2120 /* Otherwise there is just one following character, which must be an ASCII
2121 letter. */
2122
2123 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2124 {
2125 name[0] = c;
2126 name[1] = 0;
2127 }
2128 else goto ERROR_RETURN;
2129
2130 *ptrptr = ptr;
2131
2132 /* Search for a recognized property name using binary chop. */
2133
2134 bot = 0;
2135 top = PRIV(utt_size);
2136
2137 while (bot < top)
2138 {
2139 int r;
2140 i = (bot + top) >> 1;
2141 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2142 if (r == 0)
2143 {
2144 *ptypeptr = PRIV(utt)[i].type;
2145 *pdataptr = PRIV(utt)[i].value;
2146 return TRUE;
2147 }
2148 if (r > 0) bot = i + 1; else top = i;
2149 }
2150 *errorcodeptr = ERR47; /* Unrecognized name */
2151 return FALSE;
2152
2153 ERROR_RETURN: /* Malformed \P or \p */
2154 *errorcodeptr = ERR46;
2155 *ptrptr = ptr;
2156 return FALSE;
2157 }
2158 #endif
2159
2160
2161
2162 /*************************************************
2163 * Check for POSIX class syntax *
2164 *************************************************/
2165
2166 /* This function is called when the sequence "[:" or "[." or "[=" is
2167 encountered in a character class. It checks whether this is followed by a
2168 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2169 reach an unescaped ']' without the special preceding character, return FALSE.
2170
2171 Originally, this function only recognized a sequence of letters between the
2172 terminators, but it seems that Perl recognizes any sequence of characters,
2173 though of course unknown POSIX names are subsequently rejected. Perl gives an
2174 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2175 didn't consider this to be a POSIX class. Likewise for [:1234:].
2176
2177 The problem in trying to be exactly like Perl is in the handling of escapes. We
2178 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2179 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2180 below handles the special cases \\ and \], but does not try to do any other
2181 escape processing. This makes it different from Perl for cases such as
2182 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2183 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2184 when Perl does, I think.
2185
2186 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2187 It seems that the appearance of a nested POSIX class supersedes an apparent
2188 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2189 a digit. This is handled by returning FALSE if the start of a new group with
2190 the same terminator is encountered, since the next closing sequence must close
2191 the nested group, not the outer one.
2192
2193 In Perl, unescaped square brackets may also appear as part of class names. For
2194 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2195 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2196 seem right at all. PCRE does not allow closing square brackets in POSIX class
2197 names.
2198
2199 Arguments:
2200 ptr pointer to the character after the initial [ (colon, dot, equals)
2201 ptrend pointer to the end of the pattern
2202 endptr where to return a pointer to the terminating ':', '.', or '='
2203
2204 Returns: TRUE or FALSE
2205 */
2206
2207 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2208 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2209 {
2210 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2211 terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2212
2213 for (; ptrend - ptr >= 2; ptr++)
2214 {
2215 if (*ptr == CHAR_BACKSLASH &&
2216 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2217 ptr++;
2218
2219 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2220 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2221
2222 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2223 {
2224 *endptr = ptr;
2225 return TRUE;
2226 }
2227 }
2228
2229 return FALSE;
2230 }
2231
2232
2233
2234 /*************************************************
2235 * Check POSIX class name *
2236 *************************************************/
2237
2238 /* This function is called to check the name given in a POSIX-style class entry
2239 such as [:alnum:].
2240
2241 Arguments:
2242 ptr points to the first letter
2243 len the length of the name
2244
2245 Returns: a value representing the name, or -1 if unknown
2246 */
2247
2248 static int
check_posix_name(PCRE2_SPTR ptr,int len)2249 check_posix_name(PCRE2_SPTR ptr, int len)
2250 {
2251 const char *pn = posix_names;
2252 int yield = 0;
2253 while (posix_name_lengths[yield] != 0)
2254 {
2255 if (len == posix_name_lengths[yield] &&
2256 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2257 pn += posix_name_lengths[yield] + 1;
2258 yield++;
2259 }
2260 return -1;
2261 }
2262
2263
2264
2265 /*************************************************
2266 * Read a subpattern or VERB name *
2267 *************************************************/
2268
2269 /* This function is called from parse_regex() below whenever it needs to read
2270 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2271 pointer must be to the character before the name. If that character is '*' we
2272 are reading a verb or alpha assertion name. The pointer is updated to point
2273 after the name, for a VERB or alpha assertion name, or after tha name's
2274 terminator for a subpattern name. Returning both the offset and the name
2275 pointer is redundant information, but some callers use one and some the other,
2276 so it is simplest just to return both.
2277
2278 Arguments:
2279 ptrptr points to the character pointer variable
2280 ptrend points to the end of the input string
2281 utf true if the input is UTF-encoded
2282 terminator the terminator of a subpattern name must be this
2283 offsetptr where to put the offset from the start of the pattern
2284 nameptr where to put a pointer to the name in the input
2285 namelenptr where to put the length of the name
2286 errcodeptr where to put an error code
2287 cb pointer to the compile data block
2288
2289 Returns: TRUE if a name was read
2290 FALSE otherwise, with error code set
2291 */
2292
2293 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2294 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2295 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2296 int *errorcodeptr, compile_block *cb)
2297 {
2298 PCRE2_SPTR ptr = *ptrptr;
2299 BOOL is_group = (*ptr != CHAR_ASTERISK);
2300
2301 if (++ptr >= ptrend) /* No characters in name */
2302 {
2303 *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2304 ERR60; /* Verb not recognized or malformed */
2305 goto FAILED;
2306 }
2307
2308 *nameptr = ptr;
2309 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2310
2311 /* In UTF mode, a group name may contain letters and decimal digits as defined
2312 by Unicode properties, and underscores, but must not start with a digit. */
2313
2314 #ifdef SUPPORT_UNICODE
2315 if (utf && is_group)
2316 {
2317 uint32_t c, type;
2318
2319 GETCHAR(c, ptr);
2320 type = UCD_CHARTYPE(c);
2321
2322 if (type == ucp_Nd)
2323 {
2324 *errorcodeptr = ERR44;
2325 goto FAILED;
2326 }
2327
2328 for(;;)
2329 {
2330 if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2331 c != CHAR_UNDERSCORE) break;
2332 ptr++;
2333 FORWARDCHARTEST(ptr, ptrend);
2334 if (ptr >= ptrend) break;
2335 GETCHAR(c, ptr);
2336 type = UCD_CHARTYPE(c);
2337 }
2338 }
2339 else
2340 #else
2341 (void)utf; /* Avoid compiler warning */
2342 #endif /* SUPPORT_UNICODE */
2343
2344 /* Handle non-group names and group names in non-UTF modes. A group name must
2345 not start with a digit. If either of the others start with a digit it just
2346 won't be recognized. */
2347
2348 {
2349 if (is_group && IS_DIGIT(*ptr))
2350 {
2351 *errorcodeptr = ERR44;
2352 goto FAILED;
2353 }
2354
2355 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2356 {
2357 ptr++;
2358 }
2359 }
2360
2361 /* Check name length */
2362
2363 if (ptr > *nameptr + MAX_NAME_SIZE)
2364 {
2365 *errorcodeptr = ERR48;
2366 goto FAILED;
2367 }
2368 *namelenptr = (uint32_t)(ptr - *nameptr);
2369
2370 /* Subpattern names must not be empty, and their terminator is checked here.
2371 (What follows a verb or alpha assertion name is checked separately.) */
2372
2373 if (is_group)
2374 {
2375 if (ptr == *nameptr)
2376 {
2377 *errorcodeptr = ERR62; /* Subpattern name expected */
2378 goto FAILED;
2379 }
2380 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2381 {
2382 *errorcodeptr = ERR42;
2383 goto FAILED;
2384 }
2385 ptr++;
2386 }
2387
2388 *ptrptr = ptr;
2389 return TRUE;
2390
2391 FAILED:
2392 *ptrptr = ptr;
2393 return FALSE;
2394 }
2395
2396
2397
2398 /*************************************************
2399 * Manage callouts at start of cycle *
2400 *************************************************/
2401
2402 /* At the start of a new item in parse_regex() we are able to record the
2403 details of the previous item in a prior callout, and also to set up an
2404 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2405 which would otherwise happen for items such as \Q that contribute nothing to
2406 the parsed pattern.
2407
2408 Arguments:
2409 ptr current pattern pointer
2410 pcalloutptr points to a pointer to previous callout, or NULL
2411 auto_callout TRUE if auto_callouts are enabled
2412 parsed_pattern the parsed pattern pointer
2413 cb compile block
2414
2415 Returns: possibly updated parsed_pattern pointer.
2416 */
2417
2418 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2419 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2420 uint32_t *parsed_pattern, compile_block *cb)
2421 {
2422 uint32_t *previous_callout = *pcalloutptr;
2423
2424 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2425 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2426
2427 if (!auto_callout) previous_callout = NULL; else
2428 {
2429 if (previous_callout == NULL ||
2430 previous_callout != parsed_pattern - 4 ||
2431 previous_callout[3] != 255)
2432 {
2433 previous_callout = parsed_pattern; /* Set up new automatic callout */
2434 parsed_pattern += 4;
2435 previous_callout[0] = META_CALLOUT_NUMBER;
2436 previous_callout[2] = 0;
2437 previous_callout[3] = 255;
2438 }
2439 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2440 }
2441
2442 *pcalloutptr = previous_callout;
2443 return parsed_pattern;
2444 }
2445
2446
2447
2448 /*************************************************
2449 * Parse regex and identify named groups *
2450 *************************************************/
2451
2452 /* This function is called first of all. It scans the pattern and does two
2453 things: (1) It identifies capturing groups and makes a table of named capturing
2454 groups so that information about them is fully available to both the compiling
2455 scans. (2) It writes a parsed version of the pattern with comments omitted and
2456 escapes processed into the parsed_pattern vector.
2457
2458 Arguments:
2459 ptr points to the start of the pattern
2460 options compiling dynamic options (may change during the scan)
2461 has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2462 cb pointer to the compile data block
2463
2464 Returns: zero on success or a non-zero error code, with the
2465 error offset placed in the cb field
2466 */
2467
2468 /* A structure and some flags for dealing with nested groups. */
2469
2470 typedef struct nest_save {
2471 uint16_t nest_depth;
2472 uint16_t reset_group;
2473 uint16_t max_group;
2474 uint16_t flags;
2475 uint32_t options;
2476 } nest_save;
2477
2478 #define NSF_RESET 0x0001u
2479 #define NSF_CONDASSERT 0x0002u
2480 #define NSF_ATOMICSR 0x0004u
2481
2482 /* Options that are changeable within the pattern must be tracked during
2483 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2484 but all must be tracked so that META_OPTIONS items set the correct values for
2485 the main compiling phase. */
2486
2487 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2488 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2489 PCRE2_UNGREEDY)
2490
2491 /* States used for analyzing ranges in character classes. The two OK values
2492 must be last. */
2493
2494 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2495
2496 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2497 the storing of literal values in the main parsed pattern, where they can always
2498 be quantified. */
2499
2500 #if PCRE2_CODE_UNIT_WIDTH == 32
2501 #define PARSED_LITERAL(c, p) \
2502 { \
2503 if (c >= META_END) *p++ = META_BIGVALUE; \
2504 *p++ = c; \
2505 okquantifier = TRUE; \
2506 }
2507 #else
2508 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2509 #endif
2510
2511 /* Here's the actual function. */
2512
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2513 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2514 compile_block *cb)
2515 {
2516 uint32_t c;
2517 uint32_t delimiter;
2518 uint32_t namelen;
2519 uint32_t class_range_state;
2520 uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2521 uint32_t *verbstartptr = NULL;
2522 uint32_t *previous_callout = NULL;
2523 uint32_t *parsed_pattern = cb->parsed_pattern;
2524 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2525 uint32_t meta_quantifier = 0;
2526 uint32_t add_after_mark = 0;
2527 uint32_t extra_options = cb->cx->extra_options;
2528 uint16_t nest_depth = 0;
2529 int after_manual_callout = 0;
2530 int expect_cond_assert = 0;
2531 int errorcode = 0;
2532 int escape;
2533 int i;
2534 BOOL inescq = FALSE;
2535 BOOL inverbname = FALSE;
2536 BOOL utf = (options & PCRE2_UTF) != 0;
2537 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2538 BOOL isdupname;
2539 BOOL negate_class;
2540 BOOL okquantifier = FALSE;
2541 PCRE2_SPTR thisptr;
2542 PCRE2_SPTR name;
2543 PCRE2_SPTR ptrend = cb->end_pattern;
2544 PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2545 named_group *ng;
2546 nest_save *top_nest, *end_nests;
2547
2548 /* Insert leading items for word and line matching (features provided for the
2549 benefit of pcre2grep). */
2550
2551 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2552 {
2553 *parsed_pattern++ = META_CIRCUMFLEX;
2554 *parsed_pattern++ = META_NOCAPTURE;
2555 }
2556 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2557 {
2558 *parsed_pattern++ = META_ESCAPE + ESC_b;
2559 *parsed_pattern++ = META_NOCAPTURE;
2560 }
2561
2562 /* If the pattern is actually a literal string, process it separately to avoid
2563 cluttering up the main loop. */
2564
2565 if ((options & PCRE2_LITERAL) != 0)
2566 {
2567 while (ptr < ptrend)
2568 {
2569 if (parsed_pattern >= parsed_pattern_end)
2570 {
2571 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2572 goto FAILED;
2573 }
2574 thisptr = ptr;
2575 GETCHARINCTEST(c, ptr);
2576 if (auto_callout)
2577 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2578 auto_callout, parsed_pattern, cb);
2579 PARSED_LITERAL(c, parsed_pattern);
2580 }
2581 goto PARSED_END;
2582 }
2583
2584 /* Process a real regex which may contain meta-characters. */
2585
2586 top_nest = NULL;
2587 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2588
2589 /* The size of the nest_save structure might not be a factor of the size of the
2590 workspace. Therefore we must round down end_nests so as to correctly avoid
2591 creating a nest_save that spans the end of the workspace. */
2592
2593 end_nests = (nest_save *)((char *)end_nests -
2594 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2595
2596 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2597
2598 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2599
2600 /* Now scan the pattern */
2601
2602 while (ptr < ptrend)
2603 {
2604 int prev_expect_cond_assert;
2605 uint32_t min_repeat, max_repeat;
2606 uint32_t set, unset, *optset;
2607 uint32_t terminator;
2608 uint32_t prev_meta_quantifier;
2609 BOOL prev_okquantifier;
2610 PCRE2_SPTR tempptr;
2611 PCRE2_SIZE offset;
2612
2613 if (parsed_pattern >= parsed_pattern_end)
2614 {
2615 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2616 goto FAILED;
2617 }
2618
2619 if (nest_depth > cb->cx->parens_nest_limit)
2620 {
2621 errorcode = ERR19;
2622 goto FAILED; /* Parentheses too deeply nested */
2623 }
2624
2625 /* Get next input character, save its position for callout handling. */
2626
2627 thisptr = ptr;
2628 GETCHARINCTEST(c, ptr);
2629
2630 /* Copy quoted literals until \E, allowing for the possibility of automatic
2631 callouts, except when processing a (*VERB) "name". */
2632
2633 if (inescq)
2634 {
2635 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2636 {
2637 inescq = FALSE;
2638 ptr++; /* Skip E */
2639 }
2640 else
2641 {
2642 if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2643 { /* expecting a conditional assertion, */
2644 ptr--; /* but an empty \Q\E sequence is OK. */
2645 errorcode = ERR28;
2646 goto FAILED;
2647 }
2648 if (inverbname)
2649 { /* Don't use PARSED_LITERAL() because it */
2650 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2651 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2652 #endif
2653 *parsed_pattern++ = c;
2654 }
2655 else
2656 {
2657 if (after_manual_callout-- <= 0)
2658 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2659 auto_callout, parsed_pattern, cb);
2660 PARSED_LITERAL(c, parsed_pattern);
2661 }
2662 meta_quantifier = 0;
2663 }
2664 continue; /* Next character */
2665 }
2666
2667 /* If we are processing the "name" part of a (*VERB:NAME) item, all
2668 characters up to the closing parenthesis are literals except when
2669 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2670 and \E and escaped characters are allowed (no character types such as \d). If
2671 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2672 this by not entering the special (*VERB:NAME) processing - they are then
2673 picked up below. Note that c is a character, not a code unit, so we must not
2674 use MAX_255 to test its size because MAX_255 tests code units and is assumed
2675 TRUE in 8-bit mode. */
2676
2677 if (inverbname &&
2678 (
2679 /* EITHER: not both options set */
2680 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2681 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2682 #ifdef SUPPORT_UNICODE
2683 /* OR: character > 255 AND not Unicode Pattern White Space */
2684 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2685 #endif
2686 /* OR: not a # comment or isspace() white space */
2687 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2688 #ifdef SUPPORT_UNICODE
2689 /* and not CHAR_NEL when Unicode is supported */
2690 && c != CHAR_NEL
2691 #endif
2692 )))
2693 {
2694 PCRE2_SIZE verbnamelength;
2695
2696 switch(c)
2697 {
2698 default: /* Don't use PARSED_LITERAL() because it */
2699 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2700 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2701 #endif
2702 *parsed_pattern++ = c;
2703 break;
2704
2705 case CHAR_RIGHT_PARENTHESIS:
2706 inverbname = FALSE;
2707 /* This is the length in characters */
2708 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2709 /* But the limit on the length is in code units */
2710 if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2711 {
2712 ptr--;
2713 errorcode = ERR76;
2714 goto FAILED;
2715 }
2716 *verblengthptr = (uint32_t)verbnamelength;
2717
2718 /* If this name was on a verb such as (*ACCEPT) which does not continue,
2719 a (*MARK) was generated for the name. We now add the original verb as the
2720 next item. */
2721
2722 if (add_after_mark != 0)
2723 {
2724 *parsed_pattern++ = add_after_mark;
2725 add_after_mark = 0;
2726 }
2727 break;
2728
2729 case CHAR_BACKSLASH:
2730 if ((options & PCRE2_ALT_VERBNAMES) != 0)
2731 {
2732 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2733 cb->cx->extra_options, FALSE, cb);
2734 if (errorcode != 0) goto FAILED;
2735 }
2736 else escape = 0; /* Treat all as literal */
2737
2738 switch(escape)
2739 {
2740 case 0: /* Don't use PARSED_LITERAL() because it */
2741 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2742 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2743 #endif
2744 *parsed_pattern++ = c;
2745 break;
2746
2747 case ESC_Q:
2748 inescq = TRUE;
2749 break;
2750
2751 case ESC_E: /* Ignore */
2752 break;
2753
2754 default:
2755 errorcode = ERR40; /* Invalid in verb name */
2756 goto FAILED;
2757 }
2758 }
2759 continue; /* Next character in pattern */
2760 }
2761
2762 /* Not a verb name character. At this point we must process everything that
2763 must not change the quantification state. This is mainly comments, but we
2764 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2765 A+, as in Perl. An isolated \E is ignored. */
2766
2767 if (c == CHAR_BACKSLASH && ptr < ptrend)
2768 {
2769 if (*ptr == CHAR_Q || *ptr == CHAR_E)
2770 {
2771 inescq = *ptr == CHAR_Q;
2772 ptr++;
2773 continue;
2774 }
2775 }
2776
2777 /* Skip over whitespace and # comments in extended mode. Note that c is a
2778 character, not a code unit, so we must not use MAX_255 to test its size
2779 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2780 whitespace characters are those designated as "Pattern White Space" by
2781 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2782 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2783 subset of space characters that match \h and \v. */
2784
2785 if ((options & PCRE2_EXTENDED) != 0)
2786 {
2787 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2788 #ifdef SUPPORT_UNICODE
2789 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2790 #endif
2791 if (c == CHAR_NUMBER_SIGN)
2792 {
2793 while (ptr < ptrend)
2794 {
2795 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
2796 { /* IS_NEWLINE sets cb->nllen. */
2797 ptr += cb->nllen;
2798 break;
2799 }
2800 ptr++;
2801 #ifdef SUPPORT_UNICODE
2802 if (utf) FORWARDCHARTEST(ptr, ptrend);
2803 #endif
2804 }
2805 continue; /* Next character in pattern */
2806 }
2807 }
2808
2809 /* Skip over bracketed comments */
2810
2811 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2812 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2813 {
2814 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2815 if (ptr >= ptrend)
2816 {
2817 errorcode = ERR18; /* A special error for missing ) in a comment */
2818 goto FAILED; /* to make it easier to debug. */
2819 }
2820 ptr++;
2821 continue; /* Next character in pattern */
2822 }
2823
2824 /* If the next item is not a quantifier, fill in length of any previous
2825 callout and create an auto callout if required. */
2826
2827 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2828 (c != CHAR_LEFT_CURLY_BRACKET ||
2829 (tempptr = ptr,
2830 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2831 {
2832 if (after_manual_callout-- <= 0)
2833 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2834 parsed_pattern, cb);
2835 }
2836
2837 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2838 assertion, possibly preceded by a callout. If the value is 1, we have just
2839 had the callout and expect an assertion. There must be at least 3 more
2840 characters in all cases. When expect_cond_assert is 2, we know that the
2841 current character is an opening parenthesis, as otherwise we wouldn't be
2842 here. However, when it is 1, we need to check, and it's easiest just to check
2843 always. Note that expect_cond_assert may be negative, since all callouts just
2844 decrement it. */
2845
2846 if (expect_cond_assert > 0)
2847 {
2848 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2849 (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2850 if (ok)
2851 {
2852 if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
2853 {
2854 ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2855 }
2856 else switch(ptr[1]) /* Traditional symbolic format */
2857 {
2858 case CHAR_C:
2859 ok = expect_cond_assert == 2;
2860 break;
2861
2862 case CHAR_EQUALS_SIGN:
2863 case CHAR_EXCLAMATION_MARK:
2864 break;
2865
2866 case CHAR_LESS_THAN_SIGN:
2867 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2868 break;
2869
2870 default:
2871 ok = FALSE;
2872 }
2873 }
2874
2875 if (!ok)
2876 {
2877 ptr--; /* Adjust error offset */
2878 errorcode = ERR28;
2879 goto FAILED;
2880 }
2881 }
2882
2883 /* Remember whether we are expecting a conditional assertion, and set the
2884 default for this item. */
2885
2886 prev_expect_cond_assert = expect_cond_assert;
2887 expect_cond_assert = 0;
2888
2889 /* Remember quantification status for the previous significant item, then set
2890 default for this item. */
2891
2892 prev_okquantifier = okquantifier;
2893 prev_meta_quantifier = meta_quantifier;
2894 okquantifier = FALSE;
2895 meta_quantifier = 0;
2896
2897 /* If the previous significant item was a quantifier, adjust the parsed code
2898 if there is a following modifier. The base meta value is always followed by
2899 the PLUS and QUERY values, in that order. We do this here rather than after
2900 reading a quantifier so that intervening comments and /x whitespace can be
2901 ignored without having to replicate code. */
2902
2903 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2904 {
2905 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2906 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2907 0x00020000u : 0x00010000u);
2908 continue; /* Next character in pattern */
2909 }
2910
2911
2912 /* Process the next item in the main part of a pattern. */
2913
2914 switch(c)
2915 {
2916 default: /* Non-special character */
2917 PARSED_LITERAL(c, parsed_pattern);
2918 break;
2919
2920
2921 /* ---- Escape sequence ---- */
2922
2923 case CHAR_BACKSLASH:
2924 tempptr = ptr;
2925 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2926 cb->cx->extra_options, FALSE, cb);
2927 if (errorcode != 0)
2928 {
2929 ESCAPE_FAILED:
2930 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
2931 goto FAILED;
2932 ptr = tempptr;
2933 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
2934 {
2935 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
2936 }
2937 escape = 0; /* Treat as literal character */
2938 }
2939
2940 /* The escape was a data escape or literal character. */
2941
2942 if (escape == 0)
2943 {
2944 PARSED_LITERAL(c, parsed_pattern);
2945 }
2946
2947 /* The escape was a back (or forward) reference. We keep the offset in
2948 order to give a more useful diagnostic for a bad forward reference. For
2949 references to groups numbered less than 10 we can't use more than two items
2950 in parsed_pattern because they may be just two characters in the input (and
2951 in a 64-bit world an offset may need two elements). So for them, the offset
2952 of the first occurrent is held in a special vector. */
2953
2954 else if (escape < 0)
2955 {
2956 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
2957 escape = -escape;
2958 *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
2959 if (escape < 10)
2960 {
2961 if (cb->small_ref_offset[escape] == PCRE2_UNSET)
2962 cb->small_ref_offset[escape] = offset;
2963 }
2964 else
2965 {
2966 PUTOFFSET(offset, parsed_pattern);
2967 }
2968 okquantifier = TRUE;
2969 }
2970
2971 /* The escape was a character class such as \d etc. or other special
2972 escape indicator such as \A or \X. Most of them generate just a single
2973 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
2974 value. They are supported only when Unicode is available. The type and
2975 value are packed into a single 32-bit value so that the whole sequences
2976 uses only two elements in the parsed_vector. This is because the same
2977 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
2978 set.
2979
2980 There are also some cases where the escape sequence is followed by a name:
2981 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
2982 and \g'name' are subroutine calls by name; \g{name} is a synonym for
2983 \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
2984 and returned as a negative value (handled above). A name is coded as an
2985 offset into the pattern and a length. */
2986
2987 else switch (escape)
2988 {
2989 case ESC_C:
2990 #ifdef NEVER_BACKSLASH_C
2991 errorcode = ERR85;
2992 goto ESCAPE_FAILED;
2993 #else
2994 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
2995 {
2996 errorcode = ERR83;
2997 goto ESCAPE_FAILED;
2998 }
2999 #endif
3000 okquantifier = TRUE;
3001 *parsed_pattern++ = META_ESCAPE + escape;
3002 break;
3003
3004 case ESC_X:
3005 #ifndef SUPPORT_UNICODE
3006 errorcode = ERR45; /* Supported only with Unicode support */
3007 goto ESCAPE_FAILED;
3008 #endif
3009 case ESC_H:
3010 case ESC_h:
3011 case ESC_N:
3012 case ESC_R:
3013 case ESC_V:
3014 case ESC_v:
3015 okquantifier = TRUE;
3016 *parsed_pattern++ = META_ESCAPE + escape;
3017 break;
3018
3019 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3020 *parsed_pattern++ = META_ESCAPE + escape;
3021 break;
3022
3023 /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
3024 without Unicode support because it is checked when pcre2_compile() is
3025 called. */
3026
3027 case ESC_d:
3028 case ESC_D:
3029 case ESC_s:
3030 case ESC_S:
3031 case ESC_w:
3032 case ESC_W:
3033 okquantifier = TRUE;
3034 if ((options & PCRE2_UCP) == 0)
3035 {
3036 *parsed_pattern++ = META_ESCAPE + escape;
3037 }
3038 else
3039 {
3040 *parsed_pattern++ = META_ESCAPE +
3041 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3042 ESC_p : ESC_P);
3043 switch(escape)
3044 {
3045 case ESC_d:
3046 case ESC_D:
3047 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3048 break;
3049
3050 case ESC_s:
3051 case ESC_S:
3052 *parsed_pattern++ = PT_SPACE << 16;
3053 break;
3054
3055 case ESC_w:
3056 case ESC_W:
3057 *parsed_pattern++ = PT_WORD << 16;
3058 break;
3059 }
3060 }
3061 break;
3062
3063 /* Unicode property matching */
3064
3065 case ESC_P:
3066 case ESC_p:
3067 #ifdef SUPPORT_UNICODE
3068 {
3069 BOOL negated;
3070 uint16_t ptype = 0, pdata = 0;
3071 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3072 goto ESCAPE_FAILED;
3073 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3074 *parsed_pattern++ = META_ESCAPE + escape;
3075 *parsed_pattern++ = (ptype << 16) | pdata;
3076 okquantifier = TRUE;
3077 }
3078 #else
3079 errorcode = ERR45;
3080 goto ESCAPE_FAILED;
3081 #endif
3082 break; /* End \P and \p */
3083
3084 /* When \g is used with quotes or angle brackets as delimiters, it is a
3085 numerical or named subroutine call, and control comes here. When used
3086 with brace delimiters it is a numberical back reference and does not come
3087 here because check_escape() returns it directly as a reference. \k is
3088 always a named back reference. */
3089
3090 case ESC_g:
3091 case ESC_k:
3092 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3093 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3094 {
3095 errorcode = (escape == ESC_g)? ERR57 : ERR69;
3096 goto ESCAPE_FAILED;
3097 }
3098 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3099 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3100 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3101
3102 /* For a non-braced \g, check for a numerical recursion. */
3103
3104 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3105 {
3106 PCRE2_SPTR p = ptr + 1;
3107
3108 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3109 &errorcode))
3110 {
3111 if (p >= ptrend || *p != terminator)
3112 {
3113 errorcode = ERR57;
3114 goto ESCAPE_FAILED;
3115 }
3116 ptr = p;
3117 goto SET_RECURSION;
3118 }
3119 if (errorcode != 0) goto ESCAPE_FAILED;
3120 }
3121
3122 /* Not a numerical recursion */
3123
3124 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3125 &errorcode, cb)) goto ESCAPE_FAILED;
3126
3127 /* \k and \g when used with braces are back references, whereas \g used
3128 with quotes or angle brackets is a recursion */
3129
3130 *parsed_pattern++ =
3131 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3132 META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3133 *parsed_pattern++ = namelen;
3134
3135 PUTOFFSET(offset, parsed_pattern);
3136 okquantifier = TRUE;
3137 break; /* End special escape processing */
3138 }
3139 break; /* End escape sequence processing */
3140
3141
3142 /* ---- Single-character special items ---- */
3143
3144 case CHAR_CIRCUMFLEX_ACCENT:
3145 *parsed_pattern++ = META_CIRCUMFLEX;
3146 break;
3147
3148 case CHAR_DOLLAR_SIGN:
3149 *parsed_pattern++ = META_DOLLAR;
3150 break;
3151
3152 case CHAR_DOT:
3153 *parsed_pattern++ = META_DOT;
3154 okquantifier = TRUE;
3155 break;
3156
3157
3158 /* ---- Single-character quantifiers ---- */
3159
3160 case CHAR_ASTERISK:
3161 meta_quantifier = META_ASTERISK;
3162 goto CHECK_QUANTIFIER;
3163
3164 case CHAR_PLUS:
3165 meta_quantifier = META_PLUS;
3166 goto CHECK_QUANTIFIER;
3167
3168 case CHAR_QUESTION_MARK:
3169 meta_quantifier = META_QUERY;
3170 goto CHECK_QUANTIFIER;
3171
3172
3173 /* ---- Potential {n,m} quantifier ---- */
3174
3175 case CHAR_LEFT_CURLY_BRACKET:
3176 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3177 &errorcode))
3178 {
3179 if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3180 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3181 break; /* No more quantifier processing */
3182 }
3183 meta_quantifier = META_MINMAX;
3184 /* Fall through */
3185
3186
3187 /* ---- Quantifier post-processing ---- */
3188
3189 /* Check that a quantifier is allowed after the previous item. */
3190
3191 CHECK_QUANTIFIER:
3192 if (!prev_okquantifier)
3193 {
3194 errorcode = ERR9;
3195 goto FAILED_BACK;
3196 }
3197
3198 /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3199 quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3200 sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3201 wrapping it in non-capturing brackets, but we have to allow for a preceding
3202 (*MARK) for when (*ACCEPT) has an argument. */
3203
3204 if (parsed_pattern[-1] == META_ACCEPT)
3205 {
3206 uint32_t *p;
3207 for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3208 *verbstartptr = META_NOCAPTURE;
3209 parsed_pattern[1] = META_KET;
3210 parsed_pattern += 2;
3211 }
3212
3213 /* Now we can put the quantifier into the parsed pattern vector. At this
3214 stage, we have only the basic quantifier. The check for a following + or ?
3215 modifier happens at the top of the loop, after any intervening comments
3216 have been removed. */
3217
3218 *parsed_pattern++ = meta_quantifier;
3219 if (c == CHAR_LEFT_CURLY_BRACKET)
3220 {
3221 *parsed_pattern++ = min_repeat;
3222 *parsed_pattern++ = max_repeat;
3223 }
3224 break;
3225
3226
3227 /* ---- Character class ---- */
3228
3229 case CHAR_LEFT_SQUARE_BRACKET:
3230 okquantifier = TRUE;
3231
3232 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3233 used for "start of word" and "end of word". As these are otherwise illegal
3234 sequences, we don't break anything by recognizing them. They are replaced
3235 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3236 erroneous and are handled by the normal code below. */
3237
3238 if (ptrend - ptr >= 6 &&
3239 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3240 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3241 {
3242 *parsed_pattern++ = META_ESCAPE + ESC_b;
3243
3244 if (ptr[2] == CHAR_LESS_THAN_SIGN)
3245 {
3246 *parsed_pattern++ = META_LOOKAHEAD;
3247 }
3248 else
3249 {
3250 *parsed_pattern++ = META_LOOKBEHIND;
3251 *has_lookbehind = TRUE;
3252
3253 /* The offset is used only for the "non-fixed length" error; this won't
3254 occur here, so just store zero. */
3255
3256 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3257 }
3258
3259 if ((options & PCRE2_UCP) == 0)
3260 *parsed_pattern++ = META_ESCAPE + ESC_w;
3261 else
3262 {
3263 *parsed_pattern++ = META_ESCAPE + ESC_p;
3264 *parsed_pattern++ = PT_WORD << 16;
3265 }
3266 *parsed_pattern++ = META_KET;
3267 ptr += 6;
3268 break;
3269 }
3270
3271 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3272 they are encountered at the top level, so we'll do that too. */
3273
3274 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3275 *ptr == CHAR_EQUALS_SIGN) &&
3276 check_posix_syntax(ptr, ptrend, &tempptr))
3277 {
3278 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3279 goto FAILED;
3280 }
3281
3282 /* Process a regular character class. If the first character is '^', set
3283 the negation flag. If the first few characters (either before or after ^)
3284 are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3285 This makes for compatibility with Perl. */
3286
3287 negate_class = FALSE;
3288 while (ptr < ptrend)
3289 {
3290 GETCHARINCTEST(c, ptr);
3291 if (c == CHAR_BACKSLASH)
3292 {
3293 if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3294 else if (ptrend - ptr >= 3 &&
3295 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3296 ptr += 3;
3297 else
3298 break;
3299 }
3300 else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3301 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3302 continue;
3303 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3304 negate_class = TRUE;
3305 else break;
3306 }
3307
3308 /* Now the real contents of the class; c has the first "real" character.
3309 Empty classes are permitted only if the option is set. */
3310
3311 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3312 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3313 {
3314 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3315 break; /* End of class processing */
3316 }
3317
3318 /* Process a non-empty class. */
3319
3320 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3321 class_range_state = RANGE_NO;
3322
3323 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3324 because there are holes in the encoding, and simply using the range A-Z
3325 (for example) would include the characters in the holes. This applies only
3326 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3327 in this respect. In order to accommodate this, we keep track of whether
3328 character values are literal or not, and a state variable for handling
3329 ranges. */
3330
3331 /* Loop for the contents of the class */
3332
3333 for (;;)
3334 {
3335 BOOL char_is_literal = TRUE;
3336
3337 /* Inside \Q...\E everything is literal except \E */
3338
3339 if (inescq)
3340 {
3341 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3342 {
3343 inescq = FALSE; /* Reset literal state */
3344 ptr++; /* Skip the 'E' */
3345 goto CLASS_CONTINUE;
3346 }
3347 goto CLASS_LITERAL;
3348 }
3349
3350 /* Skip over space and tab (only) in extended-more mode. */
3351
3352 if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3353 (c == CHAR_SPACE || c == CHAR_HT))
3354 goto CLASS_CONTINUE;
3355
3356 /* Handle POSIX class names. Perl allows a negation extension of the
3357 form [:^name:]. A square bracket that doesn't match the syntax is
3358 treated as a literal. We also recognize the POSIX constructions
3359 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3360 5.6 and 5.8 do. */
3361
3362 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3363 ptrend - ptr >= 3 &&
3364 (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3365 *ptr == CHAR_EQUALS_SIGN) &&
3366 check_posix_syntax(ptr, ptrend, &tempptr))
3367 {
3368 BOOL posix_negate = FALSE;
3369 int posix_class;
3370
3371 /* Perl treats a hyphen before a POSIX class as a literal, not the
3372 start of a range. However, it gives a warning in its warning mode. PCRE
3373 does not have a warning mode, so we give an error, because this is
3374 likely an error on the user's part. */
3375
3376 if (class_range_state == RANGE_STARTED)
3377 {
3378 errorcode = ERR50;
3379 goto FAILED;
3380 }
3381
3382 if (*ptr != CHAR_COLON)
3383 {
3384 errorcode = ERR13;
3385 goto FAILED_BACK;
3386 }
3387
3388 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3389 {
3390 posix_negate = TRUE;
3391 ptr++;
3392 }
3393
3394 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3395 if (posix_class < 0)
3396 {
3397 errorcode = ERR30;
3398 goto FAILED;
3399 }
3400 ptr = tempptr + 2;
3401
3402 /* Perl treats a hyphen after a POSIX class as a literal, not the
3403 start of a range. However, it gives a warning in its warning mode
3404 unless the hyphen is the last character in the class. PCRE does not
3405 have a warning mode, so we give an error, because this is likely an
3406 error on the user's part. */
3407
3408 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3409 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3410 {
3411 errorcode = ERR50;
3412 goto FAILED;
3413 }
3414
3415 /* Set "a hyphen is not the start of a range" for the -] case, and also
3416 in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3417 fuzzers do that kind of thing) and *then* a hyphen. This causes that
3418 hyphen to be treated as a literal. I don't think it's worth setting up
3419 special apparatus to do otherwise. */
3420
3421 class_range_state = RANGE_NO;
3422
3423 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3424 use Unicode properties \p or \P or, in one case, \h or \H. The
3425 substitutes table has two values per class, containing the type and
3426 value of a \p or \P item. The special cases are specified with a
3427 negative type: a non-zero value causes \h or \H to be used, and a zero
3428 value falls through to behave like a non-UCP POSIX class. */
3429
3430 #ifdef SUPPORT_UNICODE
3431 if ((options & PCRE2_UCP) != 0)
3432 {
3433 int ptype = posix_substitutes[2*posix_class];
3434 int pvalue = posix_substitutes[2*posix_class + 1];
3435 if (ptype >= 0)
3436 {
3437 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3438 *parsed_pattern++ = (ptype << 16) | pvalue;
3439 goto CLASS_CONTINUE;
3440 }
3441
3442 if (pvalue != 0)
3443 {
3444 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3445 goto CLASS_CONTINUE;
3446 }
3447
3448 /* Fall through */
3449 }
3450 #endif /* SUPPORT_UNICODE */
3451
3452 /* Non-UCP POSIX class */
3453
3454 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3455 *parsed_pattern++ = posix_class;
3456 }
3457
3458 /* Handle potential start of range */
3459
3460 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3461 {
3462 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3463 META_RANGE_LITERAL : META_RANGE_ESCAPED;
3464 class_range_state = RANGE_STARTED;
3465 }
3466
3467 /* Handle a literal character */
3468
3469 else if (c != CHAR_BACKSLASH)
3470 {
3471 CLASS_LITERAL:
3472 if (class_range_state == RANGE_STARTED)
3473 {
3474 if (c == parsed_pattern[-2]) /* Optimize one-char range */
3475 parsed_pattern--;
3476 else if (parsed_pattern[-2] > c) /* Check range is in order */
3477 {
3478 errorcode = ERR8;
3479 goto FAILED_BACK;
3480 }
3481 else
3482 {
3483 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3484 parsed_pattern[-1] = META_RANGE_ESCAPED;
3485 PARSED_LITERAL(c, parsed_pattern);
3486 }
3487 class_range_state = RANGE_NO;
3488 }
3489 else /* Potential start of range */
3490 {
3491 class_range_state = char_is_literal?
3492 RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3493 PARSED_LITERAL(c, parsed_pattern);
3494 }
3495 }
3496
3497 /* Handle escapes in a class */
3498
3499 else
3500 {
3501 tempptr = ptr;
3502 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3503 cb->cx->extra_options, TRUE, cb);
3504
3505 if (errorcode != 0)
3506 {
3507 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3508 goto FAILED;
3509 ptr = tempptr;
3510 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3511 {
3512 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3513 }
3514 escape = 0; /* Treat as literal character */
3515 }
3516
3517 switch(escape)
3518 {
3519 case 0: /* Escaped character code point is in c */
3520 char_is_literal = FALSE;
3521 goto CLASS_LITERAL;
3522
3523 case ESC_b:
3524 c = CHAR_BS; /* \b is backspace in a class */
3525 char_is_literal = FALSE;
3526 goto CLASS_LITERAL;
3527
3528 case ESC_Q:
3529 inescq = TRUE; /* Enter literal mode */
3530 goto CLASS_CONTINUE;
3531
3532 case ESC_E: /* Ignore orphan \E */
3533 goto CLASS_CONTINUE;
3534
3535 case ESC_B: /* Always an error in a class */
3536 case ESC_R:
3537 case ESC_X:
3538 errorcode = ERR7;
3539 ptr--;
3540 goto FAILED;
3541 }
3542
3543 /* The second part of a range can be a single-character escape
3544 sequence (detected above), but not any of the other escapes. Perl
3545 treats a hyphen as a literal in such circumstances. However, in Perl's
3546 warning mode, a warning is given, so PCRE now faults it, as it is
3547 almost certainly a mistake on the user's part. */
3548
3549 if (class_range_state == RANGE_STARTED)
3550 {
3551 errorcode = ERR50;
3552 goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
3553 }
3554
3555 /* Of the remaining escapes, only those that define characters are
3556 allowed in a class. None may start a range. */
3557
3558 class_range_state = RANGE_NO;
3559 switch(escape)
3560 {
3561 case ESC_N:
3562 errorcode = ERR71;
3563 goto FAILED;
3564
3565 case ESC_H:
3566 case ESC_h:
3567 case ESC_V:
3568 case ESC_v:
3569 *parsed_pattern++ = META_ESCAPE + escape;
3570 break;
3571
3572 /* These escapes are converted to Unicode property tests when
3573 PCRE2_UCP is set. */
3574
3575 case ESC_d:
3576 case ESC_D:
3577 case ESC_s:
3578 case ESC_S:
3579 case ESC_w:
3580 case ESC_W:
3581 if ((options & PCRE2_UCP) == 0)
3582 {
3583 *parsed_pattern++ = META_ESCAPE + escape;
3584 }
3585 else
3586 {
3587 *parsed_pattern++ = META_ESCAPE +
3588 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3589 ESC_p : ESC_P);
3590 switch(escape)
3591 {
3592 case ESC_d:
3593 case ESC_D:
3594 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3595 break;
3596
3597 case ESC_s:
3598 case ESC_S:
3599 *parsed_pattern++ = PT_SPACE << 16;
3600 break;
3601
3602 case ESC_w:
3603 case ESC_W:
3604 *parsed_pattern++ = PT_WORD << 16;
3605 break;
3606 }
3607 }
3608 break;
3609
3610 /* Explicit Unicode property matching */
3611
3612 case ESC_P:
3613 case ESC_p:
3614 #ifdef SUPPORT_UNICODE
3615 {
3616 BOOL negated;
3617 uint16_t ptype = 0, pdata = 0;
3618 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3619 goto FAILED;
3620 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3621 *parsed_pattern++ = META_ESCAPE + escape;
3622 *parsed_pattern++ = (ptype << 16) | pdata;
3623 }
3624 #else
3625 errorcode = ERR45;
3626 goto FAILED;
3627 #endif
3628 break; /* End \P and \p */
3629
3630 default: /* All others are not allowed in a class */
3631 errorcode = ERR7;
3632 ptr--;
3633 goto FAILED;
3634 }
3635
3636 /* Perl gives a warning unless a following hyphen is the last character
3637 in the class. PCRE throws an error. */
3638
3639 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3640 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3641 {
3642 errorcode = ERR50;
3643 goto FAILED;
3644 }
3645 }
3646
3647 /* Proceed to next thing in the class. */
3648
3649 CLASS_CONTINUE:
3650 if (ptr >= ptrend)
3651 {
3652 errorcode = ERR6; /* Missing terminating ']' */
3653 goto FAILED;
3654 }
3655 GETCHARINCTEST(c, ptr);
3656 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3657 } /* End of class-processing loop */
3658
3659 /* -] at the end of a class is a literal '-' */
3660
3661 if (class_range_state == RANGE_STARTED)
3662 {
3663 parsed_pattern[-1] = CHAR_MINUS;
3664 class_range_state = RANGE_NO;
3665 }
3666
3667 *parsed_pattern++ = META_CLASS_END;
3668 break; /* End of character class */
3669
3670
3671 /* ---- Opening parenthesis ---- */
3672
3673 case CHAR_LEFT_PARENTHESIS:
3674 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3675
3676 /* If ( is not followed by ? it is either a capture or a special verb or an
3677 alpha assertion or a positive non-atomic lookahead. */
3678
3679 if (*ptr != CHAR_QUESTION_MARK)
3680 {
3681 const char *vn;
3682
3683 /* Handle capturing brackets (or non-capturing if auto-capture is turned
3684 off). */
3685
3686 if (*ptr != CHAR_ASTERISK)
3687 {
3688 nest_depth++;
3689 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3690 {
3691 if (cb->bracount >= MAX_GROUP_NUMBER)
3692 {
3693 errorcode = ERR97;
3694 goto FAILED;
3695 }
3696 cb->bracount++;
3697 *parsed_pattern++ = META_CAPTURE | cb->bracount;
3698 }
3699 else *parsed_pattern++ = META_NOCAPTURE;
3700 }
3701
3702 /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3703 quantifier" error rather than "(*MARK) must have an argument". */
3704
3705 else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3706 break;
3707
3708 /* Handle "alpha assertions" such as (*pla:...). Most of these are
3709 synonyms for the historical symbolic assertions, but the script run and
3710 non-atomic lookaround ones are new. They are distinguished by starting
3711 with a lower case letter. Checking both ends of the alphabet makes this
3712 work in all character codes. */
3713
3714 else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3715 {
3716 uint32_t meta;
3717
3718 vn = alasnames;
3719 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3720 &errorcode, cb)) goto FAILED;
3721 if (ptr >= ptrend || *ptr != CHAR_COLON)
3722 {
3723 errorcode = ERR95; /* Malformed */
3724 goto FAILED;
3725 }
3726
3727 /* Scan the table of alpha assertion names */
3728
3729 for (i = 0; i < alascount; i++)
3730 {
3731 if (namelen == alasmeta[i].len &&
3732 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3733 break;
3734 vn += alasmeta[i].len + 1;
3735 }
3736
3737 if (i >= alascount)
3738 {
3739 errorcode = ERR95; /* Alpha assertion not recognized */
3740 goto FAILED;
3741 }
3742
3743 /* Check for expecting an assertion condition. If so, only atomic
3744 lookaround assertions are valid. */
3745
3746 meta = alasmeta[i].meta;
3747 if (prev_expect_cond_assert > 0 &&
3748 (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3749 {
3750 errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3751 ERR98 : ERR28; /* (Atomic) assertion expected */
3752 goto FAILED;
3753 }
3754
3755 /* The lookaround alphabetic synonyms can mostly be handled by jumping
3756 to the code that handles the traditional symbolic forms. */
3757
3758 switch(meta)
3759 {
3760 default:
3761 errorcode = ERR89; /* Unknown code; should never occur because */
3762 goto FAILED; /* the meta values come from a table above. */
3763
3764 case META_ATOMIC:
3765 goto ATOMIC_GROUP;
3766
3767 case META_LOOKAHEAD:
3768 goto POSITIVE_LOOK_AHEAD;
3769
3770 case META_LOOKAHEAD_NA:
3771 goto POSITIVE_NONATOMIC_LOOK_AHEAD;
3772
3773 case META_LOOKAHEADNOT:
3774 goto NEGATIVE_LOOK_AHEAD;
3775
3776 case META_LOOKBEHIND:
3777 case META_LOOKBEHINDNOT:
3778 case META_LOOKBEHIND_NA:
3779 *parsed_pattern++ = meta;
3780 ptr--;
3781 goto POST_LOOKBEHIND;
3782
3783 /* The script run facilities are handled here. Unicode support is
3784 required (give an error if not, as this is a security issue). Always
3785 record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3786 META_ATOMIC and remember that we need two META_KETs at the end. */
3787
3788 case META_SCRIPT_RUN:
3789 case META_ATOMIC_SCRIPT_RUN:
3790 #ifdef SUPPORT_UNICODE
3791 *parsed_pattern++ = META_SCRIPT_RUN;
3792 nest_depth++;
3793 ptr++;
3794 if (meta == META_ATOMIC_SCRIPT_RUN)
3795 {
3796 *parsed_pattern++ = META_ATOMIC;
3797 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3798 else if (++top_nest >= end_nests)
3799 {
3800 errorcode = ERR84;
3801 goto FAILED;
3802 }
3803 top_nest->nest_depth = nest_depth;
3804 top_nest->flags = NSF_ATOMICSR;
3805 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3806 }
3807 break;
3808 #else /* SUPPORT_UNICODE */
3809 errorcode = ERR96;
3810 goto FAILED;
3811 #endif
3812 }
3813 }
3814
3815
3816 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3817
3818 else
3819 {
3820 vn = verbnames;
3821 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3822 &errorcode, cb)) goto FAILED;
3823 if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3824 *ptr != CHAR_RIGHT_PARENTHESIS))
3825 {
3826 errorcode = ERR60; /* Malformed */
3827 goto FAILED;
3828 }
3829
3830 /* Scan the table of verb names */
3831
3832 for (i = 0; i < verbcount; i++)
3833 {
3834 if (namelen == verbs[i].len &&
3835 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3836 break;
3837 vn += verbs[i].len + 1;
3838 }
3839
3840 if (i >= verbcount)
3841 {
3842 errorcode = ERR60; /* Verb not recognized */
3843 goto FAILED;
3844 }
3845
3846 /* An empty argument is treated as no argument. */
3847
3848 if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3849 ptr[1] == CHAR_RIGHT_PARENTHESIS)
3850 ptr++; /* Advance to the closing parens */
3851
3852 /* Check for mandatory non-empty argument; this is (*MARK) */
3853
3854 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3855 {
3856 errorcode = ERR66;
3857 goto FAILED;
3858 }
3859
3860 /* Remember where this verb, possibly with a preceding (*MARK), starts,
3861 for handling quantified (*ACCEPT). */
3862
3863 verbstartptr = parsed_pattern;
3864 okquantifier = (verbs[i].meta == META_ACCEPT);
3865
3866 /* It appears that Perl allows any characters whatsoever, other than a
3867 closing parenthesis, to appear in arguments ("names"), so we no longer
3868 insist on letters, digits, and underscores. Perl does not, however, do
3869 any interpretation within arguments, and has no means of including a
3870 closing parenthesis. PCRE supports escape processing but only when it
3871 is requested by an option. We set inverbname TRUE here, and let the
3872 main loop take care of this so that escape and \x processing is done by
3873 the main code above. */
3874
3875 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
3876 {
3877 /* Some optional arguments can be treated as a preceding (*MARK) */
3878
3879 if (verbs[i].has_arg < 0)
3880 {
3881 add_after_mark = verbs[i].meta;
3882 *parsed_pattern++ = META_MARK;
3883 }
3884
3885 /* The remaining verbs with arguments (except *MARK) need a different
3886 opcode. */
3887
3888 else
3889 {
3890 *parsed_pattern++ = verbs[i].meta +
3891 ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3892 }
3893
3894 /* Set up for reading the name in the main loop. */
3895
3896 verblengthptr = parsed_pattern++;
3897 verbnamestart = ptr;
3898 inverbname = TRUE;
3899 }
3900 else /* No verb "name" argument */
3901 {
3902 *parsed_pattern++ = verbs[i].meta;
3903 }
3904 } /* End of (*VERB) handling */
3905 break; /* Done with this parenthesis */
3906 } /* End of groups that don't start with (? */
3907
3908
3909 /* ---- Items starting (? ---- */
3910
3911 /* The type of item is determined by what follows (?. Handle (?| and option
3912 changes under "default" because both need a new block on the nest stack.
3913 Comments starting with (?# are handled above. Note that there is some
3914 ambiguity about the sequence (?- because if a digit follows it's a relative
3915 recursion or subroutine call whereas otherwise it's an option unsetting. */
3916
3917 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3918
3919 switch(*ptr)
3920 {
3921 default:
3922 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
3923 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
3924
3925 /* We now have either (?| or a (possibly empty) option setting,
3926 optionally followed by a non-capturing group. */
3927
3928 nest_depth++;
3929 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3930 else if (++top_nest >= end_nests)
3931 {
3932 errorcode = ERR84;
3933 goto FAILED;
3934 }
3935 top_nest->nest_depth = nest_depth;
3936 top_nest->flags = 0;
3937 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3938
3939 /* Start of non-capturing group that resets the capture count for each
3940 branch. */
3941
3942 if (*ptr == CHAR_VERTICAL_LINE)
3943 {
3944 top_nest->reset_group = (uint16_t)cb->bracount;
3945 top_nest->max_group = (uint16_t)cb->bracount;
3946 top_nest->flags |= NSF_RESET;
3947 cb->external_flags |= PCRE2_DUPCAPUSED;
3948 *parsed_pattern++ = META_NOCAPTURE;
3949 ptr++;
3950 }
3951
3952 /* Scan for options imnsxJU to be set or unset. */
3953
3954 else
3955 {
3956 BOOL hyphenok = TRUE;
3957 uint32_t oldoptions = options;
3958
3959 top_nest->reset_group = 0;
3960 top_nest->max_group = 0;
3961 set = unset = 0;
3962 optset = &set;
3963
3964 /* ^ at the start unsets imnsx and disables the subsequent use of - */
3965
3966 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
3967 {
3968 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
3969 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
3970 hyphenok = FALSE;
3971 ptr++;
3972 }
3973
3974 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
3975 *ptr != CHAR_COLON)
3976 {
3977 switch (*ptr++)
3978 {
3979 case CHAR_MINUS:
3980 if (!hyphenok)
3981 {
3982 errorcode = ERR94;
3983 ptr--; /* Correct the offset */
3984 goto FAILED;
3985 }
3986 optset = &unset;
3987 hyphenok = FALSE;
3988 break;
3989
3990 case CHAR_J: /* Record that it changed in the external options */
3991 *optset |= PCRE2_DUPNAMES;
3992 cb->external_flags |= PCRE2_JCHANGED;
3993 break;
3994
3995 case CHAR_i: *optset |= PCRE2_CASELESS; break;
3996 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
3997 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
3998 case CHAR_s: *optset |= PCRE2_DOTALL; break;
3999 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4000
4001 /* If x appears twice it sets the extended extended option. */
4002
4003 case CHAR_x:
4004 *optset |= PCRE2_EXTENDED;
4005 if (ptr < ptrend && *ptr == CHAR_x)
4006 {
4007 *optset |= PCRE2_EXTENDED_MORE;
4008 ptr++;
4009 }
4010 break;
4011
4012 default:
4013 errorcode = ERR11;
4014 ptr--; /* Correct the offset */
4015 goto FAILED;
4016 }
4017 }
4018
4019 /* If we are setting extended without extended-more, ensure that any
4020 existing extended-more gets unset. Also, unsetting extended must also
4021 unset extended-more. */
4022
4023 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4024 (unset & PCRE2_EXTENDED) != 0)
4025 unset |= PCRE2_EXTENDED_MORE;
4026
4027 options = (options | set) & (~unset);
4028
4029 /* If the options ended with ')' this is not the start of a nested
4030 group with option changes, so the options change at this level.
4031 In this case, if the previous level set up a nest block, discard the
4032 one we have just created. Otherwise adjust it for the previous level.
4033 If the options ended with ':' we are starting a non-capturing group,
4034 possibly with an options setting. */
4035
4036 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4037 if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4038 {
4039 nest_depth--; /* This is not a nested group after all. */
4040 if (top_nest > (nest_save *)(cb->start_workspace) &&
4041 (top_nest-1)->nest_depth == nest_depth) top_nest--;
4042 else top_nest->nest_depth = nest_depth;
4043 }
4044 else *parsed_pattern++ = META_NOCAPTURE;
4045
4046 /* If nothing changed, no need to record. */
4047
4048 if (options != oldoptions)
4049 {
4050 *parsed_pattern++ = META_OPTIONS;
4051 *parsed_pattern++ = options;
4052 }
4053 } /* End options processing */
4054 break; /* End default case after (? */
4055
4056
4057 /* ---- Python syntax support ---- */
4058
4059 case CHAR_P:
4060 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4061
4062 /* (?P<name> is the same as (?<name>, which defines a named group. */
4063
4064 if (*ptr == CHAR_LESS_THAN_SIGN)
4065 {
4066 terminator = CHAR_GREATER_THAN_SIGN;
4067 goto DEFINE_NAME;
4068 }
4069
4070 /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4071 call. */
4072
4073 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4074
4075 /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4076 else after (?P is an error. */
4077
4078 if (*ptr != CHAR_EQUALS_SIGN)
4079 {
4080 errorcode = ERR41;
4081 goto FAILED;
4082 }
4083 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4084 &namelen, &errorcode, cb)) goto FAILED;
4085 *parsed_pattern++ = META_BACKREF_BYNAME;
4086 *parsed_pattern++ = namelen;
4087 PUTOFFSET(offset, parsed_pattern);
4088 okquantifier = TRUE;
4089 break; /* End of (?P processing */
4090
4091
4092 /* ---- Recursion/subroutine calls by number ---- */
4093
4094 case CHAR_R:
4095 i = 0; /* (?R) == (?R0) */
4096 ptr++;
4097 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4098 {
4099 errorcode = ERR58;
4100 goto FAILED;
4101 }
4102 goto SET_RECURSION;
4103
4104 /* An item starting (?- followed by a digit comes here via the "default"
4105 case because (?- followed by a non-digit is an options setting. */
4106
4107 case CHAR_PLUS:
4108 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4109 {
4110 errorcode = ERR29; /* Missing number */
4111 goto FAILED;
4112 }
4113 /* Fall through */
4114
4115 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4116 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4117 RECURSION_BYNUMBER:
4118 if (!read_number(&ptr, ptrend,
4119 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4120 MAX_GROUP_NUMBER, ERR61,
4121 &i, &errorcode)) goto FAILED;
4122 if (i < 0) /* NB (?0) is permitted */
4123 {
4124 errorcode = ERR15; /* Unknown group */
4125 goto FAILED_BACK;
4126 }
4127 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4128 goto UNCLOSED_PARENTHESIS;
4129
4130 SET_RECURSION:
4131 *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4132 offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4133 ptr++;
4134 PUTOFFSET(offset, parsed_pattern);
4135 okquantifier = TRUE;
4136 break; /* End of recursive call by number handling */
4137
4138
4139 /* ---- Recursion/subroutine calls by name ---- */
4140
4141 case CHAR_AMPERSAND:
4142 RECURSE_BY_NAME:
4143 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4144 &namelen, &errorcode, cb)) goto FAILED;
4145 *parsed_pattern++ = META_RECURSE_BYNAME;
4146 *parsed_pattern++ = namelen;
4147 PUTOFFSET(offset, parsed_pattern);
4148 okquantifier = TRUE;
4149 break;
4150
4151 /* ---- Callout with numerical or string argument ---- */
4152
4153 case CHAR_C:
4154 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4155
4156 /* If the previous item was a condition starting (?(? an assertion,
4157 optionally preceded by a callout, is expected. This is checked later on,
4158 during actual compilation. However we need to identify this kind of
4159 assertion in this pass because it must not be qualified. The value of
4160 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4161 for a callout - still leaving a positive value that identifies the
4162 assertion. Multiple callouts or any other items will make it zero or
4163 less, which doesn't matter because they will cause an error later. */
4164
4165 expect_cond_assert = prev_expect_cond_assert - 1;
4166
4167 /* If previous_callout is not NULL, it means this follows a previous
4168 callout. If it was a manual callout, do nothing; this means its "length
4169 of next pattern item" field will remain zero. If it was an automatic
4170 callout, abolish it. */
4171
4172 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4173 previous_callout == parsed_pattern - 4 &&
4174 parsed_pattern[-1] == 255)
4175 parsed_pattern = previous_callout;
4176
4177 /* Save for updating next pattern item length, and skip one item before
4178 completing. */
4179
4180 previous_callout = parsed_pattern;
4181 after_manual_callout = 1;
4182
4183 /* Handle a string argument; specific delimiter is required. */
4184
4185 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4186 {
4187 PCRE2_SIZE calloutlength;
4188 PCRE2_SPTR startptr = ptr;
4189
4190 delimiter = 0;
4191 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4192 {
4193 if (*ptr == PRIV(callout_start_delims)[i])
4194 {
4195 delimiter = PRIV(callout_end_delims)[i];
4196 break;
4197 }
4198 }
4199 if (delimiter == 0)
4200 {
4201 errorcode = ERR82;
4202 goto FAILED;
4203 }
4204
4205 *parsed_pattern = META_CALLOUT_STRING;
4206 parsed_pattern += 3; /* Skip pattern info */
4207
4208 for (;;)
4209 {
4210 if (++ptr >= ptrend)
4211 {
4212 errorcode = ERR81;
4213 ptr = startptr; /* To give a more useful message */
4214 goto FAILED;
4215 }
4216 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4217 break;
4218 }
4219
4220 calloutlength = (PCRE2_SIZE)(ptr - startptr);
4221 if (calloutlength > UINT32_MAX)
4222 {
4223 errorcode = ERR72;
4224 goto FAILED;
4225 }
4226 *parsed_pattern++ = (uint32_t)calloutlength;
4227 offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4228 PUTOFFSET(offset, parsed_pattern);
4229 }
4230
4231 /* Handle a callout with an optional numerical argument, which must be
4232 less than or equal to 255. A missing argument gives 0. */
4233
4234 else
4235 {
4236 int n = 0;
4237 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
4238 parsed_pattern += 3; /* Skip pattern info */
4239 while (ptr < ptrend && IS_DIGIT(*ptr))
4240 {
4241 n = n * 10 + *ptr++ - CHAR_0;
4242 if (n > 255)
4243 {
4244 errorcode = ERR38;
4245 goto FAILED;
4246 }
4247 }
4248 *parsed_pattern++ = n;
4249 }
4250
4251 /* Both formats must have a closing parenthesis */
4252
4253 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4254 {
4255 errorcode = ERR39;
4256 goto FAILED;
4257 }
4258 ptr++;
4259
4260 /* Remember the offset to the next item in the pattern, and set a default
4261 length. This should get updated after the next item is read. */
4262
4263 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4264 previous_callout[2] = 0;
4265 break; /* End callout */
4266
4267
4268 /* ---- Conditional group ---- */
4269
4270 /* A condition can be an assertion, a number (referring to a numbered
4271 group's having been set), a name (referring to a named group), or 'R',
4272 referring to overall recursion. R<digits> and R&name are also permitted
4273 for recursion state tests. Numbers may be preceded by + or - to specify a
4274 relative group number.
4275
4276 There are several syntaxes for testing a named group: (?(name)) is used
4277 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4278
4279 There are two unfortunate ambiguities. 'R' can be the recursive thing or
4280 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4281 the Perl DEFINE feature or the Python named test. We look for a name
4282 first; if not found, we try the other case.
4283
4284 For compatibility with auto-callouts, we allow a callout to be specified
4285 before a condition that is an assertion. */
4286
4287 case CHAR_LEFT_PARENTHESIS:
4288 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4289 nest_depth++;
4290
4291 /* If the next character is ? or * there must be an assertion next
4292 (optionally preceded by a callout). We do not check this here, but
4293 instead we set expect_cond_assert to 2. If this is still greater than
4294 zero (callouts decrement it) when the next assertion is read, it will be
4295 marked as a condition that must not be repeated. A value greater than
4296 zero also causes checking that an assertion (possibly with callout)
4297 follows. */
4298
4299 if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4300 {
4301 *parsed_pattern++ = META_COND_ASSERT;
4302 ptr--; /* Pull pointer back to the opening parenthesis. */
4303 expect_cond_assert = 2;
4304 break; /* End of conditional */
4305 }
4306
4307 /* Handle (?([+-]number)... */
4308
4309 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4310 &errorcode))
4311 {
4312 if (i <= 0)
4313 {
4314 errorcode = ERR15;
4315 goto FAILED;
4316 }
4317 *parsed_pattern++ = META_COND_NUMBER;
4318 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4319 PUTOFFSET(offset, parsed_pattern);
4320 *parsed_pattern++ = i;
4321 }
4322 else if (errorcode != 0) goto FAILED; /* Number too big */
4323
4324 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4325
4326 else if (ptrend - ptr >= 10 &&
4327 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4328 ptr[7] != CHAR_RIGHT_PARENTHESIS)
4329 {
4330 uint32_t ge = 0;
4331 int major = 0;
4332 int minor = 0;
4333
4334 ptr += 7;
4335 if (*ptr == CHAR_GREATER_THAN_SIGN)
4336 {
4337 ge = 1;
4338 ptr++;
4339 }
4340
4341 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4342 references its argument twice. */
4343
4344 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4345 goto BAD_VERSION_CONDITION;
4346
4347 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4348 goto FAILED;
4349
4350 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4351 if (*ptr == CHAR_DOT)
4352 {
4353 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4354 minor = (*ptr++ - CHAR_0) * 10;
4355 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4356 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4357 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4358 goto BAD_VERSION_CONDITION;
4359 }
4360
4361 *parsed_pattern++ = META_COND_VERSION;
4362 *parsed_pattern++ = ge;
4363 *parsed_pattern++ = major;
4364 *parsed_pattern++ = minor;
4365 }
4366
4367 /* All the remaining cases now require us to read a name. We cannot at
4368 this stage distinguish ambiguous cases such as (?(R12) which might be a
4369 recursion test by number or a name, because the named groups have not yet
4370 all been identified. Those cases are treated as names, but given a
4371 different META code. */
4372
4373 else
4374 {
4375 BOOL was_r_ampersand = FALSE;
4376
4377 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4378 {
4379 terminator = CHAR_RIGHT_PARENTHESIS;
4380 was_r_ampersand = TRUE;
4381 ptr++;
4382 }
4383 else if (*ptr == CHAR_LESS_THAN_SIGN)
4384 terminator = CHAR_GREATER_THAN_SIGN;
4385 else if (*ptr == CHAR_APOSTROPHE)
4386 terminator = CHAR_APOSTROPHE;
4387 else
4388 {
4389 terminator = CHAR_RIGHT_PARENTHESIS;
4390 ptr--; /* Point to char before name */
4391 }
4392 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4393 &errorcode, cb)) goto FAILED;
4394
4395 /* Handle (?(R&name) */
4396
4397 if (was_r_ampersand)
4398 {
4399 *parsed_pattern = META_COND_RNAME;
4400 ptr--; /* Back to closing parens */
4401 }
4402
4403 /* Handle (?(name). If the name is "DEFINE" we identify it with a
4404 special code. Likewise if the name consists of R followed only by
4405 digits. Otherwise, handle it like a quoted name. */
4406
4407 else if (terminator == CHAR_RIGHT_PARENTHESIS)
4408 {
4409 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4410 *parsed_pattern = META_COND_DEFINE;
4411 else
4412 {
4413 for (i = 1; i < (int)namelen; i++)
4414 if (!IS_DIGIT(name[i])) break;
4415 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4416 META_COND_RNUMBER : META_COND_NAME;
4417 }
4418 ptr--; /* Back to closing parens */
4419 }
4420
4421 /* Handle (?('name') or (?(<name>) */
4422
4423 else *parsed_pattern = META_COND_NAME;
4424
4425 /* All these cases except DEFINE end with the name length and offset;
4426 DEFINE just has an offset (for the "too many branches" error). */
4427
4428 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4429 PUTOFFSET(offset, parsed_pattern);
4430 } /* End cases that read a name */
4431
4432 /* Check the closing parenthesis of the condition */
4433
4434 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4435 {
4436 errorcode = ERR24;
4437 goto FAILED;
4438 }
4439 ptr++;
4440 break; /* End of condition processing */
4441
4442
4443 /* ---- Atomic group ---- */
4444
4445 case CHAR_GREATER_THAN_SIGN:
4446 ATOMIC_GROUP: /* Come from (*atomic: */
4447 *parsed_pattern++ = META_ATOMIC;
4448 nest_depth++;
4449 ptr++;
4450 break;
4451
4452
4453 /* ---- Lookahead assertions ---- */
4454
4455 case CHAR_EQUALS_SIGN:
4456 POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
4457 *parsed_pattern++ = META_LOOKAHEAD;
4458 ptr++;
4459 goto POST_ASSERTION;
4460
4461 case CHAR_ASTERISK:
4462 POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
4463 *parsed_pattern++ = META_LOOKAHEAD_NA;
4464 ptr++;
4465 goto POST_ASSERTION;
4466
4467 case CHAR_EXCLAMATION_MARK:
4468 NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
4469 *parsed_pattern++ = META_LOOKAHEADNOT;
4470 ptr++;
4471 goto POST_ASSERTION;
4472
4473
4474 /* ---- Lookbehind assertions ---- */
4475
4476 /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4477 is the start of the name of a capturing group. */
4478
4479 case CHAR_LESS_THAN_SIGN:
4480 if (ptrend - ptr <= 1 ||
4481 (ptr[1] != CHAR_EQUALS_SIGN &&
4482 ptr[1] != CHAR_EXCLAMATION_MARK &&
4483 ptr[1] != CHAR_ASTERISK))
4484 {
4485 terminator = CHAR_GREATER_THAN_SIGN;
4486 goto DEFINE_NAME;
4487 }
4488 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4489 META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4490 META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4491
4492 POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
4493 *has_lookbehind = TRUE;
4494 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4495 PUTOFFSET(offset, parsed_pattern);
4496 ptr += 2;
4497 /* Fall through */
4498
4499 /* If the previous item was a condition starting (?(? an assertion,
4500 optionally preceded by a callout, is expected. This is checked later on,
4501 during actual compilation. However we need to identify this kind of
4502 assertion in this pass because it must not be qualified. The value of
4503 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4504 for a callout - still leaving a positive value that identifies the
4505 assertion. Multiple callouts or any other items will make it zero or
4506 less, which doesn't matter because they will cause an error later. */
4507
4508 POST_ASSERTION:
4509 nest_depth++;
4510 if (prev_expect_cond_assert > 0)
4511 {
4512 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4513 else if (++top_nest >= end_nests)
4514 {
4515 errorcode = ERR84;
4516 goto FAILED;
4517 }
4518 top_nest->nest_depth = nest_depth;
4519 top_nest->flags = NSF_CONDASSERT;
4520 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4521 }
4522 break;
4523
4524
4525 /* ---- Define a named group ---- */
4526
4527 /* A named group may be defined as (?'name') or (?<name>). In the latter
4528 case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4529 terminator set to '>'. */
4530
4531 case CHAR_APOSTROPHE:
4532 terminator = CHAR_APOSTROPHE; /* Terminator */
4533
4534 DEFINE_NAME:
4535 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4536 &errorcode, cb)) goto FAILED;
4537
4538 /* We have a name for this capturing group. It is also assigned a number,
4539 which is its primary means of identification. */
4540
4541 if (cb->bracount >= MAX_GROUP_NUMBER)
4542 {
4543 errorcode = ERR97;
4544 goto FAILED;
4545 }
4546 cb->bracount++;
4547 *parsed_pattern++ = META_CAPTURE | cb->bracount;
4548 nest_depth++;
4549
4550 /* Check not too many names */
4551
4552 if (cb->names_found >= MAX_NAME_COUNT)
4553 {
4554 errorcode = ERR49;
4555 goto FAILED;
4556 }
4557
4558 /* Adjust the entry size to accommodate the longest name found. */
4559
4560 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4561 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4562
4563 /* Scan the list to check for duplicates. For duplicate names, if the
4564 number is the same, break the loop, which causes the name to be
4565 discarded; otherwise, if DUPNAMES is not set, give an error.
4566 If it is set, allow the name with a different number, but continue
4567 scanning in case this is a duplicate with the same number. For
4568 non-duplicate names, give an error if the number is duplicated. */
4569
4570 isdupname = FALSE;
4571 ng = cb->named_groups;
4572 for (i = 0; i < cb->names_found; i++, ng++)
4573 {
4574 if (namelen == ng->length &&
4575 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4576 {
4577 if (ng->number == cb->bracount) break;
4578 if ((options & PCRE2_DUPNAMES) == 0)
4579 {
4580 errorcode = ERR43;
4581 goto FAILED;
4582 }
4583 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4584 cb->dupnames = TRUE; /* Duplicate names exist */
4585 }
4586 else if (ng->number == cb->bracount)
4587 {
4588 errorcode = ERR65;
4589 goto FAILED;
4590 }
4591 }
4592
4593 if (i < cb->names_found) break; /* Ignore duplicate with same number */
4594
4595 /* Increase the list size if necessary */
4596
4597 if (cb->names_found >= cb->named_group_list_size)
4598 {
4599 uint32_t newsize = cb->named_group_list_size * 2;
4600 named_group *newspace =
4601 cb->cx->memctl.malloc(newsize * sizeof(named_group),
4602 cb->cx->memctl.memory_data);
4603 if (newspace == NULL)
4604 {
4605 errorcode = ERR21;
4606 goto FAILED;
4607 }
4608
4609 memcpy(newspace, cb->named_groups,
4610 cb->named_group_list_size * sizeof(named_group));
4611 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4612 cb->cx->memctl.free((void *)cb->named_groups,
4613 cb->cx->memctl.memory_data);
4614 cb->named_groups = newspace;
4615 cb->named_group_list_size = newsize;
4616 }
4617
4618 /* Add this name to the list */
4619
4620 cb->named_groups[cb->names_found].name = name;
4621 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4622 cb->named_groups[cb->names_found].number = cb->bracount;
4623 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4624 cb->names_found++;
4625 break;
4626 } /* End of (? switch */
4627 break; /* End of ( handling */
4628
4629
4630 /* ---- Branch terminators ---- */
4631
4632 /* Alternation: reset the capture count if we are in a (?| group. */
4633
4634 case CHAR_VERTICAL_LINE:
4635 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4636 (top_nest->flags & NSF_RESET) != 0)
4637 {
4638 if (cb->bracount > top_nest->max_group)
4639 top_nest->max_group = (uint16_t)cb->bracount;
4640 cb->bracount = top_nest->reset_group;
4641 }
4642 *parsed_pattern++ = META_ALT;
4643 break;
4644
4645 /* End of group; reset the capture count to the maximum if we are in a (?|
4646 group and/or reset the options that are tracked during parsing. Disallow
4647 quantifier for a condition that is an assertion. */
4648
4649 case CHAR_RIGHT_PARENTHESIS:
4650 okquantifier = TRUE;
4651 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4652 {
4653 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4654 if ((top_nest->flags & NSF_RESET) != 0 &&
4655 top_nest->max_group > cb->bracount)
4656 cb->bracount = top_nest->max_group;
4657 if ((top_nest->flags & NSF_CONDASSERT) != 0)
4658 okquantifier = FALSE;
4659
4660 if ((top_nest->flags & NSF_ATOMICSR) != 0)
4661 {
4662 *parsed_pattern++ = META_KET;
4663 }
4664
4665 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4666 else top_nest--;
4667 }
4668 if (nest_depth == 0) /* Unmatched closing parenthesis */
4669 {
4670 errorcode = ERR22;
4671 goto FAILED_BACK;
4672 }
4673 nest_depth--;
4674 *parsed_pattern++ = META_KET;
4675 break;
4676 } /* End of switch on pattern character */
4677 } /* End of main character scan loop */
4678
4679 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4680
4681 if (inverbname && ptr >= ptrend)
4682 {
4683 errorcode = ERR60;
4684 goto FAILED;
4685 }
4686
4687 /* Manage callout for the final item */
4688
4689 PARSED_END:
4690 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4691 parsed_pattern, cb);
4692
4693 /* Insert trailing items for word and line matching (features provided for the
4694 benefit of pcre2grep). */
4695
4696 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4697 {
4698 *parsed_pattern++ = META_KET;
4699 *parsed_pattern++ = META_DOLLAR;
4700 }
4701 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4702 {
4703 *parsed_pattern++ = META_KET;
4704 *parsed_pattern++ = META_ESCAPE + ESC_b;
4705 }
4706
4707 /* Terminate the parsed pattern, then return success if all groups are closed.
4708 Otherwise we have unclosed parentheses. */
4709
4710 if (parsed_pattern >= parsed_pattern_end)
4711 {
4712 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
4713 goto FAILED;
4714 }
4715
4716 *parsed_pattern = META_END;
4717 if (nest_depth == 0) return 0;
4718
4719 UNCLOSED_PARENTHESIS:
4720 errorcode = ERR14;
4721
4722 /* Come here for all failures. */
4723
4724 FAILED:
4725 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4726 return errorcode;
4727
4728 /* Some errors need to indicate the previous character. */
4729
4730 FAILED_BACK:
4731 ptr--;
4732 goto FAILED;
4733
4734 /* This failure happens several times. */
4735
4736 BAD_VERSION_CONDITION:
4737 errorcode = ERR79;
4738 goto FAILED;
4739 }
4740
4741
4742
4743 /*************************************************
4744 * Find first significant opcode *
4745 *************************************************/
4746
4747 /* This is called by several functions that scan a compiled expression looking
4748 for a fixed first character, or an anchoring opcode etc. It skips over things
4749 that do not influence this. For some calls, it makes sense to skip negative
4750 forward and all backward assertions, and also the \b assertion; for others it
4751 does not.
4752
4753 Arguments:
4754 code pointer to the start of the group
4755 skipassert TRUE if certain assertions are to be skipped
4756
4757 Returns: pointer to the first significant opcode
4758 */
4759
4760 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4761 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4762 {
4763 for (;;)
4764 {
4765 switch ((int)*code)
4766 {
4767 case OP_ASSERT_NOT:
4768 case OP_ASSERTBACK:
4769 case OP_ASSERTBACK_NOT:
4770 case OP_ASSERTBACK_NA:
4771 if (!skipassert) return code;
4772 do code += GET(code, 1); while (*code == OP_ALT);
4773 code += PRIV(OP_lengths)[*code];
4774 break;
4775
4776 case OP_WORD_BOUNDARY:
4777 case OP_NOT_WORD_BOUNDARY:
4778 if (!skipassert) return code;
4779 /* Fall through */
4780
4781 case OP_CALLOUT:
4782 case OP_CREF:
4783 case OP_DNCREF:
4784 case OP_RREF:
4785 case OP_DNRREF:
4786 case OP_FALSE:
4787 case OP_TRUE:
4788 code += PRIV(OP_lengths)[*code];
4789 break;
4790
4791 case OP_CALLOUT_STR:
4792 code += GET(code, 1 + 2*LINK_SIZE);
4793 break;
4794
4795 case OP_SKIPZERO:
4796 code += 2 + GET(code, 2) + LINK_SIZE;
4797 break;
4798
4799 case OP_COND:
4800 case OP_SCOND:
4801 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
4802 code[GET(code, 1)] != OP_KET) /* More than one branch */
4803 return code;
4804 code += GET(code, 1) + 1 + LINK_SIZE;
4805 break;
4806
4807 case OP_MARK:
4808 case OP_COMMIT_ARG:
4809 case OP_PRUNE_ARG:
4810 case OP_SKIP_ARG:
4811 case OP_THEN_ARG:
4812 code += code[1] + PRIV(OP_lengths)[*code];
4813 break;
4814
4815 default:
4816 return code;
4817 }
4818 }
4819 /* Control never reaches here */
4820 }
4821
4822
4823
4824 #ifdef SUPPORT_UNICODE
4825 /*************************************************
4826 * Get othercase range *
4827 *************************************************/
4828
4829 /* This function is passed the start and end of a class range in UCP mode. It
4830 searches up the characters, looking for ranges of characters in the "other"
4831 case. Each call returns the next one, updating the start address. A character
4832 with multiple other cases is returned on its own with a special return value.
4833
4834 Arguments:
4835 cptr points to starting character value; updated
4836 d end value
4837 ocptr where to put start of othercase range
4838 odptr where to put end of othercase range
4839
4840 Yield: -1 when no more
4841 0 when a range is returned
4842 >0 the CASESET offset for char with multiple other cases
4843 in this case, ocptr contains the original
4844 */
4845
4846 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4847 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4848 uint32_t *odptr)
4849 {
4850 uint32_t c, othercase, next;
4851 unsigned int co;
4852
4853 /* Find the first character that has an other case. If it has multiple other
4854 cases, return its case offset value. */
4855
4856 for (c = *cptr; c <= d; c++)
4857 {
4858 if ((co = UCD_CASESET(c)) != 0)
4859 {
4860 *ocptr = c++; /* Character that has the set */
4861 *cptr = c; /* Rest of input range */
4862 return (int)co;
4863 }
4864 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4865 }
4866
4867 if (c > d) return -1; /* Reached end of range */
4868
4869 /* Found a character that has a single other case. Search for the end of the
4870 range, which is either the end of the input range, or a character that has zero
4871 or more than one other cases. */
4872
4873 *ocptr = othercase;
4874 next = othercase + 1;
4875
4876 for (++c; c <= d; c++)
4877 {
4878 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4879 next++;
4880 }
4881
4882 *odptr = next - 1; /* End of othercase range */
4883 *cptr = c; /* Rest of input range */
4884 return 0;
4885 }
4886 #endif /* SUPPORT_UNICODE */
4887
4888
4889
4890 /*************************************************
4891 * Add a character or range to a class (internal) *
4892 *************************************************/
4893
4894 /* This function packages up the logic of adding a character or range of
4895 characters to a class. The character values in the arguments will be within the
4896 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4897 called only from within the "add to class" group of functions, some of which
4898 are recursive and mutually recursive. The external entry point is
4899 add_to_class().
4900
4901 Arguments:
4902 classbits the bit map for characters < 256
4903 uchardptr points to the pointer for extra data
4904 options the options word
4905 cb compile data
4906 start start of range character
4907 end end of range character
4908
4909 Returns: the number of < 256 characters added
4910 the pointer to extra data is updated
4911 */
4912
4913 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4914 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4915 uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
4916 {
4917 uint32_t c;
4918 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
4919 unsigned int n8 = 0;
4920
4921 /* If caseless matching is required, scan the range and process alternate
4922 cases. In Unicode, there are 8-bit characters that have alternate cases that
4923 are greater than 255 and vice-versa. Sometimes we can just extend the original
4924 range. */
4925
4926 if ((options & PCRE2_CASELESS) != 0)
4927 {
4928 #ifdef SUPPORT_UNICODE
4929 if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
4930 {
4931 int rc;
4932 uint32_t oc, od;
4933
4934 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
4935 c = start;
4936
4937 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4938 {
4939 /* Handle a single character that has more than one other case. */
4940
4941 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
4942 PRIV(ucd_caseless_sets) + rc, oc);
4943
4944 /* Do nothing if the other case range is within the original range. */
4945
4946 else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
4947
4948 /* Extend the original range if there is overlap, noting that if oc < c, we
4949 can't have od > end because a subrange is always shorter than the basic
4950 range. Otherwise, use a recursive call to add the additional range. */
4951
4952 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4953 else if (od > end && oc <= end + 1)
4954 {
4955 end = od; /* Extend upwards */
4956 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4957 }
4958 else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
4959 }
4960 }
4961 else
4962 #endif /* SUPPORT_UNICODE */
4963
4964 /* Not UTF mode */
4965
4966 for (c = start; c <= classbits_end; c++)
4967 {
4968 SETBIT(classbits, cb->fcc[c]);
4969 n8++;
4970 }
4971 }
4972
4973 /* Now handle the originally supplied range. Adjust the final value according
4974 to the bit length - this means that the same lists of (e.g.) horizontal spaces
4975 can be used in all cases. */
4976
4977 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
4978 end = MAX_NON_UTF_CHAR;
4979
4980 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
4981
4982 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4983
4984 for (c = start; c <= classbits_end; c++)
4985 {
4986 /* Regardless of start, c will always be <= 255. */
4987 SETBIT(classbits, c);
4988 n8++;
4989 }
4990
4991 #ifdef SUPPORT_WIDE_CHARS
4992 if (start <= 0xff) start = 0xff + 1;
4993
4994 if (end >= start)
4995 {
4996 PCRE2_UCHAR *uchardata = *uchardptr;
4997
4998 #ifdef SUPPORT_UNICODE
4999 if ((options & PCRE2_UTF) != 0)
5000 {
5001 if (start < end)
5002 {
5003 *uchardata++ = XCL_RANGE;
5004 uchardata += PRIV(ord2utf)(start, uchardata);
5005 uchardata += PRIV(ord2utf)(end, uchardata);
5006 }
5007 else if (start == end)
5008 {
5009 *uchardata++ = XCL_SINGLE;
5010 uchardata += PRIV(ord2utf)(start, uchardata);
5011 }
5012 }
5013 else
5014 #endif /* SUPPORT_UNICODE */
5015
5016 /* Without UTF support, character values are constrained by the bit length,
5017 and can only be > 256 for 16-bit and 32-bit libraries. */
5018
5019 #if PCRE2_CODE_UNIT_WIDTH == 8
5020 {}
5021 #else
5022 if (start < end)
5023 {
5024 *uchardata++ = XCL_RANGE;
5025 *uchardata++ = start;
5026 *uchardata++ = end;
5027 }
5028 else if (start == end)
5029 {
5030 *uchardata++ = XCL_SINGLE;
5031 *uchardata++ = start;
5032 }
5033 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
5034 *uchardptr = uchardata; /* Updata extra data pointer */
5035 }
5036 #else /* SUPPORT_WIDE_CHARS */
5037 (void)uchardptr; /* Avoid compiler warning */
5038 #endif /* SUPPORT_WIDE_CHARS */
5039
5040 return n8; /* Number of 8-bit characters */
5041 }
5042
5043
5044
5045 #ifdef SUPPORT_UNICODE
5046 /*************************************************
5047 * Add a list of characters to a class (internal) *
5048 *************************************************/
5049
5050 /* This function is used for adding a list of case-equivalent characters to a
5051 class when in UTF mode. This function is called only from within
5052 add_to_class_internal(), with which it is mutually recursive.
5053
5054 Arguments:
5055 classbits the bit map for characters < 256
5056 uchardptr points to the pointer for extra data
5057 options the options word
5058 cb contains pointers to tables etc.
5059 p points to row of 32-bit values, terminated by NOTACHAR
5060 except character to omit; this is used when adding lists of
5061 case-equivalent characters to avoid including the one we
5062 already know about
5063
5064 Returns: the number of < 256 characters added
5065 the pointer to extra data is updated
5066 */
5067
5068 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5069 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5070 uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
5071 {
5072 unsigned int n8 = 0;
5073 while (p[0] < NOTACHAR)
5074 {
5075 unsigned int n = 0;
5076 if (p[0] != except)
5077 {
5078 while(p[n+1] == p[0] + n + 1) n++;
5079 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5080 }
5081 p += n + 1;
5082 }
5083 return n8;
5084 }
5085 #endif
5086
5087
5088
5089 /*************************************************
5090 * External entry point for add range to class *
5091 *************************************************/
5092
5093 /* This function sets the overall range so that the internal functions can try
5094 to avoid duplication when handling case-independence.
5095
5096 Arguments:
5097 classbits the bit map for characters < 256
5098 uchardptr points to the pointer for extra data
5099 options the options word
5100 cb compile data
5101 start start of range character
5102 end end of range character
5103
5104 Returns: the number of < 256 characters added
5105 the pointer to extra data is updated
5106 */
5107
5108 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5109 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5110 compile_block *cb, uint32_t start, uint32_t end)
5111 {
5112 cb->class_range_start = start;
5113 cb->class_range_end = end;
5114 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5115 }
5116
5117
5118 /*************************************************
5119 * External entry point for add list to class *
5120 *************************************************/
5121
5122 /* This function is used for adding a list of horizontal or vertical whitespace
5123 characters to a class. The list must be in order so that ranges of characters
5124 can be detected and handled appropriately. This function sets the overall range
5125 so that the internal functions can try to avoid duplication when handling
5126 case-independence.
5127
5128 Arguments:
5129 classbits the bit map for characters < 256
5130 uchardptr points to the pointer for extra data
5131 options the options word
5132 cb contains pointers to tables etc.
5133 p points to row of 32-bit values, terminated by NOTACHAR
5134 except character to omit; this is used when adding lists of
5135 case-equivalent characters to avoid including the one we
5136 already know about
5137
5138 Returns: the number of < 256 characters added
5139 the pointer to extra data is updated
5140 */
5141
5142 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5143 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5144 compile_block *cb, const uint32_t *p, unsigned int except)
5145 {
5146 unsigned int n8 = 0;
5147 while (p[0] < NOTACHAR)
5148 {
5149 unsigned int n = 0;
5150 if (p[0] != except)
5151 {
5152 while(p[n+1] == p[0] + n + 1) n++;
5153 cb->class_range_start = p[0];
5154 cb->class_range_end = p[n];
5155 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5156 }
5157 p += n + 1;
5158 }
5159 return n8;
5160 }
5161
5162
5163
5164 /*************************************************
5165 * Add characters not in a list to a class *
5166 *************************************************/
5167
5168 /* This function is used for adding the complement of a list of horizontal or
5169 vertical whitespace to a class. The list must be in order.
5170
5171 Arguments:
5172 classbits the bit map for characters < 256
5173 uchardptr points to the pointer for extra data
5174 options the options word
5175 cb contains pointers to tables etc.
5176 p points to row of 32-bit values, terminated by NOTACHAR
5177
5178 Returns: the number of < 256 characters added
5179 the pointer to extra data is updated
5180 */
5181
5182 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)5183 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5184 uint32_t options, compile_block *cb, const uint32_t *p)
5185 {
5186 BOOL utf = (options & PCRE2_UTF) != 0;
5187 unsigned int n8 = 0;
5188 if (p[0] > 0)
5189 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5190 while (p[0] < NOTACHAR)
5191 {
5192 while (p[1] == p[0] + 1) p++;
5193 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5194 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5195 p++;
5196 }
5197 return n8;
5198 }
5199
5200
5201
5202 /*************************************************
5203 * Find details of duplicate group names *
5204 *************************************************/
5205
5206 /* This is called from compile_branch() when it needs to know the index and
5207 count of duplicates in the names table when processing named backreferences,
5208 either directly, or as conditions.
5209
5210 Arguments:
5211 name points to the name
5212 length the length of the name
5213 indexptr where to put the index
5214 countptr where to put the count of duplicates
5215 errorcodeptr where to put an error code
5216 cb the compile block
5217
5218 Returns: TRUE if OK, FALSE if not, error code set
5219 */
5220
5221 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5222 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5223 int *countptr, int *errorcodeptr, compile_block *cb)
5224 {
5225 uint32_t i, groupnumber;
5226 int count;
5227 PCRE2_UCHAR *slot = cb->name_table;
5228
5229 /* Find the first entry in the table */
5230
5231 for (i = 0; i < cb->names_found; i++)
5232 {
5233 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5234 slot[IMM2_SIZE+length] == 0) break;
5235 slot += cb->name_entry_size;
5236 }
5237
5238 /* This should not occur, because this function is called only when we know we
5239 have duplicate names. Give an internal error. */
5240
5241 if (i >= cb->names_found)
5242 {
5243 *errorcodeptr = ERR53;
5244 cb->erroroffset = name - cb->start_pattern;
5245 return FALSE;
5246 }
5247
5248 /* Record the index and then see how many duplicates there are, updating the
5249 backref map and maximum back reference as we do. */
5250
5251 *indexptr = i;
5252 count = 0;
5253
5254 for (;;)
5255 {
5256 count++;
5257 groupnumber = GET2(slot,0);
5258 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5259 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5260 if (++i >= cb->names_found) break;
5261 slot += cb->name_entry_size;
5262 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5263 (slot+IMM2_SIZE)[length] != 0) break;
5264 }
5265
5266 *countptr = count;
5267 return TRUE;
5268 }
5269
5270
5271
5272 /*************************************************
5273 * Compile one branch *
5274 *************************************************/
5275
5276 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5277 the options are changed during the branch, the pointer is used to change the
5278 external options bits. This function is used during the pre-compile phase when
5279 we are trying to find out the amount of memory needed, as well as during the
5280 real compile phase. The value of lengthptr distinguishes the two phases.
5281
5282 Arguments:
5283 optionsptr pointer to the option bits
5284 codeptr points to the pointer to the current code point
5285 pptrptr points to the current parsed pattern pointer
5286 errorcodeptr points to error code variable
5287 firstcuptr place to put the first required code unit
5288 firstcuflagsptr place to put the first code unit flags, or a negative number
5289 reqcuptr place to put the last required code unit
5290 reqcuflagsptr place to put the last required code unit flags, or a negative number
5291 bcptr points to current branch chain
5292 cb contains pointers to tables etc.
5293 lengthptr NULL during the real compile phase
5294 points to length accumulator during pre-compile phase
5295
5296 Returns: 0 There's been an error, *errorcodeptr is non-zero
5297 +1 Success, this branch must match at least one character
5298 -1 Success, this branch may match an empty string
5299 */
5300
5301 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)5302 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
5303 int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
5304 uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
5305 compile_block *cb, PCRE2_SIZE *lengthptr)
5306 {
5307 int bravalue = 0;
5308 int okreturn = -1;
5309 int group_return = 0;
5310 uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
5311 uint32_t greedy_default, greedy_non_default;
5312 uint32_t repeat_type, op_type;
5313 uint32_t options = *optionsptr; /* May change dynamically */
5314 uint32_t firstcu, reqcu;
5315 uint32_t zeroreqcu, zerofirstcu;
5316 uint32_t escape;
5317 uint32_t *pptr = *pptrptr;
5318 uint32_t meta, meta_arg;
5319 int32_t firstcuflags, reqcuflags;
5320 int32_t zeroreqcuflags, zerofirstcuflags;
5321 int32_t req_caseopt, reqvary, tempreqvary;
5322 PCRE2_SIZE offset = 0;
5323 PCRE2_SIZE length_prevgroup = 0;
5324 PCRE2_UCHAR *code = *codeptr;
5325 PCRE2_UCHAR *last_code = code;
5326 PCRE2_UCHAR *orig_code = code;
5327 PCRE2_UCHAR *tempcode;
5328 PCRE2_UCHAR *previous = NULL;
5329 PCRE2_UCHAR op_previous;
5330 BOOL groupsetfirstcu = FALSE;
5331 BOOL had_accept = FALSE;
5332 BOOL matched_char = FALSE;
5333 BOOL previous_matched_char = FALSE;
5334 BOOL reset_caseful = FALSE;
5335 const uint8_t *cbits = cb->cbits;
5336 uint8_t classbits[32];
5337
5338 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5339 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5340 dynamically as we process the pattern. */
5341
5342 #ifdef SUPPORT_UNICODE
5343 BOOL utf = (options & PCRE2_UTF) != 0;
5344 BOOL ucp = (options & PCRE2_UCP) != 0;
5345 #else /* No Unicode support */
5346 BOOL utf = FALSE;
5347 #endif
5348
5349 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5350 class_uchardata always so that it can be passed to add_to_class() always,
5351 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5352 alternative calls for the different cases. */
5353
5354 PCRE2_UCHAR *class_uchardata;
5355 #ifdef SUPPORT_WIDE_CHARS
5356 BOOL xclass;
5357 PCRE2_UCHAR *class_uchardata_base;
5358 #endif
5359
5360 /* Set up the default and non-default settings for greediness */
5361
5362 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5363 greedy_non_default = greedy_default ^ 1;
5364
5365 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5366 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5367 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5368
5369 When we hit a repeat whose minimum is zero, we may have to adjust these values
5370 to take the zero repeat into account. This is implemented by setting them to
5371 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5372 item types that can be repeated set these backoff variables appropriately. */
5373
5374 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5375 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5376
5377 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
5378 according to the current setting of the caseless flag. The REQ_CASELESS value
5379 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5380 to record the case status of the value. This is used only for ASCII characters.
5381 */
5382
5383 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
5384
5385 /* Switch on next META item until the end of the branch */
5386
5387 for (;; pptr++)
5388 {
5389 #ifdef SUPPORT_WIDE_CHARS
5390 BOOL xclass_has_prop;
5391 #endif
5392 BOOL negate_class;
5393 BOOL should_flip_negation;
5394 BOOL match_all_or_no_wide_chars;
5395 BOOL possessive_quantifier;
5396 BOOL note_group_empty;
5397 int class_has_8bitchar;
5398 int i;
5399 uint32_t mclength;
5400 uint32_t skipunits;
5401 uint32_t subreqcu, subfirstcu;
5402 uint32_t groupnumber;
5403 uint32_t verbarglen, verbculen;
5404 int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
5405 open_capitem *oc;
5406 PCRE2_UCHAR mcbuffer[8];
5407
5408 /* Get next META item in the pattern and its potential argument. */
5409
5410 meta = META_CODE(*pptr);
5411 meta_arg = META_DATA(*pptr);
5412
5413 /* If we are in the pre-compile phase, accumulate the length used for the
5414 previous cycle of this loop, unless the next item is a quantifier. */
5415
5416 if (lengthptr != NULL)
5417 {
5418 if (code > cb->start_workspace + cb->workspace_size -
5419 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5420 {
5421 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5422 ERR52 : ERR86;
5423 return 0;
5424 }
5425
5426 /* There is at least one situation where code goes backwards: this is the
5427 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5428 is processed, the whole class is eliminated. However, it is created first,
5429 so we have to allow memory for it. Therefore, don't ever reduce the length
5430 at this point. */
5431
5432 if (code < last_code) code = last_code;
5433
5434 /* If the next thing is not a quantifier, we add the length of the previous
5435 item into the total, and reset the code pointer to the start of the
5436 workspace. Otherwise leave the previous item available to be quantified. */
5437
5438 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5439 {
5440 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5441 {
5442 *errorcodeptr = ERR20; /* Integer overflow */
5443 return 0;
5444 }
5445 *lengthptr += (PCRE2_SIZE)(code - orig_code);
5446 if (*lengthptr > MAX_PATTERN_SIZE)
5447 {
5448 *errorcodeptr = ERR20; /* Pattern is too large */
5449 return 0;
5450 }
5451 code = orig_code;
5452 }
5453
5454 /* Remember where this code item starts so we can catch the "backwards"
5455 case above next time round. */
5456
5457 last_code = code;
5458 }
5459
5460 /* Process the next parsed pattern item. If it is not a quantifier, remember
5461 where it starts so that it can be quantified when a quantifier follows.
5462 Checking for the legality of quantifiers happens in parse_regex(), except for
5463 a quantifier after an assertion that is a condition. */
5464
5465 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5466 {
5467 previous = code;
5468 if (matched_char && !had_accept) okreturn = 1;
5469 }
5470
5471 previous_matched_char = matched_char;
5472 matched_char = FALSE;
5473 note_group_empty = FALSE;
5474 skipunits = 0; /* Default value for most subgroups */
5475
5476 switch(meta)
5477 {
5478 /* ===================================================================*/
5479 /* The branch terminates at pattern end or | or ) */
5480
5481 case META_END:
5482 case META_ALT:
5483 case META_KET:
5484 *firstcuptr = firstcu;
5485 *firstcuflagsptr = firstcuflags;
5486 *reqcuptr = reqcu;
5487 *reqcuflagsptr = reqcuflags;
5488 *codeptr = code;
5489 *pptrptr = pptr;
5490 return okreturn;
5491
5492
5493 /* ===================================================================*/
5494 /* Handle single-character metacharacters. In multiline mode, ^ disables
5495 the setting of any following char as a first character. */
5496
5497 case META_CIRCUMFLEX:
5498 if ((options & PCRE2_MULTILINE) != 0)
5499 {
5500 if (firstcuflags == REQ_UNSET)
5501 zerofirstcuflags = firstcuflags = REQ_NONE;
5502 *code++ = OP_CIRCM;
5503 }
5504 else *code++ = OP_CIRC;
5505 break;
5506
5507 case META_DOLLAR:
5508 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5509 break;
5510
5511 /* There can never be a first char if '.' is first, whatever happens about
5512 repeats. The value of reqcu doesn't change either. */
5513
5514 case META_DOT:
5515 matched_char = TRUE;
5516 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5517 zerofirstcu = firstcu;
5518 zerofirstcuflags = firstcuflags;
5519 zeroreqcu = reqcu;
5520 zeroreqcuflags = reqcuflags;
5521 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5522 break;
5523
5524
5525 /* ===================================================================*/
5526 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5527 Otherwise, an initial ']' is taken as a data character. When empty classes
5528 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5529 match any character, so generate OP_ALLANY. */
5530
5531 case META_CLASS_EMPTY:
5532 case META_CLASS_EMPTY_NOT:
5533 matched_char = TRUE;
5534 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5535 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5536 zerofirstcu = firstcu;
5537 zerofirstcuflags = firstcuflags;
5538 break;
5539
5540
5541 /* ===================================================================*/
5542 /* Non-empty character class. If the included characters are all < 256, we
5543 build a 32-byte bitmap of the permitted characters, except in the special
5544 case where there is only one such character. For negated classes, we build
5545 the map as usual, then invert it at the end. However, we use a different
5546 opcode so that data characters > 255 can be handled correctly.
5547
5548 If the class contains characters outside the 0-255 range, a different
5549 opcode is compiled. It may optionally have a bit map for characters < 256,
5550 but those above are are explicitly listed afterwards. A flag code unit
5551 tells whether the bitmap is present, and whether this is a negated class or
5552 not. */
5553
5554 case META_CLASS_NOT:
5555 case META_CLASS:
5556 matched_char = TRUE;
5557 negate_class = meta == META_CLASS_NOT;
5558
5559 /* We can optimize the case of a single character in a class by generating
5560 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5561 negative. In the negative case there can be no first char if this item is
5562 first, whatever repeat count may follow. In the case of reqcu, save the
5563 previous value for reinstating. */
5564
5565 /* NOTE: at present this optimization is not effective if the only
5566 character in a class in 32-bit, non-UCP mode has its top bit set. */
5567
5568 if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5569 {
5570 #ifdef SUPPORT_UNICODE
5571 uint32_t d;
5572 #endif
5573 uint32_t c = pptr[1];
5574
5575 pptr += 2; /* Move on to class end */
5576 if (meta == META_CLASS) /* A positive one-char class can be */
5577 { /* handled as a normal literal character. */
5578 meta = c; /* Set up the character */
5579 goto NORMAL_CHAR_SET;
5580 }
5581
5582 /* Handle a negative one-character class */
5583
5584 zeroreqcu = reqcu;
5585 zeroreqcuflags = reqcuflags;
5586 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5587 zerofirstcu = firstcu;
5588 zerofirstcuflags = firstcuflags;
5589
5590 /* For caseless UTF or UCP mode, check whether this character has more
5591 than one other case. If so, generate a special OP_NOTPROP item instead of
5592 OP_NOTI. */
5593
5594 #ifdef SUPPORT_UNICODE
5595 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5596 (d = UCD_CASESET(c)) != 0)
5597 {
5598 *code++ = OP_NOTPROP;
5599 *code++ = PT_CLIST;
5600 *code++ = d;
5601 break; /* We are finished with this class */
5602 }
5603 #endif
5604 /* Char has only one other case, or UCP not available */
5605
5606 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5607 code += PUTCHAR(c, code);
5608 break; /* We are finished with this class */
5609 } /* End of 1-char optimization */
5610
5611 /* Handle character classes that contain more than just one literal
5612 character. If there are exactly two characters in a positive class, see if
5613 they are case partners. This can be optimized to generate a caseless single
5614 character match (which also sets first/required code units if relevant). */
5615
5616 if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5617 pptr[3] == META_CLASS_END)
5618 {
5619 uint32_t c = pptr[1];
5620
5621 #ifdef SUPPORT_UNICODE
5622 if (UCD_CASESET(c) == 0)
5623 #endif
5624 {
5625 uint32_t d;
5626
5627 #ifdef SUPPORT_UNICODE
5628 if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5629 #endif
5630 {
5631 #if PCRE2_CODE_UNIT_WIDTH != 8
5632 if (c > 255) d = c; else
5633 #endif
5634 d = TABLE_GET(c, cb->fcc, c);
5635 }
5636
5637 if (c != d && pptr[2] == d)
5638 {
5639 pptr += 3; /* Move on to class end */
5640 meta = c;
5641 if ((options & PCRE2_CASELESS) == 0)
5642 {
5643 reset_caseful = TRUE;
5644 options |= PCRE2_CASELESS;
5645 req_caseopt = REQ_CASELESS;
5646 }
5647 goto CLASS_CASELESS_CHAR;
5648 }
5649 }
5650 }
5651
5652 /* If a non-extended class contains a negative special such as \S, we need
5653 to flip the negation flag at the end, so that support for characters > 255
5654 works correctly (they are all included in the class). An extended class may
5655 need to insert specific matching or non-matching code for wide characters.
5656 */
5657
5658 should_flip_negation = match_all_or_no_wide_chars = FALSE;
5659
5660 /* Extended class (xclass) will be used when characters > 255
5661 might match. */
5662
5663 #ifdef SUPPORT_WIDE_CHARS
5664 xclass = FALSE;
5665 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
5666 class_uchardata_base = class_uchardata; /* Save the start */
5667 #endif
5668
5669 /* For optimization purposes, we track some properties of the class:
5670 class_has_8bitchar will be non-zero if the class contains at least one
5671 character with a code point less than 256; xclass_has_prop will be TRUE if
5672 Unicode property checks are present in the class. */
5673
5674 class_has_8bitchar = 0;
5675 #ifdef SUPPORT_WIDE_CHARS
5676 xclass_has_prop = FALSE;
5677 #endif
5678
5679 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5680 in a temporary bit of memory, in case the class contains fewer than two
5681 8-bit characters because in that case the compiled code doesn't use the bit
5682 map. */
5683
5684 memset(classbits, 0, 32 * sizeof(uint8_t));
5685
5686 /* Process items until META_CLASS_END is reached. */
5687
5688 while ((meta = *(++pptr)) != META_CLASS_END)
5689 {
5690 /* Handle POSIX classes such as [:alpha:] etc. */
5691
5692 if (meta == META_POSIX || meta == META_POSIX_NEG)
5693 {
5694 BOOL local_negate = (meta == META_POSIX_NEG);
5695 int posix_class = *(++pptr);
5696 int taboffset, tabopt;
5697 uint8_t pbits[32];
5698
5699 should_flip_negation = local_negate; /* Note negative special */
5700
5701 /* If matching is caseless, upper and lower are converted to alpha.
5702 This relies on the fact that the class table starts with alpha,
5703 lower, upper as the first 3 entries. */
5704
5705 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5706 posix_class = 0;
5707
5708 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5709 different escape sequences that use Unicode properties \p or \P.
5710 Others that are not available via \p or \P have to generate
5711 XCL_PROP/XCL_NOTPROP directly, which is done here. */
5712
5713 #ifdef SUPPORT_UNICODE
5714 if ((options & PCRE2_UCP) != 0) switch(posix_class)
5715 {
5716 case PC_GRAPH:
5717 case PC_PRINT:
5718 case PC_PUNCT:
5719 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5720 *class_uchardata++ = (PCRE2_UCHAR)
5721 ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5722 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5723 *class_uchardata++ = 0;
5724 xclass_has_prop = TRUE;
5725 goto CONTINUE_CLASS;
5726
5727 /* For the other POSIX classes (ascii, xdigit) we are going to
5728 fall through to the non-UCP case and build a bit map for
5729 characters with code points less than 256. However, if we are in
5730 a negated POSIX class, characters with code points greater than
5731 255 must either all match or all not match, depending on whether
5732 the whole class is not or is negated. For example, for
5733 [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5734 they must not.
5735
5736 In the special case where there are no xclass items, this is
5737 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5738 explicit range is needed for OP_XCLASS. Setting a flag here
5739 causes the range to be generated later when it is known that
5740 OP_XCLASS is required. In the 8-bit library this is relevant only in
5741 utf mode, since no wide characters can exist otherwise. */
5742
5743 default:
5744 #if PCRE2_CODE_UNIT_WIDTH == 8
5745 if (utf)
5746 #endif
5747 match_all_or_no_wide_chars |= local_negate;
5748 break;
5749 }
5750 #endif /* SUPPORT_UNICODE */
5751
5752 /* In the non-UCP case, or when UCP makes no difference, we build the
5753 bit map for the POSIX class in a chunk of local store because we may
5754 be adding and subtracting from it, and we don't want to subtract bits
5755 that may be in the main map already. At the end we or the result into
5756 the bit map that is being built. */
5757
5758 posix_class *= 3;
5759
5760 /* Copy in the first table (always present) */
5761
5762 memcpy(pbits, cbits + posix_class_maps[posix_class],
5763 32 * sizeof(uint8_t));
5764
5765 /* If there is a second table, add or remove it as required. */
5766
5767 taboffset = posix_class_maps[posix_class + 1];
5768 tabopt = posix_class_maps[posix_class + 2];
5769
5770 if (taboffset >= 0)
5771 {
5772 if (tabopt >= 0)
5773 for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5774 else
5775 for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5776 }
5777
5778 /* Now see if we need to remove any special characters. An option
5779 value of 1 removes vertical space and 2 removes underscore. */
5780
5781 if (tabopt < 0) tabopt = -tabopt;
5782 if (tabopt == 1) pbits[1] &= ~0x3c;
5783 else if (tabopt == 2) pbits[11] &= 0x7f;
5784
5785 /* Add the POSIX table or its complement into the main table that is
5786 being built and we are done. */
5787
5788 if (local_negate)
5789 for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
5790 else
5791 for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
5792
5793 /* Every class contains at least one < 256 character. */
5794
5795 class_has_8bitchar = 1;
5796 goto CONTINUE_CLASS; /* End of POSIX handling */
5797 }
5798
5799 /* Other than POSIX classes, the only items we should encounter are
5800 \d-type escapes and literal characters (possibly as ranges). */
5801
5802 if (meta == META_BIGVALUE)
5803 {
5804 meta = *(++pptr);
5805 goto CLASS_LITERAL;
5806 }
5807
5808 /* Any other non-literal must be an escape */
5809
5810 if (meta >= META_END)
5811 {
5812 if (META_CODE(meta) != META_ESCAPE)
5813 {
5814 #ifdef DEBUG_SHOW_PARSED
5815 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5816 "in character class\n", meta);
5817 #endif
5818 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
5819 return 0;
5820 }
5821 escape = META_DATA(meta);
5822
5823 /* Every class contains at least one < 256 character. */
5824
5825 class_has_8bitchar++;
5826
5827 switch(escape)
5828 {
5829 case ESC_d:
5830 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5831 break;
5832
5833 case ESC_D:
5834 should_flip_negation = TRUE;
5835 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
5836 break;
5837
5838 case ESC_w:
5839 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5840 break;
5841
5842 case ESC_W:
5843 should_flip_negation = TRUE;
5844 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
5845 break;
5846
5847 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5848 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5849 previously set by something earlier in the character class.
5850 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5851 we could just adjust the appropriate bit. From PCRE 8.34 we no
5852 longer treat \s and \S specially. */
5853
5854 case ESC_s:
5855 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5856 break;
5857
5858 case ESC_S:
5859 should_flip_negation = TRUE;
5860 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
5861 break;
5862
5863 /* When adding the horizontal or vertical space lists to a class, or
5864 their complements, disable PCRE2_CASELESS, because it justs wastes
5865 time, and in the "not-x" UTF cases can create unwanted duplicates in
5866 the XCLASS list (provoked by characters that have more than one other
5867 case and by both cases being in the same "not-x" sublist). */
5868
5869 case ESC_h:
5870 (void)add_list_to_class(classbits, &class_uchardata,
5871 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5872 break;
5873
5874 case ESC_H:
5875 (void)add_not_list_to_class(classbits, &class_uchardata,
5876 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5877 break;
5878
5879 case ESC_v:
5880 (void)add_list_to_class(classbits, &class_uchardata,
5881 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5882 break;
5883
5884 case ESC_V:
5885 (void)add_not_list_to_class(classbits, &class_uchardata,
5886 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5887 break;
5888
5889 /* If Unicode is not supported, \P and \p are not allowed and are
5890 faulted at parse time, so will never appear here. */
5891
5892 #ifdef SUPPORT_UNICODE
5893 case ESC_p:
5894 case ESC_P:
5895 {
5896 uint32_t ptype = *(++pptr) >> 16;
5897 uint32_t pdata = *pptr & 0xffff;
5898 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5899 *class_uchardata++ = ptype;
5900 *class_uchardata++ = pdata;
5901 xclass_has_prop = TRUE;
5902 class_has_8bitchar--; /* Undo! */
5903 }
5904 break;
5905 #endif
5906 }
5907
5908 goto CONTINUE_CLASS;
5909 } /* End handling \d-type escapes */
5910
5911 /* A literal character may be followed by a range meta. At parse time
5912 there are checks for out-of-order characters, for ranges where the two
5913 characters are equal, and for hyphens that cannot indicate a range. At
5914 this point, therefore, no checking is needed. */
5915
5916 else
5917 {
5918 uint32_t c, d;
5919
5920 CLASS_LITERAL:
5921 c = d = meta;
5922
5923 /* Remember if \r or \n were explicitly used */
5924
5925 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5926
5927 /* Process a character range */
5928
5929 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
5930 {
5931 #ifdef EBCDIC
5932 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
5933 #endif
5934 pptr += 2;
5935 d = *pptr;
5936 if (d == META_BIGVALUE) d = *(++pptr);
5937
5938 /* Remember an explicit \r or \n, and add the range to the class. */
5939
5940 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5941
5942 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
5943 because there are holes in the encoding, and simply using the range
5944 A-Z (for example) would include the characters in the holes. This
5945 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
5946
5947 #ifdef EBCDIC
5948 if (range_is_literal &&
5949 (cb->ctypes[c] & ctype_letter) != 0 &&
5950 (cb->ctypes[d] & ctype_letter) != 0 &&
5951 (c <= CHAR_z) == (d <= CHAR_z))
5952 {
5953 uint32_t uc = (d <= CHAR_z)? 0 : 64;
5954 uint32_t C = c - uc;
5955 uint32_t D = d - uc;
5956
5957 if (C <= CHAR_i)
5958 {
5959 class_has_8bitchar +=
5960 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5961 ((D < CHAR_i)? D : CHAR_i) + uc);
5962 C = CHAR_j;
5963 }
5964
5965 if (C <= D && C <= CHAR_r)
5966 {
5967 class_has_8bitchar +=
5968 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5969 ((D < CHAR_r)? D : CHAR_r) + uc);
5970 C = CHAR_s;
5971 }
5972
5973 if (C <= D)
5974 {
5975 class_has_8bitchar +=
5976 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5977 D + uc);
5978 }
5979 }
5980 else
5981 #endif
5982 /* Not an EBCDIC special range */
5983
5984 class_has_8bitchar +=
5985 add_to_class(classbits, &class_uchardata, options, cb, c, d);
5986 goto CONTINUE_CLASS; /* Go get the next char in the class */
5987 } /* End of range handling */
5988
5989
5990 /* Handle a single character. */
5991
5992 class_has_8bitchar +=
5993 add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
5994 }
5995
5996 /* Continue to the next item in the class. */
5997
5998 CONTINUE_CLASS:
5999
6000 #ifdef SUPPORT_WIDE_CHARS
6001 /* If any wide characters or Unicode properties have been encountered,
6002 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6003 of the extra data and reset the pointer. This is so that very large
6004 classes that contain a zillion wide characters or Unicode property tests
6005 do not overwrite the workspace (which is on the stack). */
6006
6007 if (class_uchardata > class_uchardata_base)
6008 {
6009 xclass = TRUE;
6010 if (lengthptr != NULL)
6011 {
6012 *lengthptr += class_uchardata - class_uchardata_base;
6013 class_uchardata = class_uchardata_base;
6014 }
6015 }
6016 #endif
6017
6018 continue; /* Needed to avoid error when not supporting wide chars */
6019 } /* End of main class-processing loop */
6020
6021 /* If this class is the first thing in the branch, there can be no first
6022 char setting, whatever the repeat count. Any reqcu setting must remain
6023 unchanged after any kind of repeat. */
6024
6025 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6026 zerofirstcu = firstcu;
6027 zerofirstcuflags = firstcuflags;
6028 zeroreqcu = reqcu;
6029 zeroreqcuflags = reqcuflags;
6030
6031 /* If there are characters with values > 255, or Unicode property settings
6032 (\p or \P), we have to compile an extended class, with its own opcode,
6033 unless there were no property settings and there was a negated special such
6034 as \S in the class, and PCRE2_UCP is not set, because in that case all
6035 characters > 255 are in or not in the class, so any that were explicitly
6036 given as well can be ignored.
6037
6038 In the UCP case, if certain negated POSIX classes ([:^ascii:] or
6039 [^:xdigit:]) were present in a class, we either have to match or not match
6040 all wide characters (depending on whether the whole class is or is not
6041 negated). This requirement is indicated by match_all_or_no_wide_chars being
6042 true. We do this by including an explicit range, which works in both cases.
6043 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6044 cannot be any wide characters in 8-bit non-UTF mode.
6045
6046 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6047 class where \S etc is present without PCRE2_UCP, causing an extended class
6048 to be compiled, we make sure that all characters > 255 are included by
6049 forcing match_all_or_no_wide_chars to be true.
6050
6051 If, when generating an xclass, there are no characters < 256, we can omit
6052 the bitmap in the actual compiled code. */
6053
6054 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
6055 if (xclass && (
6056 #ifdef SUPPORT_UNICODE
6057 (options & PCRE2_UCP) != 0 ||
6058 #endif
6059 xclass_has_prop || !should_flip_negation))
6060 {
6061 if (match_all_or_no_wide_chars || (
6062 #if PCRE2_CODE_UNIT_WIDTH == 8
6063 utf &&
6064 #endif
6065 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6066 {
6067 *class_uchardata++ = XCL_RANGE;
6068 if (utf) /* Will always be utf in the 8-bit library */
6069 {
6070 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6071 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6072 }
6073 else /* Can only happen for the 16-bit & 32-bit libraries */
6074 {
6075 #if PCRE2_CODE_UNIT_WIDTH == 16
6076 *class_uchardata++ = 0x100;
6077 *class_uchardata++ = 0xffffu;
6078 #elif PCRE2_CODE_UNIT_WIDTH == 32
6079 *class_uchardata++ = 0x100;
6080 *class_uchardata++ = 0xffffffffu;
6081 #endif
6082 }
6083 }
6084 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
6085 *code++ = OP_XCLASS;
6086 code += LINK_SIZE;
6087 *code = negate_class? XCL_NOT:0;
6088 if (xclass_has_prop) *code |= XCL_HASPROP;
6089
6090 /* If the map is required, move up the extra data to make room for it;
6091 otherwise just move the code pointer to the end of the extra data. */
6092
6093 if (class_has_8bitchar > 0)
6094 {
6095 *code++ |= XCL_MAP;
6096 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6097 CU2BYTES(class_uchardata - code));
6098 if (negate_class && !xclass_has_prop)
6099 {
6100 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6101 for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6102 }
6103 memcpy(code, classbits, 32);
6104 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6105 }
6106 else code = class_uchardata;
6107
6108 /* Now fill in the complete length of the item */
6109
6110 PUT(previous, 1, (int)(code - previous));
6111 break; /* End of class handling */
6112 }
6113 #endif /* SUPPORT_WIDE_CHARS */
6114
6115 /* If there are no characters > 255, or they are all to be included or
6116 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6117 whole class was negated and whether there were negative specials such as \S
6118 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6119 negating it if necessary. */
6120
6121 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6122 if (lengthptr == NULL) /* Save time in the pre-compile phase */
6123 {
6124 if (negate_class)
6125 {
6126 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6127 for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6128 }
6129 memcpy(code, classbits, 32);
6130 }
6131 code += 32 / sizeof(PCRE2_UCHAR);
6132 break; /* End of class processing */
6133
6134
6135 /* ===================================================================*/
6136 /* Deal with (*VERB)s. */
6137
6138 /* Check for open captures before ACCEPT and close those that are within
6139 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6140 assertion. In the first pass, just accumulate the length required;
6141 otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6142 workspace overflow. Do not set firstcu after *ACCEPT. */
6143
6144 case META_ACCEPT:
6145 cb->had_accept = had_accept = TRUE;
6146 for (oc = cb->open_caps;
6147 oc != NULL && oc->assert_depth >= cb->assert_depth;
6148 oc = oc->next)
6149 {
6150 if (lengthptr != NULL)
6151 {
6152 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6153 }
6154 else
6155 {
6156 *code++ = OP_CLOSE;
6157 PUT2INC(code, 0, oc->number);
6158 }
6159 }
6160 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6161 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6162 break;
6163
6164 case META_PRUNE:
6165 case META_SKIP:
6166 cb->had_pruneorskip = TRUE;
6167 /* Fall through */
6168 case META_COMMIT:
6169 case META_FAIL:
6170 *code++ = verbops[(meta - META_MARK) >> 16];
6171 break;
6172
6173 case META_THEN:
6174 cb->external_flags |= PCRE2_HASTHEN;
6175 *code++ = OP_THEN;
6176 break;
6177
6178 /* Handle verbs with arguments. Arguments can be very long, especially in
6179 16- and 32-bit modes, and can overflow the workspace in the first pass.
6180 However, the argument length is constrained to be small enough to fit in
6181 one code unit. This check happens in parse_regex(). In the first pass,
6182 instead of putting the argument into memory, we just update the length
6183 counter and set up an empty argument. */
6184
6185 case META_THEN_ARG:
6186 cb->external_flags |= PCRE2_HASTHEN;
6187 goto VERB_ARG;
6188
6189 case META_PRUNE_ARG:
6190 case META_SKIP_ARG:
6191 cb->had_pruneorskip = TRUE;
6192 /* Fall through */
6193 case META_MARK:
6194 case META_COMMIT_ARG:
6195 VERB_ARG:
6196 *code++ = verbops[(meta - META_MARK) >> 16];
6197 /* The length is in characters. */
6198 verbarglen = *(++pptr);
6199 verbculen = 0;
6200 tempcode = code++;
6201 for (i = 0; i < (int)verbarglen; i++)
6202 {
6203 meta = *(++pptr);
6204 #ifdef SUPPORT_UNICODE
6205 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6206 #endif
6207 {
6208 mclength = 1;
6209 mcbuffer[0] = meta;
6210 }
6211 if (lengthptr != NULL) *lengthptr += mclength; else
6212 {
6213 memcpy(code, mcbuffer, CU2BYTES(mclength));
6214 code += mclength;
6215 verbculen += mclength;
6216 }
6217 }
6218
6219 *tempcode = verbculen; /* Fill in the code unit length */
6220 *code++ = 0; /* Terminating zero */
6221 break;
6222
6223
6224 /* ===================================================================*/
6225 /* Handle options change. The new setting must be passed back for use in
6226 subsequent branches. Reset the greedy defaults and the case value for
6227 firstcu and reqcu. */
6228
6229 case META_OPTIONS:
6230 *optionsptr = options = *(++pptr);
6231 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6232 greedy_non_default = greedy_default ^ 1;
6233 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6234 break;
6235
6236
6237 /* ===================================================================*/
6238 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6239 because it could be a numerical check on recursion, or a name check on a
6240 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6241 we can handle it either way. We first try for a name; if not found, process
6242 the number. */
6243
6244 case META_COND_RNUMBER: /* (?(Rdigits) */
6245 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6246 case META_COND_RNAME: /* (?(R&name) - test for recursion */
6247 bravalue = OP_COND;
6248 {
6249 int count, index;
6250 PCRE2_SPTR name;
6251 named_group *ng = cb->named_groups;
6252 uint32_t length = *(++pptr);
6253
6254 GETPLUSOFFSET(offset, pptr);
6255 name = cb->start_pattern + offset;
6256
6257 /* In the first pass, the names generated in the pre-pass are available,
6258 but the main name table has not yet been created. Scan the list of names
6259 generated in the pre-pass in order to get a number and whether or not
6260 this name is duplicated. If it is not duplicated, we can handle it as a
6261 numerical group. */
6262
6263 for (i = 0; i < cb->names_found; i++, ng++)
6264 {
6265 if (length == ng->length &&
6266 PRIV(strncmp)(name, ng->name, length) == 0)
6267 {
6268 if (!ng->isdup)
6269 {
6270 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6271 PUT2(code, 2+LINK_SIZE, ng->number);
6272 if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6273 skipunits = 1+IMM2_SIZE;
6274 goto GROUP_PROCESS_NOTE_EMPTY;
6275 }
6276 break; /* Found a duplicated name */
6277 }
6278 }
6279
6280 /* If the name was not found we have a bad reference, unless we are
6281 dealing with R<digits>, which is treated as a recursion test by number.
6282 */
6283
6284 if (i >= cb->names_found)
6285 {
6286 groupnumber = 0;
6287 if (meta == META_COND_RNUMBER)
6288 {
6289 for (i = 1; i < (int)length; i++)
6290 {
6291 groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6292 if (groupnumber > MAX_GROUP_NUMBER)
6293 {
6294 *errorcodeptr = ERR61;
6295 cb->erroroffset = offset + i;
6296 return 0;
6297 }
6298 }
6299 }
6300
6301 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6302 {
6303 *errorcodeptr = ERR15;
6304 cb->erroroffset = offset;
6305 return 0;
6306 }
6307
6308 /* (?Rdigits) treated as a recursion reference by number. A value of
6309 zero (which is the result of both (?R) and (?R0)) means "any", and is
6310 translated into RREF_ANY (which is 0xffff). */
6311
6312 if (groupnumber == 0) groupnumber = RREF_ANY;
6313 code[1+LINK_SIZE] = OP_RREF;
6314 PUT2(code, 2+LINK_SIZE, groupnumber);
6315 skipunits = 1+IMM2_SIZE;
6316 goto GROUP_PROCESS_NOTE_EMPTY;
6317 }
6318
6319 /* A duplicated name was found. Note that if an R<digits> name is found
6320 (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6321
6322 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6323
6324 /* We have a duplicated name. In the compile pass we have to search the
6325 main table in order to get the index and count values. */
6326
6327 count = 0; /* Values for first pass (avoids compiler warning) */
6328 index = 0;
6329 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6330 &count, errorcodeptr, cb)) return 0;
6331
6332 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6333 insert appropriate data values. */
6334
6335 code[1+LINK_SIZE]++;
6336 skipunits = 1+2*IMM2_SIZE;
6337 PUT2(code, 2+LINK_SIZE, index);
6338 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6339 }
6340 goto GROUP_PROCESS_NOTE_EMPTY;
6341
6342 /* The DEFINE condition is always false. Its internal groups may never
6343 be called, so matched_char must remain false, hence the jump to
6344 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6345
6346 case META_COND_DEFINE:
6347 bravalue = OP_COND;
6348 GETPLUSOFFSET(offset, pptr);
6349 code[1+LINK_SIZE] = OP_DEFINE;
6350 skipunits = 1;
6351 goto GROUP_PROCESS;
6352
6353 /* Conditional test of a group's being set. */
6354
6355 case META_COND_NUMBER:
6356 bravalue = OP_COND;
6357 GETPLUSOFFSET(offset, pptr);
6358 groupnumber = *(++pptr);
6359 if (groupnumber > cb->bracount)
6360 {
6361 *errorcodeptr = ERR15;
6362 cb->erroroffset = offset;
6363 return 0;
6364 }
6365 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6366 offset -= 2; /* Point at initial ( for too many branches error */
6367 code[1+LINK_SIZE] = OP_CREF;
6368 skipunits = 1+IMM2_SIZE;
6369 PUT2(code, 2+LINK_SIZE, groupnumber);
6370 goto GROUP_PROCESS_NOTE_EMPTY;
6371
6372 /* Test for the PCRE2 version. */
6373
6374 case META_COND_VERSION:
6375 bravalue = OP_COND;
6376 if (pptr[1] > 0)
6377 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6378 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6379 OP_TRUE : OP_FALSE;
6380 else
6381 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6382 OP_TRUE : OP_FALSE;
6383 skipunits = 1;
6384 pptr += 3;
6385 goto GROUP_PROCESS_NOTE_EMPTY;
6386
6387 /* The condition is an assertion, possibly preceded by a callout. */
6388
6389 case META_COND_ASSERT:
6390 bravalue = OP_COND;
6391 goto GROUP_PROCESS_NOTE_EMPTY;
6392
6393
6394 /* ===================================================================*/
6395 /* Handle all kinds of nested bracketed groups. The non-capturing,
6396 non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6397
6398 case META_LOOKAHEAD:
6399 bravalue = OP_ASSERT;
6400 cb->assert_depth += 1;
6401 goto GROUP_PROCESS;
6402
6403 case META_LOOKAHEAD_NA:
6404 bravalue = OP_ASSERT_NA;
6405 cb->assert_depth += 1;
6406 goto GROUP_PROCESS;
6407
6408 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6409 thing to do, but Perl allows all assertions to be quantified, and when
6410 they contain capturing parentheses there may be a potential use for
6411 this feature. Not that that applies to a quantified (?!) but we allow
6412 it for uniformity. */
6413
6414 case META_LOOKAHEADNOT:
6415 if (pptr[1] == META_KET &&
6416 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6417 {
6418 *code++ = OP_FAIL;
6419 pptr++;
6420 }
6421 else
6422 {
6423 bravalue = OP_ASSERT_NOT;
6424 cb->assert_depth += 1;
6425 goto GROUP_PROCESS;
6426 }
6427 break;
6428
6429 case META_LOOKBEHIND:
6430 bravalue = OP_ASSERTBACK;
6431 cb->assert_depth += 1;
6432 goto GROUP_PROCESS;
6433
6434 case META_LOOKBEHINDNOT:
6435 bravalue = OP_ASSERTBACK_NOT;
6436 cb->assert_depth += 1;
6437 goto GROUP_PROCESS;
6438
6439 case META_LOOKBEHIND_NA:
6440 bravalue = OP_ASSERTBACK_NA;
6441 cb->assert_depth += 1;
6442 goto GROUP_PROCESS;
6443
6444 case META_ATOMIC:
6445 bravalue = OP_ONCE;
6446 goto GROUP_PROCESS_NOTE_EMPTY;
6447
6448 case META_SCRIPT_RUN:
6449 bravalue = OP_SCRIPT_RUN;
6450 goto GROUP_PROCESS_NOTE_EMPTY;
6451
6452 case META_NOCAPTURE:
6453 bravalue = OP_BRA;
6454 /* Fall through */
6455
6456 /* Process nested bracketed regex. The nesting depth is maintained for the
6457 benefit of the stackguard function. The test for too deep nesting is now
6458 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6459 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6460 note of whether or not they may match an empty string. */
6461
6462 GROUP_PROCESS_NOTE_EMPTY:
6463 note_group_empty = TRUE;
6464
6465 GROUP_PROCESS:
6466 cb->parens_depth += 1;
6467 *code = bravalue;
6468 pptr++;
6469 tempcode = code;
6470 tempreqvary = cb->req_varyopt; /* Save value before group */
6471 length_prevgroup = 0; /* Initialize for pre-compile phase */
6472
6473 if ((group_return =
6474 compile_regex(
6475 options, /* The option state */
6476 &tempcode, /* Where to put code (updated) */
6477 &pptr, /* Input pointer (updated) */
6478 errorcodeptr, /* Where to put an error message */
6479 skipunits, /* Skip over bracket number */
6480 &subfirstcu, /* For possible first char */
6481 &subfirstcuflags,
6482 &subreqcu, /* For possible last char */
6483 &subreqcuflags,
6484 bcptr, /* Current branch chain */
6485 cb, /* Compile data block */
6486 (lengthptr == NULL)? NULL : /* Actual compile phase */
6487 &length_prevgroup /* Pre-compile phase */
6488 )) == 0)
6489 return 0; /* Error */
6490
6491 cb->parens_depth -= 1;
6492
6493 /* If that was a non-conditional significant group (not an assertion, not a
6494 DEFINE) that matches at least one character, then the current item matches
6495 a character. Conditionals are handled below. */
6496
6497 if (note_group_empty && bravalue != OP_COND && group_return > 0)
6498 matched_char = TRUE;
6499
6500 /* If we've just compiled an assertion, pop the assert depth. */
6501
6502 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6503 cb->assert_depth -= 1;
6504
6505 /* At the end of compiling, code is still pointing to the start of the
6506 group, while tempcode has been updated to point past the end of the group.
6507 The parsed pattern pointer (pptr) is on the closing META_KET.
6508
6509 If this is a conditional bracket, check that there are no more than
6510 two branches in the group, or just one if it's a DEFINE group. We do this
6511 in the real compile phase, not in the pre-pass, where the whole group may
6512 not be available. */
6513
6514 if (bravalue == OP_COND && lengthptr == NULL)
6515 {
6516 PCRE2_UCHAR *tc = code;
6517 int condcount = 0;
6518
6519 do {
6520 condcount++;
6521 tc += GET(tc,1);
6522 }
6523 while (*tc != OP_KET);
6524
6525 /* A DEFINE group is never obeyed inline (the "condition" is always
6526 false). It must have only one branch. Having checked this, change the
6527 opcode to OP_FALSE. */
6528
6529 if (code[LINK_SIZE+1] == OP_DEFINE)
6530 {
6531 if (condcount > 1)
6532 {
6533 cb->erroroffset = offset;
6534 *errorcodeptr = ERR54;
6535 return 0;
6536 }
6537 code[LINK_SIZE+1] = OP_FALSE;
6538 bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6539 }
6540
6541 /* A "normal" conditional group. If there is just one branch, we must not
6542 make use of its firstcu or reqcu, because this is equivalent to an
6543 empty second branch. Also, it may match an empty string. If there are two
6544 branches, this item must match a character if the group must. */
6545
6546 else
6547 {
6548 if (condcount > 2)
6549 {
6550 cb->erroroffset = offset;
6551 *errorcodeptr = ERR27;
6552 return 0;
6553 }
6554 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6555 else if (group_return > 0) matched_char = TRUE;
6556 }
6557 }
6558
6559 /* In the pre-compile phase, update the length by the length of the group,
6560 less the brackets at either end. Then reduce the compiled code to just a
6561 set of non-capturing brackets so that it doesn't use much memory if it is
6562 duplicated by a quantifier.*/
6563
6564 if (lengthptr != NULL)
6565 {
6566 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6567 {
6568 *errorcodeptr = ERR20;
6569 return 0;
6570 }
6571 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6572 code++; /* This already contains bravalue */
6573 PUTINC(code, 0, 1 + LINK_SIZE);
6574 *code++ = OP_KET;
6575 PUTINC(code, 0, 1 + LINK_SIZE);
6576 break; /* No need to waste time with special character handling */
6577 }
6578
6579 /* Otherwise update the main code pointer to the end of the group. */
6580
6581 code = tempcode;
6582
6583 /* For a DEFINE group, required and first character settings are not
6584 relevant. */
6585
6586 if (bravalue == OP_DEFINE) break;
6587
6588 /* Handle updating of the required and first code units for other types of
6589 group. Update for normal brackets of all kinds, and conditions with two
6590 branches (see code above). If the bracket is followed by a quantifier with
6591 zero repeat, we have to back off. Hence the definition of zeroreqcu and
6592 zerofirstcu outside the main loop so that they can be accessed for the back
6593 off. */
6594
6595 zeroreqcu = reqcu;
6596 zeroreqcuflags = reqcuflags;
6597 zerofirstcu = firstcu;
6598 zerofirstcuflags = firstcuflags;
6599 groupsetfirstcu = FALSE;
6600
6601 if (bravalue >= OP_ONCE) /* Not an assertion */
6602 {
6603 /* If we have not yet set a firstcu in this branch, take it from the
6604 subpattern, remembering that it was set here so that a repeat of more
6605 than one can replicate it as reqcu if necessary. If the subpattern has
6606 no firstcu, set "none" for the whole branch. In both cases, a zero
6607 repeat forces firstcu to "none". */
6608
6609 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6610 {
6611 if (subfirstcuflags >= 0)
6612 {
6613 firstcu = subfirstcu;
6614 firstcuflags = subfirstcuflags;
6615 groupsetfirstcu = TRUE;
6616 }
6617 else firstcuflags = REQ_NONE;
6618 zerofirstcuflags = REQ_NONE;
6619 }
6620
6621 /* If firstcu was previously set, convert the subpattern's firstcu
6622 into reqcu if there wasn't one, using the vary flag that was in
6623 existence beforehand. */
6624
6625 else if (subfirstcuflags >= 0 && subreqcuflags < 0)
6626 {
6627 subreqcu = subfirstcu;
6628 subreqcuflags = subfirstcuflags | tempreqvary;
6629 }
6630
6631 /* If the subpattern set a required code unit (or set a first code unit
6632 that isn't really the first code unit - see above), set it. */
6633
6634 if (subreqcuflags >= 0)
6635 {
6636 reqcu = subreqcu;
6637 reqcuflags = subreqcuflags;
6638 }
6639 }
6640
6641 /* For a forward assertion, we take the reqcu, if set, provided that the
6642 group has also set a firstcu. This can be helpful if the pattern that
6643 follows the assertion doesn't set a different char. For example, it's
6644 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6645 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6646 the "real" "a" would then become a reqcu instead of a firstcu. This is
6647 overcome by a scan at the end if there's no firstcu, looking for an
6648 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6649 we must only take the reqcu when the group also set a firstcu. Otherwise,
6650 in that example, 'X' ends up set for both. */
6651
6652 else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6653 subreqcuflags >= 0 && subfirstcuflags >= 0)
6654 {
6655 reqcu = subreqcu;
6656 reqcuflags = subreqcuflags;
6657 }
6658
6659 break; /* End of nested group handling */
6660
6661
6662 /* ===================================================================*/
6663 /* Handle named backreferences and recursions. */
6664
6665 case META_BACKREF_BYNAME:
6666 case META_RECURSE_BYNAME:
6667 {
6668 int count, index;
6669 PCRE2_SPTR name;
6670 BOOL is_dupname = FALSE;
6671 named_group *ng = cb->named_groups;
6672 uint32_t length = *(++pptr);
6673
6674 GETPLUSOFFSET(offset, pptr);
6675 name = cb->start_pattern + offset;
6676
6677 /* In the first pass, the names generated in the pre-pass are available,
6678 but the main name table has not yet been created. Scan the list of names
6679 generated in the pre-pass in order to get a number and whether or not
6680 this name is duplicated. */
6681
6682 groupnumber = 0;
6683 for (i = 0; i < cb->names_found; i++, ng++)
6684 {
6685 if (length == ng->length &&
6686 PRIV(strncmp)(name, ng->name, length) == 0)
6687 {
6688 is_dupname = ng->isdup;
6689 groupnumber = ng->number;
6690
6691 /* For a recursion, that's all that is needed. We can now go to
6692 the code that handles numerical recursion, applying it to the first
6693 group with the given name. */
6694
6695 if (meta == META_RECURSE_BYNAME)
6696 {
6697 meta_arg = groupnumber;
6698 goto HANDLE_NUMERICAL_RECURSION;
6699 }
6700
6701 /* For a back reference, update the back reference map and the
6702 maximum back reference. */
6703
6704 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6705 if (groupnumber > cb->top_backref)
6706 cb->top_backref = groupnumber;
6707 }
6708 }
6709
6710 /* If the name was not found we have a bad reference. */
6711
6712 if (groupnumber == 0)
6713 {
6714 *errorcodeptr = ERR15;
6715 cb->erroroffset = offset;
6716 return 0;
6717 }
6718
6719 /* If a back reference name is not duplicated, we can handle it as
6720 a numerical reference. */
6721
6722 if (!is_dupname)
6723 {
6724 meta_arg = groupnumber;
6725 goto HANDLE_SINGLE_REFERENCE;
6726 }
6727
6728 /* If a back reference name is duplicated, we generate a different
6729 opcode to a numerical back reference. In the second pass we must
6730 search for the index and count in the final name table. */
6731
6732 count = 0; /* Values for first pass (avoids compiler warning) */
6733 index = 0;
6734 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6735 &count, errorcodeptr, cb)) return 0;
6736
6737 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6738 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6739 PUT2INC(code, 0, index);
6740 PUT2INC(code, 0, count);
6741 }
6742 break;
6743
6744
6745 /* ===================================================================*/
6746 /* Handle a numerical callout. */
6747
6748 case META_CALLOUT_NUMBER:
6749 code[0] = OP_CALLOUT;
6750 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6751 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6752 code[1 + 2*LINK_SIZE] = pptr[3];
6753 pptr += 3;
6754 code += PRIV(OP_lengths)[OP_CALLOUT];
6755 break;
6756
6757
6758 /* ===================================================================*/
6759 /* Handle a callout with a string argument. In the pre-pass we just compute
6760 the length without generating anything. The length in pptr[3] includes both
6761 delimiters; in the actual compile only the first one is copied, but a
6762 terminating zero is added. Any doubled delimiters within the string make
6763 this an overestimate, but it is not worth bothering about. */
6764
6765 case META_CALLOUT_STRING:
6766 if (lengthptr != NULL)
6767 {
6768 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6769 pptr += 3;
6770 SKIPOFFSET(pptr);
6771 }
6772
6773 /* In the real compile we can copy the string. The starting delimiter is
6774 included so that the client can discover it if they want. We also pass the
6775 start offset to help a script language give better error messages. */
6776
6777 else
6778 {
6779 PCRE2_SPTR pp;
6780 uint32_t delimiter;
6781 uint32_t length = pptr[3];
6782 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6783
6784 code[0] = OP_CALLOUT_STR;
6785 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6786 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6787
6788 pptr += 3;
6789 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
6790 pp = cb->start_pattern + offset;
6791 delimiter = *callout_string++ = *pp++;
6792 if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6793 delimiter = CHAR_RIGHT_CURLY_BRACKET;
6794 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
6795
6796 /* The syntax of the pattern was checked in the parsing scan. The length
6797 includes both delimiters, but we have passed the opening one just above,
6798 so we reduce length before testing it. The test is for > 1 because we do
6799 not want to copy the final delimiter. This also ensures that pp[1] is
6800 accessible. */
6801
6802 while (--length > 1)
6803 {
6804 if (*pp == delimiter && pp[1] == delimiter)
6805 {
6806 *callout_string++ = delimiter;
6807 pp += 2;
6808 length--;
6809 }
6810 else *callout_string++ = *pp++;
6811 }
6812 *callout_string++ = CHAR_NUL;
6813
6814 /* Set the length of the entire item, the advance to its end. */
6815
6816 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6817 code = callout_string;
6818 }
6819 break;
6820
6821
6822 /* ===================================================================*/
6823 /* Handle repetition. The different types are all sorted out in the parsing
6824 pass. */
6825
6826 case META_MINMAX_PLUS:
6827 case META_MINMAX_QUERY:
6828 case META_MINMAX:
6829 repeat_min = *(++pptr);
6830 repeat_max = *(++pptr);
6831 goto REPEAT;
6832
6833 case META_ASTERISK:
6834 case META_ASTERISK_PLUS:
6835 case META_ASTERISK_QUERY:
6836 repeat_min = 0;
6837 repeat_max = REPEAT_UNLIMITED;
6838 goto REPEAT;
6839
6840 case META_PLUS:
6841 case META_PLUS_PLUS:
6842 case META_PLUS_QUERY:
6843 repeat_min = 1;
6844 repeat_max = REPEAT_UNLIMITED;
6845 goto REPEAT;
6846
6847 case META_QUERY:
6848 case META_QUERY_PLUS:
6849 case META_QUERY_QUERY:
6850 repeat_min = 0;
6851 repeat_max = 1;
6852
6853 REPEAT:
6854 if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6855
6856 /* Remember whether this is a variable length repeat, and default to
6857 single-char opcodes. */
6858
6859 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6860 op_type = 0;
6861
6862 /* Adjust first and required code units for a zero repeat. */
6863
6864 if (repeat_min == 0)
6865 {
6866 firstcu = zerofirstcu;
6867 firstcuflags = zerofirstcuflags;
6868 reqcu = zeroreqcu;
6869 reqcuflags = zeroreqcuflags;
6870 }
6871
6872 /* Note the greediness and possessiveness. */
6873
6874 switch (meta)
6875 {
6876 case META_MINMAX_PLUS:
6877 case META_ASTERISK_PLUS:
6878 case META_PLUS_PLUS:
6879 case META_QUERY_PLUS:
6880 repeat_type = 0; /* Force greedy */
6881 possessive_quantifier = TRUE;
6882 break;
6883
6884 case META_MINMAX_QUERY:
6885 case META_ASTERISK_QUERY:
6886 case META_PLUS_QUERY:
6887 case META_QUERY_QUERY:
6888 repeat_type = greedy_non_default;
6889 possessive_quantifier = FALSE;
6890 break;
6891
6892 default:
6893 repeat_type = greedy_default;
6894 possessive_quantifier = FALSE;
6895 break;
6896 }
6897
6898 /* Save start of previous item, in case we have to move it up in order to
6899 insert something before it, and remember what it was. */
6900
6901 tempcode = previous;
6902 op_previous = *previous;
6903
6904 /* Now handle repetition for the different types of item. If the repeat
6905 minimum and the repeat maximum are both 1, we can ignore the quantifier for
6906 non-parenthesized items, as they have only one alternative. For anything in
6907 parentheses, we must not ignore if {1} is possessive. */
6908
6909 switch (op_previous)
6910 {
6911 /* If previous was a character or negated character match, abolish the
6912 item and generate a repeat item instead. If a char item has a minimum of
6913 more than one, ensure that it is set in reqcu - it might not be if a
6914 sequence such as x{3} is the first thing in a branch because the x will
6915 have gone into firstcu instead. */
6916
6917 case OP_CHAR:
6918 case OP_CHARI:
6919 case OP_NOT:
6920 case OP_NOTI:
6921 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6922 op_type = chartypeoffset[op_previous - OP_CHAR];
6923
6924 /* Deal with UTF characters that take up more than one code unit. */
6925
6926 #ifdef MAYBE_UTF_MULTI
6927 if (utf && NOT_FIRSTCU(code[-1]))
6928 {
6929 PCRE2_UCHAR *lastchar = code - 1;
6930 BACKCHAR(lastchar);
6931 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
6932 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
6933 }
6934 else
6935 #endif /* MAYBE_UTF_MULTI */
6936
6937 /* Handle the case of a single code unit - either with no UTF support, or
6938 with UTF disabled, or for a single-code-unit UTF character. */
6939 {
6940 mcbuffer[0] = code[-1];
6941 mclength = 1;
6942 if (op_previous <= OP_CHARI && repeat_min > 1)
6943 {
6944 reqcu = mcbuffer[0];
6945 reqcuflags = req_caseopt | cb->req_varyopt;
6946 }
6947 }
6948 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
6949
6950 /* If previous was a character class or a back reference, we put the
6951 repeat stuff after it, but just skip the item if the repeat was {0,0}. */
6952
6953 #ifdef SUPPORT_WIDE_CHARS
6954 case OP_XCLASS:
6955 #endif
6956 case OP_CLASS:
6957 case OP_NCLASS:
6958 case OP_REF:
6959 case OP_REFI:
6960 case OP_DNREF:
6961 case OP_DNREFI:
6962
6963 if (repeat_max == 0)
6964 {
6965 code = previous;
6966 goto END_REPEAT;
6967 }
6968 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6969
6970 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
6971 *code++ = OP_CRSTAR + repeat_type;
6972 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
6973 *code++ = OP_CRPLUS + repeat_type;
6974 else if (repeat_min == 0 && repeat_max == 1)
6975 *code++ = OP_CRQUERY + repeat_type;
6976 else
6977 {
6978 *code++ = OP_CRRANGE + repeat_type;
6979 PUT2INC(code, 0, repeat_min);
6980 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
6981 PUT2INC(code, 0, repeat_max);
6982 }
6983 break;
6984
6985 /* If previous is OP_FAIL, it was generated by an empty class []
6986 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
6987 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
6988 time. We can just ignore this repeat. */
6989
6990 case OP_FAIL:
6991 goto END_REPEAT;
6992
6993 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
6994 because pcre2_match() could not handle backtracking into recursively
6995 called groups. Now that this backtracking is available, we no longer need
6996 to do this. However, we still need to replicate recursions as we do for
6997 groups so as to have independent backtracking points. We can replicate
6998 for the minimum number of repeats directly. For optional repeats we now
6999 wrap the recursion in OP_BRA brackets and make use of the bracket
7000 repetition. */
7001
7002 case OP_RECURSE:
7003 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7004 goto END_REPEAT;
7005
7006 /* Generate unwrapped repeats for a non-zero minimum, except when the
7007 minimum is 1 and the maximum unlimited, because that can be handled with
7008 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7009 minimum, we just need to generate the appropriate additional copies.
7010 Otherwise we need to generate one more, to simulate the situation when
7011 the minimum is zero. */
7012
7013 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7014 {
7015 int replicate = repeat_min;
7016 if (repeat_min == repeat_max) replicate--;
7017
7018 /* In the pre-compile phase, we don't actually do the replication. We
7019 just adjust the length as if we had. Do some paranoid checks for
7020 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7021 integer type when available, otherwise double. */
7022
7023 if (lengthptr != NULL)
7024 {
7025 PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
7026 if ((INT64_OR_DOUBLE)replicate*
7027 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
7028 (INT64_OR_DOUBLE)INT_MAX ||
7029 OFLOW_MAX - *lengthptr < delta)
7030 {
7031 *errorcodeptr = ERR20;
7032 return 0;
7033 }
7034 *lengthptr += delta;
7035 }
7036
7037 else for (i = 0; i < replicate; i++)
7038 {
7039 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7040 previous = code;
7041 code += 1 + LINK_SIZE;
7042 }
7043
7044 /* If the number of repeats is fixed, we are done. Otherwise, adjust
7045 the counts and fall through. */
7046
7047 if (repeat_min == repeat_max) break;
7048 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7049 repeat_min = 0;
7050 }
7051
7052 /* Wrap the recursion call in OP_BRA brackets. */
7053
7054 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7055 op_previous = *previous = OP_BRA;
7056 PUT(previous, 1, 2 + 2*LINK_SIZE);
7057 previous[2 + 2*LINK_SIZE] = OP_KET;
7058 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7059 code += 2 + 2 * LINK_SIZE;
7060 length_prevgroup = 3 + 3*LINK_SIZE;
7061 group_return = -1; /* Set "may match empty string" */
7062
7063 /* Now treat as a repeated OP_BRA. */
7064 /* Fall through */
7065
7066 /* If previous was a bracket group, we may have to replicate it in
7067 certain cases. Note that at this point we can encounter only the "basic"
7068 bracket opcodes such as BRA and CBRA, as this is the place where they get
7069 converted into the more special varieties such as BRAPOS and SBRA.
7070 Originally, PCRE did not allow repetition of assertions, but now it does,
7071 for Perl compatibility. */
7072
7073 case OP_ASSERT:
7074 case OP_ASSERT_NOT:
7075 case OP_ASSERT_NA:
7076 case OP_ASSERTBACK:
7077 case OP_ASSERTBACK_NOT:
7078 case OP_ASSERTBACK_NA:
7079 case OP_ONCE:
7080 case OP_SCRIPT_RUN:
7081 case OP_BRA:
7082 case OP_CBRA:
7083 case OP_COND:
7084 {
7085 int len = (int)(code - previous);
7086 PCRE2_UCHAR *bralink = NULL;
7087 PCRE2_UCHAR *brazeroptr = NULL;
7088
7089 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7090 goto END_REPEAT;
7091
7092 /* Repeating a DEFINE group (or any group where the condition is always
7093 FALSE and there is only one branch) is pointless, but Perl allows the
7094 syntax, so we just ignore the repeat. */
7095
7096 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7097 previous[GET(previous, 1)] != OP_ALT)
7098 goto END_REPEAT;
7099
7100 /* Perl allows all assertions to be quantified, and when they contain
7101 capturing parentheses and/or are optional there are potential uses for
7102 this feature. PCRE2 used to force the maximum quantifier to 1 on the
7103 invalid grounds that further repetition was never useful. This was
7104 always a bit pointless, since an assertion could be wrapped with a
7105 repeated group to achieve the effect. General repetition is now
7106 permitted, but if the maximum is unlimited it is set to one more than
7107 the minimum. */
7108
7109 if (op_previous < OP_ONCE) /* Assertion */
7110 {
7111 if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7112 }
7113
7114 /* The case of a zero minimum is special because of the need to stick
7115 OP_BRAZERO in front of it, and because the group appears once in the
7116 data, whereas in other cases it appears the minimum number of times. For
7117 this reason, it is simplest to treat this case separately, as otherwise
7118 the code gets far too messy. There are several special subcases when the
7119 minimum is zero. */
7120
7121 if (repeat_min == 0)
7122 {
7123 /* If the maximum is also zero, we used to just omit the group from
7124 the output altogether, like this:
7125
7126 ** if (repeat_max == 0)
7127 ** {
7128 ** code = previous;
7129 ** goto END_REPEAT;
7130 ** }
7131
7132 However, that fails when a group or a subgroup within it is
7133 referenced as a subroutine from elsewhere in the pattern, so now we
7134 stick in OP_SKIPZERO in front of it so that it is skipped on
7135 execution. As we don't have a list of which groups are referenced, we
7136 cannot do this selectively.
7137
7138 If the maximum is 1 or unlimited, we just have to stick in the
7139 BRAZERO and do no more at this point. */
7140
7141 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7142 {
7143 (void)memmove(previous + 1, previous, CU2BYTES(len));
7144 code++;
7145 if (repeat_max == 0)
7146 {
7147 *previous++ = OP_SKIPZERO;
7148 goto END_REPEAT;
7149 }
7150 brazeroptr = previous; /* Save for possessive optimizing */
7151 *previous++ = OP_BRAZERO + repeat_type;
7152 }
7153
7154 /* If the maximum is greater than 1 and limited, we have to replicate
7155 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7156 The first one has to be handled carefully because it's the original
7157 copy, which has to be moved up. The remainder can be handled by code
7158 that is common with the non-zero minimum case below. We have to
7159 adjust the value or repeat_max, since one less copy is required. */
7160
7161 else
7162 {
7163 int linkoffset;
7164 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7165 code += 2 + LINK_SIZE;
7166 *previous++ = OP_BRAZERO + repeat_type;
7167 *previous++ = OP_BRA;
7168
7169 /* We chain together the bracket link offset fields that have to be
7170 filled in later when the ends of the brackets are reached. */
7171
7172 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7173 bralink = previous;
7174 PUTINC(previous, 0, linkoffset);
7175 }
7176
7177 if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7178 }
7179
7180 /* If the minimum is greater than zero, replicate the group as many
7181 times as necessary, and adjust the maximum to the number of subsequent
7182 copies that we need. */
7183
7184 else
7185 {
7186 if (repeat_min > 1)
7187 {
7188 /* In the pre-compile phase, we don't actually do the replication.
7189 We just adjust the length as if we had. Do some paranoid checks for
7190 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7191 integer type when available, otherwise double. */
7192
7193 if (lengthptr != NULL)
7194 {
7195 PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7196 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7197 (INT64_OR_DOUBLE)length_prevgroup >
7198 (INT64_OR_DOUBLE)INT_MAX ||
7199 OFLOW_MAX - *lengthptr < delta)
7200 {
7201 *errorcodeptr = ERR20;
7202 return 0;
7203 }
7204 *lengthptr += delta;
7205 }
7206
7207 /* This is compiling for real. If there is a set first code unit
7208 for the group, and we have not yet set a "required code unit", set
7209 it. */
7210
7211 else
7212 {
7213 if (groupsetfirstcu && reqcuflags < 0)
7214 {
7215 reqcu = firstcu;
7216 reqcuflags = firstcuflags;
7217 }
7218 for (i = 1; (uint32_t)i < repeat_min; i++)
7219 {
7220 memcpy(code, previous, CU2BYTES(len));
7221 code += len;
7222 }
7223 }
7224 }
7225
7226 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7227 }
7228
7229 /* This code is common to both the zero and non-zero minimum cases. If
7230 the maximum is limited, it replicates the group in a nested fashion,
7231 remembering the bracket starts on a stack. In the case of a zero
7232 minimum, the first one was set up above. In all cases the repeat_max
7233 now specifies the number of additional copies needed. Again, we must
7234 remember to replicate entries on the forward reference list. */
7235
7236 if (repeat_max != REPEAT_UNLIMITED)
7237 {
7238 /* In the pre-compile phase, we don't actually do the replication. We
7239 just adjust the length as if we had. For each repetition we must add
7240 1 to the length for BRAZERO and for all but the last repetition we
7241 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7242 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7243 is a 64-bit integer type when available, otherwise double. */
7244
7245 if (lengthptr != NULL && repeat_max > 0)
7246 {
7247 PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7248 2 - 2*LINK_SIZE; /* Last one doesn't nest */
7249 if ((INT64_OR_DOUBLE)repeat_max *
7250 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7251 > (INT64_OR_DOUBLE)INT_MAX ||
7252 OFLOW_MAX - *lengthptr < delta)
7253 {
7254 *errorcodeptr = ERR20;
7255 return 0;
7256 }
7257 *lengthptr += delta;
7258 }
7259
7260 /* This is compiling for real */
7261
7262 else for (i = repeat_max - 1; i >= 0; i--)
7263 {
7264 *code++ = OP_BRAZERO + repeat_type;
7265
7266 /* All but the final copy start a new nesting, maintaining the
7267 chain of brackets outstanding. */
7268
7269 if (i != 0)
7270 {
7271 int linkoffset;
7272 *code++ = OP_BRA;
7273 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7274 bralink = code;
7275 PUTINC(code, 0, linkoffset);
7276 }
7277
7278 memcpy(code, previous, CU2BYTES(len));
7279 code += len;
7280 }
7281
7282 /* Now chain through the pending brackets, and fill in their length
7283 fields (which are holding the chain links pro tem). */
7284
7285 while (bralink != NULL)
7286 {
7287 int oldlinkoffset;
7288 int linkoffset = (int)(code - bralink + 1);
7289 PCRE2_UCHAR *bra = code - linkoffset;
7290 oldlinkoffset = GET(bra, 1);
7291 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7292 *code++ = OP_KET;
7293 PUTINC(code, 0, linkoffset);
7294 PUT(bra, 1, linkoffset);
7295 }
7296 }
7297
7298 /* If the maximum is unlimited, set a repeater in the final copy. For
7299 SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7300 possessively repeated ONCE brackets can be converted into non-capturing
7301 brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7302 saves having to deal with possessive ONCEs specially.
7303
7304 Otherwise, when we are doing the actual compile phase, check to see
7305 whether this group is one that could match an empty string. If so,
7306 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7307 that runtime checking can be done. [This check is also applied to ONCE
7308 and SCRIPT_RUN groups at runtime, but in a different way.]
7309
7310 Then, if the quantifier was possessive and the bracket is not a
7311 conditional, we convert the BRA code to the POS form, and the KET code
7312 to KETRPOS. (It turns out to be convenient at runtime to detect this
7313 kind of subpattern at both the start and at the end.) The use of
7314 special opcodes makes it possible to reduce greatly the stack usage in
7315 pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7316 OP_BRAPOSZERO.
7317
7318 Then, if the minimum number of matches is 1 or 0, cancel the possessive
7319 flag so that the default action below, of wrapping everything inside
7320 atomic brackets, does not happen. When the minimum is greater than 1,
7321 there will be earlier copies of the group, and so we still have to wrap
7322 the whole thing. */
7323
7324 else
7325 {
7326 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7327 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7328
7329 /* Convert possessive ONCE brackets to non-capturing */
7330
7331 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7332
7333 /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7334 to do is to set the KET. */
7335
7336 if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7337 *ketcode = OP_KETRMAX + repeat_type;
7338
7339 /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7340 (which have been converted to non-capturing above). */
7341
7342 else
7343 {
7344 /* In the compile phase, adjust the opcode if the group can match
7345 an empty string. For a conditional group with only one branch, the
7346 value of group_return will not show "could be empty", so we must
7347 check that separately. */
7348
7349 if (lengthptr == NULL)
7350 {
7351 if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7352 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7353 *bracode = OP_SCOND;
7354 }
7355
7356 /* Handle possessive quantifiers. */
7357
7358 if (possessive_quantifier)
7359 {
7360 /* For COND brackets, we wrap the whole thing in a possessively
7361 repeated non-capturing bracket, because we have not invented POS
7362 versions of the COND opcodes. */
7363
7364 if (*bracode == OP_COND || *bracode == OP_SCOND)
7365 {
7366 int nlen = (int)(code - bracode);
7367 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7368 code += 1 + LINK_SIZE;
7369 nlen += 1 + LINK_SIZE;
7370 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7371 *code++ = OP_KETRPOS;
7372 PUTINC(code, 0, nlen);
7373 PUT(bracode, 1, nlen);
7374 }
7375
7376 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7377
7378 else
7379 {
7380 *bracode += 1; /* Switch to xxxPOS opcodes */
7381 *ketcode = OP_KETRPOS;
7382 }
7383
7384 /* If the minimum is zero, mark it as possessive, then unset the
7385 possessive flag when the minimum is 0 or 1. */
7386
7387 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7388 if (repeat_min < 2) possessive_quantifier = FALSE;
7389 }
7390
7391 /* Non-possessive quantifier */
7392
7393 else *ketcode = OP_KETRMAX + repeat_type;
7394 }
7395 }
7396 }
7397 break;
7398
7399 /* If previous was a character type match (\d or similar), abolish it and
7400 create a suitable repeat item. The code is shared with single-character
7401 repeats by setting op_type to add a suitable offset into repeat_type.
7402 Note the the Unicode property types will be present only when
7403 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7404 here because it just makes it horribly messy. */
7405
7406 default:
7407 if (op_previous >= OP_EODN) /* Not a character type - internal error */
7408 {
7409 *errorcodeptr = ERR10;
7410 return 0;
7411 }
7412 else
7413 {
7414 int prop_type, prop_value;
7415 PCRE2_UCHAR *oldcode;
7416
7417 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7418
7419 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7420 mclength = 0; /* Not a character */
7421
7422 if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7423 {
7424 prop_type = previous[1];
7425 prop_value = previous[2];
7426 }
7427 else
7428 {
7429 /* Come here from just above with a character in mcbuffer/mclength. */
7430 OUTPUT_SINGLE_REPEAT:
7431 prop_type = prop_value = -1;
7432 }
7433
7434 /* At this point, if prop_type == prop_value == -1 we either have a
7435 character in mcbuffer when mclength is greater than zero, or we have
7436 mclength zero, in which case there is a non-property character type in
7437 op_previous. If prop_type/value are not negative, we have a property
7438 character type in op_previous. */
7439
7440 oldcode = code; /* Save where we were */
7441 code = previous; /* Usually overwrite previous item */
7442
7443 /* If the maximum is zero then the minimum must also be zero; Perl allows
7444 this case, so we do too - by simply omitting the item altogether. */
7445
7446 if (repeat_max == 0) goto END_REPEAT;
7447
7448 /* Combine the op_type with the repeat_type */
7449
7450 repeat_type += op_type;
7451
7452 /* A minimum of zero is handled either as the special case * or ?, or as
7453 an UPTO, with the maximum given. */
7454
7455 if (repeat_min == 0)
7456 {
7457 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7458 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7459 else
7460 {
7461 *code++ = OP_UPTO + repeat_type;
7462 PUT2INC(code, 0, repeat_max);
7463 }
7464 }
7465
7466 /* A repeat minimum of 1 is optimized into some special cases. If the
7467 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7468 left in place and, if the maximum is greater than 1, we use OP_UPTO with
7469 one less than the maximum. */
7470
7471 else if (repeat_min == 1)
7472 {
7473 if (repeat_max == REPEAT_UNLIMITED)
7474 *code++ = OP_PLUS + repeat_type;
7475 else
7476 {
7477 code = oldcode; /* Leave previous item in place */
7478 if (repeat_max == 1) goto END_REPEAT;
7479 *code++ = OP_UPTO + repeat_type;
7480 PUT2INC(code, 0, repeat_max - 1);
7481 }
7482 }
7483
7484 /* The case {n,n} is just an EXACT, while the general case {n,m} is
7485 handled as an EXACT followed by an UPTO or STAR or QUERY. */
7486
7487 else
7488 {
7489 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7490 PUT2INC(code, 0, repeat_min);
7491
7492 /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7493 and then generate the second opcode. For a repeated Unicode property
7494 match, there are two extra values that define the required property,
7495 and mclength is set zero to indicate this. */
7496
7497 if (repeat_max != repeat_min)
7498 {
7499 if (mclength > 0)
7500 {
7501 memcpy(code, mcbuffer, CU2BYTES(mclength));
7502 code += mclength;
7503 }
7504 else
7505 {
7506 *code++ = op_previous;
7507 if (prop_type >= 0)
7508 {
7509 *code++ = prop_type;
7510 *code++ = prop_value;
7511 }
7512 }
7513
7514 /* Now set up the following opcode */
7515
7516 if (repeat_max == REPEAT_UNLIMITED)
7517 *code++ = OP_STAR + repeat_type;
7518 else
7519 {
7520 repeat_max -= repeat_min;
7521 if (repeat_max == 1)
7522 {
7523 *code++ = OP_QUERY + repeat_type;
7524 }
7525 else
7526 {
7527 *code++ = OP_UPTO + repeat_type;
7528 PUT2INC(code, 0, repeat_max);
7529 }
7530 }
7531 }
7532 }
7533
7534 /* Fill in the character or character type for the final opcode. */
7535
7536 if (mclength > 0)
7537 {
7538 memcpy(code, mcbuffer, CU2BYTES(mclength));
7539 code += mclength;
7540 }
7541 else
7542 {
7543 *code++ = op_previous;
7544 if (prop_type >= 0)
7545 {
7546 *code++ = prop_type;
7547 *code++ = prop_value;
7548 }
7549 }
7550 }
7551 break;
7552 } /* End of switch on different op_previous values */
7553
7554
7555 /* If the character following a repeat is '+', possessive_quantifier is
7556 TRUE. For some opcodes, there are special alternative opcodes for this
7557 case. For anything else, we wrap the entire repeated item inside OP_ONCE
7558 brackets. Logically, the '+' notation is just syntactic sugar, taken from
7559 Sun's Java package, but the special opcodes can optimize it.
7560
7561 Some (but not all) possessively repeated subpatterns have already been
7562 completely handled in the code just above. For them, possessive_quantifier
7563 is always FALSE at this stage. Note that the repeated item starts at
7564 tempcode, not at previous, which might be the first part of a string whose
7565 (former) last char we repeated. */
7566
7567 if (possessive_quantifier)
7568 {
7569 int len;
7570
7571 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7572 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7573 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7574 remains is greater than zero, there's a further opcode that can be
7575 handled. If not, do nothing, leaving the EXACT alone. */
7576
7577 switch(*tempcode)
7578 {
7579 case OP_TYPEEXACT:
7580 tempcode += PRIV(OP_lengths)[*tempcode] +
7581 ((tempcode[1 + IMM2_SIZE] == OP_PROP
7582 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7583 break;
7584
7585 /* CHAR opcodes are used for exacts whose count is 1. */
7586
7587 case OP_CHAR:
7588 case OP_CHARI:
7589 case OP_NOT:
7590 case OP_NOTI:
7591 case OP_EXACT:
7592 case OP_EXACTI:
7593 case OP_NOTEXACT:
7594 case OP_NOTEXACTI:
7595 tempcode += PRIV(OP_lengths)[*tempcode];
7596 #ifdef SUPPORT_UNICODE
7597 if (utf && HAS_EXTRALEN(tempcode[-1]))
7598 tempcode += GET_EXTRALEN(tempcode[-1]);
7599 #endif
7600 break;
7601
7602 /* For the class opcodes, the repeat operator appears at the end;
7603 adjust tempcode to point to it. */
7604
7605 case OP_CLASS:
7606 case OP_NCLASS:
7607 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7608 break;
7609
7610 #ifdef SUPPORT_WIDE_CHARS
7611 case OP_XCLASS:
7612 tempcode += GET(tempcode, 1);
7613 break;
7614 #endif
7615 }
7616
7617 /* If tempcode is equal to code (which points to the end of the repeated
7618 item), it means we have skipped an EXACT item but there is no following
7619 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7620 all other cases, tempcode will be pointing to the repeat opcode, and will
7621 be less than code, so the value of len will be greater than 0. */
7622
7623 len = (int)(code - tempcode);
7624 if (len > 0)
7625 {
7626 unsigned int repcode = *tempcode;
7627
7628 /* There is a table for possessifying opcodes, all of which are less
7629 than OP_CALLOUT. A zero entry means there is no possessified version.
7630 */
7631
7632 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7633 *tempcode = opcode_possessify[repcode];
7634
7635 /* For opcode without a special possessified version, wrap the item in
7636 ONCE brackets. */
7637
7638 else
7639 {
7640 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7641 code += 1 + LINK_SIZE;
7642 len += 1 + LINK_SIZE;
7643 tempcode[0] = OP_ONCE;
7644 *code++ = OP_KET;
7645 PUTINC(code, 0, len);
7646 PUT(tempcode, 1, len);
7647 }
7648 }
7649 }
7650
7651 /* We set the "follows varying string" flag for subsequently encountered
7652 reqcus if it isn't already set and we have just passed a varying length
7653 item. */
7654
7655 END_REPEAT:
7656 cb->req_varyopt |= reqvary;
7657 break;
7658
7659
7660 /* ===================================================================*/
7661 /* Handle a 32-bit data character with a value greater than META_END. */
7662
7663 case META_BIGVALUE:
7664 pptr++;
7665 goto NORMAL_CHAR;
7666
7667
7668 /* ===============================================================*/
7669 /* Handle a back reference by number, which is the meta argument. The
7670 pattern offsets for back references to group numbers less than 10 are held
7671 in a special vector, to avoid using more than two parsed pattern elements
7672 in 64-bit environments. We only need the offset to the first occurrence,
7673 because if that doesn't fail, subsequent ones will also be OK. */
7674
7675 case META_BACKREF:
7676 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7677 else GETPLUSOFFSET(offset, pptr);
7678
7679 if (meta_arg > cb->bracount)
7680 {
7681 cb->erroroffset = offset;
7682 *errorcodeptr = ERR15; /* Non-existent subpattern */
7683 return 0;
7684 }
7685
7686 /* Come here from named backref handling when the reference is to a
7687 single group (that is, not to a duplicated name). The back reference
7688 data will have already been updated. We must disable firstcu if not
7689 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7690 later. */
7691
7692 HANDLE_SINGLE_REFERENCE:
7693 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7694 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7695 PUT2INC(code, 0, meta_arg);
7696
7697 /* Update the map of back references, and keep the highest one. We
7698 could do this in parse_regex() for numerical back references, but not
7699 for named back references, because we don't know the numbers to which
7700 named back references refer. So we do it all in this function. */
7701
7702 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7703 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7704 break;
7705
7706
7707 /* ===============================================================*/
7708 /* Handle recursion by inserting the number of the called group (which is
7709 the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7710 scanned and these numbers are replaced by offsets within the pattern. It is
7711 done like this to avoid problems with forward references and adjusting
7712 offsets when groups are duplicated and moved (as discovered in previous
7713 implementations). Note that a recursion does not have a set first
7714 character. */
7715
7716 case META_RECURSE:
7717 GETPLUSOFFSET(offset, pptr);
7718 if (meta_arg > cb->bracount)
7719 {
7720 cb->erroroffset = offset;
7721 *errorcodeptr = ERR15; /* Non-existent subpattern */
7722 return 0;
7723 }
7724 HANDLE_NUMERICAL_RECURSION:
7725 *code = OP_RECURSE;
7726 PUT(code, 1, meta_arg);
7727 code += 1 + LINK_SIZE;
7728 groupsetfirstcu = FALSE;
7729 cb->had_recurse = TRUE;
7730 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7731 zerofirstcu = firstcu;
7732 zerofirstcuflags = firstcuflags;
7733 break;
7734
7735
7736 /* ===============================================================*/
7737 /* Handle capturing parentheses; the number is the meta argument. */
7738
7739 case META_CAPTURE:
7740 bravalue = OP_CBRA;
7741 skipunits = IMM2_SIZE;
7742 PUT2(code, 1+LINK_SIZE, meta_arg);
7743 cb->lastcapture = meta_arg;
7744 goto GROUP_PROCESS_NOTE_EMPTY;
7745
7746
7747 /* ===============================================================*/
7748 /* Handle escape sequence items. For ones like \d, the ESC_values are
7749 arranged to be the same as the corresponding OP_values in the default case
7750 when PCRE2_UCP is not set (which is the only case in which they will appear
7751 here).
7752
7753 Note: \Q and \E are never seen here, as they were dealt with in
7754 parse_pattern(). Neither are numerical back references or recursions, which
7755 were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7756 \g, when followed by names, are turned into META_BACKREF_BYNAME or
7757 META_RECURSE_BYNAME. */
7758
7759 case META_ESCAPE:
7760
7761 /* We can test for escape sequences that consume a character because their
7762 values lie between ESC_b and ESC_Z; this may have to change if any new ones
7763 are ever created. For these sequences, we disable the setting of a first
7764 character if it hasn't already been set. */
7765
7766 if (meta_arg > ESC_b && meta_arg < ESC_Z)
7767 {
7768 matched_char = TRUE;
7769 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7770 }
7771
7772 /* Set values to reset to if this is followed by a zero repeat. */
7773
7774 zerofirstcu = firstcu;
7775 zerofirstcuflags = firstcuflags;
7776 zeroreqcu = reqcu;
7777 zeroreqcuflags = reqcuflags;
7778
7779 /* If Unicode is not supported, \P and \p are not allowed and are
7780 faulted at parse time, so will never appear here. */
7781
7782 #ifdef SUPPORT_UNICODE
7783 if (meta_arg == ESC_P || meta_arg == ESC_p)
7784 {
7785 uint32_t ptype = *(++pptr) >> 16;
7786 uint32_t pdata = *pptr & 0xffff;
7787
7788 /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7789 from the auto-anchoring code. */
7790
7791 if (meta_arg == ESC_p && ptype == PT_ANY)
7792 {
7793 *code++ = OP_ALLANY;
7794 }
7795 else
7796 {
7797 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7798 *code++ = ptype;
7799 *code++ = pdata;
7800 }
7801 break; /* End META_ESCAPE */
7802 }
7803 #endif
7804
7805 /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
7806 done. However, there's an option, in case anyone was relying on it. */
7807
7808 if (cb->assert_depth > 0 && meta_arg == ESC_K &&
7809 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
7810 {
7811 *errorcodeptr = ERR99;
7812 return 0;
7813 }
7814
7815 /* For the rest (including \X when Unicode is supported - if not it's
7816 faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7817 not set; if it is set, these escapes do not show up here because they are
7818 converted into Unicode property tests in parse_regex(). Note that \b and \B
7819 do a one-character lookbehind, and \A also behaves as if it does. */
7820
7821 if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7822 if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7823 cb->max_lookbehind == 0)
7824 cb->max_lookbehind = 1;
7825
7826 /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7827 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7828
7829 #if PCRE2_CODE_UNIT_WIDTH == 32
7830 *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7831 #else
7832 *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7833 #endif
7834 break; /* End META_ESCAPE */
7835
7836
7837 /* ===================================================================*/
7838 /* Handle an unrecognized meta value. A parsed pattern value less than
7839 META_END is a literal. Otherwise we have a problem. */
7840
7841 default:
7842 if (meta >= META_END)
7843 {
7844 #ifdef DEBUG_SHOW_PARSED
7845 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7846 #endif
7847 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
7848 return 0;
7849 }
7850
7851 /* Handle a literal character. We come here by goto in the case of a
7852 32-bit, non-UTF character whose value is greater than META_END. */
7853
7854 NORMAL_CHAR:
7855 meta = *pptr; /* Get the full 32 bits */
7856 NORMAL_CHAR_SET: /* Character is already in meta */
7857 matched_char = TRUE;
7858
7859 /* For caseless UTF or UCP mode, check whether this character has more than
7860 one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
7861 */
7862
7863 #ifdef SUPPORT_UNICODE
7864 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
7865 {
7866 uint32_t caseset = UCD_CASESET(meta);
7867 if (caseset != 0)
7868 {
7869 *code++ = OP_PROP;
7870 *code++ = PT_CLIST;
7871 *code++ = caseset;
7872 if (firstcuflags == REQ_UNSET)
7873 firstcuflags = zerofirstcuflags = REQ_NONE;
7874 break; /* End handling this meta item */
7875 }
7876 }
7877 #endif
7878
7879 /* Caseful matches, or caseless and not one of the multicase characters. We
7880 come here by goto in the case of a positive class that contains only
7881 case-partners of a character with just two cases; matched_char has already
7882 been set TRUE and options fudged if necessary. */
7883
7884 CLASS_CASELESS_CHAR:
7885
7886 /* Get the character's code units into mcbuffer, with the length in
7887 mclength. When not in UTF mode, the length is always 1. */
7888
7889 #ifdef SUPPORT_UNICODE
7890 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7891 #endif
7892 {
7893 mclength = 1;
7894 mcbuffer[0] = meta;
7895 }
7896
7897 /* Generate the appropriate code */
7898
7899 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7900 memcpy(code, mcbuffer, CU2BYTES(mclength));
7901 code += mclength;
7902
7903 /* Remember if \r or \n were seen */
7904
7905 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7906 cb->external_flags |= PCRE2_HASCRORLF;
7907
7908 /* Set the first and required code units appropriately. If no previous
7909 first code unit, set it from this character, but revert to none on a zero
7910 repeat. Otherwise, leave the firstcu value alone, and don't change it on
7911 a zero repeat. */
7912
7913 if (firstcuflags == REQ_UNSET)
7914 {
7915 zerofirstcuflags = REQ_NONE;
7916 zeroreqcu = reqcu;
7917 zeroreqcuflags = reqcuflags;
7918
7919 /* If the character is more than one code unit long, we can set a single
7920 firstcu only if it is not to be matched caselessly. Multiple possible
7921 starting code units may be picked up later in the studying code. */
7922
7923 if (mclength == 1 || req_caseopt == 0)
7924 {
7925 firstcu = mcbuffer[0];
7926 firstcuflags = req_caseopt;
7927 if (mclength != 1)
7928 {
7929 reqcu = code[-1];
7930 reqcuflags = cb->req_varyopt;
7931 }
7932 }
7933 else firstcuflags = reqcuflags = REQ_NONE;
7934 }
7935
7936 /* firstcu was previously set; we can set reqcu only if the length is
7937 1 or the matching is caseful. */
7938
7939 else
7940 {
7941 zerofirstcu = firstcu;
7942 zerofirstcuflags = firstcuflags;
7943 zeroreqcu = reqcu;
7944 zeroreqcuflags = reqcuflags;
7945 if (mclength == 1 || req_caseopt == 0)
7946 {
7947 reqcu = code[-1];
7948 reqcuflags = req_caseopt | cb->req_varyopt;
7949 }
7950 }
7951
7952 /* If caselessness was temporarily instated, reset it. */
7953
7954 if (reset_caseful)
7955 {
7956 options &= ~PCRE2_CASELESS;
7957 req_caseopt = 0;
7958 reset_caseful = FALSE;
7959 }
7960
7961 break; /* End literal character handling */
7962 } /* End of big switch */
7963 } /* End of big loop */
7964
7965 /* Control never reaches here. */
7966 }
7967
7968
7969
7970 /*************************************************
7971 * Compile regex: a sequence of alternatives *
7972 *************************************************/
7973
7974 /* On entry, pptr is pointing past the bracket meta, but on return it points to
7975 the closing bracket or META_END. The code variable is pointing at the code unit
7976 into which the BRA operator has been stored. This function is used during the
7977 pre-compile phase when we are trying to find out the amount of memory needed,
7978 as well as during the real compile phase. The value of lengthptr distinguishes
7979 the two phases.
7980
7981 Arguments:
7982 options option bits, including any changes for this subpattern
7983 codeptr -> the address of the current code pointer
7984 pptrptr -> the address of the current parsed pattern pointer
7985 errorcodeptr -> pointer to error code variable
7986 skipunits skip this many code units at start (for brackets and OP_COND)
7987 firstcuptr place to put the first required code unit
7988 firstcuflagsptr place to put the first code unit flags, or a negative number
7989 reqcuptr place to put the last required code unit
7990 reqcuflagsptr place to put the last required code unit flags, or a negative number
7991 bcptr pointer to the chain of currently open branches
7992 cb points to the data block with tables pointers etc.
7993 lengthptr NULL during the real compile phase
7994 points to length accumulator during pre-compile phase
7995
7996 Returns: 0 There has been an error
7997 +1 Success, this group must match at least one character
7998 -1 Success, this group may match an empty string
7999 */
8000
8001 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)8002 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
8003 int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
8004 int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
8005 branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
8006 {
8007 PCRE2_UCHAR *code = *codeptr;
8008 PCRE2_UCHAR *last_branch = code;
8009 PCRE2_UCHAR *start_bracket = code;
8010 BOOL lookbehind;
8011 open_capitem capitem;
8012 int capnumber = 0;
8013 int okreturn = 1;
8014 uint32_t *pptr = *pptrptr;
8015 uint32_t firstcu, reqcu;
8016 uint32_t lookbehindlength;
8017 int32_t firstcuflags, reqcuflags;
8018 uint32_t branchfirstcu, branchreqcu;
8019 int32_t branchfirstcuflags, branchreqcuflags;
8020 PCRE2_SIZE length;
8021 branch_chain bc;
8022
8023 /* If set, call the external function that checks for stack availability. */
8024
8025 if (cb->cx->stack_guard != NULL &&
8026 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8027 {
8028 *errorcodeptr= ERR33;
8029 return 0;
8030 }
8031
8032 /* Miscellaneous initialization */
8033
8034 bc.outer = bcptr;
8035 bc.current_branch = code;
8036
8037 firstcu = reqcu = 0;
8038 firstcuflags = reqcuflags = REQ_UNSET;
8039
8040 /* Accumulate the length for use in the pre-compile phase. Start with the
8041 length of the BRA and KET and any extra code units that are required at the
8042 beginning. We accumulate in a local variable to save frequent testing of
8043 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8044 start and end of each alternative, because compiled items are discarded during
8045 the pre-compile phase so that the workspace is not exceeded. */
8046
8047 length = 2 + 2*LINK_SIZE + skipunits;
8048
8049 /* Remember if this is a lookbehind assertion, and if it is, save its length
8050 and skip over the pattern offset. */
8051
8052 lookbehind = *code == OP_ASSERTBACK ||
8053 *code == OP_ASSERTBACK_NOT ||
8054 *code == OP_ASSERTBACK_NA;
8055
8056 if (lookbehind)
8057 {
8058 lookbehindlength = META_DATA(pptr[-1]);
8059 pptr += SIZEOFFSET;
8060 }
8061 else lookbehindlength = 0;
8062
8063 /* If this is a capturing subpattern, add to the chain of open capturing items
8064 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8065 need be tested here; changing this opcode to one of its variants, e.g.
8066 OP_SCBRAPOS, happens later, after the group has been compiled. */
8067
8068 if (*code == OP_CBRA)
8069 {
8070 capnumber = GET2(code, 1 + LINK_SIZE);
8071 capitem.number = capnumber;
8072 capitem.next = cb->open_caps;
8073 capitem.assert_depth = cb->assert_depth;
8074 cb->open_caps = &capitem;
8075 }
8076
8077 /* Offset is set zero to mark that this bracket is still open */
8078
8079 PUT(code, 1, 0);
8080 code += 1 + LINK_SIZE + skipunits;
8081
8082 /* Loop for each alternative branch */
8083
8084 for (;;)
8085 {
8086 int branch_return;
8087
8088 /* Insert OP_REVERSE if this is as lookbehind assertion. */
8089
8090 if (lookbehind && lookbehindlength > 0)
8091 {
8092 *code++ = OP_REVERSE;
8093 PUTINC(code, 0, lookbehindlength);
8094 length += 1 + LINK_SIZE;
8095 }
8096
8097 /* Now compile the branch; in the pre-compile phase its length gets added
8098 into the length. */
8099
8100 if ((branch_return =
8101 compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
8102 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
8103 cb, (lengthptr == NULL)? NULL : &length)) == 0)
8104 return 0;
8105
8106 /* If a branch can match an empty string, so can the whole group. */
8107
8108 if (branch_return < 0) okreturn = -1;
8109
8110 /* In the real compile phase, there is some post-processing to be done. */
8111
8112 if (lengthptr == NULL)
8113 {
8114 /* If this is the first branch, the firstcu and reqcu values for the
8115 branch become the values for the regex. */
8116
8117 if (*last_branch != OP_ALT)
8118 {
8119 firstcu = branchfirstcu;
8120 firstcuflags = branchfirstcuflags;
8121 reqcu = branchreqcu;
8122 reqcuflags = branchreqcuflags;
8123 }
8124
8125 /* If this is not the first branch, the first char and reqcu have to
8126 match the values from all the previous branches, except that if the
8127 previous value for reqcu didn't have REQ_VARY set, it can still match,
8128 and we set REQ_VARY for the group from this branch's value. */
8129
8130 else
8131 {
8132 /* If we previously had a firstcu, but it doesn't match the new branch,
8133 we have to abandon the firstcu for the regex, but if there was
8134 previously no reqcu, it takes on the value of the old firstcu. */
8135
8136 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8137 {
8138 if (firstcuflags >= 0)
8139 {
8140 if (reqcuflags < 0)
8141 {
8142 reqcu = firstcu;
8143 reqcuflags = firstcuflags;
8144 }
8145 }
8146 firstcuflags = REQ_NONE;
8147 }
8148
8149 /* If we (now or from before) have no firstcu, a firstcu from the
8150 branch becomes a reqcu if there isn't a branch reqcu. */
8151
8152 if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
8153 branchreqcuflags < 0)
8154 {
8155 branchreqcu = branchfirstcu;
8156 branchreqcuflags = branchfirstcuflags;
8157 }
8158
8159 /* Now ensure that the reqcus match */
8160
8161 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8162 reqcu != branchreqcu)
8163 reqcuflags = REQ_NONE;
8164 else
8165 {
8166 reqcu = branchreqcu;
8167 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8168 }
8169 }
8170 }
8171
8172 /* Handle reaching the end of the expression, either ')' or end of pattern.
8173 In the real compile phase, go back through the alternative branches and
8174 reverse the chain of offsets, with the field in the BRA item now becoming an
8175 offset to the first alternative. If there are no alternatives, it points to
8176 the end of the group. The length in the terminating ket is always the length
8177 of the whole bracketed item. Return leaving the pointer at the terminating
8178 char. */
8179
8180 if (META_CODE(*pptr) != META_ALT)
8181 {
8182 if (lengthptr == NULL)
8183 {
8184 PCRE2_SIZE branch_length = code - last_branch;
8185 do
8186 {
8187 PCRE2_SIZE prev_length = GET(last_branch, 1);
8188 PUT(last_branch, 1, branch_length);
8189 branch_length = prev_length;
8190 last_branch -= branch_length;
8191 }
8192 while (branch_length > 0);
8193 }
8194
8195 /* Fill in the ket */
8196
8197 *code = OP_KET;
8198 PUT(code, 1, (int)(code - start_bracket));
8199 code += 1 + LINK_SIZE;
8200
8201 /* If it was a capturing subpattern, remove the block from the chain. */
8202
8203 if (capnumber > 0) cb->open_caps = cb->open_caps->next;
8204
8205 /* Set values to pass back */
8206
8207 *codeptr = code;
8208 *pptrptr = pptr;
8209 *firstcuptr = firstcu;
8210 *firstcuflagsptr = firstcuflags;
8211 *reqcuptr = reqcu;
8212 *reqcuflagsptr = reqcuflags;
8213 if (lengthptr != NULL)
8214 {
8215 if (OFLOW_MAX - *lengthptr < length)
8216 {
8217 *errorcodeptr = ERR20;
8218 return 0;
8219 }
8220 *lengthptr += length;
8221 }
8222 return okreturn;
8223 }
8224
8225 /* Another branch follows. In the pre-compile phase, we can move the code
8226 pointer back to where it was for the start of the first branch. (That is,
8227 pretend that each branch is the only one.)
8228
8229 In the real compile phase, insert an ALT node. Its length field points back
8230 to the previous branch while the bracket remains open. At the end the chain
8231 is reversed. It's done like this so that the start of the bracket has a
8232 zero offset until it is closed, making it possible to detect recursion. */
8233
8234 if (lengthptr != NULL)
8235 {
8236 code = *codeptr + 1 + LINK_SIZE + skipunits;
8237 length += 1 + LINK_SIZE;
8238 }
8239 else
8240 {
8241 *code = OP_ALT;
8242 PUT(code, 1, (int)(code - last_branch));
8243 bc.current_branch = last_branch = code;
8244 code += 1 + LINK_SIZE;
8245 }
8246
8247 /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8248 and then advance past the vertical bar. */
8249
8250 lookbehindlength = META_DATA(*pptr);
8251 pptr++;
8252 }
8253 /* Control never reaches here */
8254 }
8255
8256
8257
8258 /*************************************************
8259 * Check for anchored pattern *
8260 *************************************************/
8261
8262 /* Try to find out if this is an anchored regular expression. Consider each
8263 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8264 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8265 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8266 be found, because ^ generates OP_CIRCM in that mode.
8267
8268 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8269 This is the code for \G, which means "match at start of match position, taking
8270 into account the match offset".
8271
8272 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8273 because that will try the rest of the pattern at all possible matching points,
8274 so there is no point trying again.... er ....
8275
8276 .... except when the .* appears inside capturing parentheses, and there is a
8277 subsequent back reference to those parentheses. We haven't enough information
8278 to catch that case precisely.
8279
8280 At first, the best we could do was to detect when .* was in capturing brackets
8281 and the highest back reference was greater than or equal to that level.
8282 However, by keeping a bitmap of the first 31 back references, we can catch some
8283 of the more common cases more precisely.
8284
8285 ... A second exception is when the .* appears inside an atomic group, because
8286 this prevents the number of characters it matches from being adjusted.
8287
8288 Arguments:
8289 code points to start of the compiled pattern
8290 bracket_map a bitmap of which brackets we are inside while testing; this
8291 handles up to substring 31; after that we just have to take
8292 the less precise approach
8293 cb points to the compile data block
8294 atomcount atomic group level
8295 inassert TRUE if in an assertion
8296
8297 Returns: TRUE or FALSE
8298 */
8299
8300 static BOOL
is_anchored(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8301 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8302 int atomcount, BOOL inassert)
8303 {
8304 do {
8305 PCRE2_SPTR scode = first_significant_code(
8306 code + PRIV(OP_lengths)[*code], FALSE);
8307 int op = *scode;
8308
8309 /* Non-capturing brackets */
8310
8311 if (op == OP_BRA || op == OP_BRAPOS ||
8312 op == OP_SBRA || op == OP_SBRAPOS)
8313 {
8314 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8315 return FALSE;
8316 }
8317
8318 /* Capturing brackets */
8319
8320 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8321 op == OP_SCBRA || op == OP_SCBRAPOS)
8322 {
8323 int n = GET2(scode, 1+LINK_SIZE);
8324 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8325 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8326 }
8327
8328 /* Positive forward assertion */
8329
8330 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8331 {
8332 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8333 }
8334
8335 /* Condition. If there is no second branch, it can't be anchored. */
8336
8337 else if (op == OP_COND || op == OP_SCOND)
8338 {
8339 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8340 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8341 return FALSE;
8342 }
8343
8344 /* Atomic groups */
8345
8346 else if (op == OP_ONCE)
8347 {
8348 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8349 return FALSE;
8350 }
8351
8352 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8353 it isn't in brackets that are or may be referenced or inside an atomic
8354 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8355 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8356 with the subject "aab", which matches "b", i.e. not at the start of a line.
8357 There is also an option that disables auto-anchoring. */
8358
8359 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8360 op == OP_TYPEPOSSTAR))
8361 {
8362 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8363 atomcount > 0 || cb->had_pruneorskip || inassert ||
8364 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8365 return FALSE;
8366 }
8367
8368 /* Check for explicit anchoring */
8369
8370 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8371
8372 code += GET(code, 1);
8373 }
8374 while (*code == OP_ALT); /* Loop for each alternative */
8375 return TRUE;
8376 }
8377
8378
8379
8380 /*************************************************
8381 * Check for starting with ^ or .* *
8382 *************************************************/
8383
8384 /* This is called to find out if every branch starts with ^ or .* so that
8385 "first char" processing can be done to speed things up in multiline
8386 matching and for non-DOTALL patterns that start with .* (which must start at
8387 the beginning or after \n). As in the case of is_anchored() (see above), we
8388 have to take account of back references to capturing brackets that contain .*
8389 because in that case we can't make the assumption. Also, the appearance of .*
8390 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8391 or *SKIP does not count, because once again the assumption no longer holds.
8392
8393 Arguments:
8394 code points to start of the compiled pattern or a group
8395 bracket_map a bitmap of which brackets we are inside while testing; this
8396 handles up to substring 31; after that we just have to take
8397 the less precise approach
8398 cb points to the compile data
8399 atomcount atomic group level
8400 inassert TRUE if in an assertion
8401
8402 Returns: TRUE or FALSE
8403 */
8404
8405 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8406 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8407 int atomcount, BOOL inassert)
8408 {
8409 do {
8410 PCRE2_SPTR scode = first_significant_code(
8411 code + PRIV(OP_lengths)[*code], FALSE);
8412 int op = *scode;
8413
8414 /* If we are at the start of a conditional assertion group, *both* the
8415 conditional assertion *and* what follows the condition must satisfy the test
8416 for start of line. Other kinds of condition fail. Note that there may be an
8417 auto-callout at the start of a condition. */
8418
8419 if (op == OP_COND)
8420 {
8421 scode += 1 + LINK_SIZE;
8422
8423 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8424 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8425
8426 switch (*scode)
8427 {
8428 case OP_CREF:
8429 case OP_DNCREF:
8430 case OP_RREF:
8431 case OP_DNRREF:
8432 case OP_FAIL:
8433 case OP_FALSE:
8434 case OP_TRUE:
8435 return FALSE;
8436
8437 default: /* Assertion */
8438 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8439 do scode += GET(scode, 1); while (*scode == OP_ALT);
8440 scode += 1 + LINK_SIZE;
8441 break;
8442 }
8443 scode = first_significant_code(scode, FALSE);
8444 op = *scode;
8445 }
8446
8447 /* Non-capturing brackets */
8448
8449 if (op == OP_BRA || op == OP_BRAPOS ||
8450 op == OP_SBRA || op == OP_SBRAPOS)
8451 {
8452 if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8453 return FALSE;
8454 }
8455
8456 /* Capturing brackets */
8457
8458 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8459 op == OP_SCBRA || op == OP_SCBRAPOS)
8460 {
8461 int n = GET2(scode, 1+LINK_SIZE);
8462 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8463 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8464 }
8465
8466 /* Positive forward assertions */
8467
8468 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8469 {
8470 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8471 return FALSE;
8472 }
8473
8474 /* Atomic brackets */
8475
8476 else if (op == OP_ONCE)
8477 {
8478 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8479 return FALSE;
8480 }
8481
8482 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8483 brackets that may be referenced or an assertion, and as long as the pattern
8484 does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8485 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8486 i.e. not at the start of a line. There is also an option that disables this
8487 optimization. */
8488
8489 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8490 {
8491 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8492 atomcount > 0 || cb->had_pruneorskip || inassert ||
8493 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8494 return FALSE;
8495 }
8496
8497 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8498 in particular that this includes atomic brackets OP_ONCE because the number
8499 of characters matched by .* cannot be adjusted inside them. */
8500
8501 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8502
8503 /* Move on to the next alternative */
8504
8505 code += GET(code, 1);
8506 }
8507 while (*code == OP_ALT); /* Loop for each alternative */
8508 return TRUE;
8509 }
8510
8511
8512
8513 /*************************************************
8514 * Scan compiled regex for recursion reference *
8515 *************************************************/
8516
8517 /* This function scans through a compiled pattern until it finds an instance of
8518 OP_RECURSE.
8519
8520 Arguments:
8521 code points to start of expression
8522 utf TRUE in UTF mode
8523
8524 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8525 */
8526
8527 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8528 find_recurse(PCRE2_SPTR code, BOOL utf)
8529 {
8530 for (;;)
8531 {
8532 PCRE2_UCHAR c = *code;
8533 if (c == OP_END) return NULL;
8534 if (c == OP_RECURSE) return code;
8535
8536 /* XCLASS is used for classes that cannot be represented just by a bit map.
8537 This includes negated single high-valued characters. CALLOUT_STR is used for
8538 callouts with string arguments. In both cases the length in the table is
8539 zero; the actual length is stored in the compiled code. */
8540
8541 if (c == OP_XCLASS) code += GET(code, 1);
8542 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8543
8544 /* Otherwise, we can get the item's length from the table, except that for
8545 repeated character types, we have to test for \p and \P, which have an extra
8546 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8547 we must add in its length. */
8548
8549 else
8550 {
8551 switch(c)
8552 {
8553 case OP_TYPESTAR:
8554 case OP_TYPEMINSTAR:
8555 case OP_TYPEPLUS:
8556 case OP_TYPEMINPLUS:
8557 case OP_TYPEQUERY:
8558 case OP_TYPEMINQUERY:
8559 case OP_TYPEPOSSTAR:
8560 case OP_TYPEPOSPLUS:
8561 case OP_TYPEPOSQUERY:
8562 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8563 break;
8564
8565 case OP_TYPEPOSUPTO:
8566 case OP_TYPEUPTO:
8567 case OP_TYPEMINUPTO:
8568 case OP_TYPEEXACT:
8569 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8570 code += 2;
8571 break;
8572
8573 case OP_MARK:
8574 case OP_COMMIT_ARG:
8575 case OP_PRUNE_ARG:
8576 case OP_SKIP_ARG:
8577 case OP_THEN_ARG:
8578 code += code[1];
8579 break;
8580 }
8581
8582 /* Add in the fixed length from the table */
8583
8584 code += PRIV(OP_lengths)[c];
8585
8586 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8587 be followed by a multi-unit character. The length in the table is a
8588 minimum, so we have to arrange to skip the extra units. */
8589
8590 #ifdef MAYBE_UTF_MULTI
8591 if (utf) switch(c)
8592 {
8593 case OP_CHAR:
8594 case OP_CHARI:
8595 case OP_NOT:
8596 case OP_NOTI:
8597 case OP_EXACT:
8598 case OP_EXACTI:
8599 case OP_NOTEXACT:
8600 case OP_NOTEXACTI:
8601 case OP_UPTO:
8602 case OP_UPTOI:
8603 case OP_NOTUPTO:
8604 case OP_NOTUPTOI:
8605 case OP_MINUPTO:
8606 case OP_MINUPTOI:
8607 case OP_NOTMINUPTO:
8608 case OP_NOTMINUPTOI:
8609 case OP_POSUPTO:
8610 case OP_POSUPTOI:
8611 case OP_NOTPOSUPTO:
8612 case OP_NOTPOSUPTOI:
8613 case OP_STAR:
8614 case OP_STARI:
8615 case OP_NOTSTAR:
8616 case OP_NOTSTARI:
8617 case OP_MINSTAR:
8618 case OP_MINSTARI:
8619 case OP_NOTMINSTAR:
8620 case OP_NOTMINSTARI:
8621 case OP_POSSTAR:
8622 case OP_POSSTARI:
8623 case OP_NOTPOSSTAR:
8624 case OP_NOTPOSSTARI:
8625 case OP_PLUS:
8626 case OP_PLUSI:
8627 case OP_NOTPLUS:
8628 case OP_NOTPLUSI:
8629 case OP_MINPLUS:
8630 case OP_MINPLUSI:
8631 case OP_NOTMINPLUS:
8632 case OP_NOTMINPLUSI:
8633 case OP_POSPLUS:
8634 case OP_POSPLUSI:
8635 case OP_NOTPOSPLUS:
8636 case OP_NOTPOSPLUSI:
8637 case OP_QUERY:
8638 case OP_QUERYI:
8639 case OP_NOTQUERY:
8640 case OP_NOTQUERYI:
8641 case OP_MINQUERY:
8642 case OP_MINQUERYI:
8643 case OP_NOTMINQUERY:
8644 case OP_NOTMINQUERYI:
8645 case OP_POSQUERY:
8646 case OP_POSQUERYI:
8647 case OP_NOTPOSQUERY:
8648 case OP_NOTPOSQUERYI:
8649 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8650 break;
8651 }
8652 #else
8653 (void)(utf); /* Keep compiler happy by referencing function argument */
8654 #endif /* MAYBE_UTF_MULTI */
8655 }
8656 }
8657 }
8658
8659
8660
8661 /*************************************************
8662 * Check for asserted fixed first code unit *
8663 *************************************************/
8664
8665 /* During compilation, the "first code unit" settings from forward assertions
8666 are discarded, because they can cause conflicts with actual literals that
8667 follow. However, if we end up without a first code unit setting for an
8668 unanchored pattern, it is worth scanning the regex to see if there is an
8669 initial asserted first code unit. If all branches start with the same asserted
8670 code unit, or with a non-conditional bracket all of whose alternatives start
8671 with the same asserted code unit (recurse ad lib), then we return that code
8672 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8673 REQ_NONE in the flags.
8674
8675 Arguments:
8676 code points to start of compiled pattern
8677 flags points to the first code unit flags
8678 inassert non-zero if in an assertion
8679
8680 Returns: the fixed first code unit, or 0 with REQ_NONE in flags
8681 */
8682
8683 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,uint32_t inassert)8684 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
8685 {
8686 uint32_t c = 0;
8687 int cflags = REQ_NONE;
8688
8689 *flags = REQ_NONE;
8690 do {
8691 uint32_t d;
8692 int dflags;
8693 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8694 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8695 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8696 PCRE2_UCHAR op = *scode;
8697
8698 switch(op)
8699 {
8700 default:
8701 return 0;
8702
8703 case OP_BRA:
8704 case OP_BRAPOS:
8705 case OP_CBRA:
8706 case OP_SCBRA:
8707 case OP_CBRAPOS:
8708 case OP_SCBRAPOS:
8709 case OP_ASSERT:
8710 case OP_ASSERT_NA:
8711 case OP_ONCE:
8712 case OP_SCRIPT_RUN:
8713 d = find_firstassertedcu(scode, &dflags, inassert +
8714 ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
8715 if (dflags < 0)
8716 return 0;
8717 if (cflags < 0) { c = d; cflags = dflags; }
8718 else if (c != d || cflags != dflags) return 0;
8719 break;
8720
8721 case OP_EXACT:
8722 scode += IMM2_SIZE;
8723 /* Fall through */
8724
8725 case OP_CHAR:
8726 case OP_PLUS:
8727 case OP_MINPLUS:
8728 case OP_POSPLUS:
8729 if (inassert == 0) return 0;
8730 if (cflags < 0) { c = scode[1]; cflags = 0; }
8731 else if (c != scode[1]) return 0;
8732 break;
8733
8734 case OP_EXACTI:
8735 scode += IMM2_SIZE;
8736 /* Fall through */
8737
8738 case OP_CHARI:
8739 case OP_PLUSI:
8740 case OP_MINPLUSI:
8741 case OP_POSPLUSI:
8742 if (inassert == 0) return 0;
8743
8744 /* If the character is more than one code unit long, we cannot set its
8745 first code unit when matching caselessly. Later scanning may pick up
8746 multiple code units. */
8747
8748 #ifdef SUPPORT_UNICODE
8749 #if PCRE2_CODE_UNIT_WIDTH == 8
8750 if (scode[1] >= 0x80) return 0;
8751 #elif PCRE2_CODE_UNIT_WIDTH == 16
8752 if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
8753 #endif
8754 #endif
8755
8756 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8757 else if (c != scode[1]) return 0;
8758 break;
8759 }
8760
8761 code += GET(code, 1);
8762 }
8763 while (*code == OP_ALT);
8764
8765 *flags = cflags;
8766 return c;
8767 }
8768
8769
8770
8771 /*************************************************
8772 * Add an entry to the name/number table *
8773 *************************************************/
8774
8775 /* This function is called between compiling passes to add an entry to the
8776 name/number table, maintaining alphabetical order. Checking for permitted
8777 and forbidden duplicates has already been done.
8778
8779 Arguments:
8780 cb the compile data block
8781 name the name to add
8782 length the length of the name
8783 groupno the group number
8784 tablecount the count of names in the table so far
8785
8786 Returns: nothing
8787 */
8788
8789 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8790 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8791 unsigned int groupno, uint32_t tablecount)
8792 {
8793 uint32_t i;
8794 PCRE2_UCHAR *slot = cb->name_table;
8795
8796 for (i = 0; i < tablecount; i++)
8797 {
8798 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8799 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8800 crc = -1; /* Current name is a substring */
8801
8802 /* Make space in the table and break the loop for an earlier name. For a
8803 duplicate or later name, carry on. We do this for duplicates so that in the
8804 simple case (when ?(| is not used) they are in order of their numbers. In all
8805 cases they are in the order in which they appear in the pattern. */
8806
8807 if (crc < 0)
8808 {
8809 (void)memmove(slot + cb->name_entry_size, slot,
8810 CU2BYTES((tablecount - i) * cb->name_entry_size));
8811 break;
8812 }
8813
8814 /* Continue the loop for a later or duplicate name */
8815
8816 slot += cb->name_entry_size;
8817 }
8818
8819 PUT2(slot, 0, groupno);
8820 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8821
8822 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8823 the memory is all initialized. Otherwise valgrind moans about uninitialized
8824 memory when saving serialized compiled patterns. */
8825
8826 memset(slot + IMM2_SIZE + length, 0,
8827 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8828 }
8829
8830
8831
8832 /*************************************************
8833 * Skip in parsed pattern *
8834 *************************************************/
8835
8836 /* This function is called to skip parts of the parsed pattern when finding the
8837 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8838 the end of the branch, it is called to skip over an internal lookaround or
8839 (DEFINE) group, and it is also called to skip to the end of a class, during
8840 which it will never encounter nested groups (but there's no need to have
8841 special code for that).
8842
8843 When called to find the end of a branch or group, pptr must point to the first
8844 meta code inside the branch, not the branch-starting code. In other cases it
8845 can point to the item that causes the function to be called.
8846
8847 Arguments:
8848 pptr current pointer to skip from
8849 skiptype PSKIP_CLASS when skipping to end of class
8850 PSKIP_ALT when META_ALT ends the skip
8851 PSKIP_KET when only META_KET ends the skip
8852
8853 Returns: new value of pptr
8854 NULL if META_END is reached - should never occur
8855 or for an unknown meta value - likewise
8856 */
8857
8858 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8859 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8860 {
8861 uint32_t nestlevel = 0;
8862
8863 for (;; pptr++)
8864 {
8865 uint32_t meta = META_CODE(*pptr);
8866
8867 switch(meta)
8868 {
8869 default: /* Just skip over most items */
8870 if (meta < META_END) continue; /* Literal */
8871 break;
8872
8873 /* This should never occur. */
8874
8875 case META_END:
8876 return NULL;
8877
8878 /* The data for these items is variable in length. */
8879
8880 case META_BACKREF: /* Offset is present only if group >= 10 */
8881 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8882 break;
8883
8884 case META_ESCAPE: /* A few escapes are followed by data items. */
8885 switch (META_DATA(*pptr))
8886 {
8887 case ESC_P:
8888 case ESC_p:
8889 pptr += 1;
8890 break;
8891
8892 case ESC_g:
8893 case ESC_k:
8894 pptr += 1 + SIZEOFFSET;
8895 break;
8896 }
8897 break;
8898
8899 case META_MARK: /* Add the length of the name. */
8900 case META_COMMIT_ARG:
8901 case META_PRUNE_ARG:
8902 case META_SKIP_ARG:
8903 case META_THEN_ARG:
8904 pptr += pptr[1];
8905 break;
8906
8907 /* These are the "active" items in this loop. */
8908
8909 case META_CLASS_END:
8910 if (skiptype == PSKIP_CLASS) return pptr;
8911 break;
8912
8913 case META_ATOMIC:
8914 case META_CAPTURE:
8915 case META_COND_ASSERT:
8916 case META_COND_DEFINE:
8917 case META_COND_NAME:
8918 case META_COND_NUMBER:
8919 case META_COND_RNAME:
8920 case META_COND_RNUMBER:
8921 case META_COND_VERSION:
8922 case META_LOOKAHEAD:
8923 case META_LOOKAHEADNOT:
8924 case META_LOOKAHEAD_NA:
8925 case META_LOOKBEHIND:
8926 case META_LOOKBEHINDNOT:
8927 case META_LOOKBEHIND_NA:
8928 case META_NOCAPTURE:
8929 case META_SCRIPT_RUN:
8930 nestlevel++;
8931 break;
8932
8933 case META_ALT:
8934 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
8935 break;
8936
8937 case META_KET:
8938 if (nestlevel == 0) return pptr;
8939 nestlevel--;
8940 break;
8941 }
8942
8943 /* The extra data item length for each meta is in a table. */
8944
8945 meta = (meta >> 16) & 0x7fff;
8946 if (meta >= sizeof(meta_extra_lengths)) return NULL;
8947 pptr += meta_extra_lengths[meta];
8948 }
8949 /* Control never reaches here */
8950 return pptr;
8951 }
8952
8953
8954
8955 /*************************************************
8956 * Find length of a parsed group *
8957 *************************************************/
8958
8959 /* This is called for nested groups within a branch of a lookbehind whose
8960 length is being computed. If all the branches in the nested group have the same
8961 length, that is OK. On entry, the pointer must be at the first element after
8962 the group initializing code. On exit it points to OP_KET. Caching is used to
8963 improve processing speed when the same capturing group occurs many times.
8964
8965 Arguments:
8966 pptrptr pointer to pointer in the parsed pattern
8967 isinline FALSE if a reference or recursion; TRUE for inline group
8968 errcodeptr pointer to the errorcode
8969 lcptr pointer to the loop counter
8970 group number of captured group or -1 for a non-capturing group
8971 recurses chain of recurse_check to catch mutual recursion
8972 cb pointer to the compile data
8973
8974 Returns: the group length or a negative number
8975 */
8976
8977 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)8978 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
8979 int group, parsed_recurse_check *recurses, compile_block *cb)
8980 {
8981 int branchlength;
8982 int grouplength = -1;
8983
8984 /* The cache can be used only if there is no possibility of there being two
8985 groups with the same number. We do not need to set the end pointer for a group
8986 that is being processed as a back reference or recursion, but we must do so for
8987 an inline group. */
8988
8989 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
8990 {
8991 uint32_t groupinfo = cb->groupinfo[group];
8992 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
8993 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
8994 {
8995 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
8996 return groupinfo & GI_FIXED_LENGTH_MASK;
8997 }
8998 }
8999
9000 /* Scan the group. In this case we find the end pointer of necessity. */
9001
9002 for(;;)
9003 {
9004 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9005 if (branchlength < 0) goto ISNOTFIXED;
9006 if (grouplength == -1) grouplength = branchlength;
9007 else if (grouplength != branchlength) goto ISNOTFIXED;
9008 if (**pptrptr == META_KET) break;
9009 *pptrptr += 1; /* Skip META_ALT */
9010 }
9011
9012 if (group > 0)
9013 cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9014 return grouplength;
9015
9016 ISNOTFIXED:
9017 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
9018 return -1;
9019 }
9020
9021
9022
9023 /*************************************************
9024 * Find length of a parsed branch *
9025 *************************************************/
9026
9027 /* Return a fixed length for a branch in a lookbehind, giving an error if the
9028 length is not fixed. On entry, *pptrptr points to the first element inside the
9029 branch. On exit it is set to point to the ALT or KET.
9030
9031 Arguments:
9032 pptrptr pointer to pointer in the parsed pattern
9033 errcodeptr pointer to error code
9034 lcptr pointer to loop counter
9035 recurses chain of recurse_check to catch mutual recursion
9036 cb pointer to compile block
9037
9038 Returns: the length, or a negative value on error
9039 */
9040
9041 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9042 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9043 parsed_recurse_check *recurses, compile_block *cb)
9044 {
9045 int branchlength = 0;
9046 int grouplength;
9047 uint32_t lastitemlength = 0;
9048 uint32_t *pptr = *pptrptr;
9049 PCRE2_SIZE offset;
9050 parsed_recurse_check this_recurse;
9051
9052 /* A large and/or complex regex can take too long to process. This can happen
9053 more often when (?| groups are present in the pattern because their length
9054 cannot be cached. */
9055
9056 if ((*lcptr)++ > 2000)
9057 {
9058 *errcodeptr = ERR35; /* Lookbehind is too complicated */
9059 return -1;
9060 }
9061
9062 /* Scan the branch, accumulating the length. */
9063
9064 for (;; pptr++)
9065 {
9066 parsed_recurse_check *r;
9067 uint32_t *gptr, *gptrend;
9068 uint32_t escape;
9069 uint32_t group = 0;
9070 uint32_t itemlength = 0;
9071
9072 if (*pptr < META_END)
9073 {
9074 itemlength = 1;
9075 }
9076
9077 else switch (META_CODE(*pptr))
9078 {
9079 case META_KET:
9080 case META_ALT:
9081 goto EXIT;
9082
9083 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9084 actual termination. */
9085
9086 case META_ACCEPT:
9087 case META_FAIL:
9088 pptr = parsed_skip(pptr, PSKIP_ALT);
9089 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9090 goto EXIT;
9091
9092 case META_MARK:
9093 case META_COMMIT_ARG:
9094 case META_PRUNE_ARG:
9095 case META_SKIP_ARG:
9096 case META_THEN_ARG:
9097 pptr += pptr[1] + 1;
9098 break;
9099
9100 case META_CIRCUMFLEX:
9101 case META_COMMIT:
9102 case META_DOLLAR:
9103 case META_PRUNE:
9104 case META_SKIP:
9105 case META_THEN:
9106 break;
9107
9108 case META_OPTIONS:
9109 pptr += 1;
9110 break;
9111
9112 case META_BIGVALUE:
9113 itemlength = 1;
9114 pptr += 1;
9115 break;
9116
9117 case META_CLASS:
9118 case META_CLASS_NOT:
9119 itemlength = 1;
9120 pptr = parsed_skip(pptr, PSKIP_CLASS);
9121 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9122 break;
9123
9124 case META_CLASS_EMPTY_NOT:
9125 case META_DOT:
9126 itemlength = 1;
9127 break;
9128
9129 case META_CALLOUT_NUMBER:
9130 pptr += 3;
9131 break;
9132
9133 case META_CALLOUT_STRING:
9134 pptr += 3 + SIZEOFFSET;
9135 break;
9136
9137 /* Only some escapes consume a character. Of those, \R and \X are never
9138 allowed because they might match more than character. \C is allowed only in
9139 32-bit and non-UTF 8/16-bit modes. */
9140
9141 case META_ESCAPE:
9142 escape = META_DATA(*pptr);
9143 if (escape == ESC_R || escape == ESC_X) return -1;
9144 if (escape > ESC_b && escape < ESC_Z)
9145 {
9146 #if PCRE2_CODE_UNIT_WIDTH != 32
9147 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9148 {
9149 *errcodeptr = ERR36;
9150 return -1;
9151 }
9152 #endif
9153 itemlength = 1;
9154 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
9155 }
9156 break;
9157
9158 /* Lookaheads do not contribute to the length of this branch, but they may
9159 contain lookbehinds within them whose lengths need to be set. */
9160
9161 case META_LOOKAHEAD:
9162 case META_LOOKAHEADNOT:
9163 case META_LOOKAHEAD_NA:
9164 *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9165 if (*errcodeptr != 0) return -1;
9166
9167 /* Ignore any qualifiers that follow a lookahead assertion. */
9168
9169 switch (pptr[1])
9170 {
9171 case META_ASTERISK:
9172 case META_ASTERISK_PLUS:
9173 case META_ASTERISK_QUERY:
9174 case META_PLUS:
9175 case META_PLUS_PLUS:
9176 case META_PLUS_QUERY:
9177 case META_QUERY:
9178 case META_QUERY_PLUS:
9179 case META_QUERY_QUERY:
9180 pptr++;
9181 break;
9182
9183 case META_MINMAX:
9184 case META_MINMAX_PLUS:
9185 case META_MINMAX_QUERY:
9186 pptr += 3;
9187 break;
9188
9189 default:
9190 break;
9191 }
9192 break;
9193
9194 /* A nested lookbehind does not contribute any length to this lookbehind,
9195 but must itself be checked and have its lengths set. */
9196
9197 case META_LOOKBEHIND:
9198 case META_LOOKBEHINDNOT:
9199 case META_LOOKBEHIND_NA:
9200 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9201 return -1;
9202 break;
9203
9204 /* Back references and recursions are handled by very similar code. At this
9205 stage, the names generated in the parsing pass are available, but the main
9206 name table has not yet been created. So for the named varieties, scan the
9207 list of names in order to get the number of the first one in the pattern,
9208 and whether or not this name is duplicated. */
9209
9210 case META_BACKREF_BYNAME:
9211 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9212 goto ISNOTFIXED;
9213 /* Fall through */
9214
9215 case META_RECURSE_BYNAME:
9216 {
9217 int i;
9218 PCRE2_SPTR name;
9219 BOOL is_dupname = FALSE;
9220 named_group *ng = cb->named_groups;
9221 uint32_t meta_code = META_CODE(*pptr);
9222 uint32_t length = *(++pptr);
9223
9224 GETPLUSOFFSET(offset, pptr);
9225 name = cb->start_pattern + offset;
9226 for (i = 0; i < cb->names_found; i++, ng++)
9227 {
9228 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9229 {
9230 group = ng->number;
9231 is_dupname = ng->isdup;
9232 break;
9233 }
9234 }
9235
9236 if (group == 0)
9237 {
9238 *errcodeptr = ERR15; /* Non-existent subpattern */
9239 cb->erroroffset = offset;
9240 return -1;
9241 }
9242
9243 /* A numerical back reference can be fixed length if duplicate capturing
9244 groups are not being used. A non-duplicate named back reference can also
9245 be handled. */
9246
9247 if (meta_code == META_RECURSE_BYNAME ||
9248 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9249 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9250 }
9251 goto ISNOTFIXED; /* Duplicate name or number */
9252
9253 /* The offset values for back references < 10 are in a separate vector
9254 because otherwise they would use more than two parsed pattern elements on
9255 64-bit systems. */
9256
9257 case META_BACKREF:
9258 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9259 (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9260 goto ISNOTFIXED;
9261 group = META_DATA(*pptr);
9262 if (group < 10)
9263 {
9264 offset = cb->small_ref_offset[group];
9265 goto RECURSE_OR_BACKREF_LENGTH;
9266 }
9267
9268 /* Fall through */
9269 /* For groups >= 10 - picking up group twice does no harm. */
9270
9271 /* A true recursion implies not fixed length, but a subroutine call may
9272 be OK. Back reference "recursions" are also failed. */
9273
9274 case META_RECURSE:
9275 group = META_DATA(*pptr);
9276 GETPLUSOFFSET(offset, pptr);
9277
9278 RECURSE_OR_BACKREF_LENGTH:
9279 if (group > cb->bracount)
9280 {
9281 cb->erroroffset = offset;
9282 *errcodeptr = ERR15; /* Non-existent subpattern */
9283 return -1;
9284 }
9285 if (group == 0) goto ISNOTFIXED; /* Local recursion */
9286 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9287 {
9288 if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9289 else if (*gptr == (META_CAPTURE | group)) break;
9290 }
9291
9292 /* We must start the search for the end of the group at the first meta code
9293 inside the group. Otherwise it will be treated as an enclosed group. */
9294
9295 gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9296 if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9297 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9298 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9299 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9300 this_recurse.prev = recurses;
9301 this_recurse.groupptr = gptr;
9302
9303 /* We do not need to know the position of the end of the group, that is,
9304 gptr is not used after the call to get_grouplength(). Setting the second
9305 argument FALSE stops it scanning for the end when the length can be found
9306 in the cache. */
9307
9308 gptr++;
9309 grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9310 &this_recurse, cb);
9311 if (grouplength < 0)
9312 {
9313 if (*errcodeptr == 0) goto ISNOTFIXED;
9314 return -1; /* Error already set */
9315 }
9316 itemlength = grouplength;
9317 break;
9318
9319 /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9320 the length of this branch. Skip from the following item to the next
9321 unpaired ket. */
9322
9323 case META_COND_DEFINE:
9324 pptr = parsed_skip(pptr + 1, PSKIP_KET);
9325 break;
9326
9327 /* Check other nested groups - advance past the initial data for each type
9328 and then seek a fixed length with get_grouplength(). */
9329
9330 case META_COND_NAME:
9331 case META_COND_NUMBER:
9332 case META_COND_RNAME:
9333 case META_COND_RNUMBER:
9334 pptr += 2 + SIZEOFFSET;
9335 goto CHECK_GROUP;
9336
9337 case META_COND_ASSERT:
9338 pptr += 1;
9339 goto CHECK_GROUP;
9340
9341 case META_COND_VERSION:
9342 pptr += 4;
9343 goto CHECK_GROUP;
9344
9345 case META_CAPTURE:
9346 group = META_DATA(*pptr);
9347 /* Fall through */
9348
9349 case META_ATOMIC:
9350 case META_NOCAPTURE:
9351 case META_SCRIPT_RUN:
9352 pptr++;
9353 CHECK_GROUP:
9354 grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9355 recurses, cb);
9356 if (grouplength < 0) return -1;
9357 itemlength = grouplength;
9358 break;
9359
9360 /* Exact repetition is OK; variable repetition is not. A repetition of zero
9361 must subtract the length that has already been added. */
9362
9363 case META_MINMAX:
9364 case META_MINMAX_PLUS:
9365 case META_MINMAX_QUERY:
9366 if (pptr[1] == pptr[2])
9367 {
9368 switch(pptr[1])
9369 {
9370 case 0:
9371 branchlength -= lastitemlength;
9372 break;
9373
9374 case 1:
9375 itemlength = 0;
9376 break;
9377
9378 default: /* Check for integer overflow */
9379 if (lastitemlength != 0 && /* Should not occur, but just in case */
9380 INT_MAX/lastitemlength < pptr[1] - 1)
9381 {
9382 *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */
9383 return -1;
9384 }
9385 itemlength = (pptr[1] - 1) * lastitemlength;
9386 break;
9387 }
9388 pptr += 2;
9389 break;
9390 }
9391 /* Fall through */
9392
9393 /* Any other item means this branch does not have a fixed length. */
9394
9395 default:
9396 ISNOTFIXED:
9397 *errcodeptr = ERR25; /* Not fixed length */
9398 return -1;
9399 }
9400
9401 /* Add the item length to the branchlength, checking for integer overflow and
9402 for the branch length exceeding the limit. */
9403
9404 if (INT_MAX - branchlength < (int)itemlength ||
9405 (branchlength += itemlength) > LOOKBEHIND_MAX)
9406 {
9407 *errcodeptr = ERR87;
9408 return -1;
9409 }
9410
9411 /* Save this item length for use if the next item is a quantifier. */
9412
9413 lastitemlength = itemlength;
9414 }
9415
9416 EXIT:
9417 *pptrptr = pptr;
9418 return branchlength;
9419
9420 PARSED_SKIP_FAILED:
9421 *errcodeptr = ERR90;
9422 return -1;
9423 }
9424
9425
9426
9427 /*************************************************
9428 * Set lengths in a lookbehind *
9429 *************************************************/
9430
9431 /* This function is called for each lookbehind, to set the lengths in its
9432 branches. An error occurs if any branch does not have a fixed length that is
9433 less than the maximum (65535). On exit, the pointer must be left on the final
9434 ket.
9435
9436 The function also maintains the max_lookbehind value. Any lookbehind branch
9437 that contains a nested lookbehind may actually look further back than the
9438 length of the branch. The additional amount is passed back from
9439 get_branchlength() as an "extra" value.
9440
9441 Arguments:
9442 pptrptr pointer to pointer in the parsed pattern
9443 errcodeptr pointer to error code
9444 lcptr pointer to loop counter
9445 recurses chain of recurse_check to catch mutual recursion
9446 cb pointer to compile block
9447
9448 Returns: TRUE if all is well
9449 FALSE otherwise, with error code and offset set
9450 */
9451
9452 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9453 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9454 parsed_recurse_check *recurses, compile_block *cb)
9455 {
9456 PCRE2_SIZE offset;
9457 int branchlength;
9458 uint32_t *bptr = *pptrptr;
9459
9460 READPLUSOFFSET(offset, bptr); /* Offset for error messages */
9461 *pptrptr += SIZEOFFSET;
9462
9463 do
9464 {
9465 *pptrptr += 1;
9466 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9467 if (branchlength < 0)
9468 {
9469 /* The errorcode and offset may already be set from a nested lookbehind. */
9470 if (*errcodeptr == 0) *errcodeptr = ERR25;
9471 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9472 return FALSE;
9473 }
9474 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9475 *bptr |= branchlength; /* branchlength never more than 65535 */
9476 bptr = *pptrptr;
9477 }
9478 while (*bptr == META_ALT);
9479
9480 return TRUE;
9481 }
9482
9483
9484
9485 /*************************************************
9486 * Check parsed pattern lookbehinds *
9487 *************************************************/
9488
9489 /* This function is called at the end of parsing a pattern if any lookbehinds
9490 were encountered. It scans the parsed pattern for them, calling
9491 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9492 the error offset is marked unset. The enables the functions above not to
9493 override settings from deeper nestings.
9494
9495 This function is called recursively from get_branchlength() for lookaheads in
9496 order to process any lookbehinds that they may contain. It stops when it hits a
9497 non-nested closing parenthesis in this case, returning a pointer to it.
9498
9499 Arguments
9500 pptr points to where to start (start of pattern or start of lookahead)
9501 retptr if not NULL, return the ket pointer here
9502 recurses chain of recurse_check to catch mutual recursion
9503 cb points to the compile block
9504 lcptr points to loop counter
9505
9506 Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9507 */
9508
9509 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb,int * lcptr)9510 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9511 parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9512 {
9513 int errorcode = 0;
9514 int nestlevel = 0;
9515
9516 cb->erroroffset = PCRE2_UNSET;
9517
9518 for (; *pptr != META_END; pptr++)
9519 {
9520 if (*pptr < META_END) continue; /* Literal */
9521
9522 switch (META_CODE(*pptr))
9523 {
9524 default:
9525 return ERR70; /* Unrecognized meta code */
9526
9527 case META_ESCAPE:
9528 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9529 pptr += 1;
9530 break;
9531
9532 case META_KET:
9533 if (--nestlevel < 0)
9534 {
9535 if (retptr != NULL) *retptr = pptr;
9536 return 0;
9537 }
9538 break;
9539
9540 case META_ATOMIC:
9541 case META_CAPTURE:
9542 case META_COND_ASSERT:
9543 case META_LOOKAHEAD:
9544 case META_LOOKAHEADNOT:
9545 case META_LOOKAHEAD_NA:
9546 case META_NOCAPTURE:
9547 case META_SCRIPT_RUN:
9548 nestlevel++;
9549 break;
9550
9551 case META_ACCEPT:
9552 case META_ALT:
9553 case META_ASTERISK:
9554 case META_ASTERISK_PLUS:
9555 case META_ASTERISK_QUERY:
9556 case META_BACKREF:
9557 case META_CIRCUMFLEX:
9558 case META_CLASS:
9559 case META_CLASS_EMPTY:
9560 case META_CLASS_EMPTY_NOT:
9561 case META_CLASS_END:
9562 case META_CLASS_NOT:
9563 case META_COMMIT:
9564 case META_DOLLAR:
9565 case META_DOT:
9566 case META_FAIL:
9567 case META_PLUS:
9568 case META_PLUS_PLUS:
9569 case META_PLUS_QUERY:
9570 case META_PRUNE:
9571 case META_QUERY:
9572 case META_QUERY_PLUS:
9573 case META_QUERY_QUERY:
9574 case META_RANGE_ESCAPED:
9575 case META_RANGE_LITERAL:
9576 case META_SKIP:
9577 case META_THEN:
9578 break;
9579
9580 case META_RECURSE:
9581 pptr += SIZEOFFSET;
9582 break;
9583
9584 case META_BACKREF_BYNAME:
9585 case META_RECURSE_BYNAME:
9586 pptr += 1 + SIZEOFFSET;
9587 break;
9588
9589 case META_COND_DEFINE:
9590 pptr += SIZEOFFSET;
9591 nestlevel++;
9592 break;
9593
9594 case META_COND_NAME:
9595 case META_COND_NUMBER:
9596 case META_COND_RNAME:
9597 case META_COND_RNUMBER:
9598 pptr += 1 + SIZEOFFSET;
9599 nestlevel++;
9600 break;
9601
9602 case META_COND_VERSION:
9603 pptr += 3;
9604 nestlevel++;
9605 break;
9606
9607 case META_CALLOUT_STRING:
9608 pptr += 3 + SIZEOFFSET;
9609 break;
9610
9611 case META_BIGVALUE:
9612 case META_OPTIONS:
9613 case META_POSIX:
9614 case META_POSIX_NEG:
9615 pptr += 1;
9616 break;
9617
9618 case META_MINMAX:
9619 case META_MINMAX_QUERY:
9620 case META_MINMAX_PLUS:
9621 pptr += 2;
9622 break;
9623
9624 case META_CALLOUT_NUMBER:
9625 pptr += 3;
9626 break;
9627
9628 case META_MARK:
9629 case META_COMMIT_ARG:
9630 case META_PRUNE_ARG:
9631 case META_SKIP_ARG:
9632 case META_THEN_ARG:
9633 pptr += 1 + pptr[1];
9634 break;
9635
9636 case META_LOOKBEHIND:
9637 case META_LOOKBEHINDNOT:
9638 case META_LOOKBEHIND_NA:
9639 if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
9640 return errorcode;
9641 break;
9642 }
9643 }
9644
9645 return 0;
9646 }
9647
9648
9649
9650 /*************************************************
9651 * External function to compile a pattern *
9652 *************************************************/
9653
9654 /* This function reads a regular expression in the form of a string and returns
9655 a pointer to a block of store holding a compiled version of the expression.
9656
9657 Arguments:
9658 pattern the regular expression
9659 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
9660 options option bits
9661 errorptr pointer to errorcode
9662 erroroffset pointer to error offset
9663 ccontext points to a compile context or is NULL
9664
9665 Returns: pointer to compiled data block, or NULL on error,
9666 with errorcode and erroroffset set
9667 */
9668
9669 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9670 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9671 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9672 {
9673 BOOL utf; /* Set TRUE for UTF mode */
9674 BOOL ucp; /* Set TRUE for UCP mode */
9675 BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
9676 BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
9677 pcre2_real_code *re = NULL; /* What we will return */
9678 compile_block cb; /* "Static" compile-time data */
9679 const uint8_t *tables; /* Char tables base pointer */
9680
9681 PCRE2_UCHAR *code; /* Current pointer in compiled code */
9682 PCRE2_SPTR codestart; /* Start of compiled code */
9683 PCRE2_SPTR ptr; /* Current pointer in pattern */
9684 uint32_t *pptr; /* Current pointer in parsed pattern */
9685
9686 PCRE2_SIZE length = 1; /* Allow for final END opcode */
9687 PCRE2_SIZE usedlength; /* Actual length used */
9688 PCRE2_SIZE re_blocksize; /* Size of memory block */
9689 PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
9690 PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
9691
9692 int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
9693 uint32_t firstcu, reqcu; /* Value of first/req code unit */
9694 uint32_t setflags = 0; /* NL and BSR set flags */
9695
9696 uint32_t skipatstart; /* When checking (*UTF) etc */
9697 uint32_t limit_heap = UINT32_MAX;
9698 uint32_t limit_match = UINT32_MAX; /* Unset match limits */
9699 uint32_t limit_depth = UINT32_MAX;
9700
9701 int newline = 0; /* Unset; can be set by the pattern */
9702 int bsr = 0; /* Unset; can be set by the pattern */
9703 int errorcode = 0; /* Initialize to avoid compiler warn */
9704 int regexrc; /* Return from compile */
9705
9706 uint32_t i; /* Local loop counter */
9707
9708 /* Comments at the head of this file explain about these variables. */
9709
9710 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9711 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9712 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9713
9714 /* The workspace is used in different ways in the different compiling phases.
9715 It needs to be 16-bit aligned for the preliminary parsing scan. */
9716
9717 uint32_t c16workspace[C16_WORK_SIZE];
9718 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9719
9720
9721 /* -------------- Check arguments and set up the pattern ----------------- */
9722
9723 /* There must be error code and offset pointers. */
9724
9725 if (errorptr == NULL || erroroffset == NULL) return NULL;
9726 *errorptr = ERR0;
9727 *erroroffset = 0;
9728
9729 /* There must be a pattern! */
9730
9731 if (pattern == NULL)
9732 {
9733 *errorptr = ERR16;
9734 return NULL;
9735 }
9736
9737 /* A NULL compile context means "use a default context" */
9738
9739 if (ccontext == NULL)
9740 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9741
9742 /* PCRE2_MATCH_INVALID_UTF implies UTF */
9743
9744 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
9745
9746 /* Check that all undefined public option bits are zero. */
9747
9748 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9749 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9750 {
9751 *errorptr = ERR17;
9752 return NULL;
9753 }
9754
9755 if ((options & PCRE2_LITERAL) != 0 &&
9756 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9757 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9758 {
9759 *errorptr = ERR92;
9760 return NULL;
9761 }
9762
9763 /* A zero-terminated pattern is indicated by the special length value
9764 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9765
9766 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9767 patlen = PRIV(strlen)(pattern);
9768
9769 if (patlen > ccontext->max_pattern_length)
9770 {
9771 *errorptr = ERR88;
9772 return NULL;
9773 }
9774
9775 /* From here on, all returns from this function should end up going via the
9776 EXIT label. */
9777
9778
9779 /* ------------ Initialize the "static" compile data -------------- */
9780
9781 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9782
9783 cb.lcc = tables + lcc_offset; /* Individual */
9784 cb.fcc = tables + fcc_offset; /* character */
9785 cb.cbits = tables + cbits_offset; /* tables */
9786 cb.ctypes = tables + ctypes_offset;
9787
9788 cb.assert_depth = 0;
9789 cb.bracount = 0;
9790 cb.cx = ccontext;
9791 cb.dupnames = FALSE;
9792 cb.end_pattern = pattern + patlen;
9793 cb.erroroffset = 0;
9794 cb.external_flags = 0;
9795 cb.external_options = options;
9796 cb.groupinfo = stack_groupinfo;
9797 cb.had_recurse = FALSE;
9798 cb.lastcapture = 0;
9799 cb.max_lookbehind = 0;
9800 cb.name_entry_size = 0;
9801 cb.name_table = NULL;
9802 cb.named_groups = named_groups;
9803 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9804 cb.names_found = 0;
9805 cb.open_caps = NULL;
9806 cb.parens_depth = 0;
9807 cb.parsed_pattern = stack_parsed_pattern;
9808 cb.req_varyopt = 0;
9809 cb.start_code = cworkspace;
9810 cb.start_pattern = pattern;
9811 cb.start_workspace = cworkspace;
9812 cb.workspace_size = COMPILE_WORK_SIZE;
9813
9814 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9815 references to help in deciding whether (.*) can be treated as anchored or not.
9816 */
9817
9818 cb.top_backref = 0;
9819 cb.backref_map = 0;
9820
9821 /* Escape sequences \1 to \9 are always back references, but as they are only
9822 two characters long, only two elements can be used in the parsed_pattern
9823 vector. The first contains the reference, and we'd like to use the second to
9824 record the offset in the pattern, so that forward references to non-existent
9825 groups can be diagnosed later with an offset. However, on 64-bit systems,
9826 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9827 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9828 references have enough space for the offset to be put into the parsed pattern.
9829 */
9830
9831 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9832
9833
9834 /* --------------- Start looking at the pattern --------------- */
9835
9836 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9837 the start of the pattern, and remember the offset to the actual regex. With
9838 valgrind support, make the terminator of a zero-terminated pattern
9839 inaccessible. This catches bugs that would otherwise only show up for
9840 non-zero-terminated patterns. */
9841
9842 #ifdef SUPPORT_VALGRIND
9843 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9844 #endif
9845
9846 ptr = pattern;
9847 skipatstart = 0;
9848
9849 if ((options & PCRE2_LITERAL) == 0)
9850 {
9851 while (patlen - skipatstart >= 2 &&
9852 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9853 ptr[skipatstart+1] == CHAR_ASTERISK)
9854 {
9855 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9856 {
9857 uint32_t c, pp;
9858 pso *p = pso_list + i;
9859
9860 if (patlen - skipatstart - 2 >= p->length &&
9861 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9862 p->length) == 0)
9863 {
9864 skipatstart += p->length + 2;
9865 switch(p->type)
9866 {
9867 case PSO_OPT:
9868 cb.external_options |= p->value;
9869 break;
9870
9871 case PSO_FLG:
9872 setflags |= p->value;
9873 break;
9874
9875 case PSO_NL:
9876 newline = p->value;
9877 setflags |= PCRE2_NL_SET;
9878 break;
9879
9880 case PSO_BSR:
9881 bsr = p->value;
9882 setflags |= PCRE2_BSR_SET;
9883 break;
9884
9885 case PSO_LIMM:
9886 case PSO_LIMD:
9887 case PSO_LIMH:
9888 c = 0;
9889 pp = skipatstart;
9890 if (!IS_DIGIT(ptr[pp]))
9891 {
9892 errorcode = ERR60;
9893 ptr += pp;
9894 goto HAD_EARLY_ERROR;
9895 }
9896 while (IS_DIGIT(ptr[pp]))
9897 {
9898 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
9899 c = c*10 + (ptr[pp++] - CHAR_0);
9900 }
9901 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9902 {
9903 errorcode = ERR60;
9904 ptr += pp;
9905 goto HAD_EARLY_ERROR;
9906 }
9907 if (p->type == PSO_LIMH) limit_heap = c;
9908 else if (p->type == PSO_LIMM) limit_match = c;
9909 else limit_depth = c;
9910 skipatstart += pp - skipatstart;
9911 break;
9912 }
9913 break; /* Out of the table scan loop */
9914 }
9915 }
9916 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
9917 }
9918 }
9919
9920 /* End of pattern-start options; advance to start of real regex. */
9921
9922 ptr += skipatstart;
9923
9924 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
9925
9926 #ifndef SUPPORT_UNICODE
9927 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
9928 {
9929 errorcode = ERR32;
9930 goto HAD_EARLY_ERROR;
9931 }
9932 #endif
9933
9934 /* Check UTF. We have the original options in 'options', with that value as
9935 modified by (*UTF) etc in cb->external_options. The extra option
9936 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
9937 surrogate code points cannot be represented in UTF-16. */
9938
9939 utf = (cb.external_options & PCRE2_UTF) != 0;
9940 if (utf)
9941 {
9942 if ((options & PCRE2_NEVER_UTF) != 0)
9943 {
9944 errorcode = ERR74;
9945 goto HAD_EARLY_ERROR;
9946 }
9947 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
9948 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
9949 goto HAD_ERROR; /* Offset was set by valid_utf() */
9950
9951 #if PCRE2_CODE_UNIT_WIDTH == 16
9952 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
9953 {
9954 errorcode = ERR91;
9955 goto HAD_EARLY_ERROR;
9956 }
9957 #endif
9958 }
9959
9960 /* Check UCP lockout. */
9961
9962 ucp = (cb.external_options & PCRE2_UCP) != 0;
9963 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
9964 {
9965 errorcode = ERR75;
9966 goto HAD_EARLY_ERROR;
9967 }
9968
9969 /* Process the BSR setting. */
9970
9971 if (bsr == 0) bsr = ccontext->bsr_convention;
9972
9973 /* Process the newline setting. */
9974
9975 if (newline == 0) newline = ccontext->newline_convention;
9976 cb.nltype = NLTYPE_FIXED;
9977 switch(newline)
9978 {
9979 case PCRE2_NEWLINE_CR:
9980 cb.nllen = 1;
9981 cb.nl[0] = CHAR_CR;
9982 break;
9983
9984 case PCRE2_NEWLINE_LF:
9985 cb.nllen = 1;
9986 cb.nl[0] = CHAR_NL;
9987 break;
9988
9989 case PCRE2_NEWLINE_NUL:
9990 cb.nllen = 1;
9991 cb.nl[0] = CHAR_NUL;
9992 break;
9993
9994 case PCRE2_NEWLINE_CRLF:
9995 cb.nllen = 2;
9996 cb.nl[0] = CHAR_CR;
9997 cb.nl[1] = CHAR_NL;
9998 break;
9999
10000 case PCRE2_NEWLINE_ANY:
10001 cb.nltype = NLTYPE_ANY;
10002 break;
10003
10004 case PCRE2_NEWLINE_ANYCRLF:
10005 cb.nltype = NLTYPE_ANYCRLF;
10006 break;
10007
10008 default:
10009 errorcode = ERR56;
10010 goto HAD_EARLY_ERROR;
10011 }
10012
10013 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
10014 their numerical equivalents, so that this information is always available for
10015 the remaining processing. (2) At the same time, parse the pattern and put a
10016 processed version into the parsed_pattern vector. This has escapes interpreted
10017 and comments removed (amongst other things).
10018
10019 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
10020 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
10021 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10022 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10023 characters greater than META_END (0x80000000) have to be coded as two units. In
10024 this case, therefore, we scan the pattern to check for such values. */
10025
10026 #if PCRE2_CODE_UNIT_WIDTH == 32
10027 if (!utf)
10028 {
10029 PCRE2_SPTR p;
10030 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10031 }
10032 #endif
10033
10034 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10035 is set we have to assume a numerical callout (4 elements) for each character
10036 plus one at the end. This is overkill, but memory is plentiful these days. For
10037 many smaller patterns the vector on the stack (which was set up above) can be
10038 used. */
10039
10040 parsed_size_needed = patlen - skipatstart + big32count;
10041
10042 if ((ccontext->extra_options &
10043 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10044 parsed_size_needed += 4;
10045
10046 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10047 parsed_size_needed = (parsed_size_needed + 1) * 5;
10048
10049 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10050 {
10051 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10052 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10053 if (heap_parsed_pattern == NULL)
10054 {
10055 *errorptr = ERR21;
10056 goto EXIT;
10057 }
10058 cb.parsed_pattern = heap_parsed_pattern;
10059 }
10060 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10061
10062 /* Do the parsing scan. */
10063
10064 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10065 if (errorcode != 0) goto HAD_CB_ERROR;
10066
10067 /* Workspace is needed to remember information about numbered groups: whether a
10068 group can match an empty string and what its fixed length is. This is done to
10069 avoid the possibility of recursive references causing very long compile times
10070 when checking these features. Unnumbered groups do not have this exposure since
10071 they cannot be referenced. We use an indexed vector for this purpose. If there
10072 are sufficiently few groups, the default vector on the stack, as set up above,
10073 can be used. Otherwise we have to get/free a special vector. The vector must be
10074 initialized to zero. */
10075
10076 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
10077 {
10078 cb.groupinfo = ccontext->memctl.malloc(
10079 (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
10080 if (cb.groupinfo == NULL)
10081 {
10082 errorcode = ERR21;
10083 cb.erroroffset = 0;
10084 goto HAD_CB_ERROR;
10085 }
10086 }
10087 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
10088
10089 /* If there were any lookbehinds, scan the parsed pattern to figure out their
10090 lengths. */
10091
10092 if (has_lookbehind)
10093 {
10094 int loopcount = 0;
10095 errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10096 if (errorcode != 0) goto HAD_CB_ERROR;
10097 }
10098
10099 /* For debugging, there is a function that shows the parsed data vector. */
10100
10101 #ifdef DEBUG_SHOW_PARSED
10102 fprintf(stderr, "+++ Pre-scan complete:\n");
10103 show_parsed(&cb);
10104 #endif
10105
10106 /* For debugging capturing information this code can be enabled. */
10107
10108 #ifdef DEBUG_SHOW_CAPTURES
10109 {
10110 named_group *ng = cb.named_groups;
10111 fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10112 for (i = 0; i < cb.names_found; i++, ng++)
10113 {
10114 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10115 }
10116 }
10117 #endif
10118
10119 /* Pretend to compile the pattern while actually just accumulating the amount
10120 of memory required in the 'length' variable. This behaviour is triggered by
10121 passing a non-NULL final argument to compile_regex(). We pass a block of
10122 workspace (cworkspace) for it to compile parts of the pattern into; the
10123 compiled code is discarded when it is no longer needed, so hopefully this
10124 workspace will never overflow, though there is a test for its doing so.
10125
10126 On error, errorcode will be set non-zero, so we don't need to look at the
10127 result of the function. The initial options have been put into the cb block,
10128 but we still have to pass a separate options variable (the first argument)
10129 because the options may change as the pattern is processed. */
10130
10131 cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
10132 pptr = cb.parsed_pattern;
10133 code = cworkspace;
10134 *code = OP_BRA;
10135
10136 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
10137 &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
10138
10139 if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
10140
10141 /* This should be caught in compile_regex(), but just in case... */
10142
10143 if (length > MAX_PATTERN_SIZE)
10144 {
10145 errorcode = ERR20;
10146 goto HAD_CB_ERROR;
10147 }
10148
10149 /* Compute the size of, and then get and initialize, the data block for storing
10150 the compiled pattern and names table. Integer overflow should no longer be
10151 possible because nowadays we limit the maximum value of cb.names_found and
10152 cb.name_entry_size. */
10153
10154 re_blocksize = sizeof(pcre2_real_code) +
10155 CU2BYTES(length +
10156 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10157 re = (pcre2_real_code *)
10158 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10159 if (re == NULL)
10160 {
10161 errorcode = ERR21;
10162 goto HAD_CB_ERROR;
10163 }
10164
10165 /* The compiler may put padding at the end of the pcre2_real_code structure in
10166 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10167 compiled pattern is copied (for example, when serialized) undefined bytes are
10168 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10169 write to the last 8 bytes of the structure before setting the fields. */
10170
10171 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10172 re->memctl = ccontext->memctl;
10173 re->tables = tables;
10174 re->executable_jit = NULL;
10175 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10176 re->blocksize = re_blocksize;
10177 re->magic_number = MAGIC_NUMBER;
10178 re->compile_options = options;
10179 re->overall_options = cb.external_options;
10180 re->extra_options = ccontext->extra_options;
10181 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10182 re->limit_heap = limit_heap;
10183 re->limit_match = limit_match;
10184 re->limit_depth = limit_depth;
10185 re->first_codeunit = 0;
10186 re->last_codeunit = 0;
10187 re->bsr_convention = bsr;
10188 re->newline_convention = newline;
10189 re->max_lookbehind = 0;
10190 re->minlength = 0;
10191 re->top_bracket = 0;
10192 re->top_backref = 0;
10193 re->name_entry_size = cb.name_entry_size;
10194 re->name_count = cb.names_found;
10195
10196 /* The basic block is immediately followed by the name table, and the compiled
10197 code follows after that. */
10198
10199 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10200 re->name_entry_size * re->name_count;
10201
10202 /* Update the compile data block for the actual compile. The starting points of
10203 the name/number translation table and of the code are passed around in the
10204 compile data block. The start/end pattern and initial options are already set
10205 from the pre-compile phase, as is the name_entry_size field. */
10206
10207 cb.parens_depth = 0;
10208 cb.assert_depth = 0;
10209 cb.lastcapture = 0;
10210 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10211 cb.start_code = codestart;
10212 cb.req_varyopt = 0;
10213 cb.had_accept = FALSE;
10214 cb.had_pruneorskip = FALSE;
10215 cb.open_caps = NULL;
10216
10217 /* If any named groups were found, create the name/number table from the list
10218 created in the pre-pass. */
10219
10220 if (cb.names_found > 0)
10221 {
10222 named_group *ng = cb.named_groups;
10223 for (i = 0; i < cb.names_found; i++, ng++)
10224 add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10225 }
10226
10227 /* Set up a starting, non-extracting bracket, then compile the expression. On
10228 error, errorcode will be set non-zero, so we don't need to look at the result
10229 of the function here. */
10230
10231 pptr = cb.parsed_pattern;
10232 code = (PCRE2_UCHAR *)codestart;
10233 *code = OP_BRA;
10234 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
10235 &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
10236 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10237 re->top_bracket = cb.bracount;
10238 re->top_backref = cb.top_backref;
10239 re->max_lookbehind = cb.max_lookbehind;
10240
10241 if (cb.had_accept)
10242 {
10243 reqcu = 0; /* Must disable after (*ACCEPT) */
10244 reqcuflags = REQ_NONE;
10245 re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
10246 }
10247
10248 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10249 but the estimated length exceeds the really used length, adjust the value of
10250 re->blocksize, and if valgrind support is configured, mark the extra allocated
10251 memory as unaddressable, so that any out-of-bound reads can be detected. */
10252
10253 *code++ = OP_END;
10254 usedlength = code - codestart;
10255 if (usedlength > length) errorcode = ERR23; else
10256 {
10257 re->blocksize -= CU2BYTES(length - usedlength);
10258 #ifdef SUPPORT_VALGRIND
10259 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10260 #endif
10261 }
10262
10263 /* Scan the pattern for recursion/subroutine calls and convert the group
10264 numbers into offsets. Maintain a small cache so that repeated groups containing
10265 recursions are efficiently handled. */
10266
10267 #define RSCAN_CACHE_SIZE 8
10268
10269 if (errorcode == 0 && cb.had_recurse)
10270 {
10271 PCRE2_UCHAR *rcode;
10272 PCRE2_SPTR rgroup;
10273 unsigned int ccount = 0;
10274 int start = RSCAN_CACHE_SIZE;
10275 recurse_cache rc[RSCAN_CACHE_SIZE];
10276
10277 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10278 rcode != NULL;
10279 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10280 {
10281 int p, groupnumber;
10282
10283 groupnumber = (int)GET(rcode, 1);
10284 if (groupnumber == 0) rgroup = codestart; else
10285 {
10286 PCRE2_SPTR search_from = codestart;
10287 rgroup = NULL;
10288 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10289 {
10290 if (groupnumber == rc[p].groupnumber)
10291 {
10292 rgroup = rc[p].group;
10293 break;
10294 }
10295
10296 /* Group n+1 must always start to the right of group n, so we can save
10297 search time below when the new group number is greater than any of the
10298 previously found groups. */
10299
10300 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10301 }
10302
10303 if (rgroup == NULL)
10304 {
10305 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10306 if (rgroup == NULL)
10307 {
10308 errorcode = ERR53;
10309 break;
10310 }
10311 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10312 rc[start].groupnumber = groupnumber;
10313 rc[start].group = rgroup;
10314 if (ccount < RSCAN_CACHE_SIZE) ccount++;
10315 }
10316 }
10317
10318 PUT(rcode, 1, rgroup - codestart);
10319 }
10320 }
10321
10322 /* In rare debugging situations we sometimes need to look at the compiled code
10323 at this stage. */
10324
10325 #ifdef DEBUG_CALL_PRINTINT
10326 pcre2_printint(re, stderr, TRUE);
10327 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10328 #endif
10329
10330 /* Unless disabled, check whether any single character iterators can be
10331 auto-possessified. The function overwrites the appropriate opcode values, so
10332 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10333 used in this code because at least one compiler gives a warning about loss of
10334 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10335 function call. */
10336
10337 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10338 {
10339 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10340 if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10341 }
10342
10343 /* Failed to compile, or error while post-processing. */
10344
10345 if (errorcode != 0) goto HAD_CB_ERROR;
10346
10347 /* Successful compile. If the anchored option was not passed, set it if
10348 we can determine that the pattern is anchored by virtue of ^ characters or \A
10349 or anything else, such as starting with non-atomic .* when DOTALL is set and
10350 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10351 disable this case). */
10352
10353 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10354 is_anchored(codestart, 0, &cb, 0, FALSE))
10355 re->overall_options |= PCRE2_ANCHORED;
10356
10357 /* Set up the first code unit or startline flag, the required code unit, and
10358 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10359 is set, as the data it would create will not be used. Note that a first code
10360 unit (but not the startline flag) is useful for anchored patterns because it
10361 can still give a quick "no match" and also avoid searching for a last code
10362 unit. */
10363
10364 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10365 {
10366 int minminlength = 0; /* For minimal minlength from first/required CU */
10367
10368 /* If we do not have a first code unit, see if there is one that is asserted
10369 (these are not saved during the compile because they can cause conflicts with
10370 actual literals that follow). */
10371
10372 if (firstcuflags < 0)
10373 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10374
10375 /* Save the data for a first code unit. The existence of one means the
10376 minimum length must be at least 1. */
10377
10378 if (firstcuflags >= 0)
10379 {
10380 re->first_codeunit = firstcu;
10381 re->flags |= PCRE2_FIRSTSET;
10382 minminlength++;
10383
10384 /* Handle caseless first code units. */
10385
10386 if ((firstcuflags & REQ_CASELESS) != 0)
10387 {
10388 if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10389 {
10390 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10391 }
10392
10393 /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10394 In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10395 points and cannot have another case, but if UCP is set they may do. */
10396
10397 #ifdef SUPPORT_UNICODE
10398 #if PCRE2_CODE_UNIT_WIDTH == 8
10399 else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10400 re->flags |= PCRE2_FIRSTCASELESS;
10401 #else
10402 else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10403 UCD_OTHERCASE(firstcu) != firstcu)
10404 re->flags |= PCRE2_FIRSTCASELESS;
10405 #endif
10406 #endif /* SUPPORT_UNICODE */
10407 }
10408 }
10409
10410 /* When there is no first code unit, for non-anchored patterns, see if we can
10411 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10412 branches start with ^ and also when all branches start with non-atomic .* for
10413 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10414 that disables this case.) */
10415
10416 else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10417 is_startline(codestart, 0, &cb, 0, FALSE))
10418 re->flags |= PCRE2_STARTLINE;
10419
10420 /* Handle the "required code unit", if one is set. In the UTF case we can
10421 increment the minimum minimum length only if we are sure this really is a
10422 different character and not a non-starting code unit of the first character,
10423 because the minimum length count is in characters, not code units. */
10424
10425 if (reqcuflags >= 0)
10426 {
10427 #if PCRE2_CODE_UNIT_WIDTH == 16
10428 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10429 firstcuflags < 0 || /* First not set */
10430 (firstcu & 0xf800) != 0xd800 || /* First not surrogate */
10431 (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
10432 #elif PCRE2_CODE_UNIT_WIDTH == 8
10433 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10434 firstcuflags < 0 || /* First not set */
10435 (firstcu & 0x80) == 0 || /* First is ASCII */
10436 (reqcu & 0x80) == 0) /* Req is ASCII */
10437 #endif
10438 {
10439 minminlength++;
10440 }
10441
10442 /* In the case of an anchored pattern, set up the value only if it follows
10443 a variable length item in the pattern. */
10444
10445 if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10446 (reqcuflags & REQ_VARY) != 0)
10447 {
10448 re->last_codeunit = reqcu;
10449 re->flags |= PCRE2_LASTSET;
10450
10451 /* Handle caseless required code units as for first code units (above). */
10452
10453 if ((reqcuflags & REQ_CASELESS) != 0)
10454 {
10455 if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10456 {
10457 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10458 }
10459 #ifdef SUPPORT_UNICODE
10460 #if PCRE2_CODE_UNIT_WIDTH == 8
10461 else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10462 re->flags |= PCRE2_LASTCASELESS;
10463 #else
10464 else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10465 UCD_OTHERCASE(reqcu) != reqcu)
10466 re->flags |= PCRE2_LASTCASELESS;
10467 #endif
10468 #endif /* SUPPORT_UNICODE */
10469 }
10470 }
10471 }
10472
10473 /* Study the compiled pattern to set up information such as a bitmap of
10474 starting code units and a minimum matching length. */
10475
10476 if (PRIV(study)(re) != 0)
10477 {
10478 errorcode = ERR31;
10479 goto HAD_CB_ERROR;
10480 }
10481
10482 /* If study() set a bitmap of starting code units, it implies a minimum
10483 length of at least one. */
10484
10485 if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10486 minminlength = 1;
10487
10488 /* If the minimum length set (or not set) by study() is less than the minimum
10489 implied by required code units, override it. */
10490
10491 if (re->minlength < minminlength) re->minlength = minminlength;
10492 } /* End of start-of-match optimizations. */
10493
10494 /* Control ends up here in all cases. When running under valgrind, make a
10495 pattern's terminating zero defined again. If memory was obtained for the parsed
10496 version of the pattern, free it before returning. Also free the list of named
10497 groups if a larger one had to be obtained, and likewise the group information
10498 vector. */
10499
10500 EXIT:
10501 #ifdef SUPPORT_VALGRIND
10502 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10503 #endif
10504 if (cb.parsed_pattern != stack_parsed_pattern)
10505 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10506 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10507 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10508 if (cb.groupinfo != stack_groupinfo)
10509 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10510 return re; /* Will be NULL after an error */
10511
10512 /* Errors discovered in parse_regex() set the offset value in the compile
10513 block. Errors discovered before it is called must compute it from the ptr
10514 value. After parse_regex() is called, the offset in the compile block is set to
10515 the end of the pattern, but certain errors in compile_regex() may reset it if
10516 an offset is available in the parsed pattern. */
10517
10518 HAD_CB_ERROR:
10519 ptr = pattern + cb.erroroffset;
10520
10521 HAD_EARLY_ERROR:
10522 *erroroffset = ptr - pattern;
10523
10524 HAD_ERROR:
10525 *errorptr = errorcode;
10526 pcre2_code_free(re);
10527 re = NULL;
10528 goto EXIT;
10529 }
10530
10531 /* End of pcre2_compile.c */
10532