1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2020 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #define NLBLOCK cb /* Block containing newline information */
47 #define PSSTART start_pattern /* Field containing processed string start */
48 #define PSEND end_pattern /* Field containing processed string end */
49
50 #include "pcre2_internal.h"
51
52 /* In rare error cases debugging might require calling pcre2_printint(). */
53
54 #if 0
55 #ifdef EBCDIC
56 #define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57 #else
58 #define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59 #endif
60 #include "pcre2_printint.c"
61 #define DEBUG_CALL_PRINTINT
62 #endif
63
64 /* Other debugging code can be enabled by these defines. */
65
66 /* #define DEBUG_SHOW_CAPTURES */
67 /* #define DEBUG_SHOW_PARSED */
68
69 /* There are a few things that vary with different code unit sizes. Handle them
70 by defining macros in order to minimize #if usage. */
71
72 #if PCRE2_CODE_UNIT_WIDTH == 8
73 #define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74 #define XDIGIT(c) xdigitab[c]
75
76 #else /* Either 16-bit or 32-bit */
77 #define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79 #if PCRE2_CODE_UNIT_WIDTH == 16
80 #define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82 #else /* 32-bit */
83 #define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84 #endif
85 #endif
86
87 /* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88 consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89 them will be able to (i.e. assume a 64-bit world). */
90
91 #if PCRE2_SIZE_MAX <= UINT32_MAX
92 #define PUTOFFSET(s,p) *p++ = s
93 #define GETOFFSET(s,p) s = *p++
94 #define GETPLUSOFFSET(s,p) s = *(++p)
95 #define READPLUSOFFSET(s,p) s = p[1]
96 #define SKIPOFFSET(p) p++
97 #define SIZEOFFSET 1
98 #else
99 #define PUTOFFSET(s,p) \
100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101 #define GETOFFSET(s,p) \
102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103 #define GETPLUSOFFSET(s,p) \
104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105 #define READPLUSOFFSET(s,p) \
106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107 #define SKIPOFFSET(p) p += 2
108 #define SIZEOFFSET 2
109 #endif
110
111 /* Macros for manipulating elements of the parsed pattern vector. */
112
113 #define META_CODE(x) (x & 0xffff0000u)
114 #define META_DATA(x) (x & 0x0000ffffu)
115 #define META_DIFF(x,y) ((x-y)>>16)
116
117 /* Function definitions to allow mutual recursion */
118
119 #ifdef SUPPORT_UNICODE
120 static unsigned int
121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122 compile_block *, const uint32_t *, unsigned int);
123 #endif
124
125 static int
126 compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
127 uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
128 compile_block *, PCRE2_SIZE *);
129
130 static int
131 get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132 compile_block *);
133
134 static BOOL
135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136 compile_block *);
137
138 static int
139 check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140 compile_block *);
141
142
143 /*************************************************
144 * Code parameters and static tables *
145 *************************************************/
146
147 #define MAX_GROUP_NUMBER 65535u
148 #define MAX_REPEAT_COUNT 65535u
149 #define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
150
151 /* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152 different ways in the different pattern scans. The parsing and group-
153 identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154 aligned for this. Having defined the size in code units, we set up
155 C16_WORK_SIZE as the number of elements in the 16-bit vector.
156
157 During the first compiling phase, when determining how much memory is required,
158 the regex is partly compiled into this space, but the compiled parts are
159 discarded as soon as they can be, so that hopefully there will never be an
160 overrun. The code does, however, check for an overrun, which can occur for
161 pathological patterns. The size of the workspace depends on LINK_SIZE because
162 the length of compiled items varies with this.
163
164 In the real compile phase, this workspace is not currently used. */
165
166 #define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
167
168 #define C16_WORK_SIZE \
169 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170
171 /* A uint32_t vector is used for caching information about the size of
172 capturing groups, to improve performance. A default is created on the stack of
173 this size. */
174
175 #define GROUPINFO_DEFAULT_SIZE 256
176
177 /* The overrun tests check for a slightly smaller size so that they detect the
178 overrun before it actually does run off the end of the data block. */
179
180 #define WORK_SIZE_SAFETY_MARGIN (100)
181
182 /* This value determines the size of the initial vector that is used for
183 remembering named groups during the pre-compile. It is allocated on the stack,
184 but if it is too small, it is expanded, in a similar way to the workspace. The
185 value is the number of slots in the list. */
186
187 #define NAMED_GROUP_LIST_SIZE 20
188
189 /* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190 of uint32_t. For short patterns this lives on the stack, with this size. Heap
191 memory is used for longer patterns. */
192
193 #define PARSED_PATTERN_DEFAULT_SIZE 1024
194
195 /* Maximum length value to check against when making sure that the variable
196 that holds the compiled pattern length does not overflow. We make it a bit less
197 than INT_MAX to allow for adding in group terminating code units, so that we
198 don't have to check them every time. */
199
200 #define OFLOW_MAX (INT_MAX - 20)
201
202 /* Code values for parsed patterns, which are stored in a vector of 32-bit
203 unsigned ints. Values less than META_END are literal data values. The coding
204 for identifying the item is in the top 16-bits, leaving 16 bits for the
205 additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206 macros are used to manipulate parsed pattern elements.
207
208 NOTE: When these definitions are changed, the table of extra lengths for each
209 code (meta_extra_lengths, just below) must be updated to remain in step. */
210
211 #define META_END 0x80000000u /* End of pattern */
212
213 #define META_ALT 0x80010000u /* alternation */
214 #define META_ATOMIC 0x80020000u /* atomic group */
215 #define META_BACKREF 0x80030000u /* Back ref */
216 #define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
217 #define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
218 #define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
219 #define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
220 #define META_CAPTURE 0x80080000u /* Capturing parenthesis */
221 #define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
222 #define META_CLASS 0x800a0000u /* start non-empty class */
223 #define META_CLASS_EMPTY 0x800b0000u /* empty class */
224 #define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
225 #define META_CLASS_END 0x800d0000u /* end of non-empty class */
226 #define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
227 #define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
228 #define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
229 #define META_COND_NAME 0x80110000u /* (?(<name>)... */
230 #define META_COND_NUMBER 0x80120000u /* (?(digits)... */
231 #define META_COND_RNAME 0x80130000u /* (?(R&name)... */
232 #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
233 #define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
234 #define META_DOLLAR 0x80160000u /* $ metacharacter */
235 #define META_DOT 0x80170000u /* . metacharacter */
236 #define META_ESCAPE 0x80180000u /* \d and friends */
237 #define META_KET 0x80190000u /* closing parenthesis */
238 #define META_NOCAPTURE 0x801a0000u /* no capture parens */
239 #define META_OPTIONS 0x801b0000u /* (?i) and friends */
240 #define META_POSIX 0x801c0000u /* POSIX class item */
241 #define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
242 #define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
243 #define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
244 #define META_RECURSE 0x80200000u /* Recursion */
245 #define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
246 #define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
247
248 /* These must be kept together to make it easy to check that an assertion
249 is present where expected in a conditional group. */
250
251 #define META_LOOKAHEAD 0x80230000u /* (?= */
252 #define META_LOOKAHEADNOT 0x80240000u /* (?! */
253 #define META_LOOKBEHIND 0x80250000u /* (?<= */
254 #define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
255
256 /* These cannot be conditions */
257
258 #define META_LOOKAHEAD_NA 0x80270000u /* (*napla: */
259 #define META_LOOKBEHIND_NA 0x80280000u /* (*naplb: */
260
261 /* These must be kept in this order, with consecutive values, and the _ARG
262 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263 versions. */
264
265 #define META_MARK 0x80290000u /* (*MARK) */
266 #define META_ACCEPT 0x802a0000u /* (*ACCEPT) */
267 #define META_FAIL 0x802b0000u /* (*FAIL) */
268 #define META_COMMIT 0x802c0000u /* These */
269 #define META_COMMIT_ARG 0x802d0000u /* pairs */
270 #define META_PRUNE 0x802e0000u /* must */
271 #define META_PRUNE_ARG 0x802f0000u /* be */
272 #define META_SKIP 0x80300000u /* kept */
273 #define META_SKIP_ARG 0x80310000u /* in */
274 #define META_THEN 0x80320000u /* this */
275 #define META_THEN_ARG 0x80330000u /* order */
276
277 /* These must be kept in groups of adjacent 3 values, and all together. */
278
279 #define META_ASTERISK 0x80340000u /* * */
280 #define META_ASTERISK_PLUS 0x80350000u /* *+ */
281 #define META_ASTERISK_QUERY 0x80360000u /* *? */
282 #define META_PLUS 0x80370000u /* + */
283 #define META_PLUS_PLUS 0x80380000u /* ++ */
284 #define META_PLUS_QUERY 0x80390000u /* +? */
285 #define META_QUERY 0x803a0000u /* ? */
286 #define META_QUERY_PLUS 0x803b0000u /* ?+ */
287 #define META_QUERY_QUERY 0x803c0000u /* ?? */
288 #define META_MINMAX 0x803d0000u /* {n,m} repeat */
289 #define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
290 #define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
291
292 #define META_FIRST_QUANTIFIER META_ASTERISK
293 #define META_LAST_QUANTIFIER META_MINMAX_QUERY
294
295 /* This is a special "meta code" that is used only to distinguish (*asr: from
296 (*sr: in the table of aphabetic assertions. It is never stored in the parsed
297 pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298 therefore no need for it to have a length entry, so use a high value. */
299
300 #define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301
302 /* Table of extra lengths for each of the meta codes. Must be kept in step with
303 the definitions above. For some items these values are a basic length to which
304 a variable amount has to be added. */
305
306 static unsigned char meta_extra_lengths[] = {
307 0, /* META_END */
308 0, /* META_ALT */
309 0, /* META_ATOMIC */
310 0, /* META_BACKREF - more if group is >= 10 */
311 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
312 1, /* META_BIGVALUE */
313 3, /* META_CALLOUT_NUMBER */
314 3+SIZEOFFSET, /* META_CALLOUT_STRING */
315 0, /* META_CAPTURE */
316 0, /* META_CIRCUMFLEX */
317 0, /* META_CLASS */
318 0, /* META_CLASS_EMPTY */
319 0, /* META_CLASS_EMPTY_NOT */
320 0, /* META_CLASS_END */
321 0, /* META_CLASS_NOT */
322 0, /* META_COND_ASSERT */
323 SIZEOFFSET, /* META_COND_DEFINE */
324 1+SIZEOFFSET, /* META_COND_NAME */
325 1+SIZEOFFSET, /* META_COND_NUMBER */
326 1+SIZEOFFSET, /* META_COND_RNAME */
327 1+SIZEOFFSET, /* META_COND_RNUMBER */
328 3, /* META_COND_VERSION */
329 0, /* META_DOLLAR */
330 0, /* META_DOT */
331 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332 0, /* META_KET */
333 0, /* META_NOCAPTURE */
334 1, /* META_OPTIONS */
335 1, /* META_POSIX */
336 1, /* META_POSIX_NEG */
337 0, /* META_RANGE_ESCAPED */
338 0, /* META_RANGE_LITERAL */
339 SIZEOFFSET, /* META_RECURSE */
340 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
341 0, /* META_SCRIPT_RUN */
342 0, /* META_LOOKAHEAD */
343 0, /* META_LOOKAHEADNOT */
344 SIZEOFFSET, /* META_LOOKBEHIND */
345 SIZEOFFSET, /* META_LOOKBEHINDNOT */
346 0, /* META_LOOKAHEAD_NA */
347 SIZEOFFSET, /* META_LOOKBEHIND_NA */
348 1, /* META_MARK - plus the string length */
349 0, /* META_ACCEPT */
350 0, /* META_FAIL */
351 0, /* META_COMMIT */
352 1, /* META_COMMIT_ARG - plus the string length */
353 0, /* META_PRUNE */
354 1, /* META_PRUNE_ARG - plus the string length */
355 0, /* META_SKIP */
356 1, /* META_SKIP_ARG - plus the string length */
357 0, /* META_THEN */
358 1, /* META_THEN_ARG - plus the string length */
359 0, /* META_ASTERISK */
360 0, /* META_ASTERISK_PLUS */
361 0, /* META_ASTERISK_QUERY */
362 0, /* META_PLUS */
363 0, /* META_PLUS_PLUS */
364 0, /* META_PLUS_QUERY */
365 0, /* META_QUERY */
366 0, /* META_QUERY_PLUS */
367 0, /* META_QUERY_QUERY */
368 2, /* META_MINMAX */
369 2, /* META_MINMAX_PLUS */
370 2 /* META_MINMAX_QUERY */
371 };
372
373 /* Types for skipping parts of a parsed pattern. */
374
375 enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376
377 /* Macro for setting individual bits in class bitmaps. It took some
378 experimenting to figure out how to stop gcc 5.3.0 from warning with
379 -Wconversion. This version gets a warning:
380
381 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382
383 Let's hope the apparently less efficient version isn't actually so bad if the
384 compiler is clever with identical subexpressions. */
385
386 #define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387
388 /* Private flags added to firstcu and reqcu. */
389
390 #define REQ_CASELESS (1u << 0) /* Indicates caselessness */
391 #define REQ_VARY (1u << 1) /* reqcu followed non-literal item */
392 /* Negative values for the firstcu and reqcu flags */
393 #define REQ_UNSET (-2) /* Not yet found anything */
394 #define REQ_NONE (-1) /* Found not fixed char */
395
396 /* These flags are used in the groupinfo vector. */
397
398 #define GI_SET_FIXED_LENGTH 0x80000000u
399 #define GI_NOT_FIXED_LENGTH 0x40000000u
400 #define GI_FIXED_LENGTH_MASK 0x0000ffffu
401
402 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
403 and is fast (a good compiler can turn it into a subtraction and unsigned
404 comparison). */
405
406 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
407
408 /* Table to identify hex digits. The tables in chartables are dependent on the
409 locale, and may mark arbitrary characters as digits. We want to recognize only
410 0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
411 costs 256 bytes, but it is a lot faster than doing character value tests (at
412 least in some simple cases I timed), and in some applications one wants PCRE2
413 to compile efficiently as well as match efficiently. The value in the table is
414 the binary hex digit value, or 0xff for non-hex digits. */
415
416 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
417 UTF-8 mode. */
418
419 #ifndef EBCDIC
420 static const uint8_t xdigitab[] =
421 {
422 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
423 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
424 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
428 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
429 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
430 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
431 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
432 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
434 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
435 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
436 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
437 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
438 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
439 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
440 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
441 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
444 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
445 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
446 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
447 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
448 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
449 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
450 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
454
455 #else
456
457 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
458
459 static const uint8_t xdigitab[] =
460 {
461 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
462 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
466 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
467 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
472 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
473 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
474 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
475 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
476 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
477 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
478 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
479 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
480 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
481 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
482 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
483 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
484 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
485 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
486 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
487 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
488 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
489 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
490 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
491 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
492 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
493 #endif /* EBCDIC */
494
495
496 /* Table for handling alphanumeric escaped characters. Positive returns are
497 simple data values; negative values are for special things like \d and so on.
498 Zero means further processing is needed (for things like \x), or the escape is
499 invalid. */
500
501 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
502 in UTF-8 mode. It runs from '0' to 'z'. */
503
504 #ifndef EBCDIC
505 #define ESCAPES_FIRST CHAR_0
506 #define ESCAPES_LAST CHAR_z
507 #define UPPER_CASE(c) (c-32)
508
509 static const short int escapes[] = {
510 0, 0,
511 0, 0,
512 0, 0,
513 0, 0,
514 0, 0,
515 CHAR_COLON, CHAR_SEMICOLON,
516 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
517 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
518 CHAR_COMMERCIAL_AT, -ESC_A,
519 -ESC_B, -ESC_C,
520 -ESC_D, -ESC_E,
521 0, -ESC_G,
522 -ESC_H, 0,
523 0, -ESC_K,
524 0, 0,
525 -ESC_N, 0,
526 -ESC_P, -ESC_Q,
527 -ESC_R, -ESC_S,
528 0, 0,
529 -ESC_V, -ESC_W,
530 -ESC_X, 0,
531 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
532 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
533 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
534 CHAR_GRAVE_ACCENT, CHAR_BEL,
535 -ESC_b, 0,
536 -ESC_d, CHAR_ESC,
537 CHAR_FF, 0,
538 -ESC_h, 0,
539 0, -ESC_k,
540 0, 0,
541 CHAR_LF, 0,
542 -ESC_p, 0,
543 CHAR_CR, -ESC_s,
544 CHAR_HT, 0,
545 -ESC_v, -ESC_w,
546 0, 0,
547 -ESC_z
548 };
549
550 #else
551
552 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
553 It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
554 is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
555 because it is defined as 'a', which of course picks up the ASCII value. */
556
557 #if 'a' == 0x81 /* Check for a real EBCDIC environment */
558 #define ESCAPES_FIRST CHAR_a
559 #define ESCAPES_LAST CHAR_9
560 #define UPPER_CASE(c) (c+64)
561 #else /* Testing in an ASCII environment */
562 #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
563 #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
564 #define UPPER_CASE(c) (c-32)
565 #endif
566
567 static const short int escapes[] = {
568 /* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
569 /* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
570 /* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
571 /* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
572 /* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
573 /* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
574 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
575 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
576 /* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
577 /* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
578 /* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
579 /* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
580 /* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
581 /* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
582 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
583 /* F8 */ 0, 0
584 };
585
586 /* We also need a table of characters that may follow \c in an EBCDIC
587 environment for characters 0-31. */
588
589 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
590
591 #endif /* EBCDIC */
592
593
594 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
595 searched linearly. Put all the names into a single string, in order to reduce
596 the number of relocations when a shared library is dynamically linked. The
597 string is built from string macros so that it works in UTF-8 mode on EBCDIC
598 platforms. */
599
600 typedef struct verbitem {
601 unsigned int len; /* Length of verb name */
602 uint32_t meta; /* Base META_ code */
603 int has_arg; /* Argument requirement */
604 } verbitem;
605
606 static const char verbnames[] =
607 "\0" /* Empty name is a shorthand for MARK */
608 STRING_MARK0
609 STRING_ACCEPT0
610 STRING_F0
611 STRING_FAIL0
612 STRING_COMMIT0
613 STRING_PRUNE0
614 STRING_SKIP0
615 STRING_THEN;
616
617 static const verbitem verbs[] = {
618 { 0, META_MARK, +1 }, /* > 0 => must have an argument */
619 { 4, META_MARK, +1 },
620 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
621 { 1, META_FAIL, -1 },
622 { 4, META_FAIL, -1 },
623 { 6, META_COMMIT, 0 },
624 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
625 { 4, META_SKIP, 0 },
626 { 4, META_THEN, 0 }
627 };
628
629 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
630
631 /* Verb opcodes, indexed by their META code offset from META_MARK. */
632
633 static const uint32_t verbops[] = {
634 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
635 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
636
637 /* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
638
639 typedef struct alasitem {
640 unsigned int len; /* Length of name */
641 uint32_t meta; /* Base META_ code */
642 } alasitem;
643
644 static const char alasnames[] =
645 STRING_pla0
646 STRING_plb0
647 STRING_napla0
648 STRING_naplb0
649 STRING_nla0
650 STRING_nlb0
651 STRING_positive_lookahead0
652 STRING_positive_lookbehind0
653 STRING_non_atomic_positive_lookahead0
654 STRING_non_atomic_positive_lookbehind0
655 STRING_negative_lookahead0
656 STRING_negative_lookbehind0
657 STRING_atomic0
658 STRING_sr0
659 STRING_asr0
660 STRING_script_run0
661 STRING_atomic_script_run;
662
663 static const alasitem alasmeta[] = {
664 { 3, META_LOOKAHEAD },
665 { 3, META_LOOKBEHIND },
666 { 5, META_LOOKAHEAD_NA },
667 { 5, META_LOOKBEHIND_NA },
668 { 3, META_LOOKAHEADNOT },
669 { 3, META_LOOKBEHINDNOT },
670 { 18, META_LOOKAHEAD },
671 { 19, META_LOOKBEHIND },
672 { 29, META_LOOKAHEAD_NA },
673 { 30, META_LOOKBEHIND_NA },
674 { 18, META_LOOKAHEADNOT },
675 { 19, META_LOOKBEHINDNOT },
676 { 6, META_ATOMIC },
677 { 2, META_SCRIPT_RUN }, /* sr = script run */
678 { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
679 { 10, META_SCRIPT_RUN }, /* script run */
680 { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
681 };
682
683 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
684
685 /* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
686
687 static uint32_t chartypeoffset[] = {
688 OP_STAR - OP_STAR, OP_STARI - OP_STAR,
689 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
690
691 /* Tables of names of POSIX character classes and their lengths. The names are
692 now all in a single string, to reduce the number of relocations when a shared
693 library is dynamically loaded. The list of lengths is terminated by a zero
694 length entry. The first three must be alpha, lower, upper, as this is assumed
695 for handling case independence. The indices for graph, print, and punct are
696 needed, so identify them. */
697
698 static const char posix_names[] =
699 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
700 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
701 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
702 STRING_word0 STRING_xdigit;
703
704 static const uint8_t posix_name_lengths[] = {
705 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
706
707 #define PC_GRAPH 8
708 #define PC_PRINT 9
709 #define PC_PUNCT 10
710
711 /* Table of class bit maps for each POSIX class. Each class is formed from a
712 base map, with an optional addition or removal of another map. Then, for some
713 classes, there is some additional tweaking: for [:blank:] the vertical space
714 characters are removed, and for [:alpha:] and [:alnum:] the underscore
715 character is removed. The triples in the table consist of the base map offset,
716 second map offset or -1 if no second map, and a non-negative value for map
717 addition or a negative value for map subtraction (if there are two maps). The
718 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
719 remove vertical space characters, 2 => remove underscore. */
720
721 static const int posix_class_maps[] = {
722 cbit_word, cbit_digit, -2, /* alpha */
723 cbit_lower, -1, 0, /* lower */
724 cbit_upper, -1, 0, /* upper */
725 cbit_word, -1, 2, /* alnum - word without underscore */
726 cbit_print, cbit_cntrl, 0, /* ascii */
727 cbit_space, -1, 1, /* blank - a GNU extension */
728 cbit_cntrl, -1, 0, /* cntrl */
729 cbit_digit, -1, 0, /* digit */
730 cbit_graph, -1, 0, /* graph */
731 cbit_print, -1, 0, /* print */
732 cbit_punct, -1, 0, /* punct */
733 cbit_space, -1, 0, /* space */
734 cbit_word, -1, 0, /* word - a Perl extension */
735 cbit_xdigit,-1, 0 /* xdigit */
736 };
737
738 #ifdef SUPPORT_UNICODE
739
740 /* The POSIX class Unicode property substitutes that are used in UCP mode must
741 be in the order of the POSIX class names, defined above. */
742
743 static int posix_substitutes[] = {
744 PT_GC, ucp_L, /* alpha */
745 PT_PC, ucp_Ll, /* lower */
746 PT_PC, ucp_Lu, /* upper */
747 PT_ALNUM, 0, /* alnum */
748 -1, 0, /* ascii, treat as non-UCP */
749 -1, 1, /* blank, treat as \h */
750 PT_PC, ucp_Cc, /* cntrl */
751 PT_PC, ucp_Nd, /* digit */
752 PT_PXGRAPH, 0, /* graph */
753 PT_PXPRINT, 0, /* print */
754 PT_PXPUNCT, 0, /* punct */
755 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
756 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
757 -1, 0 /* xdigit, treat as non-UCP */
758 };
759 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
760 #endif /* SUPPORT_UNICODE */
761
762 /* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
763 are allowed. */
764
765 #define PUBLIC_LITERAL_COMPILE_OPTIONS \
766 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
767 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
768 PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
769
770 #define PUBLIC_COMPILE_OPTIONS \
771 (PUBLIC_LITERAL_COMPILE_OPTIONS| \
772 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
773 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
774 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
775 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
776 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
777 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
778
779 #define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
780 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
781
782 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
783 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
784 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
785 PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX)
786
787 /* Compile time error code numbers. They are given names so that they can more
788 easily be tracked. When a new number is added, the tables called eint1 and
789 eint2 in pcre2posix.c may need to be updated, and a new error text must be
790 added to compile_error_texts in pcre2_error.c. */
791
792 enum { ERR0 = COMPILE_ERROR_BASE,
793 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
794 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
795 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
796 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
797 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
798 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
799 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
800 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
801 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
802 ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98 };
803
804 /* This is a table of start-of-pattern options such as (*UTF) and settings such
805 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
806 compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
807 generic and always supported. */
808
809 enum { PSO_OPT, /* Value is an option bit */
810 PSO_FLG, /* Value is a flag bit */
811 PSO_NL, /* Value is a newline type */
812 PSO_BSR, /* Value is a \R type */
813 PSO_LIMH, /* Read integer value for heap limit */
814 PSO_LIMM, /* Read integer value for match limit */
815 PSO_LIMD }; /* Read integer value for depth limit */
816
817 typedef struct pso {
818 const uint8_t *name;
819 uint16_t length;
820 uint16_t type;
821 uint32_t value;
822 } pso;
823
824 /* NB: STRING_UTFn_RIGHTPAR contains the length as well */
825
826 static pso pso_list[] = {
827 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
828 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
829 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
830 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
831 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
832 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
833 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
834 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
835 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
836 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
837 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
838 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
839 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
840 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
841 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
842 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
843 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
844 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
845 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
846 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
847 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
848 };
849
850 /* This table is used when converting repeating opcodes into possessified
851 versions as a result of an explicit possessive quantifier such as ++. A zero
852 value means there is no possessified version - in those cases the item in
853 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
854 because all relevant opcodes are less than that. */
855
856 static const uint8_t opcode_possessify[] = {
857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
858 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
859
860 0, /* NOTI */
861 OP_POSSTAR, 0, /* STAR, MINSTAR */
862 OP_POSPLUS, 0, /* PLUS, MINPLUS */
863 OP_POSQUERY, 0, /* QUERY, MINQUERY */
864 OP_POSUPTO, 0, /* UPTO, MINUPTO */
865 0, /* EXACT */
866 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
867
868 OP_POSSTARI, 0, /* STARI, MINSTARI */
869 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
870 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
871 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
872 0, /* EXACTI */
873 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
874
875 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
876 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
877 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
878 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
879 0, /* NOTEXACT */
880 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
881
882 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
883 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
884 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
885 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
886 0, /* NOTEXACTI */
887 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
888
889 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
890 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
891 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
892 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
893 0, /* TYPEEXACT */
894 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
895
896 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
897 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
898 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
899 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
900 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
901
902 0, 0, 0, /* CLASS, NCLASS, XCLASS */
903 0, 0, /* REF, REFI */
904 0, 0, /* DNREF, DNREFI */
905 0, 0 /* RECURSE, CALLOUT */
906 };
907
908
909 #ifdef DEBUG_SHOW_PARSED
910 /*************************************************
911 * Show the parsed pattern for debugging *
912 *************************************************/
913
914 /* For debugging the pre-scan, this code, which outputs the parsed data vector,
915 can be enabled. */
916
show_parsed(compile_block * cb)917 static void show_parsed(compile_block *cb)
918 {
919 uint32_t *pptr = cb->parsed_pattern;
920
921 for (;;)
922 {
923 int max, min;
924 PCRE2_SIZE offset;
925 uint32_t i;
926 uint32_t length;
927 uint32_t meta_arg = META_DATA(*pptr);
928
929 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
930
931 if (*pptr < META_END)
932 {
933 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
934 pptr++;
935 }
936
937 else switch (META_CODE(*pptr++))
938 {
939 default:
940 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
941 return;
942
943 case META_END:
944 fprintf(stderr, "META_END\n");
945 return;
946
947 case META_CAPTURE:
948 fprintf(stderr, "META_CAPTURE %d", meta_arg);
949 break;
950
951 case META_RECURSE:
952 GETOFFSET(offset, pptr);
953 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
954 break;
955
956 case META_BACKREF:
957 if (meta_arg < 10)
958 offset = cb->small_ref_offset[meta_arg];
959 else
960 GETOFFSET(offset, pptr);
961 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
962 break;
963
964 case META_ESCAPE:
965 if (meta_arg == ESC_P || meta_arg == ESC_p)
966 {
967 uint32_t ptype = *pptr >> 16;
968 uint32_t pvalue = *pptr++ & 0xffff;
969 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
970 ptype, pvalue);
971 }
972 else
973 {
974 uint32_t cc;
975 /* There's just one escape we might have here that isn't negated in the
976 escapes table. */
977 if (meta_arg == ESC_g) cc = CHAR_g;
978 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
979 {
980 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
981 }
982 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
983 fprintf(stderr, "META \\%c", cc);
984 }
985 break;
986
987 case META_MINMAX:
988 min = *pptr++;
989 max = *pptr++;
990 if (max != REPEAT_UNLIMITED)
991 fprintf(stderr, "META {%d,%d}", min, max);
992 else
993 fprintf(stderr, "META {%d,}", min);
994 break;
995
996 case META_MINMAX_QUERY:
997 min = *pptr++;
998 max = *pptr++;
999 if (max != REPEAT_UNLIMITED)
1000 fprintf(stderr, "META {%d,%d}?", min, max);
1001 else
1002 fprintf(stderr, "META {%d,}?", min);
1003 break;
1004
1005 case META_MINMAX_PLUS:
1006 min = *pptr++;
1007 max = *pptr++;
1008 if (max != REPEAT_UNLIMITED)
1009 fprintf(stderr, "META {%d,%d}+", min, max);
1010 else
1011 fprintf(stderr, "META {%d,}+", min);
1012 break;
1013
1014 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1015 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1016 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1017 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1018 case META_DOT: fprintf(stderr, "META_DOT"); break;
1019 case META_ASTERISK: fprintf(stderr, "META *"); break;
1020 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1021 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1022 case META_PLUS: fprintf(stderr, "META +"); break;
1023 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1024 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1025 case META_QUERY: fprintf(stderr, "META ?"); break;
1026 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1027 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1028
1029 case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1030 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1031 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1032 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1033 case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1034 case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1035 case META_KET: fprintf(stderr, "META )"); break;
1036 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1037
1038 case META_CLASS: fprintf(stderr, "META ["); break;
1039 case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1040 case META_CLASS_END: fprintf(stderr, "META ]"); break;
1041 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1042 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1043
1044 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1045 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1046
1047 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1048 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1049
1050 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1051 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1052 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1053 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1054 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1055 case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1056
1057 case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1058
1059 case META_LOOKBEHIND:
1060 fprintf(stderr, "META (?<= %d offset=", meta_arg);
1061 GETOFFSET(offset, pptr);
1062 fprintf(stderr, "%zd", offset);
1063 break;
1064
1065 case META_LOOKBEHIND_NA:
1066 fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
1067 GETOFFSET(offset, pptr);
1068 fprintf(stderr, "%zd", offset);
1069 break;
1070
1071 case META_LOOKBEHINDNOT:
1072 fprintf(stderr, "META (?<! %d offset=", meta_arg);
1073 GETOFFSET(offset, pptr);
1074 fprintf(stderr, "%zd", offset);
1075 break;
1076
1077 case META_CALLOUT_NUMBER:
1078 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1079 pptr[1]);
1080 pptr += 3;
1081 break;
1082
1083 case META_CALLOUT_STRING:
1084 {
1085 uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1086 uint32_t patlength = *pptr++; /* Length of next pattern item */
1087 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1088 GETOFFSET(offset, pptr);
1089 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1090 }
1091 break;
1092
1093 case META_RECURSE_BYNAME:
1094 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1095 GETOFFSET(offset, pptr);
1096 fprintf(stderr, "%zd", offset);
1097 break;
1098
1099 case META_BACKREF_BYNAME:
1100 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1101 GETOFFSET(offset, pptr);
1102 fprintf(stderr, "%zd", offset);
1103 break;
1104
1105 case META_COND_NUMBER:
1106 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1107 GETOFFSET(offset, pptr);
1108 fprintf(stderr, "%zd", offset);
1109 pptr++;
1110 break;
1111
1112 case META_COND_DEFINE:
1113 fprintf(stderr, "META (?(DEFINE) offset=");
1114 GETOFFSET(offset, pptr);
1115 fprintf(stderr, "%zd", offset);
1116 break;
1117
1118 case META_COND_VERSION:
1119 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1120 fprintf(stderr, "%d.", *pptr++);
1121 fprintf(stderr, "%d)", *pptr++);
1122 break;
1123
1124 case META_COND_NAME:
1125 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1126 GETOFFSET(offset, pptr);
1127 fprintf(stderr, "%zd", offset);
1128 break;
1129
1130 case META_COND_RNAME:
1131 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1132 GETOFFSET(offset, pptr);
1133 fprintf(stderr, "%zd", offset);
1134 break;
1135
1136 /* This is kept as a name, because it might be. */
1137
1138 case META_COND_RNUMBER:
1139 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1140 GETOFFSET(offset, pptr);
1141 fprintf(stderr, "%zd", offset);
1142 break;
1143
1144 case META_MARK:
1145 fprintf(stderr, "META (*MARK:");
1146 goto SHOWARG;
1147
1148 case META_COMMIT_ARG:
1149 fprintf(stderr, "META (*COMMIT:");
1150 goto SHOWARG;
1151
1152 case META_PRUNE_ARG:
1153 fprintf(stderr, "META (*PRUNE:");
1154 goto SHOWARG;
1155
1156 case META_SKIP_ARG:
1157 fprintf(stderr, "META (*SKIP:");
1158 goto SHOWARG;
1159
1160 case META_THEN_ARG:
1161 fprintf(stderr, "META (*THEN:");
1162 SHOWARG:
1163 length = *pptr++;
1164 for (i = 0; i < length; i++)
1165 {
1166 uint32_t cc = *pptr++;
1167 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1168 else fprintf(stderr, "\\x{%x}", cc);
1169 }
1170 fprintf(stderr, ") length=%u", length);
1171 break;
1172 }
1173 fprintf(stderr, "\n");
1174 }
1175 return;
1176 }
1177 #endif /* DEBUG_SHOW_PARSED */
1178
1179
1180
1181 /*************************************************
1182 * Copy compiled code *
1183 *************************************************/
1184
1185 /* Compiled JIT code cannot be copied, so the new compiled block has no
1186 associated JIT data. */
1187
1188 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code * code)1189 pcre2_code_copy(const pcre2_code *code)
1190 {
1191 PCRE2_SIZE* ref_count;
1192 pcre2_code *newcode;
1193
1194 if (code == NULL) return NULL;
1195 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1196 if (newcode == NULL) return NULL;
1197 memcpy(newcode, code, code->blocksize);
1198 newcode->executable_jit = NULL;
1199
1200 /* If the code is one that has been deserialized, increment the reference count
1201 in the decoded tables. */
1202
1203 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1204 {
1205 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1206 (*ref_count)++;
1207 }
1208
1209 return newcode;
1210 }
1211
1212
1213
1214 /*************************************************
1215 * Copy compiled code and character tables *
1216 *************************************************/
1217
1218 /* Compiled JIT code cannot be copied, so the new compiled block has no
1219 associated JIT data. This version of code_copy also makes a separate copy of
1220 the character tables. */
1221
1222 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy_with_tables(const pcre2_code * code)1223 pcre2_code_copy_with_tables(const pcre2_code *code)
1224 {
1225 PCRE2_SIZE* ref_count;
1226 pcre2_code *newcode;
1227 uint8_t *newtables;
1228
1229 if (code == NULL) return NULL;
1230 newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1231 if (newcode == NULL) return NULL;
1232 memcpy(newcode, code, code->blocksize);
1233 newcode->executable_jit = NULL;
1234
1235 newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1236 code->memctl.memory_data);
1237 if (newtables == NULL)
1238 {
1239 code->memctl.free((void *)newcode, code->memctl.memory_data);
1240 return NULL;
1241 }
1242 memcpy(newtables, code->tables, TABLES_LENGTH);
1243 ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1244 *ref_count = 1;
1245
1246 newcode->tables = newtables;
1247 newcode->flags |= PCRE2_DEREF_TABLES;
1248 return newcode;
1249 }
1250
1251
1252
1253 /*************************************************
1254 * Free compiled code *
1255 *************************************************/
1256
1257 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_code_free(pcre2_code * code)1258 pcre2_code_free(pcre2_code *code)
1259 {
1260 PCRE2_SIZE* ref_count;
1261
1262 if (code != NULL)
1263 {
1264 if (code->executable_jit != NULL)
1265 PRIV(jit_free)(code->executable_jit, &code->memctl);
1266
1267 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1268 {
1269 /* Decoded tables belong to the codes after deserialization, and they must
1270 be freed when there are no more references to them. The *ref_count should
1271 always be > 0. */
1272
1273 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1274 if (*ref_count > 0)
1275 {
1276 (*ref_count)--;
1277 if (*ref_count == 0)
1278 code->memctl.free((void *)code->tables, code->memctl.memory_data);
1279 }
1280 }
1281
1282 code->memctl.free(code, code->memctl.memory_data);
1283 }
1284 }
1285
1286
1287
1288 /*************************************************
1289 * Read a number, possibly signed *
1290 *************************************************/
1291
1292 /* This function is used to read numbers in the pattern. The initial pointer
1293 must be the sign or first digit of the number. When relative values (introduced
1294 by + or -) are allowed, they are relative group numbers, and the result must be
1295 greater than zero.
1296
1297 Arguments:
1298 ptrptr points to the character pointer variable
1299 ptrend points to the end of the input string
1300 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1301 max_value the largest number allowed
1302 max_error the error to give for an over-large number
1303 intptr where to put the result
1304 errcodeptr where to put an error code
1305
1306 Returns: TRUE - a number was read
1307 FALSE - errorcode == 0 => no number was found
1308 errorcode != 0 => an error occurred
1309 */
1310
1311 static BOOL
read_number(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,int32_t allow_sign,uint32_t max_value,uint32_t max_error,int * intptr,int * errorcodeptr)1312 read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1313 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1314 {
1315 int sign = 0;
1316 uint32_t n = 0;
1317 PCRE2_SPTR ptr = *ptrptr;
1318 BOOL yield = FALSE;
1319
1320 *errorcodeptr = 0;
1321
1322 if (allow_sign >= 0 && ptr < ptrend)
1323 {
1324 if (*ptr == CHAR_PLUS)
1325 {
1326 sign = +1;
1327 max_value -= allow_sign;
1328 ptr++;
1329 }
1330 else if (*ptr == CHAR_MINUS)
1331 {
1332 sign = -1;
1333 ptr++;
1334 }
1335 }
1336
1337 if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1338 while (ptr < ptrend && IS_DIGIT(*ptr))
1339 {
1340 n = n * 10 + *ptr++ - CHAR_0;
1341 if (n > max_value)
1342 {
1343 *errorcodeptr = max_error;
1344 goto EXIT;
1345 }
1346 }
1347
1348 if (allow_sign >= 0 && sign != 0)
1349 {
1350 if (n == 0)
1351 {
1352 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1353 goto EXIT;
1354 }
1355
1356 if (sign > 0) n += allow_sign;
1357 else if ((int)n > allow_sign)
1358 {
1359 *errorcodeptr = ERR15; /* Non-existent subpattern */
1360 goto EXIT;
1361 }
1362 else n = allow_sign + 1 - n;
1363 }
1364
1365 yield = TRUE;
1366
1367 EXIT:
1368 *intptr = n;
1369 *ptrptr = ptr;
1370 return yield;
1371 }
1372
1373
1374
1375 /*************************************************
1376 * Read repeat counts *
1377 *************************************************/
1378
1379 /* Read an item of the form {n,m} and return the values if non-NULL pointers
1380 are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1381 larger value is used for "unlimited". We have to use signed arguments for
1382 read_number() because it is capable of returning a signed value.
1383
1384 Arguments:
1385 ptrptr points to pointer to character after'{'
1386 ptrend pointer to end of input
1387 minp if not NULL, pointer to int for min
1388 maxp if not NULL, pointer to int for max (-1 if no max)
1389 returned as -1 if no max
1390 errorcodeptr points to error code variable
1391
1392 Returns: FALSE if not a repeat quantifier, errorcode set zero
1393 FALSE on error, with errorcode set non-zero
1394 TRUE on success, with pointer updated to point after '}'
1395 */
1396
1397 static BOOL
read_repeat_counts(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,uint32_t * minp,uint32_t * maxp,int * errorcodeptr)1398 read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1399 uint32_t *maxp, int *errorcodeptr)
1400 {
1401 PCRE2_SPTR p = *ptrptr;
1402 BOOL yield = FALSE;
1403 int32_t min = 0;
1404 int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1405
1406 /* NB read_number() initializes the error code to zero. The only error is for a
1407 number that is too big. */
1408
1409 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1410 goto EXIT;
1411
1412 if (p >= ptrend) goto EXIT;
1413
1414 if (*p == CHAR_RIGHT_CURLY_BRACKET)
1415 {
1416 p++;
1417 max = min;
1418 }
1419
1420 else
1421 {
1422 if (*p++ != CHAR_COMMA || p >= ptrend) goto EXIT;
1423 if (*p != CHAR_RIGHT_CURLY_BRACKET)
1424 {
1425 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1426 errorcodeptr) || p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1427 goto EXIT;
1428 if (max < min)
1429 {
1430 *errorcodeptr = ERR4;
1431 goto EXIT;
1432 }
1433 }
1434 p++;
1435 }
1436
1437 yield = TRUE;
1438 if (minp != NULL) *minp = (uint32_t)min;
1439 if (maxp != NULL) *maxp = (uint32_t)max;
1440
1441 /* Update the pattern pointer on success, or after an error, but not when
1442 the result is "not a repeat quantifier". */
1443
1444 EXIT:
1445 if (yield || *errorcodeptr != 0) *ptrptr = p;
1446 return yield;
1447 }
1448
1449
1450
1451 /*************************************************
1452 * Handle escapes *
1453 *************************************************/
1454
1455 /* This function is called when a \ has been encountered. It either returns a
1456 positive value for a simple escape such as \d, or 0 for a data character, which
1457 is placed in chptr. A backreference to group n is returned as negative n. On
1458 entry, ptr is pointing at the character after \. On exit, it points after the
1459 final code unit of the escape sequence.
1460
1461 This function is also called from pcre2_substitute() to handle escape sequences
1462 in replacement strings. In this case, the cb argument is NULL, and in the case
1463 of escapes that have further processing, only sequences that define a data
1464 character are recognised. The isclass argument is not relevant; the options
1465 argument is the final value of the compiled pattern's options.
1466
1467 Arguments:
1468 ptrptr points to the input position pointer
1469 ptrend points to the end of the input
1470 chptr points to a returned data character
1471 errorcodeptr points to the errorcode variable (containing zero)
1472 options the current options bits
1473 isclass TRUE if inside a character class
1474 cb compile data block or NULL when called from pcre2_substitute()
1475
1476 Returns: zero => a data character
1477 positive => a special escape sequence
1478 negative => a numerical back reference
1479 on error, errorcodeptr is set non-zero
1480 */
1481
1482 int
PRIV(check_escape)1483 PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1484 int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1485 compile_block *cb)
1486 {
1487 BOOL utf = (options & PCRE2_UTF) != 0;
1488 PCRE2_SPTR ptr = *ptrptr;
1489 uint32_t c, cc;
1490 int escape = 0;
1491 int i;
1492
1493 /* If backslash is at the end of the string, it's an error. */
1494
1495 if (ptr >= ptrend)
1496 {
1497 *errorcodeptr = ERR1;
1498 return 0;
1499 }
1500
1501 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1502 *errorcodeptr = 0; /* Be optimistic */
1503
1504 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
1505 value test saves a memory lookup for code points outside the alphanumeric
1506 range. */
1507
1508 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1509
1510 /* Otherwise, do a table lookup. Non-zero values need little processing here. A
1511 positive value is a literal value for something like \n. A negative value is
1512 the negation of one of the ESC_ macros that is passed back for handling by the
1513 calling function. Some extra checking is needed for \N because only \N{U+dddd}
1514 is supported. If the value is zero, further processing is handled below. */
1515
1516 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1517 {
1518 if (i > 0)
1519 {
1520 c = (uint32_t)i;
1521 if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1522 c = CHAR_LF;
1523 }
1524 else /* Negative table entry */
1525 {
1526 escape = -i; /* Else return a special escape */
1527 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1528 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1529
1530 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1531 Unicode code points, as well as plain \N for "not newline". PCRE does not
1532 support \N{name}. However, it does support quantification such as \N{2,3},
1533 so if \N{ is not followed by U+dddd we check for a quantifier. */
1534
1535 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1536 {
1537 PCRE2_SPTR p = ptr + 1;
1538
1539 /* \N{U+ can be handled by the \x{ code. However, this construction is
1540 not valid in EBCDIC environments because it specifies a Unicode
1541 character, not a codepoint in the local code. For example \N{U+0041}
1542 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1543 casing semantics for the entire pattern, so allow it only in UTF (i.e.
1544 Unicode) mode. */
1545
1546 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1547 {
1548 #ifdef EBCDIC
1549 *errorcodeptr = ERR93;
1550 #else
1551 if (utf)
1552 {
1553 ptr = p + 1;
1554 escape = 0; /* Not a fancy escape after all */
1555 goto COME_FROM_NU;
1556 }
1557 else *errorcodeptr = ERR93;
1558 #endif
1559 }
1560
1561 /* Give an error if what follows is not a quantifier, but don't override
1562 an error set by the quantifier reader (e.g. number overflow). */
1563
1564 else
1565 {
1566 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1567 *errorcodeptr == 0)
1568 *errorcodeptr = ERR37;
1569 }
1570 }
1571 }
1572 }
1573
1574 /* Escapes that need further processing, including those that are unknown, have
1575 a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1576 \o, and \x are recognized (\u and \U can never appear as they are used for case
1577 forcing). */
1578
1579 else
1580 {
1581 int s;
1582 PCRE2_SPTR oldptr;
1583 BOOL overflow;
1584 BOOL alt_bsux =
1585 ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1586
1587 /* Filter calls from pcre2_substitute(). */
1588
1589 if (cb == NULL)
1590 {
1591 if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1592 {
1593 *errorcodeptr = ERR3;
1594 return 0;
1595 }
1596 alt_bsux = FALSE; /* Do not modify \x handling */
1597 }
1598
1599 switch (c)
1600 {
1601 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1602 error. */
1603
1604 case CHAR_F:
1605 case CHAR_l:
1606 case CHAR_L:
1607 *errorcodeptr = ERR37;
1608 break;
1609
1610 /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1611 is set. Otherwise, \u must be followed by exactly four hex digits or, if
1612 PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1613 Otherwise it is a lowercase u letter. This gives some compatibility with
1614 ECMAScript (aka JavaScript). */
1615
1616 case CHAR_u:
1617 if (!alt_bsux) *errorcodeptr = ERR37; else
1618 {
1619 uint32_t xc;
1620
1621 if (ptr >= ptrend) break;
1622 if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1623 (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1624 {
1625 PCRE2_SPTR hptr = ptr + 1;
1626 cc = 0;
1627
1628 while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1629 {
1630 if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1631 {
1632 *errorcodeptr = ERR77;
1633 ptr = hptr; /* Show where */
1634 break; /* *hptr != } will cause another break below */
1635 }
1636 cc = (cc << 4) | xc;
1637 hptr++;
1638 }
1639
1640 if (hptr == ptr + 1 || /* No hex digits */
1641 hptr >= ptrend || /* Hit end of input */
1642 *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1643 break; /* Hex escape not recognized */
1644
1645 c = cc; /* Accept the code point */
1646 ptr = hptr + 1;
1647 }
1648
1649 else /* Must be exactly 4 hex digits */
1650 {
1651 if (ptrend - ptr < 4) break; /* Less than 4 chars */
1652 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1653 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1654 cc = (cc << 4) | xc;
1655 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1656 cc = (cc << 4) | xc;
1657 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1658 c = (cc << 4) | xc;
1659 ptr += 4;
1660 }
1661
1662 if (utf)
1663 {
1664 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1665 else
1666 if (c >= 0xd800 && c <= 0xdfff &&
1667 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1668 *errorcodeptr = ERR73;
1669 }
1670 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1671 }
1672 break;
1673
1674 /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1675 in which case it is an upper case letter. */
1676
1677 case CHAR_U:
1678 if (!alt_bsux) *errorcodeptr = ERR37;
1679 break;
1680
1681 /* In a character class, \g is just a literal "g". Outside a character
1682 class, \g must be followed by one of a number of specific things:
1683
1684 (1) A number, either plain or braced. If positive, it is an absolute
1685 backreference. If negative, it is a relative backreference. This is a Perl
1686 5.10 feature.
1687
1688 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1689 is part of Perl's movement towards a unified syntax for back references. As
1690 this is synonymous with \k{name}, we fudge it up by pretending it really
1691 was \k{name}.
1692
1693 (3) For Oniguruma compatibility we also support \g followed by a name or a
1694 number either in angle brackets or in single quotes. However, these are
1695 (possibly recursive) subroutine calls, _not_ backreferences. We return
1696 the ESC_g code.
1697
1698 Summary: Return a negative number for a numerical back reference, ESC_k for
1699 a named back reference, and ESC_g for a named or numbered subroutine call.
1700 */
1701
1702 case CHAR_g:
1703 if (isclass) break;
1704
1705 if (ptr >= ptrend)
1706 {
1707 *errorcodeptr = ERR57;
1708 break;
1709 }
1710
1711 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1712 {
1713 escape = ESC_g;
1714 break;
1715 }
1716
1717 /* If there is a brace delimiter, try to read a numerical reference. If
1718 there isn't one, assume we have a name and treat it as \k. */
1719
1720 if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1721 {
1722 PCRE2_SPTR p = ptr + 1;
1723 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1724 errorcodeptr))
1725 {
1726 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1727 break;
1728 }
1729 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1730 {
1731 *errorcodeptr = ERR57;
1732 break;
1733 }
1734 ptr = p + 1;
1735 }
1736
1737 /* Read an undelimited number */
1738
1739 else
1740 {
1741 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1742 errorcodeptr))
1743 {
1744 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1745 break;
1746 }
1747 }
1748
1749 if (s <= 0)
1750 {
1751 *errorcodeptr = ERR15;
1752 break;
1753 }
1754
1755 escape = -s;
1756 break;
1757
1758 /* The handling of escape sequences consisting of a string of digits
1759 starting with one that is not zero is not straightforward. Perl has changed
1760 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1761 recommended to avoid the ambiguities in the old syntax.
1762
1763 Outside a character class, the digits are read as a decimal number. If the
1764 number is less than 10, or if there are that many previous extracting left
1765 brackets, it is a back reference. Otherwise, up to three octal digits are
1766 read to form an escaped character code. Thus \123 is likely to be octal 123
1767 (cf \0123, which is octal 012 followed by the literal 3).
1768
1769 Inside a character class, \ followed by a digit is always either a literal
1770 8 or 9 or an octal number. */
1771
1772 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1773 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1774
1775 if (!isclass)
1776 {
1777 oldptr = ptr;
1778 ptr--; /* Back to the digit */
1779 if (!read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, ERR61, &s,
1780 errorcodeptr))
1781 break;
1782
1783 /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1784 are octal escapes if there are not that many previous captures. */
1785
1786 if (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount)
1787 {
1788 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1789 else escape = -s; /* Indicates a back reference */
1790 break;
1791 }
1792 ptr = oldptr; /* Put the pointer back and fall through */
1793 }
1794
1795 /* Handle a digit following \ when the number is not a back reference, or
1796 we are within a character class. If the first digit is 8 or 9, Perl used to
1797 generate a binary zero and then treat the digit as a following literal. At
1798 least by Perl 5.18 this changed so as not to insert the binary zero. */
1799
1800 if (c >= CHAR_8) break;
1801
1802 /* Fall through */
1803
1804 /* \0 always starts an octal number, but we may drop through to here with a
1805 larger first octal digit. The original code used just to take the least
1806 significant 8 bits of octal numbers (I think this is what early Perls used
1807 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1808 but no more than 3 octal digits. */
1809
1810 case CHAR_0:
1811 c -= CHAR_0;
1812 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1813 c = c * 8 + *ptr++ - CHAR_0;
1814 #if PCRE2_CODE_UNIT_WIDTH == 8
1815 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1816 #endif
1817 break;
1818
1819 /* \o is a relatively new Perl feature, supporting a more general way of
1820 specifying character codes in octal. The only supported form is \o{ddd}. */
1821
1822 case CHAR_o:
1823 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1824 {
1825 ptr--;
1826 *errorcodeptr = ERR55;
1827 }
1828 else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1829 *errorcodeptr = ERR78;
1830 else
1831 {
1832 c = 0;
1833 overflow = FALSE;
1834 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1835 {
1836 cc = *ptr++;
1837 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1838 #if PCRE2_CODE_UNIT_WIDTH == 32
1839 if (c >= 0x20000000l) { overflow = TRUE; break; }
1840 #endif
1841 c = (c << 3) + (cc - CHAR_0);
1842 #if PCRE2_CODE_UNIT_WIDTH == 8
1843 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1844 #elif PCRE2_CODE_UNIT_WIDTH == 16
1845 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1846 #elif PCRE2_CODE_UNIT_WIDTH == 32
1847 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1848 #endif
1849 }
1850 if (overflow)
1851 {
1852 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1853 *errorcodeptr = ERR34;
1854 }
1855 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1856 {
1857 if (utf && c >= 0xd800 && c <= 0xdfff &&
1858 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1859 {
1860 ptr--;
1861 *errorcodeptr = ERR73;
1862 }
1863 }
1864 else
1865 {
1866 ptr--;
1867 *errorcodeptr = ERR64;
1868 }
1869 }
1870 break;
1871
1872 /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1873 by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1874
1875 case CHAR_x:
1876 if (alt_bsux)
1877 {
1878 uint32_t xc;
1879 if (ptrend - ptr < 2) break; /* Less than 2 characters */
1880 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1881 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1882 c = (cc << 4) | xc;
1883 ptr += 2;
1884 }
1885
1886 /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1887 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1888 digits. If not, { used to be treated as a data character. However, Perl
1889 seems to read hex digits up to the first non-such, and ignore the rest, so
1890 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1891 now gives an error. */
1892
1893 else
1894 {
1895 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1896 {
1897 #ifndef EBCDIC
1898 COME_FROM_NU:
1899 #endif
1900 if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1901 {
1902 *errorcodeptr = ERR78;
1903 break;
1904 }
1905 c = 0;
1906 overflow = FALSE;
1907
1908 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1909 {
1910 ptr++;
1911 if (c == 0 && cc == 0) continue; /* Leading zeroes */
1912 #if PCRE2_CODE_UNIT_WIDTH == 32
1913 if (c >= 0x10000000l) { overflow = TRUE; break; }
1914 #endif
1915 c = (c << 4) | cc;
1916 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1917 {
1918 overflow = TRUE;
1919 break;
1920 }
1921 }
1922
1923 if (overflow)
1924 {
1925 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1926 *errorcodeptr = ERR34;
1927 }
1928 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1929 {
1930 if (utf && c >= 0xd800 && c <= 0xdfff &&
1931 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1932 {
1933 ptr--;
1934 *errorcodeptr = ERR73;
1935 }
1936 }
1937
1938 /* If the sequence of hex digits does not end with '}', give an error.
1939 We used just to recognize this construct and fall through to the normal
1940 \x handling, but nowadays Perl gives an error, which seems much more
1941 sensible, so we do too. */
1942
1943 else
1944 {
1945 ptr--;
1946 *errorcodeptr = ERR67;
1947 }
1948 } /* End of \x{} processing */
1949
1950 /* Read a up to two hex digits after \x */
1951
1952 else
1953 {
1954 c = 0;
1955 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1956 ptr++;
1957 c = cc;
1958 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1959 ptr++;
1960 c = (c << 4) | cc;
1961 } /* End of \xdd handling */
1962 } /* End of Perl-style \x handling */
1963 break;
1964
1965 /* The handling of \c is different in ASCII and EBCDIC environments. In an
1966 ASCII (or Unicode) environment, an error is given if the character
1967 following \c is not a printable ASCII character. Otherwise, the following
1968 character is upper-cased if it is a letter, and after that the 0x40 bit is
1969 flipped. The result is the value of the escape.
1970
1971 In an EBCDIC environment the handling of \c is compatible with the
1972 specification in the perlebcdic document. The following character must be
1973 a letter or one of small number of special characters. These provide a
1974 means of defining the character values 0-31.
1975
1976 For testing the EBCDIC handling of \c in an ASCII environment, recognize
1977 the EBCDIC value of 'c' explicitly. */
1978
1979 #if defined EBCDIC && 'a' != 0x81
1980 case 0x83:
1981 #else
1982 case CHAR_c:
1983 #endif
1984 if (ptr >= ptrend)
1985 {
1986 *errorcodeptr = ERR2;
1987 break;
1988 }
1989 c = *ptr;
1990 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
1991
1992 /* Handle \c in an ASCII/Unicode environment. */
1993
1994 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1995 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
1996 {
1997 *errorcodeptr = ERR68;
1998 break;
1999 }
2000 c ^= 0x40;
2001
2002 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2003 255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2004 POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2005 The other valid sequences correspond to a list of specific characters. */
2006
2007 #else
2008 if (c == CHAR_QUESTION_MARK)
2009 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2010 else
2011 {
2012 for (i = 0; i < 32; i++)
2013 {
2014 if (c == ebcdic_escape_c[i]) break;
2015 }
2016 if (i < 32) c = i; else *errorcodeptr = ERR68;
2017 }
2018 #endif /* EBCDIC */
2019
2020 ptr++;
2021 break;
2022
2023 /* Any other alphanumeric following \ is an error. Perl gives an error only
2024 if in warning mode, but PCRE doesn't have a warning mode. */
2025
2026 default:
2027 *errorcodeptr = ERR3;
2028 *ptrptr = ptr - 1; /* Point to the character at fault */
2029 return 0;
2030 }
2031 }
2032
2033 /* Set the pointer to the next character before returning. */
2034
2035 *ptrptr = ptr;
2036 *chptr = c;
2037 return escape;
2038 }
2039
2040
2041
2042 #ifdef SUPPORT_UNICODE
2043 /*************************************************
2044 * Handle \P and \p *
2045 *************************************************/
2046
2047 /* This function is called after \P or \p has been encountered, provided that
2048 PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2049 contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2050 after the final code unit of the escape sequence.
2051
2052 Arguments:
2053 ptrptr the pattern position pointer
2054 negptr a boolean that is set TRUE for negation else FALSE
2055 ptypeptr an unsigned int that is set to the type value
2056 pdataptr an unsigned int that is set to the detailed property value
2057 errorcodeptr the error code variable
2058 cb the compile data
2059
2060 Returns: TRUE if the type value was found, or FALSE for an invalid type
2061 */
2062
2063 static BOOL
get_ucp(PCRE2_SPTR * ptrptr,BOOL * negptr,uint16_t * ptypeptr,uint16_t * pdataptr,int * errorcodeptr,compile_block * cb)2064 get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2065 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2066 {
2067 PCRE2_UCHAR c;
2068 PCRE2_SIZE i, bot, top;
2069 PCRE2_SPTR ptr = *ptrptr;
2070 PCRE2_UCHAR name[32];
2071
2072 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2073 c = *ptr++;
2074 *negptr = FALSE;
2075
2076 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2077 negation. */
2078
2079 if (c == CHAR_LEFT_CURLY_BRACKET)
2080 {
2081 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2082 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2083 {
2084 *negptr = TRUE;
2085 ptr++;
2086 }
2087 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2088 {
2089 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2090 c = *ptr++;
2091 if (c == CHAR_NUL) goto ERROR_RETURN;
2092 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2093 name[i] = c;
2094 }
2095 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2096 name[i] = 0;
2097 }
2098
2099 /* Otherwise there is just one following character, which must be an ASCII
2100 letter. */
2101
2102 else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2103 {
2104 name[0] = c;
2105 name[1] = 0;
2106 }
2107 else goto ERROR_RETURN;
2108
2109 *ptrptr = ptr;
2110
2111 /* Search for a recognized property name using binary chop. */
2112
2113 bot = 0;
2114 top = PRIV(utt_size);
2115
2116 while (bot < top)
2117 {
2118 int r;
2119 i = (bot + top) >> 1;
2120 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2121 if (r == 0)
2122 {
2123 *ptypeptr = PRIV(utt)[i].type;
2124 *pdataptr = PRIV(utt)[i].value;
2125 return TRUE;
2126 }
2127 if (r > 0) bot = i + 1; else top = i;
2128 }
2129 *errorcodeptr = ERR47; /* Unrecognized name */
2130 return FALSE;
2131
2132 ERROR_RETURN: /* Malformed \P or \p */
2133 *errorcodeptr = ERR46;
2134 *ptrptr = ptr;
2135 return FALSE;
2136 }
2137 #endif
2138
2139
2140
2141 /*************************************************
2142 * Check for POSIX class syntax *
2143 *************************************************/
2144
2145 /* This function is called when the sequence "[:" or "[." or "[=" is
2146 encountered in a character class. It checks whether this is followed by a
2147 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2148 reach an unescaped ']' without the special preceding character, return FALSE.
2149
2150 Originally, this function only recognized a sequence of letters between the
2151 terminators, but it seems that Perl recognizes any sequence of characters,
2152 though of course unknown POSIX names are subsequently rejected. Perl gives an
2153 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2154 didn't consider this to be a POSIX class. Likewise for [:1234:].
2155
2156 The problem in trying to be exactly like Perl is in the handling of escapes. We
2157 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2158 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2159 below handles the special cases \\ and \], but does not try to do any other
2160 escape processing. This makes it different from Perl for cases such as
2161 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2162 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2163 when Perl does, I think.
2164
2165 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2166 It seems that the appearance of a nested POSIX class supersedes an apparent
2167 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2168 a digit. This is handled by returning FALSE if the start of a new group with
2169 the same terminator is encountered, since the next closing sequence must close
2170 the nested group, not the outer one.
2171
2172 In Perl, unescaped square brackets may also appear as part of class names. For
2173 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2174 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2175 seem right at all. PCRE does not allow closing square brackets in POSIX class
2176 names.
2177
2178 Arguments:
2179 ptr pointer to the character after the initial [ (colon, dot, equals)
2180 ptrend pointer to the end of the pattern
2181 endptr where to return a pointer to the terminating ':', '.', or '='
2182
2183 Returns: TRUE or FALSE
2184 */
2185
2186 static BOOL
check_posix_syntax(PCRE2_SPTR ptr,PCRE2_SPTR ptrend,PCRE2_SPTR * endptr)2187 check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2188 {
2189 PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2190 terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2191
2192 for (; ptrend - ptr >= 2; ptr++)
2193 {
2194 if (*ptr == CHAR_BACKSLASH &&
2195 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2196 ptr++;
2197
2198 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2199 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2200
2201 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2202 {
2203 *endptr = ptr;
2204 return TRUE;
2205 }
2206 }
2207
2208 return FALSE;
2209 }
2210
2211
2212
2213 /*************************************************
2214 * Check POSIX class name *
2215 *************************************************/
2216
2217 /* This function is called to check the name given in a POSIX-style class entry
2218 such as [:alnum:].
2219
2220 Arguments:
2221 ptr points to the first letter
2222 len the length of the name
2223
2224 Returns: a value representing the name, or -1 if unknown
2225 */
2226
2227 static int
check_posix_name(PCRE2_SPTR ptr,int len)2228 check_posix_name(PCRE2_SPTR ptr, int len)
2229 {
2230 const char *pn = posix_names;
2231 int yield = 0;
2232 while (posix_name_lengths[yield] != 0)
2233 {
2234 if (len == posix_name_lengths[yield] &&
2235 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2236 pn += posix_name_lengths[yield] + 1;
2237 yield++;
2238 }
2239 return -1;
2240 }
2241
2242
2243
2244 /*************************************************
2245 * Read a subpattern or VERB name *
2246 *************************************************/
2247
2248 /* This function is called from parse_regex() below whenever it needs to read
2249 the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2250 pointer must be to the character before the name. If that character is '*' we
2251 are reading a verb or alpha assertion name. The pointer is updated to point
2252 after the name, for a VERB or alpha assertion name, or after tha name's
2253 terminator for a subpattern name. Returning both the offset and the name
2254 pointer is redundant information, but some callers use one and some the other,
2255 so it is simplest just to return both.
2256
2257 Arguments:
2258 ptrptr points to the character pointer variable
2259 ptrend points to the end of the input string
2260 utf true if the input is UTF-encoded
2261 terminator the terminator of a subpattern name must be this
2262 offsetptr where to put the offset from the start of the pattern
2263 nameptr where to put a pointer to the name in the input
2264 namelenptr where to put the length of the name
2265 errcodeptr where to put an error code
2266 cb pointer to the compile data block
2267
2268 Returns: TRUE if a name was read
2269 FALSE otherwise, with error code set
2270 */
2271
2272 static BOOL
read_name(PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL utf,uint32_t terminator,PCRE2_SIZE * offsetptr,PCRE2_SPTR * nameptr,uint32_t * namelenptr,int * errorcodeptr,compile_block * cb)2273 read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2274 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2275 int *errorcodeptr, compile_block *cb)
2276 {
2277 PCRE2_SPTR ptr = *ptrptr;
2278 BOOL is_group = (*ptr != CHAR_ASTERISK);
2279
2280 if (++ptr >= ptrend) /* No characters in name */
2281 {
2282 *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2283 ERR60; /* Verb not recognized or malformed */
2284 goto FAILED;
2285 }
2286
2287 *nameptr = ptr;
2288 *offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2289
2290 /* In UTF mode, a group name may contain letters and decimal digits as defined
2291 by Unicode properties, and underscores, but must not start with a digit. */
2292
2293 #ifdef SUPPORT_UNICODE
2294 if (utf && is_group)
2295 {
2296 uint32_t c, type;
2297
2298 GETCHAR(c, ptr);
2299 type = UCD_CHARTYPE(c);
2300
2301 if (type == ucp_Nd)
2302 {
2303 *errorcodeptr = ERR44;
2304 goto FAILED;
2305 }
2306
2307 for(;;)
2308 {
2309 if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2310 c != CHAR_UNDERSCORE) break;
2311 ptr++;
2312 FORWARDCHARTEST(ptr, ptrend);
2313 if (ptr >= ptrend) break;
2314 GETCHAR(c, ptr);
2315 type = UCD_CHARTYPE(c);
2316 }
2317 }
2318 else
2319 #else
2320 (void)utf; /* Avoid compiler warning */
2321 #endif /* SUPPORT_UNICODE */
2322
2323 /* Handle non-group names and group names in non-UTF modes. A group name must
2324 not start with a digit. If either of the others start with a digit it just
2325 won't be recognized. */
2326
2327 {
2328 if (is_group && IS_DIGIT(*ptr))
2329 {
2330 *errorcodeptr = ERR44;
2331 goto FAILED;
2332 }
2333
2334 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2335 {
2336 ptr++;
2337 }
2338 }
2339
2340 /* Check name length */
2341
2342 if (ptr > *nameptr + MAX_NAME_SIZE)
2343 {
2344 *errorcodeptr = ERR48;
2345 goto FAILED;
2346 }
2347 *namelenptr = ptr - *nameptr;
2348
2349 /* Subpattern names must not be empty, and their terminator is checked here.
2350 (What follows a verb or alpha assertion name is checked separately.) */
2351
2352 if (is_group)
2353 {
2354 if (ptr == *nameptr)
2355 {
2356 *errorcodeptr = ERR62; /* Subpattern name expected */
2357 goto FAILED;
2358 }
2359 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2360 {
2361 *errorcodeptr = ERR42;
2362 goto FAILED;
2363 }
2364 ptr++;
2365 }
2366
2367 *ptrptr = ptr;
2368 return TRUE;
2369
2370 FAILED:
2371 *ptrptr = ptr;
2372 return FALSE;
2373 }
2374
2375
2376
2377 /*************************************************
2378 * Manage callouts at start of cycle *
2379 *************************************************/
2380
2381 /* At the start of a new item in parse_regex() we are able to record the
2382 details of the previous item in a prior callout, and also to set up an
2383 automatic callout if enabled. Avoid having two adjacent automatic callouts,
2384 which would otherwise happen for items such as \Q that contribute nothing to
2385 the parsed pattern.
2386
2387 Arguments:
2388 ptr current pattern pointer
2389 pcalloutptr points to a pointer to previous callout, or NULL
2390 auto_callout TRUE if auto_callouts are enabled
2391 parsed_pattern the parsed pattern pointer
2392 cb compile block
2393
2394 Returns: possibly updated parsed_pattern pointer.
2395 */
2396
2397 static uint32_t *
manage_callouts(PCRE2_SPTR ptr,uint32_t ** pcalloutptr,BOOL auto_callout,uint32_t * parsed_pattern,compile_block * cb)2398 manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2399 uint32_t *parsed_pattern, compile_block *cb)
2400 {
2401 uint32_t *previous_callout = *pcalloutptr;
2402
2403 if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2404 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2405
2406 if (!auto_callout) previous_callout = NULL; else
2407 {
2408 if (previous_callout == NULL ||
2409 previous_callout != parsed_pattern - 4 ||
2410 previous_callout[3] != 255)
2411 {
2412 previous_callout = parsed_pattern; /* Set up new automatic callout */
2413 parsed_pattern += 4;
2414 previous_callout[0] = META_CALLOUT_NUMBER;
2415 previous_callout[2] = 0;
2416 previous_callout[3] = 255;
2417 }
2418 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2419 }
2420
2421 *pcalloutptr = previous_callout;
2422 return parsed_pattern;
2423 }
2424
2425
2426
2427 /*************************************************
2428 * Parse regex and identify named groups *
2429 *************************************************/
2430
2431 /* This function is called first of all. It scans the pattern and does two
2432 things: (1) It identifies capturing groups and makes a table of named capturing
2433 groups so that information about them is fully available to both the compiling
2434 scans. (2) It writes a parsed version of the pattern with comments omitted and
2435 escapes processed into the parsed_pattern vector.
2436
2437 Arguments:
2438 ptr points to the start of the pattern
2439 options compiling dynamic options (may change during the scan)
2440 has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2441 cb pointer to the compile data block
2442
2443 Returns: zero on success or a non-zero error code, with the
2444 error offset placed in the cb field
2445 */
2446
2447 /* A structure and some flags for dealing with nested groups. */
2448
2449 typedef struct nest_save {
2450 uint16_t nest_depth;
2451 uint16_t reset_group;
2452 uint16_t max_group;
2453 uint16_t flags;
2454 uint32_t options;
2455 } nest_save;
2456
2457 #define NSF_RESET 0x0001u
2458 #define NSF_CONDASSERT 0x0002u
2459 #define NSF_ATOMICSR 0x0004u
2460
2461 /* Options that are changeable within the pattern must be tracked during
2462 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2463 but all must be tracked so that META_OPTIONS items set the correct values for
2464 the main compiling phase. */
2465
2466 #define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2467 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2468 PCRE2_UNGREEDY)
2469
2470 /* States used for analyzing ranges in character classes. The two OK values
2471 must be last. */
2472
2473 enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2474
2475 /* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2476 the storing of literal values in the main parsed pattern, where they can always
2477 be quantified. */
2478
2479 #if PCRE2_CODE_UNIT_WIDTH == 32
2480 #define PARSED_LITERAL(c, p) \
2481 { \
2482 if (c >= META_END) *p++ = META_BIGVALUE; \
2483 *p++ = c; \
2484 okquantifier = TRUE; \
2485 }
2486 #else
2487 #define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2488 #endif
2489
2490 /* Here's the actual function. */
2491
parse_regex(PCRE2_SPTR ptr,uint32_t options,BOOL * has_lookbehind,compile_block * cb)2492 static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2493 compile_block *cb)
2494 {
2495 uint32_t c;
2496 uint32_t delimiter;
2497 uint32_t namelen;
2498 uint32_t class_range_state;
2499 uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2500 uint32_t *verbstartptr = NULL;
2501 uint32_t *previous_callout = NULL;
2502 uint32_t *parsed_pattern = cb->parsed_pattern;
2503 uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2504 uint32_t meta_quantifier = 0;
2505 uint32_t add_after_mark = 0;
2506 uint32_t extra_options = cb->cx->extra_options;
2507 uint16_t nest_depth = 0;
2508 int after_manual_callout = 0;
2509 int expect_cond_assert = 0;
2510 int errorcode = 0;
2511 int escape;
2512 int i;
2513 BOOL inescq = FALSE;
2514 BOOL inverbname = FALSE;
2515 BOOL utf = (options & PCRE2_UTF) != 0;
2516 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2517 BOOL isdupname;
2518 BOOL negate_class;
2519 BOOL okquantifier = FALSE;
2520 PCRE2_SPTR thisptr;
2521 PCRE2_SPTR name;
2522 PCRE2_SPTR ptrend = cb->end_pattern;
2523 PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2524 named_group *ng;
2525 nest_save *top_nest, *end_nests;
2526
2527 /* Insert leading items for word and line matching (features provided for the
2528 benefit of pcre2grep). */
2529
2530 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2531 {
2532 *parsed_pattern++ = META_CIRCUMFLEX;
2533 *parsed_pattern++ = META_NOCAPTURE;
2534 }
2535 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2536 {
2537 *parsed_pattern++ = META_ESCAPE + ESC_b;
2538 *parsed_pattern++ = META_NOCAPTURE;
2539 }
2540
2541 /* If the pattern is actually a literal string, process it separately to avoid
2542 cluttering up the main loop. */
2543
2544 if ((options & PCRE2_LITERAL) != 0)
2545 {
2546 while (ptr < ptrend)
2547 {
2548 if (parsed_pattern >= parsed_pattern_end)
2549 {
2550 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2551 goto FAILED;
2552 }
2553 thisptr = ptr;
2554 GETCHARINCTEST(c, ptr);
2555 if (auto_callout)
2556 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2557 auto_callout, parsed_pattern, cb);
2558 PARSED_LITERAL(c, parsed_pattern);
2559 }
2560 goto PARSED_END;
2561 }
2562
2563 /* Process a real regex which may contain meta-characters. */
2564
2565 top_nest = NULL;
2566 end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2567
2568 /* The size of the nest_save structure might not be a factor of the size of the
2569 workspace. Therefore we must round down end_nests so as to correctly avoid
2570 creating a nest_save that spans the end of the workspace. */
2571
2572 end_nests = (nest_save *)((char *)end_nests -
2573 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2574
2575 /* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2576
2577 if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2578
2579 /* Now scan the pattern */
2580
2581 while (ptr < ptrend)
2582 {
2583 int prev_expect_cond_assert;
2584 uint32_t min_repeat, max_repeat;
2585 uint32_t set, unset, *optset;
2586 uint32_t terminator;
2587 uint32_t prev_meta_quantifier;
2588 BOOL prev_okquantifier;
2589 PCRE2_SPTR tempptr;
2590 PCRE2_SIZE offset;
2591
2592 if (parsed_pattern >= parsed_pattern_end)
2593 {
2594 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2595 goto FAILED;
2596 }
2597
2598 if (nest_depth > cb->cx->parens_nest_limit)
2599 {
2600 errorcode = ERR19;
2601 goto FAILED; /* Parentheses too deeply nested */
2602 }
2603
2604 /* Get next input character, save its position for callout handling. */
2605
2606 thisptr = ptr;
2607 GETCHARINCTEST(c, ptr);
2608
2609 /* Copy quoted literals until \E, allowing for the possibility of automatic
2610 callouts, except when processing a (*VERB) "name". */
2611
2612 if (inescq)
2613 {
2614 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2615 {
2616 inescq = FALSE;
2617 ptr++; /* Skip E */
2618 }
2619 else
2620 {
2621 if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2622 { /* expecting a conditional assertion, */
2623 ptr--; /* but an empty \Q\E sequence is OK. */
2624 errorcode = ERR28;
2625 goto FAILED;
2626 }
2627 if (inverbname)
2628 { /* Don't use PARSED_LITERAL() because it */
2629 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2630 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2631 #endif
2632 *parsed_pattern++ = c;
2633 }
2634 else
2635 {
2636 if (after_manual_callout-- <= 0)
2637 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2638 auto_callout, parsed_pattern, cb);
2639 PARSED_LITERAL(c, parsed_pattern);
2640 }
2641 meta_quantifier = 0;
2642 }
2643 continue; /* Next character */
2644 }
2645
2646 /* If we are processing the "name" part of a (*VERB:NAME) item, all
2647 characters up to the closing parenthesis are literals except when
2648 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2649 and \E and escaped characters are allowed (no character types such as \d). If
2650 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2651 this by not entering the special (*VERB:NAME) processing - they are then
2652 picked up below. Note that c is a character, not a code unit, so we must not
2653 use MAX_255 to test its size because MAX_255 tests code units and is assumed
2654 TRUE in 8-bit mode. */
2655
2656 if (inverbname &&
2657 (
2658 /* EITHER: not both options set */
2659 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2660 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2661 #ifdef SUPPORT_UNICODE
2662 /* OR: character > 255 AND not Unicode Pattern White Space */
2663 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2664 #endif
2665 /* OR: not a # comment or isspace() white space */
2666 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2667 #ifdef SUPPORT_UNICODE
2668 /* and not CHAR_NEL when Unicode is supported */
2669 && c != CHAR_NEL
2670 #endif
2671 )))
2672 {
2673 PCRE2_SIZE verbnamelength;
2674
2675 switch(c)
2676 {
2677 default: /* Don't use PARSED_LITERAL() because it */
2678 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2679 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2680 #endif
2681 *parsed_pattern++ = c;
2682 break;
2683
2684 case CHAR_RIGHT_PARENTHESIS:
2685 inverbname = FALSE;
2686 /* This is the length in characters */
2687 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2688 /* But the limit on the length is in code units */
2689 if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2690 {
2691 ptr--;
2692 errorcode = ERR76;
2693 goto FAILED;
2694 }
2695 *verblengthptr = (uint32_t)verbnamelength;
2696
2697 /* If this name was on a verb such as (*ACCEPT) which does not continue,
2698 a (*MARK) was generated for the name. We now add the original verb as the
2699 next item. */
2700
2701 if (add_after_mark != 0)
2702 {
2703 *parsed_pattern++ = add_after_mark;
2704 add_after_mark = 0;
2705 }
2706 break;
2707
2708 case CHAR_BACKSLASH:
2709 if ((options & PCRE2_ALT_VERBNAMES) != 0)
2710 {
2711 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2712 cb->cx->extra_options, FALSE, cb);
2713 if (errorcode != 0) goto FAILED;
2714 }
2715 else escape = 0; /* Treat all as literal */
2716
2717 switch(escape)
2718 {
2719 case 0: /* Don't use PARSED_LITERAL() because it */
2720 #if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2721 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2722 #endif
2723 *parsed_pattern++ = c;
2724 break;
2725
2726 case ESC_Q:
2727 inescq = TRUE;
2728 break;
2729
2730 case ESC_E: /* Ignore */
2731 break;
2732
2733 default:
2734 errorcode = ERR40; /* Invalid in verb name */
2735 goto FAILED;
2736 }
2737 }
2738 continue; /* Next character in pattern */
2739 }
2740
2741 /* Not a verb name character. At this point we must process everything that
2742 must not change the quantification state. This is mainly comments, but we
2743 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2744 A+, as in Perl. An isolated \E is ignored. */
2745
2746 if (c == CHAR_BACKSLASH && ptr < ptrend)
2747 {
2748 if (*ptr == CHAR_Q || *ptr == CHAR_E)
2749 {
2750 inescq = *ptr == CHAR_Q;
2751 ptr++;
2752 continue;
2753 }
2754 }
2755
2756 /* Skip over whitespace and # comments in extended mode. Note that c is a
2757 character, not a code unit, so we must not use MAX_255 to test its size
2758 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2759 whitespace characters are those designated as "Pattern White Space" by
2760 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2761 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2762 subset of space characters that match \h and \v. */
2763
2764 if ((options & PCRE2_EXTENDED) != 0)
2765 {
2766 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2767 #ifdef SUPPORT_UNICODE
2768 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2769 #endif
2770 if (c == CHAR_NUMBER_SIGN)
2771 {
2772 while (ptr < ptrend)
2773 {
2774 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
2775 { /* IS_NEWLINE sets cb->nllen. */
2776 ptr += cb->nllen;
2777 break;
2778 }
2779 ptr++;
2780 #ifdef SUPPORT_UNICODE
2781 if (utf) FORWARDCHARTEST(ptr, ptrend);
2782 #endif
2783 }
2784 continue; /* Next character in pattern */
2785 }
2786 }
2787
2788 /* Skip over bracketed comments */
2789
2790 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2791 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2792 {
2793 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2794 if (ptr >= ptrend)
2795 {
2796 errorcode = ERR18; /* A special error for missing ) in a comment */
2797 goto FAILED; /* to make it easier to debug. */
2798 }
2799 ptr++;
2800 continue; /* Next character in pattern */
2801 }
2802
2803 /* If the next item is not a quantifier, fill in length of any previous
2804 callout and create an auto callout if required. */
2805
2806 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2807 (c != CHAR_LEFT_CURLY_BRACKET ||
2808 (tempptr = ptr,
2809 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2810 {
2811 if (after_manual_callout-- <= 0)
2812 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2813 parsed_pattern, cb);
2814 }
2815
2816 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2817 assertion, possibly preceded by a callout. If the value is 1, we have just
2818 had the callout and expect an assertion. There must be at least 3 more
2819 characters in all cases. When expect_cond_assert is 2, we know that the
2820 current character is an opening parenthesis, as otherwise we wouldn't be
2821 here. However, when it is 1, we need to check, and it's easiest just to check
2822 always. Note that expect_cond_assert may be negative, since all callouts just
2823 decrement it. */
2824
2825 if (expect_cond_assert > 0)
2826 {
2827 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2828 (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2829 if (ok)
2830 {
2831 if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
2832 {
2833 ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2834 }
2835 else switch(ptr[1]) /* Traditional symbolic format */
2836 {
2837 case CHAR_C:
2838 ok = expect_cond_assert == 2;
2839 break;
2840
2841 case CHAR_EQUALS_SIGN:
2842 case CHAR_EXCLAMATION_MARK:
2843 break;
2844
2845 case CHAR_LESS_THAN_SIGN:
2846 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2847 break;
2848
2849 default:
2850 ok = FALSE;
2851 }
2852 }
2853
2854 if (!ok)
2855 {
2856 ptr--; /* Adjust error offset */
2857 errorcode = ERR28;
2858 goto FAILED;
2859 }
2860 }
2861
2862 /* Remember whether we are expecting a conditional assertion, and set the
2863 default for this item. */
2864
2865 prev_expect_cond_assert = expect_cond_assert;
2866 expect_cond_assert = 0;
2867
2868 /* Remember quantification status for the previous significant item, then set
2869 default for this item. */
2870
2871 prev_okquantifier = okquantifier;
2872 prev_meta_quantifier = meta_quantifier;
2873 okquantifier = FALSE;
2874 meta_quantifier = 0;
2875
2876 /* If the previous significant item was a quantifier, adjust the parsed code
2877 if there is a following modifier. The base meta value is always followed by
2878 the PLUS and QUERY values, in that order. We do this here rather than after
2879 reading a quantifier so that intervening comments and /x whitespace can be
2880 ignored without having to replicate code. */
2881
2882 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2883 {
2884 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2885 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2886 0x00020000u : 0x00010000u);
2887 continue; /* Next character in pattern */
2888 }
2889
2890
2891 /* Process the next item in the main part of a pattern. */
2892
2893 switch(c)
2894 {
2895 default: /* Non-special character */
2896 PARSED_LITERAL(c, parsed_pattern);
2897 break;
2898
2899
2900 /* ---- Escape sequence ---- */
2901
2902 case CHAR_BACKSLASH:
2903 tempptr = ptr;
2904 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2905 cb->cx->extra_options, FALSE, cb);
2906 if (errorcode != 0)
2907 {
2908 ESCAPE_FAILED:
2909 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
2910 goto FAILED;
2911 ptr = tempptr;
2912 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
2913 {
2914 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
2915 }
2916 escape = 0; /* Treat as literal character */
2917 }
2918
2919 /* The escape was a data escape or literal character. */
2920
2921 if (escape == 0)
2922 {
2923 PARSED_LITERAL(c, parsed_pattern);
2924 }
2925
2926 /* The escape was a back (or forward) reference. We keep the offset in
2927 order to give a more useful diagnostic for a bad forward reference. For
2928 references to groups numbered less than 10 we can't use more than two items
2929 in parsed_pattern because they may be just two characters in the input (and
2930 in a 64-bit world an offset may need two elements). So for them, the offset
2931 of the first occurrent is held in a special vector. */
2932
2933 else if (escape < 0)
2934 {
2935 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
2936 escape = -escape;
2937 *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
2938 if (escape < 10)
2939 {
2940 if (cb->small_ref_offset[escape] == PCRE2_UNSET)
2941 cb->small_ref_offset[escape] = offset;
2942 }
2943 else
2944 {
2945 PUTOFFSET(offset, parsed_pattern);
2946 }
2947 okquantifier = TRUE;
2948 }
2949
2950 /* The escape was a character class such as \d etc. or other special
2951 escape indicator such as \A or \X. Most of them generate just a single
2952 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
2953 value. They are supported only when Unicode is available. The type and
2954 value are packed into a single 32-bit value so that the whole sequences
2955 uses only two elements in the parsed_vector. This is because the same
2956 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
2957 set.
2958
2959 There are also some cases where the escape sequence is followed by a name:
2960 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
2961 and \g'name' are subroutine calls by name; \g{name} is a synonym for
2962 \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
2963 and returned as a negative value (handled above). A name is coded as an
2964 offset into the pattern and a length. */
2965
2966 else switch (escape)
2967 {
2968 case ESC_C:
2969 #ifdef NEVER_BACKSLASH_C
2970 errorcode = ERR85;
2971 goto ESCAPE_FAILED;
2972 #else
2973 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
2974 {
2975 errorcode = ERR83;
2976 goto ESCAPE_FAILED;
2977 }
2978 #endif
2979 okquantifier = TRUE;
2980 *parsed_pattern++ = META_ESCAPE + escape;
2981 break;
2982
2983 case ESC_X:
2984 #ifndef SUPPORT_UNICODE
2985 errorcode = ERR45; /* Supported only with Unicode support */
2986 goto ESCAPE_FAILED;
2987 #endif
2988 case ESC_H:
2989 case ESC_h:
2990 case ESC_N:
2991 case ESC_R:
2992 case ESC_V:
2993 case ESC_v:
2994 okquantifier = TRUE;
2995 *parsed_pattern++ = META_ESCAPE + escape;
2996 break;
2997
2998 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
2999 *parsed_pattern++ = META_ESCAPE + escape;
3000 break;
3001
3002 /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
3003 without Unicode support because it is checked when pcre2_compile() is
3004 called. */
3005
3006 case ESC_d:
3007 case ESC_D:
3008 case ESC_s:
3009 case ESC_S:
3010 case ESC_w:
3011 case ESC_W:
3012 okquantifier = TRUE;
3013 if ((options & PCRE2_UCP) == 0)
3014 {
3015 *parsed_pattern++ = META_ESCAPE + escape;
3016 }
3017 else
3018 {
3019 *parsed_pattern++ = META_ESCAPE +
3020 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3021 ESC_p : ESC_P);
3022 switch(escape)
3023 {
3024 case ESC_d:
3025 case ESC_D:
3026 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3027 break;
3028
3029 case ESC_s:
3030 case ESC_S:
3031 *parsed_pattern++ = PT_SPACE << 16;
3032 break;
3033
3034 case ESC_w:
3035 case ESC_W:
3036 *parsed_pattern++ = PT_WORD << 16;
3037 break;
3038 }
3039 }
3040 break;
3041
3042 /* Unicode property matching */
3043
3044 case ESC_P:
3045 case ESC_p:
3046 #ifdef SUPPORT_UNICODE
3047 {
3048 BOOL negated;
3049 uint16_t ptype = 0, pdata = 0;
3050 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3051 goto ESCAPE_FAILED;
3052 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3053 *parsed_pattern++ = META_ESCAPE + escape;
3054 *parsed_pattern++ = (ptype << 16) | pdata;
3055 okquantifier = TRUE;
3056 }
3057 #else
3058 errorcode = ERR45;
3059 goto ESCAPE_FAILED;
3060 #endif
3061 break; /* End \P and \p */
3062
3063 /* When \g is used with quotes or angle brackets as delimiters, it is a
3064 numerical or named subroutine call, and control comes here. When used
3065 with brace delimiters it is a numberical back reference and does not come
3066 here because check_escape() returns it directly as a reference. \k is
3067 always a named back reference. */
3068
3069 case ESC_g:
3070 case ESC_k:
3071 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3072 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3073 {
3074 errorcode = (escape == ESC_g)? ERR57 : ERR69;
3075 goto ESCAPE_FAILED;
3076 }
3077 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3078 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3079 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3080
3081 /* For a non-braced \g, check for a numerical recursion. */
3082
3083 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3084 {
3085 PCRE2_SPTR p = ptr + 1;
3086
3087 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3088 &errorcode))
3089 {
3090 if (p >= ptrend || *p != terminator)
3091 {
3092 errorcode = ERR57;
3093 goto ESCAPE_FAILED;
3094 }
3095 ptr = p;
3096 goto SET_RECURSION;
3097 }
3098 if (errorcode != 0) goto ESCAPE_FAILED;
3099 }
3100
3101 /* Not a numerical recursion */
3102
3103 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3104 &errorcode, cb)) goto ESCAPE_FAILED;
3105
3106 /* \k and \g when used with braces are back references, whereas \g used
3107 with quotes or angle brackets is a recursion */
3108
3109 *parsed_pattern++ =
3110 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3111 META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3112 *parsed_pattern++ = namelen;
3113
3114 PUTOFFSET(offset, parsed_pattern);
3115 okquantifier = TRUE;
3116 break; /* End special escape processing */
3117 }
3118 break; /* End escape sequence processing */
3119
3120
3121 /* ---- Single-character special items ---- */
3122
3123 case CHAR_CIRCUMFLEX_ACCENT:
3124 *parsed_pattern++ = META_CIRCUMFLEX;
3125 break;
3126
3127 case CHAR_DOLLAR_SIGN:
3128 *parsed_pattern++ = META_DOLLAR;
3129 break;
3130
3131 case CHAR_DOT:
3132 *parsed_pattern++ = META_DOT;
3133 okquantifier = TRUE;
3134 break;
3135
3136
3137 /* ---- Single-character quantifiers ---- */
3138
3139 case CHAR_ASTERISK:
3140 meta_quantifier = META_ASTERISK;
3141 goto CHECK_QUANTIFIER;
3142
3143 case CHAR_PLUS:
3144 meta_quantifier = META_PLUS;
3145 goto CHECK_QUANTIFIER;
3146
3147 case CHAR_QUESTION_MARK:
3148 meta_quantifier = META_QUERY;
3149 goto CHECK_QUANTIFIER;
3150
3151
3152 /* ---- Potential {n,m} quantifier ---- */
3153
3154 case CHAR_LEFT_CURLY_BRACKET:
3155 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3156 &errorcode))
3157 {
3158 if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3159 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3160 break; /* No more quantifier processing */
3161 }
3162 meta_quantifier = META_MINMAX;
3163 /* Fall through */
3164
3165
3166 /* ---- Quantifier post-processing ---- */
3167
3168 /* Check that a quantifier is allowed after the previous item. */
3169
3170 CHECK_QUANTIFIER:
3171 if (!prev_okquantifier)
3172 {
3173 errorcode = ERR9;
3174 goto FAILED_BACK;
3175 }
3176
3177 /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3178 quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3179 sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3180 wrapping it in non-capturing brackets, but we have to allow for a preceding
3181 (*MARK) for when (*ACCEPT) has an argument. */
3182
3183 if (parsed_pattern[-1] == META_ACCEPT)
3184 {
3185 uint32_t *p;
3186 for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3187 *verbstartptr = META_NOCAPTURE;
3188 parsed_pattern[1] = META_KET;
3189 parsed_pattern += 2;
3190 }
3191
3192 /* Now we can put the quantifier into the parsed pattern vector. At this
3193 stage, we have only the basic quantifier. The check for a following + or ?
3194 modifier happens at the top of the loop, after any intervening comments
3195 have been removed. */
3196
3197 *parsed_pattern++ = meta_quantifier;
3198 if (c == CHAR_LEFT_CURLY_BRACKET)
3199 {
3200 *parsed_pattern++ = min_repeat;
3201 *parsed_pattern++ = max_repeat;
3202 }
3203 break;
3204
3205
3206 /* ---- Character class ---- */
3207
3208 case CHAR_LEFT_SQUARE_BRACKET:
3209 okquantifier = TRUE;
3210
3211 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3212 used for "start of word" and "end of word". As these are otherwise illegal
3213 sequences, we don't break anything by recognizing them. They are replaced
3214 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3215 erroneous and are handled by the normal code below. */
3216
3217 if (ptrend - ptr >= 6 &&
3218 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3219 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3220 {
3221 *parsed_pattern++ = META_ESCAPE + ESC_b;
3222
3223 if (ptr[2] == CHAR_LESS_THAN_SIGN)
3224 {
3225 *parsed_pattern++ = META_LOOKAHEAD;
3226 }
3227 else
3228 {
3229 *parsed_pattern++ = META_LOOKBEHIND;
3230 *has_lookbehind = TRUE;
3231
3232 /* The offset is used only for the "non-fixed length" error; this won't
3233 occur here, so just store zero. */
3234
3235 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3236 }
3237
3238 if ((options & PCRE2_UCP) == 0)
3239 *parsed_pattern++ = META_ESCAPE + ESC_w;
3240 else
3241 {
3242 *parsed_pattern++ = META_ESCAPE + ESC_p;
3243 *parsed_pattern++ = PT_WORD << 16;
3244 }
3245 *parsed_pattern++ = META_KET;
3246 ptr += 6;
3247 break;
3248 }
3249
3250 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3251 they are encountered at the top level, so we'll do that too. */
3252
3253 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3254 *ptr == CHAR_EQUALS_SIGN) &&
3255 check_posix_syntax(ptr, ptrend, &tempptr))
3256 {
3257 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3258 goto FAILED;
3259 }
3260
3261 /* Process a regular character class. If the first character is '^', set
3262 the negation flag. If the first few characters (either before or after ^)
3263 are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3264 This makes for compatibility with Perl. */
3265
3266 negate_class = FALSE;
3267 while (ptr < ptrend)
3268 {
3269 GETCHARINCTEST(c, ptr);
3270 if (c == CHAR_BACKSLASH)
3271 {
3272 if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3273 else if (ptrend - ptr >= 3 &&
3274 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3275 ptr += 3;
3276 else
3277 break;
3278 }
3279 else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3280 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3281 continue;
3282 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3283 negate_class = TRUE;
3284 else break;
3285 }
3286
3287 /* Now the real contents of the class; c has the first "real" character.
3288 Empty classes are permitted only if the option is set. */
3289
3290 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3291 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3292 {
3293 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3294 break; /* End of class processing */
3295 }
3296
3297 /* Process a non-empty class. */
3298
3299 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3300 class_range_state = RANGE_NO;
3301
3302 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3303 because there are holes in the encoding, and simply using the range A-Z
3304 (for example) would include the characters in the holes. This applies only
3305 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3306 in this respect. In order to accommodate this, we keep track of whether
3307 character values are literal or not, and a state variable for handling
3308 ranges. */
3309
3310 /* Loop for the contents of the class */
3311
3312 for (;;)
3313 {
3314 BOOL char_is_literal = TRUE;
3315
3316 /* Inside \Q...\E everything is literal except \E */
3317
3318 if (inescq)
3319 {
3320 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3321 {
3322 inescq = FALSE; /* Reset literal state */
3323 ptr++; /* Skip the 'E' */
3324 goto CLASS_CONTINUE;
3325 }
3326 goto CLASS_LITERAL;
3327 }
3328
3329 /* Skip over space and tab (only) in extended-more mode. */
3330
3331 if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3332 (c == CHAR_SPACE || c == CHAR_HT))
3333 goto CLASS_CONTINUE;
3334
3335 /* Handle POSIX class names. Perl allows a negation extension of the
3336 form [:^name:]. A square bracket that doesn't match the syntax is
3337 treated as a literal. We also recognize the POSIX constructions
3338 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3339 5.6 and 5.8 do. */
3340
3341 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3342 ptrend - ptr >= 3 &&
3343 (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3344 *ptr == CHAR_EQUALS_SIGN) &&
3345 check_posix_syntax(ptr, ptrend, &tempptr))
3346 {
3347 BOOL posix_negate = FALSE;
3348 int posix_class;
3349
3350 /* Perl treats a hyphen before a POSIX class as a literal, not the
3351 start of a range. However, it gives a warning in its warning mode. PCRE
3352 does not have a warning mode, so we give an error, because this is
3353 likely an error on the user's part. */
3354
3355 if (class_range_state == RANGE_STARTED)
3356 {
3357 errorcode = ERR50;
3358 goto FAILED;
3359 }
3360
3361 if (*ptr != CHAR_COLON)
3362 {
3363 errorcode = ERR13;
3364 goto FAILED_BACK;
3365 }
3366
3367 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3368 {
3369 posix_negate = TRUE;
3370 ptr++;
3371 }
3372
3373 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3374 if (posix_class < 0)
3375 {
3376 errorcode = ERR30;
3377 goto FAILED;
3378 }
3379 ptr = tempptr + 2;
3380
3381 /* Perl treats a hyphen after a POSIX class as a literal, not the
3382 start of a range. However, it gives a warning in its warning mode
3383 unless the hyphen is the last character in the class. PCRE does not
3384 have a warning mode, so we give an error, because this is likely an
3385 error on the user's part. */
3386
3387 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3388 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3389 {
3390 errorcode = ERR50;
3391 goto FAILED;
3392 }
3393
3394 /* Set "a hyphen is not the start of a range" for the -] case, and also
3395 in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3396 fuzzers do that kind of thing) and *then* a hyphen. This causes that
3397 hyphen to be treated as a literal. I don't think it's worth setting up
3398 special apparatus to do otherwise. */
3399
3400 class_range_state = RANGE_NO;
3401
3402 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3403 use Unicode properties \p or \P or, in one case, \h or \H. The
3404 substitutes table has two values per class, containing the type and
3405 value of a \p or \P item. The special cases are specified with a
3406 negative type: a non-zero value causes \h or \H to be used, and a zero
3407 value falls through to behave like a non-UCP POSIX class. */
3408
3409 #ifdef SUPPORT_UNICODE
3410 if ((options & PCRE2_UCP) != 0)
3411 {
3412 int ptype = posix_substitutes[2*posix_class];
3413 int pvalue = posix_substitutes[2*posix_class + 1];
3414 if (ptype >= 0)
3415 {
3416 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3417 *parsed_pattern++ = (ptype << 16) | pvalue;
3418 goto CLASS_CONTINUE;
3419 }
3420
3421 if (pvalue != 0)
3422 {
3423 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3424 goto CLASS_CONTINUE;
3425 }
3426
3427 /* Fall through */
3428 }
3429 #endif /* SUPPORT_UNICODE */
3430
3431 /* Non-UCP POSIX class */
3432
3433 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3434 *parsed_pattern++ = posix_class;
3435 }
3436
3437 /* Handle potential start of range */
3438
3439 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3440 {
3441 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3442 META_RANGE_LITERAL : META_RANGE_ESCAPED;
3443 class_range_state = RANGE_STARTED;
3444 }
3445
3446 /* Handle a literal character */
3447
3448 else if (c != CHAR_BACKSLASH)
3449 {
3450 CLASS_LITERAL:
3451 if (class_range_state == RANGE_STARTED)
3452 {
3453 if (c == parsed_pattern[-2]) /* Optimize one-char range */
3454 parsed_pattern--;
3455 else if (parsed_pattern[-2] > c) /* Check range is in order */
3456 {
3457 errorcode = ERR8;
3458 goto FAILED_BACK;
3459 }
3460 else
3461 {
3462 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3463 parsed_pattern[-1] = META_RANGE_ESCAPED;
3464 PARSED_LITERAL(c, parsed_pattern);
3465 }
3466 class_range_state = RANGE_NO;
3467 }
3468 else /* Potential start of range */
3469 {
3470 class_range_state = char_is_literal?
3471 RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3472 PARSED_LITERAL(c, parsed_pattern);
3473 }
3474 }
3475
3476 /* Handle escapes in a class */
3477
3478 else
3479 {
3480 tempptr = ptr;
3481 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3482 cb->cx->extra_options, TRUE, cb);
3483
3484 if (errorcode != 0)
3485 {
3486 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3487 goto FAILED;
3488 ptr = tempptr;
3489 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3490 {
3491 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3492 }
3493 escape = 0; /* Treat as literal character */
3494 }
3495
3496 switch(escape)
3497 {
3498 case 0: /* Escaped character code point is in c */
3499 char_is_literal = FALSE;
3500 goto CLASS_LITERAL;
3501
3502 case ESC_b:
3503 c = CHAR_BS; /* \b is backspace in a class */
3504 char_is_literal = FALSE;
3505 goto CLASS_LITERAL;
3506
3507 case ESC_Q:
3508 inescq = TRUE; /* Enter literal mode */
3509 goto CLASS_CONTINUE;
3510
3511 case ESC_E: /* Ignore orphan \E */
3512 goto CLASS_CONTINUE;
3513
3514 case ESC_B: /* Always an error in a class */
3515 case ESC_R:
3516 case ESC_X:
3517 errorcode = ERR7;
3518 ptr--;
3519 goto FAILED;
3520 }
3521
3522 /* The second part of a range can be a single-character escape
3523 sequence (detected above), but not any of the other escapes. Perl
3524 treats a hyphen as a literal in such circumstances. However, in Perl's
3525 warning mode, a warning is given, so PCRE now faults it, as it is
3526 almost certainly a mistake on the user's part. */
3527
3528 if (class_range_state == RANGE_STARTED)
3529 {
3530 errorcode = ERR50;
3531 goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
3532 }
3533
3534 /* Of the remaining escapes, only those that define characters are
3535 allowed in a class. None may start a range. */
3536
3537 class_range_state = RANGE_NO;
3538 switch(escape)
3539 {
3540 case ESC_N:
3541 errorcode = ERR71;
3542 goto FAILED;
3543
3544 case ESC_H:
3545 case ESC_h:
3546 case ESC_V:
3547 case ESC_v:
3548 *parsed_pattern++ = META_ESCAPE + escape;
3549 break;
3550
3551 /* These escapes are converted to Unicode property tests when
3552 PCRE2_UCP is set. */
3553
3554 case ESC_d:
3555 case ESC_D:
3556 case ESC_s:
3557 case ESC_S:
3558 case ESC_w:
3559 case ESC_W:
3560 if ((options & PCRE2_UCP) == 0)
3561 {
3562 *parsed_pattern++ = META_ESCAPE + escape;
3563 }
3564 else
3565 {
3566 *parsed_pattern++ = META_ESCAPE +
3567 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3568 ESC_p : ESC_P);
3569 switch(escape)
3570 {
3571 case ESC_d:
3572 case ESC_D:
3573 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3574 break;
3575
3576 case ESC_s:
3577 case ESC_S:
3578 *parsed_pattern++ = PT_SPACE << 16;
3579 break;
3580
3581 case ESC_w:
3582 case ESC_W:
3583 *parsed_pattern++ = PT_WORD << 16;
3584 break;
3585 }
3586 }
3587 break;
3588
3589 /* Explicit Unicode property matching */
3590
3591 case ESC_P:
3592 case ESC_p:
3593 #ifdef SUPPORT_UNICODE
3594 {
3595 BOOL negated;
3596 uint16_t ptype = 0, pdata = 0;
3597 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3598 goto FAILED;
3599 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3600 *parsed_pattern++ = META_ESCAPE + escape;
3601 *parsed_pattern++ = (ptype << 16) | pdata;
3602 }
3603 #else
3604 errorcode = ERR45;
3605 goto FAILED;
3606 #endif
3607 break; /* End \P and \p */
3608
3609 default: /* All others are not allowed in a class */
3610 errorcode = ERR7;
3611 ptr--;
3612 goto FAILED;
3613 }
3614
3615 /* Perl gives a warning unless a following hyphen is the last character
3616 in the class. PCRE throws an error. */
3617
3618 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3619 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3620 {
3621 errorcode = ERR50;
3622 goto FAILED;
3623 }
3624 }
3625
3626 /* Proceed to next thing in the class. */
3627
3628 CLASS_CONTINUE:
3629 if (ptr >= ptrend)
3630 {
3631 errorcode = ERR6; /* Missing terminating ']' */
3632 goto FAILED;
3633 }
3634 GETCHARINCTEST(c, ptr);
3635 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3636 } /* End of class-processing loop */
3637
3638 /* -] at the end of a class is a literal '-' */
3639
3640 if (class_range_state == RANGE_STARTED)
3641 {
3642 parsed_pattern[-1] = CHAR_MINUS;
3643 class_range_state = RANGE_NO;
3644 }
3645
3646 *parsed_pattern++ = META_CLASS_END;
3647 break; /* End of character class */
3648
3649
3650 /* ---- Opening parenthesis ---- */
3651
3652 case CHAR_LEFT_PARENTHESIS:
3653 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3654
3655 /* If ( is not followed by ? it is either a capture or a special verb or an
3656 alpha assertion or a positive non-atomic lookahead. */
3657
3658 if (*ptr != CHAR_QUESTION_MARK)
3659 {
3660 const char *vn;
3661
3662 /* Handle capturing brackets (or non-capturing if auto-capture is turned
3663 off). */
3664
3665 if (*ptr != CHAR_ASTERISK)
3666 {
3667 nest_depth++;
3668 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3669 {
3670 if (cb->bracount >= MAX_GROUP_NUMBER)
3671 {
3672 errorcode = ERR97;
3673 goto FAILED;
3674 }
3675 cb->bracount++;
3676 *parsed_pattern++ = META_CAPTURE | cb->bracount;
3677 }
3678 else *parsed_pattern++ = META_NOCAPTURE;
3679 }
3680
3681 /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3682 quantifier" error rather than "(*MARK) must have an argument". */
3683
3684 else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3685 break;
3686
3687 /* Handle "alpha assertions" such as (*pla:...). Most of these are
3688 synonyms for the historical symbolic assertions, but the script run and
3689 non-atomic lookaround ones are new. They are distinguished by starting
3690 with a lower case letter. Checking both ends of the alphabet makes this
3691 work in all character codes. */
3692
3693 else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3694 {
3695 uint32_t meta;
3696
3697 vn = alasnames;
3698 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3699 &errorcode, cb)) goto FAILED;
3700 if (ptr >= ptrend || *ptr != CHAR_COLON)
3701 {
3702 errorcode = ERR95; /* Malformed */
3703 goto FAILED;
3704 }
3705
3706 /* Scan the table of alpha assertion names */
3707
3708 for (i = 0; i < alascount; i++)
3709 {
3710 if (namelen == alasmeta[i].len &&
3711 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3712 break;
3713 vn += alasmeta[i].len + 1;
3714 }
3715
3716 if (i >= alascount)
3717 {
3718 errorcode = ERR95; /* Alpha assertion not recognized */
3719 goto FAILED;
3720 }
3721
3722 /* Check for expecting an assertion condition. If so, only atomic
3723 lookaround assertions are valid. */
3724
3725 meta = alasmeta[i].meta;
3726 if (prev_expect_cond_assert > 0 &&
3727 (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3728 {
3729 errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3730 ERR98 : ERR28; /* (Atomic) assertion expected */
3731 goto FAILED;
3732 }
3733
3734 /* The lookaround alphabetic synonyms can mostly be handled by jumping
3735 to the code that handles the traditional symbolic forms. */
3736
3737 switch(meta)
3738 {
3739 default:
3740 errorcode = ERR89; /* Unknown code; should never occur because */
3741 goto FAILED; /* the meta values come from a table above. */
3742
3743 case META_ATOMIC:
3744 goto ATOMIC_GROUP;
3745
3746 case META_LOOKAHEAD:
3747 goto POSITIVE_LOOK_AHEAD;
3748
3749 case META_LOOKAHEAD_NA:
3750 goto POSITIVE_NONATOMIC_LOOK_AHEAD;
3751
3752 case META_LOOKAHEADNOT:
3753 goto NEGATIVE_LOOK_AHEAD;
3754
3755 case META_LOOKBEHIND:
3756 case META_LOOKBEHINDNOT:
3757 case META_LOOKBEHIND_NA:
3758 *parsed_pattern++ = meta;
3759 ptr--;
3760 goto POST_LOOKBEHIND;
3761
3762 /* The script run facilities are handled here. Unicode support is
3763 required (give an error if not, as this is a security issue). Always
3764 record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3765 META_ATOMIC and remember that we need two META_KETs at the end. */
3766
3767 case META_SCRIPT_RUN:
3768 case META_ATOMIC_SCRIPT_RUN:
3769 #ifdef SUPPORT_UNICODE
3770 *parsed_pattern++ = META_SCRIPT_RUN;
3771 nest_depth++;
3772 ptr++;
3773 if (meta == META_ATOMIC_SCRIPT_RUN)
3774 {
3775 *parsed_pattern++ = META_ATOMIC;
3776 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3777 else if (++top_nest >= end_nests)
3778 {
3779 errorcode = ERR84;
3780 goto FAILED;
3781 }
3782 top_nest->nest_depth = nest_depth;
3783 top_nest->flags = NSF_ATOMICSR;
3784 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3785 }
3786 break;
3787 #else /* SUPPORT_UNICODE */
3788 errorcode = ERR96;
3789 goto FAILED;
3790 #endif
3791 }
3792 }
3793
3794
3795 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3796
3797 else
3798 {
3799 vn = verbnames;
3800 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3801 &errorcode, cb)) goto FAILED;
3802 if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3803 *ptr != CHAR_RIGHT_PARENTHESIS))
3804 {
3805 errorcode = ERR60; /* Malformed */
3806 goto FAILED;
3807 }
3808
3809 /* Scan the table of verb names */
3810
3811 for (i = 0; i < verbcount; i++)
3812 {
3813 if (namelen == verbs[i].len &&
3814 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3815 break;
3816 vn += verbs[i].len + 1;
3817 }
3818
3819 if (i >= verbcount)
3820 {
3821 errorcode = ERR60; /* Verb not recognized */
3822 goto FAILED;
3823 }
3824
3825 /* An empty argument is treated as no argument. */
3826
3827 if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3828 ptr[1] == CHAR_RIGHT_PARENTHESIS)
3829 ptr++; /* Advance to the closing parens */
3830
3831 /* Check for mandatory non-empty argument; this is (*MARK) */
3832
3833 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3834 {
3835 errorcode = ERR66;
3836 goto FAILED;
3837 }
3838
3839 /* Remember where this verb, possibly with a preceding (*MARK), starts,
3840 for handling quantified (*ACCEPT). */
3841
3842 verbstartptr = parsed_pattern;
3843 okquantifier = (verbs[i].meta == META_ACCEPT);
3844
3845 /* It appears that Perl allows any characters whatsoever, other than a
3846 closing parenthesis, to appear in arguments ("names"), so we no longer
3847 insist on letters, digits, and underscores. Perl does not, however, do
3848 any interpretation within arguments, and has no means of including a
3849 closing parenthesis. PCRE supports escape processing but only when it
3850 is requested by an option. We set inverbname TRUE here, and let the
3851 main loop take care of this so that escape and \x processing is done by
3852 the main code above. */
3853
3854 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
3855 {
3856 /* Some optional arguments can be treated as a preceding (*MARK) */
3857
3858 if (verbs[i].has_arg < 0)
3859 {
3860 add_after_mark = verbs[i].meta;
3861 *parsed_pattern++ = META_MARK;
3862 }
3863
3864 /* The remaining verbs with arguments (except *MARK) need a different
3865 opcode. */
3866
3867 else
3868 {
3869 *parsed_pattern++ = verbs[i].meta +
3870 ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3871 }
3872
3873 /* Set up for reading the name in the main loop. */
3874
3875 verblengthptr = parsed_pattern++;
3876 verbnamestart = ptr;
3877 inverbname = TRUE;
3878 }
3879 else /* No verb "name" argument */
3880 {
3881 *parsed_pattern++ = verbs[i].meta;
3882 }
3883 } /* End of (*VERB) handling */
3884 break; /* Done with this parenthesis */
3885 } /* End of groups that don't start with (? */
3886
3887
3888 /* ---- Items starting (? ---- */
3889
3890 /* The type of item is determined by what follows (?. Handle (?| and option
3891 changes under "default" because both need a new block on the nest stack.
3892 Comments starting with (?# are handled above. Note that there is some
3893 ambiguity about the sequence (?- because if a digit follows it's a relative
3894 recursion or subroutine call whereas otherwise it's an option unsetting. */
3895
3896 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3897
3898 switch(*ptr)
3899 {
3900 default:
3901 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
3902 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
3903
3904 /* We now have either (?| or a (possibly empty) option setting,
3905 optionally followed by a non-capturing group. */
3906
3907 nest_depth++;
3908 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3909 else if (++top_nest >= end_nests)
3910 {
3911 errorcode = ERR84;
3912 goto FAILED;
3913 }
3914 top_nest->nest_depth = nest_depth;
3915 top_nest->flags = 0;
3916 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3917
3918 /* Start of non-capturing group that resets the capture count for each
3919 branch. */
3920
3921 if (*ptr == CHAR_VERTICAL_LINE)
3922 {
3923 top_nest->reset_group = (uint16_t)cb->bracount;
3924 top_nest->max_group = (uint16_t)cb->bracount;
3925 top_nest->flags |= NSF_RESET;
3926 cb->external_flags |= PCRE2_DUPCAPUSED;
3927 *parsed_pattern++ = META_NOCAPTURE;
3928 ptr++;
3929 }
3930
3931 /* Scan for options imnsxJU to be set or unset. */
3932
3933 else
3934 {
3935 BOOL hyphenok = TRUE;
3936 uint32_t oldoptions = options;
3937
3938 top_nest->reset_group = 0;
3939 top_nest->max_group = 0;
3940 set = unset = 0;
3941 optset = &set;
3942
3943 /* ^ at the start unsets imnsx and disables the subsequent use of - */
3944
3945 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
3946 {
3947 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
3948 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
3949 hyphenok = FALSE;
3950 ptr++;
3951 }
3952
3953 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
3954 *ptr != CHAR_COLON)
3955 {
3956 switch (*ptr++)
3957 {
3958 case CHAR_MINUS:
3959 if (!hyphenok)
3960 {
3961 errorcode = ERR94;
3962 ptr--; /* Correct the offset */
3963 goto FAILED;
3964 }
3965 optset = &unset;
3966 hyphenok = FALSE;
3967 break;
3968
3969 case CHAR_J: /* Record that it changed in the external options */
3970 *optset |= PCRE2_DUPNAMES;
3971 cb->external_flags |= PCRE2_JCHANGED;
3972 break;
3973
3974 case CHAR_i: *optset |= PCRE2_CASELESS; break;
3975 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
3976 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
3977 case CHAR_s: *optset |= PCRE2_DOTALL; break;
3978 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
3979
3980 /* If x appears twice it sets the extended extended option. */
3981
3982 case CHAR_x:
3983 *optset |= PCRE2_EXTENDED;
3984 if (ptr < ptrend && *ptr == CHAR_x)
3985 {
3986 *optset |= PCRE2_EXTENDED_MORE;
3987 ptr++;
3988 }
3989 break;
3990
3991 default:
3992 errorcode = ERR11;
3993 ptr--; /* Correct the offset */
3994 goto FAILED;
3995 }
3996 }
3997
3998 /* If we are setting extended without extended-more, ensure that any
3999 existing extended-more gets unset. Also, unsetting extended must also
4000 unset extended-more. */
4001
4002 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4003 (unset & PCRE2_EXTENDED) != 0)
4004 unset |= PCRE2_EXTENDED_MORE;
4005
4006 options = (options | set) & (~unset);
4007
4008 /* If the options ended with ')' this is not the start of a nested
4009 group with option changes, so the options change at this level.
4010 In this case, if the previous level set up a nest block, discard the
4011 one we have just created. Otherwise adjust it for the previous level.
4012 If the options ended with ':' we are starting a non-capturing group,
4013 possibly with an options setting. */
4014
4015 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4016 if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4017 {
4018 nest_depth--; /* This is not a nested group after all. */
4019 if (top_nest > (nest_save *)(cb->start_workspace) &&
4020 (top_nest-1)->nest_depth == nest_depth) top_nest--;
4021 else top_nest->nest_depth = nest_depth;
4022 }
4023 else *parsed_pattern++ = META_NOCAPTURE;
4024
4025 /* If nothing changed, no need to record. */
4026
4027 if (options != oldoptions)
4028 {
4029 *parsed_pattern++ = META_OPTIONS;
4030 *parsed_pattern++ = options;
4031 }
4032 } /* End options processing */
4033 break; /* End default case after (? */
4034
4035
4036 /* ---- Python syntax support ---- */
4037
4038 case CHAR_P:
4039 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4040
4041 /* (?P<name> is the same as (?<name>, which defines a named group. */
4042
4043 if (*ptr == CHAR_LESS_THAN_SIGN)
4044 {
4045 terminator = CHAR_GREATER_THAN_SIGN;
4046 goto DEFINE_NAME;
4047 }
4048
4049 /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4050 call. */
4051
4052 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4053
4054 /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4055 else after (?P is an error. */
4056
4057 if (*ptr != CHAR_EQUALS_SIGN)
4058 {
4059 errorcode = ERR41;
4060 goto FAILED;
4061 }
4062 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4063 &namelen, &errorcode, cb)) goto FAILED;
4064 *parsed_pattern++ = META_BACKREF_BYNAME;
4065 *parsed_pattern++ = namelen;
4066 PUTOFFSET(offset, parsed_pattern);
4067 okquantifier = TRUE;
4068 break; /* End of (?P processing */
4069
4070
4071 /* ---- Recursion/subroutine calls by number ---- */
4072
4073 case CHAR_R:
4074 i = 0; /* (?R) == (?R0) */
4075 ptr++;
4076 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4077 {
4078 errorcode = ERR58;
4079 goto FAILED;
4080 }
4081 goto SET_RECURSION;
4082
4083 /* An item starting (?- followed by a digit comes here via the "default"
4084 case because (?- followed by a non-digit is an options setting. */
4085
4086 case CHAR_PLUS:
4087 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4088 {
4089 errorcode = ERR29; /* Missing number */
4090 goto FAILED;
4091 }
4092 /* Fall through */
4093
4094 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4095 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4096 RECURSION_BYNUMBER:
4097 if (!read_number(&ptr, ptrend,
4098 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4099 MAX_GROUP_NUMBER, ERR61,
4100 &i, &errorcode)) goto FAILED;
4101 if (i < 0) /* NB (?0) is permitted */
4102 {
4103 errorcode = ERR15; /* Unknown group */
4104 goto FAILED_BACK;
4105 }
4106 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4107 goto UNCLOSED_PARENTHESIS;
4108
4109 SET_RECURSION:
4110 *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4111 offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4112 ptr++;
4113 PUTOFFSET(offset, parsed_pattern);
4114 okquantifier = TRUE;
4115 break; /* End of recursive call by number handling */
4116
4117
4118 /* ---- Recursion/subroutine calls by name ---- */
4119
4120 case CHAR_AMPERSAND:
4121 RECURSE_BY_NAME:
4122 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4123 &namelen, &errorcode, cb)) goto FAILED;
4124 *parsed_pattern++ = META_RECURSE_BYNAME;
4125 *parsed_pattern++ = namelen;
4126 PUTOFFSET(offset, parsed_pattern);
4127 okquantifier = TRUE;
4128 break;
4129
4130 /* ---- Callout with numerical or string argument ---- */
4131
4132 case CHAR_C:
4133 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4134
4135 /* If the previous item was a condition starting (?(? an assertion,
4136 optionally preceded by a callout, is expected. This is checked later on,
4137 during actual compilation. However we need to identify this kind of
4138 assertion in this pass because it must not be qualified. The value of
4139 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4140 for a callout - still leaving a positive value that identifies the
4141 assertion. Multiple callouts or any other items will make it zero or
4142 less, which doesn't matter because they will cause an error later. */
4143
4144 expect_cond_assert = prev_expect_cond_assert - 1;
4145
4146 /* If previous_callout is not NULL, it means this follows a previous
4147 callout. If it was a manual callout, do nothing; this means its "length
4148 of next pattern item" field will remain zero. If it was an automatic
4149 callout, abolish it. */
4150
4151 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4152 previous_callout == parsed_pattern - 4 &&
4153 parsed_pattern[-1] == 255)
4154 parsed_pattern = previous_callout;
4155
4156 /* Save for updating next pattern item length, and skip one item before
4157 completing. */
4158
4159 previous_callout = parsed_pattern;
4160 after_manual_callout = 1;
4161
4162 /* Handle a string argument; specific delimiter is required. */
4163
4164 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4165 {
4166 PCRE2_SIZE calloutlength;
4167 PCRE2_SPTR startptr = ptr;
4168
4169 delimiter = 0;
4170 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4171 {
4172 if (*ptr == PRIV(callout_start_delims)[i])
4173 {
4174 delimiter = PRIV(callout_end_delims)[i];
4175 break;
4176 }
4177 }
4178 if (delimiter == 0)
4179 {
4180 errorcode = ERR82;
4181 goto FAILED;
4182 }
4183
4184 *parsed_pattern = META_CALLOUT_STRING;
4185 parsed_pattern += 3; /* Skip pattern info */
4186
4187 for (;;)
4188 {
4189 if (++ptr >= ptrend)
4190 {
4191 errorcode = ERR81;
4192 ptr = startptr; /* To give a more useful message */
4193 goto FAILED;
4194 }
4195 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4196 break;
4197 }
4198
4199 calloutlength = (PCRE2_SIZE)(ptr - startptr);
4200 if (calloutlength > UINT32_MAX)
4201 {
4202 errorcode = ERR72;
4203 goto FAILED;
4204 }
4205 *parsed_pattern++ = (uint32_t)calloutlength;
4206 offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4207 PUTOFFSET(offset, parsed_pattern);
4208 }
4209
4210 /* Handle a callout with an optional numerical argument, which must be
4211 less than or equal to 255. A missing argument gives 0. */
4212
4213 else
4214 {
4215 int n = 0;
4216 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
4217 parsed_pattern += 3; /* Skip pattern info */
4218 while (ptr < ptrend && IS_DIGIT(*ptr))
4219 {
4220 n = n * 10 + *ptr++ - CHAR_0;
4221 if (n > 255)
4222 {
4223 errorcode = ERR38;
4224 goto FAILED;
4225 }
4226 }
4227 *parsed_pattern++ = n;
4228 }
4229
4230 /* Both formats must have a closing parenthesis */
4231
4232 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4233 {
4234 errorcode = ERR39;
4235 goto FAILED;
4236 }
4237 ptr++;
4238
4239 /* Remember the offset to the next item in the pattern, and set a default
4240 length. This should get updated after the next item is read. */
4241
4242 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4243 previous_callout[2] = 0;
4244 break; /* End callout */
4245
4246
4247 /* ---- Conditional group ---- */
4248
4249 /* A condition can be an assertion, a number (referring to a numbered
4250 group's having been set), a name (referring to a named group), or 'R',
4251 referring to overall recursion. R<digits> and R&name are also permitted
4252 for recursion state tests. Numbers may be preceded by + or - to specify a
4253 relative group number.
4254
4255 There are several syntaxes for testing a named group: (?(name)) is used
4256 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4257
4258 There are two unfortunate ambiguities. 'R' can be the recursive thing or
4259 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4260 the Perl DEFINE feature or the Python named test. We look for a name
4261 first; if not found, we try the other case.
4262
4263 For compatibility with auto-callouts, we allow a callout to be specified
4264 before a condition that is an assertion. */
4265
4266 case CHAR_LEFT_PARENTHESIS:
4267 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4268 nest_depth++;
4269
4270 /* If the next character is ? or * there must be an assertion next
4271 (optionally preceded by a callout). We do not check this here, but
4272 instead we set expect_cond_assert to 2. If this is still greater than
4273 zero (callouts decrement it) when the next assertion is read, it will be
4274 marked as a condition that must not be repeated. A value greater than
4275 zero also causes checking that an assertion (possibly with callout)
4276 follows. */
4277
4278 if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4279 {
4280 *parsed_pattern++ = META_COND_ASSERT;
4281 ptr--; /* Pull pointer back to the opening parenthesis. */
4282 expect_cond_assert = 2;
4283 break; /* End of conditional */
4284 }
4285
4286 /* Handle (?([+-]number)... */
4287
4288 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4289 &errorcode))
4290 {
4291 if (i <= 0)
4292 {
4293 errorcode = ERR15;
4294 goto FAILED;
4295 }
4296 *parsed_pattern++ = META_COND_NUMBER;
4297 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4298 PUTOFFSET(offset, parsed_pattern);
4299 *parsed_pattern++ = i;
4300 }
4301 else if (errorcode != 0) goto FAILED; /* Number too big */
4302
4303 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4304
4305 else if (ptrend - ptr >= 10 &&
4306 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4307 ptr[7] != CHAR_RIGHT_PARENTHESIS)
4308 {
4309 uint32_t ge = 0;
4310 int major = 0;
4311 int minor = 0;
4312
4313 ptr += 7;
4314 if (*ptr == CHAR_GREATER_THAN_SIGN)
4315 {
4316 ge = 1;
4317 ptr++;
4318 }
4319
4320 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4321 references its argument twice. */
4322
4323 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4324 goto BAD_VERSION_CONDITION;
4325
4326 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4327 goto FAILED;
4328
4329 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4330 if (*ptr == CHAR_DOT)
4331 {
4332 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4333 minor = (*ptr++ - CHAR_0) * 10;
4334 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4335 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4336 goto BAD_VERSION_CONDITION;
4337 }
4338
4339 *parsed_pattern++ = META_COND_VERSION;
4340 *parsed_pattern++ = ge;
4341 *parsed_pattern++ = major;
4342 *parsed_pattern++ = minor;
4343 }
4344
4345 /* All the remaining cases now require us to read a name. We cannot at
4346 this stage distinguish ambiguous cases such as (?(R12) which might be a
4347 recursion test by number or a name, because the named groups have not yet
4348 all been identified. Those cases are treated as names, but given a
4349 different META code. */
4350
4351 else
4352 {
4353 BOOL was_r_ampersand = FALSE;
4354
4355 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4356 {
4357 terminator = CHAR_RIGHT_PARENTHESIS;
4358 was_r_ampersand = TRUE;
4359 ptr++;
4360 }
4361 else if (*ptr == CHAR_LESS_THAN_SIGN)
4362 terminator = CHAR_GREATER_THAN_SIGN;
4363 else if (*ptr == CHAR_APOSTROPHE)
4364 terminator = CHAR_APOSTROPHE;
4365 else
4366 {
4367 terminator = CHAR_RIGHT_PARENTHESIS;
4368 ptr--; /* Point to char before name */
4369 }
4370 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4371 &errorcode, cb)) goto FAILED;
4372
4373 /* Handle (?(R&name) */
4374
4375 if (was_r_ampersand)
4376 {
4377 *parsed_pattern = META_COND_RNAME;
4378 ptr--; /* Back to closing parens */
4379 }
4380
4381 /* Handle (?(name). If the name is "DEFINE" we identify it with a
4382 special code. Likewise if the name consists of R followed only by
4383 digits. Otherwise, handle it like a quoted name. */
4384
4385 else if (terminator == CHAR_RIGHT_PARENTHESIS)
4386 {
4387 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4388 *parsed_pattern = META_COND_DEFINE;
4389 else
4390 {
4391 for (i = 1; i < (int)namelen; i++)
4392 if (!IS_DIGIT(name[i])) break;
4393 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4394 META_COND_RNUMBER : META_COND_NAME;
4395 }
4396 ptr--; /* Back to closing parens */
4397 }
4398
4399 /* Handle (?('name') or (?(<name>) */
4400
4401 else *parsed_pattern = META_COND_NAME;
4402
4403 /* All these cases except DEFINE end with the name length and offset;
4404 DEFINE just has an offset (for the "too many branches" error). */
4405
4406 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4407 PUTOFFSET(offset, parsed_pattern);
4408 } /* End cases that read a name */
4409
4410 /* Check the closing parenthesis of the condition */
4411
4412 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4413 {
4414 errorcode = ERR24;
4415 goto FAILED;
4416 }
4417 ptr++;
4418 break; /* End of condition processing */
4419
4420
4421 /* ---- Atomic group ---- */
4422
4423 case CHAR_GREATER_THAN_SIGN:
4424 ATOMIC_GROUP: /* Come from (*atomic: */
4425 *parsed_pattern++ = META_ATOMIC;
4426 nest_depth++;
4427 ptr++;
4428 break;
4429
4430
4431 /* ---- Lookahead assertions ---- */
4432
4433 case CHAR_EQUALS_SIGN:
4434 POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
4435 *parsed_pattern++ = META_LOOKAHEAD;
4436 ptr++;
4437 goto POST_ASSERTION;
4438
4439 case CHAR_ASTERISK:
4440 POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
4441 *parsed_pattern++ = META_LOOKAHEAD_NA;
4442 ptr++;
4443 goto POST_ASSERTION;
4444
4445 case CHAR_EXCLAMATION_MARK:
4446 NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
4447 *parsed_pattern++ = META_LOOKAHEADNOT;
4448 ptr++;
4449 goto POST_ASSERTION;
4450
4451
4452 /* ---- Lookbehind assertions ---- */
4453
4454 /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4455 is the start of the name of a capturing group. */
4456
4457 case CHAR_LESS_THAN_SIGN:
4458 if (ptrend - ptr <= 1 ||
4459 (ptr[1] != CHAR_EQUALS_SIGN &&
4460 ptr[1] != CHAR_EXCLAMATION_MARK &&
4461 ptr[1] != CHAR_ASTERISK))
4462 {
4463 terminator = CHAR_GREATER_THAN_SIGN;
4464 goto DEFINE_NAME;
4465 }
4466 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4467 META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4468 META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4469
4470 POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
4471 *has_lookbehind = TRUE;
4472 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4473 PUTOFFSET(offset, parsed_pattern);
4474 ptr += 2;
4475 /* Fall through */
4476
4477 /* If the previous item was a condition starting (?(? an assertion,
4478 optionally preceded by a callout, is expected. This is checked later on,
4479 during actual compilation. However we need to identify this kind of
4480 assertion in this pass because it must not be qualified. The value of
4481 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4482 for a callout - still leaving a positive value that identifies the
4483 assertion. Multiple callouts or any other items will make it zero or
4484 less, which doesn't matter because they will cause an error later. */
4485
4486 POST_ASSERTION:
4487 nest_depth++;
4488 if (prev_expect_cond_assert > 0)
4489 {
4490 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4491 else if (++top_nest >= end_nests)
4492 {
4493 errorcode = ERR84;
4494 goto FAILED;
4495 }
4496 top_nest->nest_depth = nest_depth;
4497 top_nest->flags = NSF_CONDASSERT;
4498 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4499 }
4500 break;
4501
4502
4503 /* ---- Define a named group ---- */
4504
4505 /* A named group may be defined as (?'name') or (?<name>). In the latter
4506 case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4507 terminator set to '>'. */
4508
4509 case CHAR_APOSTROPHE:
4510 terminator = CHAR_APOSTROPHE; /* Terminator */
4511
4512 DEFINE_NAME:
4513 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4514 &errorcode, cb)) goto FAILED;
4515
4516 /* We have a name for this capturing group. It is also assigned a number,
4517 which is its primary means of identification. */
4518
4519 if (cb->bracount >= MAX_GROUP_NUMBER)
4520 {
4521 errorcode = ERR97;
4522 goto FAILED;
4523 }
4524 cb->bracount++;
4525 *parsed_pattern++ = META_CAPTURE | cb->bracount;
4526 nest_depth++;
4527
4528 /* Check not too many names */
4529
4530 if (cb->names_found >= MAX_NAME_COUNT)
4531 {
4532 errorcode = ERR49;
4533 goto FAILED;
4534 }
4535
4536 /* Adjust the entry size to accommodate the longest name found. */
4537
4538 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4539 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4540
4541 /* Scan the list to check for duplicates. For duplicate names, if the
4542 number is the same, break the loop, which causes the name to be
4543 discarded; otherwise, if DUPNAMES is not set, give an error.
4544 If it is set, allow the name with a different number, but continue
4545 scanning in case this is a duplicate with the same number. For
4546 non-duplicate names, give an error if the number is duplicated. */
4547
4548 isdupname = FALSE;
4549 ng = cb->named_groups;
4550 for (i = 0; i < cb->names_found; i++, ng++)
4551 {
4552 if (namelen == ng->length &&
4553 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4554 {
4555 if (ng->number == cb->bracount) break;
4556 if ((options & PCRE2_DUPNAMES) == 0)
4557 {
4558 errorcode = ERR43;
4559 goto FAILED;
4560 }
4561 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4562 cb->dupnames = TRUE; /* Duplicate names exist */
4563 }
4564 else if (ng->number == cb->bracount)
4565 {
4566 errorcode = ERR65;
4567 goto FAILED;
4568 }
4569 }
4570
4571 if (i < cb->names_found) break; /* Ignore duplicate with same number */
4572
4573 /* Increase the list size if necessary */
4574
4575 if (cb->names_found >= cb->named_group_list_size)
4576 {
4577 uint32_t newsize = cb->named_group_list_size * 2;
4578 named_group *newspace =
4579 cb->cx->memctl.malloc(newsize * sizeof(named_group),
4580 cb->cx->memctl.memory_data);
4581 if (newspace == NULL)
4582 {
4583 errorcode = ERR21;
4584 goto FAILED;
4585 }
4586
4587 memcpy(newspace, cb->named_groups,
4588 cb->named_group_list_size * sizeof(named_group));
4589 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4590 cb->cx->memctl.free((void *)cb->named_groups,
4591 cb->cx->memctl.memory_data);
4592 cb->named_groups = newspace;
4593 cb->named_group_list_size = newsize;
4594 }
4595
4596 /* Add this name to the list */
4597
4598 cb->named_groups[cb->names_found].name = name;
4599 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4600 cb->named_groups[cb->names_found].number = cb->bracount;
4601 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4602 cb->names_found++;
4603 break;
4604 } /* End of (? switch */
4605 break; /* End of ( handling */
4606
4607
4608 /* ---- Branch terminators ---- */
4609
4610 /* Alternation: reset the capture count if we are in a (?| group. */
4611
4612 case CHAR_VERTICAL_LINE:
4613 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4614 (top_nest->flags & NSF_RESET) != 0)
4615 {
4616 if (cb->bracount > top_nest->max_group)
4617 top_nest->max_group = (uint16_t)cb->bracount;
4618 cb->bracount = top_nest->reset_group;
4619 }
4620 *parsed_pattern++ = META_ALT;
4621 break;
4622
4623 /* End of group; reset the capture count to the maximum if we are in a (?|
4624 group and/or reset the options that are tracked during parsing. Disallow
4625 quantifier for a condition that is an assertion. */
4626
4627 case CHAR_RIGHT_PARENTHESIS:
4628 okquantifier = TRUE;
4629 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4630 {
4631 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4632 if ((top_nest->flags & NSF_RESET) != 0 &&
4633 top_nest->max_group > cb->bracount)
4634 cb->bracount = top_nest->max_group;
4635 if ((top_nest->flags & NSF_CONDASSERT) != 0)
4636 okquantifier = FALSE;
4637
4638 if ((top_nest->flags & NSF_ATOMICSR) != 0)
4639 {
4640 *parsed_pattern++ = META_KET;
4641 }
4642
4643 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4644 else top_nest--;
4645 }
4646 if (nest_depth == 0) /* Unmatched closing parenthesis */
4647 {
4648 errorcode = ERR22;
4649 goto FAILED_BACK;
4650 }
4651 nest_depth--;
4652 *parsed_pattern++ = META_KET;
4653 break;
4654 } /* End of switch on pattern character */
4655 } /* End of main character scan loop */
4656
4657 /* End of pattern reached. Check for missing ) at the end of a verb name. */
4658
4659 if (inverbname && ptr >= ptrend)
4660 {
4661 errorcode = ERR60;
4662 goto FAILED;
4663 }
4664
4665 /* Manage callout for the final item */
4666
4667 PARSED_END:
4668 parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4669 parsed_pattern, cb);
4670
4671 /* Insert trailing items for word and line matching (features provided for the
4672 benefit of pcre2grep). */
4673
4674 if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4675 {
4676 *parsed_pattern++ = META_KET;
4677 *parsed_pattern++ = META_DOLLAR;
4678 }
4679 else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4680 {
4681 *parsed_pattern++ = META_KET;
4682 *parsed_pattern++ = META_ESCAPE + ESC_b;
4683 }
4684
4685 /* Terminate the parsed pattern, then return success if all groups are closed.
4686 Otherwise we have unclosed parentheses. */
4687
4688 if (parsed_pattern >= parsed_pattern_end)
4689 {
4690 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
4691 goto FAILED;
4692 }
4693
4694 *parsed_pattern = META_END;
4695 if (nest_depth == 0) return 0;
4696
4697 UNCLOSED_PARENTHESIS:
4698 errorcode = ERR14;
4699
4700 /* Come here for all failures. */
4701
4702 FAILED:
4703 cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4704 return errorcode;
4705
4706 /* Some errors need to indicate the previous character. */
4707
4708 FAILED_BACK:
4709 ptr--;
4710 goto FAILED;
4711
4712 /* This failure happens several times. */
4713
4714 BAD_VERSION_CONDITION:
4715 errorcode = ERR79;
4716 goto FAILED;
4717 }
4718
4719
4720
4721 /*************************************************
4722 * Find first significant opcode *
4723 *************************************************/
4724
4725 /* This is called by several functions that scan a compiled expression looking
4726 for a fixed first character, or an anchoring opcode etc. It skips over things
4727 that do not influence this. For some calls, it makes sense to skip negative
4728 forward and all backward assertions, and also the \b assertion; for others it
4729 does not.
4730
4731 Arguments:
4732 code pointer to the start of the group
4733 skipassert TRUE if certain assertions are to be skipped
4734
4735 Returns: pointer to the first significant opcode
4736 */
4737
4738 static const PCRE2_UCHAR*
first_significant_code(PCRE2_SPTR code,BOOL skipassert)4739 first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4740 {
4741 for (;;)
4742 {
4743 switch ((int)*code)
4744 {
4745 case OP_ASSERT_NOT:
4746 case OP_ASSERTBACK:
4747 case OP_ASSERTBACK_NOT:
4748 case OP_ASSERTBACK_NA:
4749 if (!skipassert) return code;
4750 do code += GET(code, 1); while (*code == OP_ALT);
4751 code += PRIV(OP_lengths)[*code];
4752 break;
4753
4754 case OP_WORD_BOUNDARY:
4755 case OP_NOT_WORD_BOUNDARY:
4756 if (!skipassert) return code;
4757 /* Fall through */
4758
4759 case OP_CALLOUT:
4760 case OP_CREF:
4761 case OP_DNCREF:
4762 case OP_RREF:
4763 case OP_DNRREF:
4764 case OP_FALSE:
4765 case OP_TRUE:
4766 code += PRIV(OP_lengths)[*code];
4767 break;
4768
4769 case OP_CALLOUT_STR:
4770 code += GET(code, 1 + 2*LINK_SIZE);
4771 break;
4772
4773 case OP_SKIPZERO:
4774 code += 2 + GET(code, 2) + LINK_SIZE;
4775 break;
4776
4777 case OP_COND:
4778 case OP_SCOND:
4779 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
4780 code[GET(code, 1)] != OP_KET) /* More than one branch */
4781 return code;
4782 code += GET(code, 1) + 1 + LINK_SIZE;
4783 break;
4784
4785 case OP_MARK:
4786 case OP_COMMIT_ARG:
4787 case OP_PRUNE_ARG:
4788 case OP_SKIP_ARG:
4789 case OP_THEN_ARG:
4790 code += code[1] + PRIV(OP_lengths)[*code];
4791 break;
4792
4793 default:
4794 return code;
4795 }
4796 }
4797 /* Control never reaches here */
4798 }
4799
4800
4801
4802 #ifdef SUPPORT_UNICODE
4803 /*************************************************
4804 * Get othercase range *
4805 *************************************************/
4806
4807 /* This function is passed the start and end of a class range in UCP mode. It
4808 searches up the characters, looking for ranges of characters in the "other"
4809 case. Each call returns the next one, updating the start address. A character
4810 with multiple other cases is returned on its own with a special return value.
4811
4812 Arguments:
4813 cptr points to starting character value; updated
4814 d end value
4815 ocptr where to put start of othercase range
4816 odptr where to put end of othercase range
4817
4818 Yield: -1 when no more
4819 0 when a range is returned
4820 >0 the CASESET offset for char with multiple other cases
4821 in this case, ocptr contains the original
4822 */
4823
4824 static int
get_othercase_range(uint32_t * cptr,uint32_t d,uint32_t * ocptr,uint32_t * odptr)4825 get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4826 uint32_t *odptr)
4827 {
4828 uint32_t c, othercase, next;
4829 unsigned int co;
4830
4831 /* Find the first character that has an other case. If it has multiple other
4832 cases, return its case offset value. */
4833
4834 for (c = *cptr; c <= d; c++)
4835 {
4836 if ((co = UCD_CASESET(c)) != 0)
4837 {
4838 *ocptr = c++; /* Character that has the set */
4839 *cptr = c; /* Rest of input range */
4840 return (int)co;
4841 }
4842 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4843 }
4844
4845 if (c > d) return -1; /* Reached end of range */
4846
4847 /* Found a character that has a single other case. Search for the end of the
4848 range, which is either the end of the input range, or a character that has zero
4849 or more than one other cases. */
4850
4851 *ocptr = othercase;
4852 next = othercase + 1;
4853
4854 for (++c; c <= d; c++)
4855 {
4856 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4857 next++;
4858 }
4859
4860 *odptr = next - 1; /* End of othercase range */
4861 *cptr = c; /* Rest of input range */
4862 return 0;
4863 }
4864 #endif /* SUPPORT_UNICODE */
4865
4866
4867
4868 /*************************************************
4869 * Add a character or range to a class (internal) *
4870 *************************************************/
4871
4872 /* This function packages up the logic of adding a character or range of
4873 characters to a class. The character values in the arguments will be within the
4874 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4875 called only from within the "add to class" group of functions, some of which
4876 are recursive and mutually recursive. The external entry point is
4877 add_to_class().
4878
4879 Arguments:
4880 classbits the bit map for characters < 256
4881 uchardptr points to the pointer for extra data
4882 options the options word
4883 cb compile data
4884 start start of range character
4885 end end of range character
4886
4887 Returns: the number of < 256 characters added
4888 the pointer to extra data is updated
4889 */
4890
4891 static unsigned int
add_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)4892 add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
4893 uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
4894 {
4895 uint32_t c;
4896 uint32_t classbits_end = (end <= 0xff ? end : 0xff);
4897 unsigned int n8 = 0;
4898
4899 /* If caseless matching is required, scan the range and process alternate
4900 cases. In Unicode, there are 8-bit characters that have alternate cases that
4901 are greater than 255 and vice-versa. Sometimes we can just extend the original
4902 range. */
4903
4904 if ((options & PCRE2_CASELESS) != 0)
4905 {
4906 #ifdef SUPPORT_UNICODE
4907 if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
4908 {
4909 int rc;
4910 uint32_t oc, od;
4911
4912 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
4913 c = start;
4914
4915 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4916 {
4917 /* Handle a single character that has more than one other case. */
4918
4919 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
4920 PRIV(ucd_caseless_sets) + rc, oc);
4921
4922 /* Do nothing if the other case range is within the original range. */
4923
4924 else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
4925
4926 /* Extend the original range if there is overlap, noting that if oc < c, we
4927 can't have od > end because a subrange is always shorter than the basic
4928 range. Otherwise, use a recursive call to add the additional range. */
4929
4930 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4931 else if (od > end && oc <= end + 1)
4932 {
4933 end = od; /* Extend upwards */
4934 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4935 }
4936 else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
4937 }
4938 }
4939 else
4940 #endif /* SUPPORT_UNICODE */
4941
4942 /* Not UTF mode */
4943
4944 for (c = start; c <= classbits_end; c++)
4945 {
4946 SETBIT(classbits, cb->fcc[c]);
4947 n8++;
4948 }
4949 }
4950
4951 /* Now handle the originally supplied range. Adjust the final value according
4952 to the bit length - this means that the same lists of (e.g.) horizontal spaces
4953 can be used in all cases. */
4954
4955 if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
4956 end = MAX_NON_UTF_CHAR;
4957
4958 if (start > cb->class_range_start && end < cb->class_range_end) return n8;
4959
4960 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4961
4962 for (c = start; c <= classbits_end; c++)
4963 {
4964 /* Regardless of start, c will always be <= 255. */
4965 SETBIT(classbits, c);
4966 n8++;
4967 }
4968
4969 #ifdef SUPPORT_WIDE_CHARS
4970 if (start <= 0xff) start = 0xff + 1;
4971
4972 if (end >= start)
4973 {
4974 PCRE2_UCHAR *uchardata = *uchardptr;
4975
4976 #ifdef SUPPORT_UNICODE
4977 if ((options & PCRE2_UTF) != 0)
4978 {
4979 if (start < end)
4980 {
4981 *uchardata++ = XCL_RANGE;
4982 uchardata += PRIV(ord2utf)(start, uchardata);
4983 uchardata += PRIV(ord2utf)(end, uchardata);
4984 }
4985 else if (start == end)
4986 {
4987 *uchardata++ = XCL_SINGLE;
4988 uchardata += PRIV(ord2utf)(start, uchardata);
4989 }
4990 }
4991 else
4992 #endif /* SUPPORT_UNICODE */
4993
4994 /* Without UTF support, character values are constrained by the bit length,
4995 and can only be > 256 for 16-bit and 32-bit libraries. */
4996
4997 #if PCRE2_CODE_UNIT_WIDTH == 8
4998 {}
4999 #else
5000 if (start < end)
5001 {
5002 *uchardata++ = XCL_RANGE;
5003 *uchardata++ = start;
5004 *uchardata++ = end;
5005 }
5006 else if (start == end)
5007 {
5008 *uchardata++ = XCL_SINGLE;
5009 *uchardata++ = start;
5010 }
5011 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
5012 *uchardptr = uchardata; /* Updata extra data pointer */
5013 }
5014 #else /* SUPPORT_WIDE_CHARS */
5015 (void)uchardptr; /* Avoid compiler warning */
5016 #endif /* SUPPORT_WIDE_CHARS */
5017
5018 return n8; /* Number of 8-bit characters */
5019 }
5020
5021
5022
5023 #ifdef SUPPORT_UNICODE
5024 /*************************************************
5025 * Add a list of characters to a class (internal) *
5026 *************************************************/
5027
5028 /* This function is used for adding a list of case-equivalent characters to a
5029 class when in UTF mode. This function is called only from within
5030 add_to_class_internal(), with which it is mutually recursive.
5031
5032 Arguments:
5033 classbits the bit map for characters < 256
5034 uchardptr points to the pointer for extra data
5035 options the options word
5036 cb contains pointers to tables etc.
5037 p points to row of 32-bit values, terminated by NOTACHAR
5038 except character to omit; this is used when adding lists of
5039 case-equivalent characters to avoid including the one we
5040 already know about
5041
5042 Returns: the number of < 256 characters added
5043 the pointer to extra data is updated
5044 */
5045
5046 static unsigned int
add_list_to_class_internal(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5047 add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5048 uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
5049 {
5050 unsigned int n8 = 0;
5051 while (p[0] < NOTACHAR)
5052 {
5053 unsigned int n = 0;
5054 if (p[0] != except)
5055 {
5056 while(p[n+1] == p[0] + n + 1) n++;
5057 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5058 }
5059 p += n + 1;
5060 }
5061 return n8;
5062 }
5063 #endif
5064
5065
5066
5067 /*************************************************
5068 * External entry point for add range to class *
5069 *************************************************/
5070
5071 /* This function sets the overall range so that the internal functions can try
5072 to avoid duplication when handling case-independence.
5073
5074 Arguments:
5075 classbits the bit map for characters < 256
5076 uchardptr points to the pointer for extra data
5077 options the options word
5078 cb compile data
5079 start start of range character
5080 end end of range character
5081
5082 Returns: the number of < 256 characters added
5083 the pointer to extra data is updated
5084 */
5085
5086 static unsigned int
add_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,uint32_t start,uint32_t end)5087 add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5088 compile_block *cb, uint32_t start, uint32_t end)
5089 {
5090 cb->class_range_start = start;
5091 cb->class_range_end = end;
5092 return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5093 }
5094
5095
5096 /*************************************************
5097 * External entry point for add list to class *
5098 *************************************************/
5099
5100 /* This function is used for adding a list of horizontal or vertical whitespace
5101 characters to a class. The list must be in order so that ranges of characters
5102 can be detected and handled appropriately. This function sets the overall range
5103 so that the internal functions can try to avoid duplication when handling
5104 case-independence.
5105
5106 Arguments:
5107 classbits the bit map for characters < 256
5108 uchardptr points to the pointer for extra data
5109 options the options word
5110 cb contains pointers to tables etc.
5111 p points to row of 32-bit values, terminated by NOTACHAR
5112 except character to omit; this is used when adding lists of
5113 case-equivalent characters to avoid including the one we
5114 already know about
5115
5116 Returns: the number of < 256 characters added
5117 the pointer to extra data is updated
5118 */
5119
5120 static unsigned int
add_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p,unsigned int except)5121 add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5122 compile_block *cb, const uint32_t *p, unsigned int except)
5123 {
5124 unsigned int n8 = 0;
5125 while (p[0] < NOTACHAR)
5126 {
5127 unsigned int n = 0;
5128 if (p[0] != except)
5129 {
5130 while(p[n+1] == p[0] + n + 1) n++;
5131 cb->class_range_start = p[0];
5132 cb->class_range_end = p[n];
5133 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5134 }
5135 p += n + 1;
5136 }
5137 return n8;
5138 }
5139
5140
5141
5142 /*************************************************
5143 * Add characters not in a list to a class *
5144 *************************************************/
5145
5146 /* This function is used for adding the complement of a list of horizontal or
5147 vertical whitespace to a class. The list must be in order.
5148
5149 Arguments:
5150 classbits the bit map for characters < 256
5151 uchardptr points to the pointer for extra data
5152 options the options word
5153 cb contains pointers to tables etc.
5154 p points to row of 32-bit values, terminated by NOTACHAR
5155
5156 Returns: the number of < 256 characters added
5157 the pointer to extra data is updated
5158 */
5159
5160 static unsigned int
add_not_list_to_class(uint8_t * classbits,PCRE2_UCHAR ** uchardptr,uint32_t options,compile_block * cb,const uint32_t * p)5161 add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5162 uint32_t options, compile_block *cb, const uint32_t *p)
5163 {
5164 BOOL utf = (options & PCRE2_UTF) != 0;
5165 unsigned int n8 = 0;
5166 if (p[0] > 0)
5167 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5168 while (p[0] < NOTACHAR)
5169 {
5170 while (p[1] == p[0] + 1) p++;
5171 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5172 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5173 p++;
5174 }
5175 return n8;
5176 }
5177
5178
5179
5180 /*************************************************
5181 * Find details of duplicate group names *
5182 *************************************************/
5183
5184 /* This is called from compile_branch() when it needs to know the index and
5185 count of duplicates in the names table when processing named backreferences,
5186 either directly, or as conditions.
5187
5188 Arguments:
5189 name points to the name
5190 length the length of the name
5191 indexptr where to put the index
5192 countptr where to put the count of duplicates
5193 errorcodeptr where to put an error code
5194 cb the compile block
5195
5196 Returns: TRUE if OK, FALSE if not, error code set
5197 */
5198
5199 static BOOL
find_dupname_details(PCRE2_SPTR name,uint32_t length,int * indexptr,int * countptr,int * errorcodeptr,compile_block * cb)5200 find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5201 int *countptr, int *errorcodeptr, compile_block *cb)
5202 {
5203 uint32_t i, groupnumber;
5204 int count;
5205 PCRE2_UCHAR *slot = cb->name_table;
5206
5207 /* Find the first entry in the table */
5208
5209 for (i = 0; i < cb->names_found; i++)
5210 {
5211 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5212 slot[IMM2_SIZE+length] == 0) break;
5213 slot += cb->name_entry_size;
5214 }
5215
5216 /* This should not occur, because this function is called only when we know we
5217 have duplicate names. Give an internal error. */
5218
5219 if (i >= cb->names_found)
5220 {
5221 *errorcodeptr = ERR53;
5222 cb->erroroffset = name - cb->start_pattern;
5223 return FALSE;
5224 }
5225
5226 /* Record the index and then see how many duplicates there are, updating the
5227 backref map and maximum back reference as we do. */
5228
5229 *indexptr = i;
5230 count = 0;
5231
5232 for (;;)
5233 {
5234 count++;
5235 groupnumber = GET2(slot,0);
5236 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5237 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5238 if (++i >= cb->names_found) break;
5239 slot += cb->name_entry_size;
5240 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5241 (slot+IMM2_SIZE)[length] != 0) break;
5242 }
5243
5244 *countptr = count;
5245 return TRUE;
5246 }
5247
5248
5249
5250 /*************************************************
5251 * Compile one branch *
5252 *************************************************/
5253
5254 /* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5255 the options are changed during the branch, the pointer is used to change the
5256 external options bits. This function is used during the pre-compile phase when
5257 we are trying to find out the amount of memory needed, as well as during the
5258 real compile phase. The value of lengthptr distinguishes the two phases.
5259
5260 Arguments:
5261 optionsptr pointer to the option bits
5262 codeptr points to the pointer to the current code point
5263 pptrptr points to the current parsed pattern pointer
5264 errorcodeptr points to error code variable
5265 firstcuptr place to put the first required code unit
5266 firstcuflagsptr place to put the first code unit flags, or a negative number
5267 reqcuptr place to put the last required code unit
5268 reqcuflagsptr place to put the last required code unit flags, or a negative number
5269 bcptr points to current branch chain
5270 cb contains pointers to tables etc.
5271 lengthptr NULL during the real compile phase
5272 points to length accumulator during pre-compile phase
5273
5274 Returns: 0 There's been an error, *errorcodeptr is non-zero
5275 +1 Success, this branch must match at least one character
5276 -1 Success, this branch may match an empty string
5277 */
5278
5279 static int
compile_branch(uint32_t * optionsptr,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)5280 compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
5281 int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
5282 uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
5283 compile_block *cb, PCRE2_SIZE *lengthptr)
5284 {
5285 int bravalue = 0;
5286 int okreturn = -1;
5287 int group_return = 0;
5288 uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
5289 uint32_t greedy_default, greedy_non_default;
5290 uint32_t repeat_type, op_type;
5291 uint32_t options = *optionsptr; /* May change dynamically */
5292 uint32_t firstcu, reqcu;
5293 uint32_t zeroreqcu, zerofirstcu;
5294 uint32_t escape;
5295 uint32_t *pptr = *pptrptr;
5296 uint32_t meta, meta_arg;
5297 int32_t firstcuflags, reqcuflags;
5298 int32_t zeroreqcuflags, zerofirstcuflags;
5299 int32_t req_caseopt, reqvary, tempreqvary;
5300 PCRE2_SIZE offset = 0;
5301 PCRE2_SIZE length_prevgroup = 0;
5302 PCRE2_UCHAR *code = *codeptr;
5303 PCRE2_UCHAR *last_code = code;
5304 PCRE2_UCHAR *orig_code = code;
5305 PCRE2_UCHAR *tempcode;
5306 PCRE2_UCHAR *previous = NULL;
5307 PCRE2_UCHAR op_previous;
5308 BOOL groupsetfirstcu = FALSE;
5309 BOOL had_accept = FALSE;
5310 BOOL matched_char = FALSE;
5311 BOOL previous_matched_char = FALSE;
5312 BOOL reset_caseful = FALSE;
5313 const uint8_t *cbits = cb->cbits;
5314 uint8_t classbits[32];
5315
5316 /* We can fish out the UTF setting once and for all into a BOOL, but we must
5317 not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5318 dynamically as we process the pattern. */
5319
5320 #ifdef SUPPORT_UNICODE
5321 BOOL utf = (options & PCRE2_UTF) != 0;
5322 BOOL ucp = (options & PCRE2_UCP) != 0;
5323 #else /* No Unicode support */
5324 BOOL utf = FALSE;
5325 #endif
5326
5327 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5328 class_uchardata always so that it can be passed to add_to_class() always,
5329 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5330 alternative calls for the different cases. */
5331
5332 PCRE2_UCHAR *class_uchardata;
5333 #ifdef SUPPORT_WIDE_CHARS
5334 BOOL xclass;
5335 PCRE2_UCHAR *class_uchardata_base;
5336 #endif
5337
5338 /* Set up the default and non-default settings for greediness */
5339
5340 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5341 greedy_non_default = greedy_default ^ 1;
5342
5343 /* Initialize no first unit, no required unit. REQ_UNSET means "no char
5344 matching encountered yet". It gets changed to REQ_NONE if we hit something that
5345 matches a non-fixed first unit; reqcu just remains unset if we never find one.
5346
5347 When we hit a repeat whose minimum is zero, we may have to adjust these values
5348 to take the zero repeat into account. This is implemented by setting them to
5349 zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5350 item types that can be repeated set these backoff variables appropriately. */
5351
5352 firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5353 firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5354
5355 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
5356 according to the current setting of the caseless flag. The REQ_CASELESS value
5357 leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5358 to record the case status of the value. This is used only for ASCII characters.
5359 */
5360
5361 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
5362
5363 /* Switch on next META item until the end of the branch */
5364
5365 for (;; pptr++)
5366 {
5367 #ifdef SUPPORT_WIDE_CHARS
5368 BOOL xclass_has_prop;
5369 #endif
5370 BOOL negate_class;
5371 BOOL should_flip_negation;
5372 BOOL match_all_or_no_wide_chars;
5373 BOOL possessive_quantifier;
5374 BOOL note_group_empty;
5375 int class_has_8bitchar;
5376 int i;
5377 uint32_t mclength;
5378 uint32_t skipunits;
5379 uint32_t subreqcu, subfirstcu;
5380 uint32_t groupnumber;
5381 uint32_t verbarglen, verbculen;
5382 int32_t subreqcuflags, subfirstcuflags; /* Must be signed */
5383 open_capitem *oc;
5384 PCRE2_UCHAR mcbuffer[8];
5385
5386 /* Get next META item in the pattern and its potential argument. */
5387
5388 meta = META_CODE(*pptr);
5389 meta_arg = META_DATA(*pptr);
5390
5391 /* If we are in the pre-compile phase, accumulate the length used for the
5392 previous cycle of this loop, unless the next item is a quantifier. */
5393
5394 if (lengthptr != NULL)
5395 {
5396 if (code > cb->start_workspace + cb->workspace_size -
5397 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5398 {
5399 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5400 ERR52 : ERR86;
5401 return 0;
5402 }
5403
5404 /* There is at least one situation where code goes backwards: this is the
5405 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5406 is processed, the whole class is eliminated. However, it is created first,
5407 so we have to allow memory for it. Therefore, don't ever reduce the length
5408 at this point. */
5409
5410 if (code < last_code) code = last_code;
5411
5412 /* If the next thing is not a quantifier, we add the length of the previous
5413 item into the total, and reset the code pointer to the start of the
5414 workspace. Otherwise leave the previous item available to be quantified. */
5415
5416 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5417 {
5418 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5419 {
5420 *errorcodeptr = ERR20; /* Integer overflow */
5421 return 0;
5422 }
5423 *lengthptr += (PCRE2_SIZE)(code - orig_code);
5424 if (*lengthptr > MAX_PATTERN_SIZE)
5425 {
5426 *errorcodeptr = ERR20; /* Pattern is too large */
5427 return 0;
5428 }
5429 code = orig_code;
5430 }
5431
5432 /* Remember where this code item starts so we can catch the "backwards"
5433 case above next time round. */
5434
5435 last_code = code;
5436 }
5437
5438 /* Process the next parsed pattern item. If it is not a quantifier, remember
5439 where it starts so that it can be quantified when a quantifier follows.
5440 Checking for the legality of quantifiers happens in parse_regex(), except for
5441 a quantifier after an assertion that is a condition. */
5442
5443 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5444 {
5445 previous = code;
5446 if (matched_char && !had_accept) okreturn = 1;
5447 }
5448
5449 previous_matched_char = matched_char;
5450 matched_char = FALSE;
5451 note_group_empty = FALSE;
5452 skipunits = 0; /* Default value for most subgroups */
5453
5454 switch(meta)
5455 {
5456 /* ===================================================================*/
5457 /* The branch terminates at pattern end or | or ) */
5458
5459 case META_END:
5460 case META_ALT:
5461 case META_KET:
5462 *firstcuptr = firstcu;
5463 *firstcuflagsptr = firstcuflags;
5464 *reqcuptr = reqcu;
5465 *reqcuflagsptr = reqcuflags;
5466 *codeptr = code;
5467 *pptrptr = pptr;
5468 return okreturn;
5469
5470
5471 /* ===================================================================*/
5472 /* Handle single-character metacharacters. In multiline mode, ^ disables
5473 the setting of any following char as a first character. */
5474
5475 case META_CIRCUMFLEX:
5476 if ((options & PCRE2_MULTILINE) != 0)
5477 {
5478 if (firstcuflags == REQ_UNSET)
5479 zerofirstcuflags = firstcuflags = REQ_NONE;
5480 *code++ = OP_CIRCM;
5481 }
5482 else *code++ = OP_CIRC;
5483 break;
5484
5485 case META_DOLLAR:
5486 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5487 break;
5488
5489 /* There can never be a first char if '.' is first, whatever happens about
5490 repeats. The value of reqcu doesn't change either. */
5491
5492 case META_DOT:
5493 matched_char = TRUE;
5494 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5495 zerofirstcu = firstcu;
5496 zerofirstcuflags = firstcuflags;
5497 zeroreqcu = reqcu;
5498 zeroreqcuflags = reqcuflags;
5499 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5500 break;
5501
5502
5503 /* ===================================================================*/
5504 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5505 Otherwise, an initial ']' is taken as a data character. When empty classes
5506 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5507 match any character, so generate OP_ALLANY. */
5508
5509 case META_CLASS_EMPTY:
5510 case META_CLASS_EMPTY_NOT:
5511 matched_char = TRUE;
5512 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5513 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5514 zerofirstcu = firstcu;
5515 zerofirstcuflags = firstcuflags;
5516 break;
5517
5518
5519 /* ===================================================================*/
5520 /* Non-empty character class. If the included characters are all < 256, we
5521 build a 32-byte bitmap of the permitted characters, except in the special
5522 case where there is only one such character. For negated classes, we build
5523 the map as usual, then invert it at the end. However, we use a different
5524 opcode so that data characters > 255 can be handled correctly.
5525
5526 If the class contains characters outside the 0-255 range, a different
5527 opcode is compiled. It may optionally have a bit map for characters < 256,
5528 but those above are are explicitly listed afterwards. A flag code unit
5529 tells whether the bitmap is present, and whether this is a negated class or
5530 not. */
5531
5532 case META_CLASS_NOT:
5533 case META_CLASS:
5534 matched_char = TRUE;
5535 negate_class = meta == META_CLASS_NOT;
5536
5537 /* We can optimize the case of a single character in a class by generating
5538 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5539 negative. In the negative case there can be no first char if this item is
5540 first, whatever repeat count may follow. In the case of reqcu, save the
5541 previous value for reinstating. */
5542
5543 /* NOTE: at present this optimization is not effective if the only
5544 character in a class in 32-bit, non-UCP mode has its top bit set. */
5545
5546 if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5547 {
5548 #ifdef SUPPORT_UNICODE
5549 uint32_t d;
5550 #endif
5551 uint32_t c = pptr[1];
5552
5553 pptr += 2; /* Move on to class end */
5554 if (meta == META_CLASS) /* A positive one-char class can be */
5555 { /* handled as a normal literal character. */
5556 meta = c; /* Set up the character */
5557 goto NORMAL_CHAR_SET;
5558 }
5559
5560 /* Handle a negative one-character class */
5561
5562 zeroreqcu = reqcu;
5563 zeroreqcuflags = reqcuflags;
5564 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5565 zerofirstcu = firstcu;
5566 zerofirstcuflags = firstcuflags;
5567
5568 /* For caseless UTF or UCP mode, check whether this character has more
5569 than one other case. If so, generate a special OP_NOTPROP item instead of
5570 OP_NOTI. */
5571
5572 #ifdef SUPPORT_UNICODE
5573 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5574 (d = UCD_CASESET(c)) != 0)
5575 {
5576 *code++ = OP_NOTPROP;
5577 *code++ = PT_CLIST;
5578 *code++ = d;
5579 break; /* We are finished with this class */
5580 }
5581 #endif
5582 /* Char has only one other case, or UCP not available */
5583
5584 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5585 code += PUTCHAR(c, code);
5586 break; /* We are finished with this class */
5587 } /* End of 1-char optimization */
5588
5589 /* Handle character classes that contain more than just one literal
5590 character. If there are exactly two characters in a positive class, see if
5591 they are case partners. This can be optimized to generate a caseless single
5592 character match (which also sets first/required code units if relevant). */
5593
5594 if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5595 pptr[3] == META_CLASS_END)
5596 {
5597 uint32_t c = pptr[1];
5598
5599 #ifdef SUPPORT_UNICODE
5600 if (UCD_CASESET(c) == 0)
5601 #endif
5602 {
5603 uint32_t d;
5604
5605 #ifdef SUPPORT_UNICODE
5606 if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5607 #endif
5608 {
5609 #if PCRE2_CODE_UNIT_WIDTH != 8
5610 if (c > 255) d = c; else
5611 #endif
5612 d = TABLE_GET(c, cb->fcc, c);
5613 }
5614
5615 if (c != d && pptr[2] == d)
5616 {
5617 pptr += 3; /* Move on to class end */
5618 meta = c;
5619 if ((options & PCRE2_CASELESS) == 0)
5620 {
5621 reset_caseful = TRUE;
5622 options |= PCRE2_CASELESS;
5623 req_caseopt = REQ_CASELESS;
5624 }
5625 goto CLASS_CASELESS_CHAR;
5626 }
5627 }
5628 }
5629
5630 /* If a non-extended class contains a negative special such as \S, we need
5631 to flip the negation flag at the end, so that support for characters > 255
5632 works correctly (they are all included in the class). An extended class may
5633 need to insert specific matching or non-matching code for wide characters.
5634 */
5635
5636 should_flip_negation = match_all_or_no_wide_chars = FALSE;
5637
5638 /* Extended class (xclass) will be used when characters > 255
5639 might match. */
5640
5641 #ifdef SUPPORT_WIDE_CHARS
5642 xclass = FALSE;
5643 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
5644 class_uchardata_base = class_uchardata; /* Save the start */
5645 #endif
5646
5647 /* For optimization purposes, we track some properties of the class:
5648 class_has_8bitchar will be non-zero if the class contains at least one
5649 character with a code point less than 256; xclass_has_prop will be TRUE if
5650 Unicode property checks are present in the class. */
5651
5652 class_has_8bitchar = 0;
5653 #ifdef SUPPORT_WIDE_CHARS
5654 xclass_has_prop = FALSE;
5655 #endif
5656
5657 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5658 in a temporary bit of memory, in case the class contains fewer than two
5659 8-bit characters because in that case the compiled code doesn't use the bit
5660 map. */
5661
5662 memset(classbits, 0, 32 * sizeof(uint8_t));
5663
5664 /* Process items until META_CLASS_END is reached. */
5665
5666 while ((meta = *(++pptr)) != META_CLASS_END)
5667 {
5668 /* Handle POSIX classes such as [:alpha:] etc. */
5669
5670 if (meta == META_POSIX || meta == META_POSIX_NEG)
5671 {
5672 BOOL local_negate = (meta == META_POSIX_NEG);
5673 int posix_class = *(++pptr);
5674 int taboffset, tabopt;
5675 uint8_t pbits[32];
5676
5677 should_flip_negation = local_negate; /* Note negative special */
5678
5679 /* If matching is caseless, upper and lower are converted to alpha.
5680 This relies on the fact that the class table starts with alpha,
5681 lower, upper as the first 3 entries. */
5682
5683 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5684 posix_class = 0;
5685
5686 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5687 different escape sequences that use Unicode properties \p or \P.
5688 Others that are not available via \p or \P have to generate
5689 XCL_PROP/XCL_NOTPROP directly, which is done here. */
5690
5691 #ifdef SUPPORT_UNICODE
5692 if ((options & PCRE2_UCP) != 0) switch(posix_class)
5693 {
5694 case PC_GRAPH:
5695 case PC_PRINT:
5696 case PC_PUNCT:
5697 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5698 *class_uchardata++ = (PCRE2_UCHAR)
5699 ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5700 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5701 *class_uchardata++ = 0;
5702 xclass_has_prop = TRUE;
5703 goto CONTINUE_CLASS;
5704
5705 /* For the other POSIX classes (ascii, xdigit) we are going to
5706 fall through to the non-UCP case and build a bit map for
5707 characters with code points less than 256. However, if we are in
5708 a negated POSIX class, characters with code points greater than
5709 255 must either all match or all not match, depending on whether
5710 the whole class is not or is negated. For example, for
5711 [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5712 they must not.
5713
5714 In the special case where there are no xclass items, this is
5715 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5716 explicit range is needed for OP_XCLASS. Setting a flag here
5717 causes the range to be generated later when it is known that
5718 OP_XCLASS is required. In the 8-bit library this is relevant only in
5719 utf mode, since no wide characters can exist otherwise. */
5720
5721 default:
5722 #if PCRE2_CODE_UNIT_WIDTH == 8
5723 if (utf)
5724 #endif
5725 match_all_or_no_wide_chars |= local_negate;
5726 break;
5727 }
5728 #endif /* SUPPORT_UNICODE */
5729
5730 /* In the non-UCP case, or when UCP makes no difference, we build the
5731 bit map for the POSIX class in a chunk of local store because we may
5732 be adding and subtracting from it, and we don't want to subtract bits
5733 that may be in the main map already. At the end we or the result into
5734 the bit map that is being built. */
5735
5736 posix_class *= 3;
5737
5738 /* Copy in the first table (always present) */
5739
5740 memcpy(pbits, cbits + posix_class_maps[posix_class],
5741 32 * sizeof(uint8_t));
5742
5743 /* If there is a second table, add or remove it as required. */
5744
5745 taboffset = posix_class_maps[posix_class + 1];
5746 tabopt = posix_class_maps[posix_class + 2];
5747
5748 if (taboffset >= 0)
5749 {
5750 if (tabopt >= 0)
5751 for (i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
5752 else
5753 for (i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
5754 }
5755
5756 /* Now see if we need to remove any special characters. An option
5757 value of 1 removes vertical space and 2 removes underscore. */
5758
5759 if (tabopt < 0) tabopt = -tabopt;
5760 if (tabopt == 1) pbits[1] &= ~0x3c;
5761 else if (tabopt == 2) pbits[11] &= 0x7f;
5762
5763 /* Add the POSIX table or its complement into the main table that is
5764 being built and we are done. */
5765
5766 if (local_negate)
5767 for (i = 0; i < 32; i++) classbits[i] |= ~pbits[i];
5768 else
5769 for (i = 0; i < 32; i++) classbits[i] |= pbits[i];
5770
5771 /* Every class contains at least one < 256 character. */
5772
5773 class_has_8bitchar = 1;
5774 goto CONTINUE_CLASS; /* End of POSIX handling */
5775 }
5776
5777 /* Other than POSIX classes, the only items we should encounter are
5778 \d-type escapes and literal characters (possibly as ranges). */
5779
5780 if (meta == META_BIGVALUE)
5781 {
5782 meta = *(++pptr);
5783 goto CLASS_LITERAL;
5784 }
5785
5786 /* Any other non-literal must be an escape */
5787
5788 if (meta >= META_END)
5789 {
5790 if (META_CODE(meta) != META_ESCAPE)
5791 {
5792 #ifdef DEBUG_SHOW_PARSED
5793 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5794 "in character class\n", meta);
5795 #endif
5796 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
5797 return 0;
5798 }
5799 escape = META_DATA(meta);
5800
5801 /* Every class contains at least one < 256 character. */
5802
5803 class_has_8bitchar++;
5804
5805 switch(escape)
5806 {
5807 case ESC_d:
5808 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
5809 break;
5810
5811 case ESC_D:
5812 should_flip_negation = TRUE;
5813 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_digit];
5814 break;
5815
5816 case ESC_w:
5817 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
5818 break;
5819
5820 case ESC_W:
5821 should_flip_negation = TRUE;
5822 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_word];
5823 break;
5824
5825 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5826 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5827 previously set by something earlier in the character class.
5828 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5829 we could just adjust the appropriate bit. From PCRE 8.34 we no
5830 longer treat \s and \S specially. */
5831
5832 case ESC_s:
5833 for (i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
5834 break;
5835
5836 case ESC_S:
5837 should_flip_negation = TRUE;
5838 for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
5839 break;
5840
5841 /* When adding the horizontal or vertical space lists to a class, or
5842 their complements, disable PCRE2_CASELESS, because it justs wastes
5843 time, and in the "not-x" UTF cases can create unwanted duplicates in
5844 the XCLASS list (provoked by characters that have more than one other
5845 case and by both cases being in the same "not-x" sublist). */
5846
5847 case ESC_h:
5848 (void)add_list_to_class(classbits, &class_uchardata,
5849 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5850 break;
5851
5852 case ESC_H:
5853 (void)add_not_list_to_class(classbits, &class_uchardata,
5854 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5855 break;
5856
5857 case ESC_v:
5858 (void)add_list_to_class(classbits, &class_uchardata,
5859 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5860 break;
5861
5862 case ESC_V:
5863 (void)add_not_list_to_class(classbits, &class_uchardata,
5864 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5865 break;
5866
5867 /* If Unicode is not supported, \P and \p are not allowed and are
5868 faulted at parse time, so will never appear here. */
5869
5870 #ifdef SUPPORT_UNICODE
5871 case ESC_p:
5872 case ESC_P:
5873 {
5874 uint32_t ptype = *(++pptr) >> 16;
5875 uint32_t pdata = *pptr & 0xffff;
5876 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5877 *class_uchardata++ = ptype;
5878 *class_uchardata++ = pdata;
5879 xclass_has_prop = TRUE;
5880 class_has_8bitchar--; /* Undo! */
5881 }
5882 break;
5883 #endif
5884 }
5885
5886 goto CONTINUE_CLASS;
5887 } /* End handling \d-type escapes */
5888
5889 /* A literal character may be followed by a range meta. At parse time
5890 there are checks for out-of-order characters, for ranges where the two
5891 characters are equal, and for hyphens that cannot indicate a range. At
5892 this point, therefore, no checking is needed. */
5893
5894 else
5895 {
5896 uint32_t c, d;
5897
5898 CLASS_LITERAL:
5899 c = d = meta;
5900
5901 /* Remember if \r or \n were explicitly used */
5902
5903 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5904
5905 /* Process a character range */
5906
5907 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
5908 {
5909 #ifdef EBCDIC
5910 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
5911 #endif
5912 pptr += 2;
5913 d = *pptr;
5914 if (d == META_BIGVALUE) d = *(++pptr);
5915
5916 /* Remember an explicit \r or \n, and add the range to the class. */
5917
5918 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
5919
5920 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
5921 because there are holes in the encoding, and simply using the range
5922 A-Z (for example) would include the characters in the holes. This
5923 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
5924
5925 #ifdef EBCDIC
5926 if (range_is_literal &&
5927 (cb->ctypes[c] & ctype_letter) != 0 &&
5928 (cb->ctypes[d] & ctype_letter) != 0 &&
5929 (c <= CHAR_z) == (d <= CHAR_z))
5930 {
5931 uint32_t uc = (d <= CHAR_z)? 0 : 64;
5932 uint32_t C = c - uc;
5933 uint32_t D = d - uc;
5934
5935 if (C <= CHAR_i)
5936 {
5937 class_has_8bitchar +=
5938 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5939 ((D < CHAR_i)? D : CHAR_i) + uc);
5940 C = CHAR_j;
5941 }
5942
5943 if (C <= D && C <= CHAR_r)
5944 {
5945 class_has_8bitchar +=
5946 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5947 ((D < CHAR_r)? D : CHAR_r) + uc);
5948 C = CHAR_s;
5949 }
5950
5951 if (C <= D)
5952 {
5953 class_has_8bitchar +=
5954 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
5955 D + uc);
5956 }
5957 }
5958 else
5959 #endif
5960 /* Not an EBCDIC special range */
5961
5962 class_has_8bitchar +=
5963 add_to_class(classbits, &class_uchardata, options, cb, c, d);
5964 goto CONTINUE_CLASS; /* Go get the next char in the class */
5965 } /* End of range handling */
5966
5967
5968 /* Handle a single character. */
5969
5970 class_has_8bitchar +=
5971 add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
5972 }
5973
5974 /* Continue to the next item in the class. */
5975
5976 CONTINUE_CLASS:
5977
5978 #ifdef SUPPORT_WIDE_CHARS
5979 /* If any wide characters or Unicode properties have been encountered,
5980 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
5981 of the extra data and reset the pointer. This is so that very large
5982 classes that contain a zillion wide characters or Unicode property tests
5983 do not overwrite the workspace (which is on the stack). */
5984
5985 if (class_uchardata > class_uchardata_base)
5986 {
5987 xclass = TRUE;
5988 if (lengthptr != NULL)
5989 {
5990 *lengthptr += class_uchardata - class_uchardata_base;
5991 class_uchardata = class_uchardata_base;
5992 }
5993 }
5994 #endif
5995
5996 continue; /* Needed to avoid error when not supporting wide chars */
5997 } /* End of main class-processing loop */
5998
5999 /* If this class is the first thing in the branch, there can be no first
6000 char setting, whatever the repeat count. Any reqcu setting must remain
6001 unchanged after any kind of repeat. */
6002
6003 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6004 zerofirstcu = firstcu;
6005 zerofirstcuflags = firstcuflags;
6006 zeroreqcu = reqcu;
6007 zeroreqcuflags = reqcuflags;
6008
6009 /* If there are characters with values > 255, or Unicode property settings
6010 (\p or \P), we have to compile an extended class, with its own opcode,
6011 unless there were no property settings and there was a negated special such
6012 as \S in the class, and PCRE2_UCP is not set, because in that case all
6013 characters > 255 are in or not in the class, so any that were explicitly
6014 given as well can be ignored.
6015
6016 In the UCP case, if certain negated POSIX classes ([:^ascii:] or
6017 [^:xdigit:]) were present in a class, we either have to match or not match
6018 all wide characters (depending on whether the whole class is or is not
6019 negated). This requirement is indicated by match_all_or_no_wide_chars being
6020 true. We do this by including an explicit range, which works in both cases.
6021 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6022 cannot be any wide characters in 8-bit non-UTF mode.
6023
6024 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6025 class where \S etc is present without PCRE2_UCP, causing an extended class
6026 to be compiled, we make sure that all characters > 255 are included by
6027 forcing match_all_or_no_wide_chars to be true.
6028
6029 If, when generating an xclass, there are no characters < 256, we can omit
6030 the bitmap in the actual compiled code. */
6031
6032 #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
6033 if (xclass && (
6034 #ifdef SUPPORT_UNICODE
6035 (options & PCRE2_UCP) != 0 ||
6036 #endif
6037 xclass_has_prop || !should_flip_negation))
6038 {
6039 if (match_all_or_no_wide_chars || (
6040 #if PCRE2_CODE_UNIT_WIDTH == 8
6041 utf &&
6042 #endif
6043 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6044 {
6045 *class_uchardata++ = XCL_RANGE;
6046 if (utf) /* Will always be utf in the 8-bit library */
6047 {
6048 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6049 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6050 }
6051 else /* Can only happen for the 16-bit & 32-bit libraries */
6052 {
6053 #if PCRE2_CODE_UNIT_WIDTH == 16
6054 *class_uchardata++ = 0x100;
6055 *class_uchardata++ = 0xffffu;
6056 #elif PCRE2_CODE_UNIT_WIDTH == 32
6057 *class_uchardata++ = 0x100;
6058 *class_uchardata++ = 0xffffffffu;
6059 #endif
6060 }
6061 }
6062 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
6063 *code++ = OP_XCLASS;
6064 code += LINK_SIZE;
6065 *code = negate_class? XCL_NOT:0;
6066 if (xclass_has_prop) *code |= XCL_HASPROP;
6067
6068 /* If the map is required, move up the extra data to make room for it;
6069 otherwise just move the code pointer to the end of the extra data. */
6070
6071 if (class_has_8bitchar > 0)
6072 {
6073 *code++ |= XCL_MAP;
6074 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6075 CU2BYTES(class_uchardata - code));
6076 if (negate_class && !xclass_has_prop)
6077 {
6078 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6079 for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6080 }
6081 memcpy(code, classbits, 32);
6082 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6083 }
6084 else code = class_uchardata;
6085
6086 /* Now fill in the complete length of the item */
6087
6088 PUT(previous, 1, (int)(code - previous));
6089 break; /* End of class handling */
6090 }
6091 #endif /* SUPPORT_WIDE_CHARS */
6092
6093 /* If there are no characters > 255, or they are all to be included or
6094 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6095 whole class was negated and whether there were negative specials such as \S
6096 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6097 negating it if necessary. */
6098
6099 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6100 if (lengthptr == NULL) /* Save time in the pre-compile phase */
6101 {
6102 if (negate_class)
6103 {
6104 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
6105 for (i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
6106 }
6107 memcpy(code, classbits, 32);
6108 }
6109 code += 32 / sizeof(PCRE2_UCHAR);
6110 break; /* End of class processing */
6111
6112
6113 /* ===================================================================*/
6114 /* Deal with (*VERB)s. */
6115
6116 /* Check for open captures before ACCEPT and close those that are within
6117 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6118 assertion. In the first pass, just accumulate the length required;
6119 otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6120 workspace overflow. Do not set firstcu after *ACCEPT. */
6121
6122 case META_ACCEPT:
6123 cb->had_accept = had_accept = TRUE;
6124 for (oc = cb->open_caps;
6125 oc != NULL && oc->assert_depth >= cb->assert_depth;
6126 oc = oc->next)
6127 {
6128 if (lengthptr != NULL)
6129 {
6130 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6131 }
6132 else
6133 {
6134 *code++ = OP_CLOSE;
6135 PUT2INC(code, 0, oc->number);
6136 }
6137 }
6138 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6139 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6140 break;
6141
6142 case META_PRUNE:
6143 case META_SKIP:
6144 cb->had_pruneorskip = TRUE;
6145 /* Fall through */
6146 case META_COMMIT:
6147 case META_FAIL:
6148 *code++ = verbops[(meta - META_MARK) >> 16];
6149 break;
6150
6151 case META_THEN:
6152 cb->external_flags |= PCRE2_HASTHEN;
6153 *code++ = OP_THEN;
6154 break;
6155
6156 /* Handle verbs with arguments. Arguments can be very long, especially in
6157 16- and 32-bit modes, and can overflow the workspace in the first pass.
6158 However, the argument length is constrained to be small enough to fit in
6159 one code unit. This check happens in parse_regex(). In the first pass,
6160 instead of putting the argument into memory, we just update the length
6161 counter and set up an empty argument. */
6162
6163 case META_THEN_ARG:
6164 cb->external_flags |= PCRE2_HASTHEN;
6165 goto VERB_ARG;
6166
6167 case META_PRUNE_ARG:
6168 case META_SKIP_ARG:
6169 cb->had_pruneorskip = TRUE;
6170 /* Fall through */
6171 case META_MARK:
6172 case META_COMMIT_ARG:
6173 VERB_ARG:
6174 *code++ = verbops[(meta - META_MARK) >> 16];
6175 /* The length is in characters. */
6176 verbarglen = *(++pptr);
6177 verbculen = 0;
6178 tempcode = code++;
6179 for (i = 0; i < (int)verbarglen; i++)
6180 {
6181 meta = *(++pptr);
6182 #ifdef SUPPORT_UNICODE
6183 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6184 #endif
6185 {
6186 mclength = 1;
6187 mcbuffer[0] = meta;
6188 }
6189 if (lengthptr != NULL) *lengthptr += mclength; else
6190 {
6191 memcpy(code, mcbuffer, CU2BYTES(mclength));
6192 code += mclength;
6193 verbculen += mclength;
6194 }
6195 }
6196
6197 *tempcode = verbculen; /* Fill in the code unit length */
6198 *code++ = 0; /* Terminating zero */
6199 break;
6200
6201
6202 /* ===================================================================*/
6203 /* Handle options change. The new setting must be passed back for use in
6204 subsequent branches. Reset the greedy defaults and the case value for
6205 firstcu and reqcu. */
6206
6207 case META_OPTIONS:
6208 *optionsptr = options = *(++pptr);
6209 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6210 greedy_non_default = greedy_default ^ 1;
6211 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6212 break;
6213
6214
6215 /* ===================================================================*/
6216 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6217 because it could be a numerical check on recursion, or a name check on a
6218 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6219 we can handle it either way. We first try for a name; if not found, process
6220 the number. */
6221
6222 case META_COND_RNUMBER: /* (?(Rdigits) */
6223 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6224 case META_COND_RNAME: /* (?(R&name) - test for recursion */
6225 bravalue = OP_COND;
6226 {
6227 int count, index;
6228 PCRE2_SPTR name;
6229 named_group *ng = cb->named_groups;
6230 uint32_t length = *(++pptr);
6231
6232 GETPLUSOFFSET(offset, pptr);
6233 name = cb->start_pattern + offset;
6234
6235 /* In the first pass, the names generated in the pre-pass are available,
6236 but the main name table has not yet been created. Scan the list of names
6237 generated in the pre-pass in order to get a number and whether or not
6238 this name is duplicated. If it is not duplicated, we can handle it as a
6239 numerical group. */
6240
6241 for (i = 0; i < cb->names_found; i++, ng++)
6242 {
6243 if (length == ng->length &&
6244 PRIV(strncmp)(name, ng->name, length) == 0)
6245 {
6246 if (!ng->isdup)
6247 {
6248 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6249 PUT2(code, 2+LINK_SIZE, ng->number);
6250 if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6251 skipunits = 1+IMM2_SIZE;
6252 goto GROUP_PROCESS_NOTE_EMPTY;
6253 }
6254 break; /* Found a duplicated name */
6255 }
6256 }
6257
6258 /* If the name was not found we have a bad reference, unless we are
6259 dealing with R<digits>, which is treated as a recursion test by number.
6260 */
6261
6262 if (i >= cb->names_found)
6263 {
6264 groupnumber = 0;
6265 if (meta == META_COND_RNUMBER)
6266 {
6267 for (i = 1; i < (int)length; i++)
6268 {
6269 groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6270 if (groupnumber > MAX_GROUP_NUMBER)
6271 {
6272 *errorcodeptr = ERR61;
6273 cb->erroroffset = offset + i;
6274 return 0;
6275 }
6276 }
6277 }
6278
6279 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6280 {
6281 *errorcodeptr = ERR15;
6282 cb->erroroffset = offset;
6283 return 0;
6284 }
6285
6286 /* (?Rdigits) treated as a recursion reference by number. A value of
6287 zero (which is the result of both (?R) and (?R0)) means "any", and is
6288 translated into RREF_ANY (which is 0xffff). */
6289
6290 if (groupnumber == 0) groupnumber = RREF_ANY;
6291 code[1+LINK_SIZE] = OP_RREF;
6292 PUT2(code, 2+LINK_SIZE, groupnumber);
6293 skipunits = 1+IMM2_SIZE;
6294 goto GROUP_PROCESS_NOTE_EMPTY;
6295 }
6296
6297 /* A duplicated name was found. Note that if an R<digits> name is found
6298 (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6299
6300 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6301
6302 /* We have a duplicated name. In the compile pass we have to search the
6303 main table in order to get the index and count values. */
6304
6305 count = 0; /* Values for first pass (avoids compiler warning) */
6306 index = 0;
6307 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6308 &count, errorcodeptr, cb)) return 0;
6309
6310 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6311 insert appropriate data values. */
6312
6313 code[1+LINK_SIZE]++;
6314 skipunits = 1+2*IMM2_SIZE;
6315 PUT2(code, 2+LINK_SIZE, index);
6316 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6317 }
6318 goto GROUP_PROCESS_NOTE_EMPTY;
6319
6320 /* The DEFINE condition is always false. Its internal groups may never
6321 be called, so matched_char must remain false, hence the jump to
6322 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6323
6324 case META_COND_DEFINE:
6325 bravalue = OP_COND;
6326 GETPLUSOFFSET(offset, pptr);
6327 code[1+LINK_SIZE] = OP_DEFINE;
6328 skipunits = 1;
6329 goto GROUP_PROCESS;
6330
6331 /* Conditional test of a group's being set. */
6332
6333 case META_COND_NUMBER:
6334 bravalue = OP_COND;
6335 GETPLUSOFFSET(offset, pptr);
6336 groupnumber = *(++pptr);
6337 if (groupnumber > cb->bracount)
6338 {
6339 *errorcodeptr = ERR15;
6340 cb->erroroffset = offset;
6341 return 0;
6342 }
6343 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6344 offset -= 2; /* Point at initial ( for too many branches error */
6345 code[1+LINK_SIZE] = OP_CREF;
6346 skipunits = 1+IMM2_SIZE;
6347 PUT2(code, 2+LINK_SIZE, groupnumber);
6348 goto GROUP_PROCESS_NOTE_EMPTY;
6349
6350 /* Test for the PCRE2 version. */
6351
6352 case META_COND_VERSION:
6353 bravalue = OP_COND;
6354 if (pptr[1] > 0)
6355 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6356 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6357 OP_TRUE : OP_FALSE;
6358 else
6359 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6360 OP_TRUE : OP_FALSE;
6361 skipunits = 1;
6362 pptr += 3;
6363 goto GROUP_PROCESS_NOTE_EMPTY;
6364
6365 /* The condition is an assertion, possibly preceded by a callout. */
6366
6367 case META_COND_ASSERT:
6368 bravalue = OP_COND;
6369 goto GROUP_PROCESS_NOTE_EMPTY;
6370
6371
6372 /* ===================================================================*/
6373 /* Handle all kinds of nested bracketed groups. The non-capturing,
6374 non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6375
6376 case META_LOOKAHEAD:
6377 bravalue = OP_ASSERT;
6378 cb->assert_depth += 1;
6379 goto GROUP_PROCESS;
6380
6381 case META_LOOKAHEAD_NA:
6382 bravalue = OP_ASSERT_NA;
6383 cb->assert_depth += 1;
6384 goto GROUP_PROCESS;
6385
6386 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6387 thing to do, but Perl allows all assertions to be quantified, and when
6388 they contain capturing parentheses there may be a potential use for
6389 this feature. Not that that applies to a quantified (?!) but we allow
6390 it for uniformity. */
6391
6392 case META_LOOKAHEADNOT:
6393 if (pptr[1] == META_KET &&
6394 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6395 {
6396 *code++ = OP_FAIL;
6397 pptr++;
6398 }
6399 else
6400 {
6401 bravalue = OP_ASSERT_NOT;
6402 cb->assert_depth += 1;
6403 goto GROUP_PROCESS;
6404 }
6405 break;
6406
6407 case META_LOOKBEHIND:
6408 bravalue = OP_ASSERTBACK;
6409 cb->assert_depth += 1;
6410 goto GROUP_PROCESS;
6411
6412 case META_LOOKBEHINDNOT:
6413 bravalue = OP_ASSERTBACK_NOT;
6414 cb->assert_depth += 1;
6415 goto GROUP_PROCESS;
6416
6417 case META_LOOKBEHIND_NA:
6418 bravalue = OP_ASSERTBACK_NA;
6419 cb->assert_depth += 1;
6420 goto GROUP_PROCESS;
6421
6422 case META_ATOMIC:
6423 bravalue = OP_ONCE;
6424 goto GROUP_PROCESS_NOTE_EMPTY;
6425
6426 case META_SCRIPT_RUN:
6427 bravalue = OP_SCRIPT_RUN;
6428 goto GROUP_PROCESS_NOTE_EMPTY;
6429
6430 case META_NOCAPTURE:
6431 bravalue = OP_BRA;
6432 /* Fall through */
6433
6434 /* Process nested bracketed regex. The nesting depth is maintained for the
6435 benefit of the stackguard function. The test for too deep nesting is now
6436 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6437 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6438 note of whether or not they may match an empty string. */
6439
6440 GROUP_PROCESS_NOTE_EMPTY:
6441 note_group_empty = TRUE;
6442
6443 GROUP_PROCESS:
6444 cb->parens_depth += 1;
6445 *code = bravalue;
6446 pptr++;
6447 tempcode = code;
6448 tempreqvary = cb->req_varyopt; /* Save value before group */
6449 length_prevgroup = 0; /* Initialize for pre-compile phase */
6450
6451 if ((group_return =
6452 compile_regex(
6453 options, /* The option state */
6454 &tempcode, /* Where to put code (updated) */
6455 &pptr, /* Input pointer (updated) */
6456 errorcodeptr, /* Where to put an error message */
6457 skipunits, /* Skip over bracket number */
6458 &subfirstcu, /* For possible first char */
6459 &subfirstcuflags,
6460 &subreqcu, /* For possible last char */
6461 &subreqcuflags,
6462 bcptr, /* Current branch chain */
6463 cb, /* Compile data block */
6464 (lengthptr == NULL)? NULL : /* Actual compile phase */
6465 &length_prevgroup /* Pre-compile phase */
6466 )) == 0)
6467 return 0; /* Error */
6468
6469 cb->parens_depth -= 1;
6470
6471 /* If that was a non-conditional significant group (not an assertion, not a
6472 DEFINE) that matches at least one character, then the current item matches
6473 a character. Conditionals are handled below. */
6474
6475 if (note_group_empty && bravalue != OP_COND && group_return > 0)
6476 matched_char = TRUE;
6477
6478 /* If we've just compiled an assertion, pop the assert depth. */
6479
6480 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6481 cb->assert_depth -= 1;
6482
6483 /* At the end of compiling, code is still pointing to the start of the
6484 group, while tempcode has been updated to point past the end of the group.
6485 The parsed pattern pointer (pptr) is on the closing META_KET.
6486
6487 If this is a conditional bracket, check that there are no more than
6488 two branches in the group, or just one if it's a DEFINE group. We do this
6489 in the real compile phase, not in the pre-pass, where the whole group may
6490 not be available. */
6491
6492 if (bravalue == OP_COND && lengthptr == NULL)
6493 {
6494 PCRE2_UCHAR *tc = code;
6495 int condcount = 0;
6496
6497 do {
6498 condcount++;
6499 tc += GET(tc,1);
6500 }
6501 while (*tc != OP_KET);
6502
6503 /* A DEFINE group is never obeyed inline (the "condition" is always
6504 false). It must have only one branch. Having checked this, change the
6505 opcode to OP_FALSE. */
6506
6507 if (code[LINK_SIZE+1] == OP_DEFINE)
6508 {
6509 if (condcount > 1)
6510 {
6511 cb->erroroffset = offset;
6512 *errorcodeptr = ERR54;
6513 return 0;
6514 }
6515 code[LINK_SIZE+1] = OP_FALSE;
6516 bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6517 }
6518
6519 /* A "normal" conditional group. If there is just one branch, we must not
6520 make use of its firstcu or reqcu, because this is equivalent to an
6521 empty second branch. Also, it may match an empty string. If there are two
6522 branches, this item must match a character if the group must. */
6523
6524 else
6525 {
6526 if (condcount > 2)
6527 {
6528 cb->erroroffset = offset;
6529 *errorcodeptr = ERR27;
6530 return 0;
6531 }
6532 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6533 else if (group_return > 0) matched_char = TRUE;
6534 }
6535 }
6536
6537 /* In the pre-compile phase, update the length by the length of the group,
6538 less the brackets at either end. Then reduce the compiled code to just a
6539 set of non-capturing brackets so that it doesn't use much memory if it is
6540 duplicated by a quantifier.*/
6541
6542 if (lengthptr != NULL)
6543 {
6544 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6545 {
6546 *errorcodeptr = ERR20;
6547 return 0;
6548 }
6549 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6550 code++; /* This already contains bravalue */
6551 PUTINC(code, 0, 1 + LINK_SIZE);
6552 *code++ = OP_KET;
6553 PUTINC(code, 0, 1 + LINK_SIZE);
6554 break; /* No need to waste time with special character handling */
6555 }
6556
6557 /* Otherwise update the main code pointer to the end of the group. */
6558
6559 code = tempcode;
6560
6561 /* For a DEFINE group, required and first character settings are not
6562 relevant. */
6563
6564 if (bravalue == OP_DEFINE) break;
6565
6566 /* Handle updating of the required and first code units for other types of
6567 group. Update for normal brackets of all kinds, and conditions with two
6568 branches (see code above). If the bracket is followed by a quantifier with
6569 zero repeat, we have to back off. Hence the definition of zeroreqcu and
6570 zerofirstcu outside the main loop so that they can be accessed for the back
6571 off. */
6572
6573 zeroreqcu = reqcu;
6574 zeroreqcuflags = reqcuflags;
6575 zerofirstcu = firstcu;
6576 zerofirstcuflags = firstcuflags;
6577 groupsetfirstcu = FALSE;
6578
6579 if (bravalue >= OP_ONCE) /* Not an assertion */
6580 {
6581 /* If we have not yet set a firstcu in this branch, take it from the
6582 subpattern, remembering that it was set here so that a repeat of more
6583 than one can replicate it as reqcu if necessary. If the subpattern has
6584 no firstcu, set "none" for the whole branch. In both cases, a zero
6585 repeat forces firstcu to "none". */
6586
6587 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6588 {
6589 if (subfirstcuflags >= 0)
6590 {
6591 firstcu = subfirstcu;
6592 firstcuflags = subfirstcuflags;
6593 groupsetfirstcu = TRUE;
6594 }
6595 else firstcuflags = REQ_NONE;
6596 zerofirstcuflags = REQ_NONE;
6597 }
6598
6599 /* If firstcu was previously set, convert the subpattern's firstcu
6600 into reqcu if there wasn't one, using the vary flag that was in
6601 existence beforehand. */
6602
6603 else if (subfirstcuflags >= 0 && subreqcuflags < 0)
6604 {
6605 subreqcu = subfirstcu;
6606 subreqcuflags = subfirstcuflags | tempreqvary;
6607 }
6608
6609 /* If the subpattern set a required code unit (or set a first code unit
6610 that isn't really the first code unit - see above), set it. */
6611
6612 if (subreqcuflags >= 0)
6613 {
6614 reqcu = subreqcu;
6615 reqcuflags = subreqcuflags;
6616 }
6617 }
6618
6619 /* For a forward assertion, we take the reqcu, if set, provided that the
6620 group has also set a firstcu. This can be helpful if the pattern that
6621 follows the assertion doesn't set a different char. For example, it's
6622 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6623 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6624 the "real" "a" would then become a reqcu instead of a firstcu. This is
6625 overcome by a scan at the end if there's no firstcu, looking for an
6626 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6627 we must only take the reqcu when the group also set a firstcu. Otherwise,
6628 in that example, 'X' ends up set for both. */
6629
6630 else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
6631 subreqcuflags >= 0 && subfirstcuflags >= 0)
6632 {
6633 reqcu = subreqcu;
6634 reqcuflags = subreqcuflags;
6635 }
6636
6637 break; /* End of nested group handling */
6638
6639
6640 /* ===================================================================*/
6641 /* Handle named backreferences and recursions. */
6642
6643 case META_BACKREF_BYNAME:
6644 case META_RECURSE_BYNAME:
6645 {
6646 int count, index;
6647 PCRE2_SPTR name;
6648 BOOL is_dupname = FALSE;
6649 named_group *ng = cb->named_groups;
6650 uint32_t length = *(++pptr);
6651
6652 GETPLUSOFFSET(offset, pptr);
6653 name = cb->start_pattern + offset;
6654
6655 /* In the first pass, the names generated in the pre-pass are available,
6656 but the main name table has not yet been created. Scan the list of names
6657 generated in the pre-pass in order to get a number and whether or not
6658 this name is duplicated. */
6659
6660 groupnumber = 0;
6661 for (i = 0; i < cb->names_found; i++, ng++)
6662 {
6663 if (length == ng->length &&
6664 PRIV(strncmp)(name, ng->name, length) == 0)
6665 {
6666 is_dupname = ng->isdup;
6667 groupnumber = ng->number;
6668
6669 /* For a recursion, that's all that is needed. We can now go to
6670 the code that handles numerical recursion, applying it to the first
6671 group with the given name. */
6672
6673 if (meta == META_RECURSE_BYNAME)
6674 {
6675 meta_arg = groupnumber;
6676 goto HANDLE_NUMERICAL_RECURSION;
6677 }
6678
6679 /* For a back reference, update the back reference map and the
6680 maximum back reference. */
6681
6682 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6683 if (groupnumber > cb->top_backref)
6684 cb->top_backref = groupnumber;
6685 }
6686 }
6687
6688 /* If the name was not found we have a bad reference. */
6689
6690 if (groupnumber == 0)
6691 {
6692 *errorcodeptr = ERR15;
6693 cb->erroroffset = offset;
6694 return 0;
6695 }
6696
6697 /* If a back reference name is not duplicated, we can handle it as
6698 a numerical reference. */
6699
6700 if (!is_dupname)
6701 {
6702 meta_arg = groupnumber;
6703 goto HANDLE_SINGLE_REFERENCE;
6704 }
6705
6706 /* If a back reference name is duplicated, we generate a different
6707 opcode to a numerical back reference. In the second pass we must
6708 search for the index and count in the final name table. */
6709
6710 count = 0; /* Values for first pass (avoids compiler warning) */
6711 index = 0;
6712 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6713 &count, errorcodeptr, cb)) return 0;
6714
6715 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6716 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6717 PUT2INC(code, 0, index);
6718 PUT2INC(code, 0, count);
6719 }
6720 break;
6721
6722
6723 /* ===================================================================*/
6724 /* Handle a numerical callout. */
6725
6726 case META_CALLOUT_NUMBER:
6727 code[0] = OP_CALLOUT;
6728 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6729 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6730 code[1 + 2*LINK_SIZE] = pptr[3];
6731 pptr += 3;
6732 code += PRIV(OP_lengths)[OP_CALLOUT];
6733 break;
6734
6735
6736 /* ===================================================================*/
6737 /* Handle a callout with a string argument. In the pre-pass we just compute
6738 the length without generating anything. The length in pptr[3] includes both
6739 delimiters; in the actual compile only the first one is copied, but a
6740 terminating zero is added. Any doubled delimiters within the string make
6741 this an overestimate, but it is not worth bothering about. */
6742
6743 case META_CALLOUT_STRING:
6744 if (lengthptr != NULL)
6745 {
6746 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6747 pptr += 3;
6748 SKIPOFFSET(pptr);
6749 }
6750
6751 /* In the real compile we can copy the string. The starting delimiter is
6752 included so that the client can discover it if they want. We also pass the
6753 start offset to help a script language give better error messages. */
6754
6755 else
6756 {
6757 PCRE2_SPTR pp;
6758 uint32_t delimiter;
6759 uint32_t length = pptr[3];
6760 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6761
6762 code[0] = OP_CALLOUT_STR;
6763 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6764 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6765
6766 pptr += 3;
6767 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
6768 pp = cb->start_pattern + offset;
6769 delimiter = *callout_string++ = *pp++;
6770 if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6771 delimiter = CHAR_RIGHT_CURLY_BRACKET;
6772 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
6773
6774 /* The syntax of the pattern was checked in the parsing scan. The length
6775 includes both delimiters, but we have passed the opening one just above,
6776 so we reduce length before testing it. The test is for > 1 because we do
6777 not want to copy the final delimiter. This also ensures that pp[1] is
6778 accessible. */
6779
6780 while (--length > 1)
6781 {
6782 if (*pp == delimiter && pp[1] == delimiter)
6783 {
6784 *callout_string++ = delimiter;
6785 pp += 2;
6786 length--;
6787 }
6788 else *callout_string++ = *pp++;
6789 }
6790 *callout_string++ = CHAR_NUL;
6791
6792 /* Set the length of the entire item, the advance to its end. */
6793
6794 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6795 code = callout_string;
6796 }
6797 break;
6798
6799
6800 /* ===================================================================*/
6801 /* Handle repetition. The different types are all sorted out in the parsing
6802 pass. */
6803
6804 case META_MINMAX_PLUS:
6805 case META_MINMAX_QUERY:
6806 case META_MINMAX:
6807 repeat_min = *(++pptr);
6808 repeat_max = *(++pptr);
6809 goto REPEAT;
6810
6811 case META_ASTERISK:
6812 case META_ASTERISK_PLUS:
6813 case META_ASTERISK_QUERY:
6814 repeat_min = 0;
6815 repeat_max = REPEAT_UNLIMITED;
6816 goto REPEAT;
6817
6818 case META_PLUS:
6819 case META_PLUS_PLUS:
6820 case META_PLUS_QUERY:
6821 repeat_min = 1;
6822 repeat_max = REPEAT_UNLIMITED;
6823 goto REPEAT;
6824
6825 case META_QUERY:
6826 case META_QUERY_PLUS:
6827 case META_QUERY_QUERY:
6828 repeat_min = 0;
6829 repeat_max = 1;
6830
6831 REPEAT:
6832 if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6833
6834 /* Remember whether this is a variable length repeat, and default to
6835 single-char opcodes. */
6836
6837 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6838 op_type = 0;
6839
6840 /* Adjust first and required code units for a zero repeat. */
6841
6842 if (repeat_min == 0)
6843 {
6844 firstcu = zerofirstcu;
6845 firstcuflags = zerofirstcuflags;
6846 reqcu = zeroreqcu;
6847 reqcuflags = zeroreqcuflags;
6848 }
6849
6850 /* Note the greediness and possessiveness. */
6851
6852 switch (meta)
6853 {
6854 case META_MINMAX_PLUS:
6855 case META_ASTERISK_PLUS:
6856 case META_PLUS_PLUS:
6857 case META_QUERY_PLUS:
6858 repeat_type = 0; /* Force greedy */
6859 possessive_quantifier = TRUE;
6860 break;
6861
6862 case META_MINMAX_QUERY:
6863 case META_ASTERISK_QUERY:
6864 case META_PLUS_QUERY:
6865 case META_QUERY_QUERY:
6866 repeat_type = greedy_non_default;
6867 possessive_quantifier = FALSE;
6868 break;
6869
6870 default:
6871 repeat_type = greedy_default;
6872 possessive_quantifier = FALSE;
6873 break;
6874 }
6875
6876 /* Save start of previous item, in case we have to move it up in order to
6877 insert something before it, and remember what it was. */
6878
6879 tempcode = previous;
6880 op_previous = *previous;
6881
6882 /* Now handle repetition for the different types of item. If the repeat
6883 minimum and the repeat maximum are both 1, we can ignore the quantifier for
6884 non-parenthesized items, as they have only one alternative. For anything in
6885 parentheses, we must not ignore if {1} is possessive. */
6886
6887 switch (op_previous)
6888 {
6889 /* If previous was a character or negated character match, abolish the
6890 item and generate a repeat item instead. If a char item has a minimum of
6891 more than one, ensure that it is set in reqcu - it might not be if a
6892 sequence such as x{3} is the first thing in a branch because the x will
6893 have gone into firstcu instead. */
6894
6895 case OP_CHAR:
6896 case OP_CHARI:
6897 case OP_NOT:
6898 case OP_NOTI:
6899 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6900 op_type = chartypeoffset[op_previous - OP_CHAR];
6901
6902 /* Deal with UTF characters that take up more than one code unit. */
6903
6904 #ifdef MAYBE_UTF_MULTI
6905 if (utf && NOT_FIRSTCU(code[-1]))
6906 {
6907 PCRE2_UCHAR *lastchar = code - 1;
6908 BACKCHAR(lastchar);
6909 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
6910 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
6911 }
6912 else
6913 #endif /* MAYBE_UTF_MULTI */
6914
6915 /* Handle the case of a single code unit - either with no UTF support, or
6916 with UTF disabled, or for a single-code-unit UTF character. */
6917 {
6918 mcbuffer[0] = code[-1];
6919 mclength = 1;
6920 if (op_previous <= OP_CHARI && repeat_min > 1)
6921 {
6922 reqcu = mcbuffer[0];
6923 reqcuflags = req_caseopt | cb->req_varyopt;
6924 }
6925 }
6926 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
6927
6928 /* If previous was a character class or a back reference, we put the
6929 repeat stuff after it, but just skip the item if the repeat was {0,0}. */
6930
6931 #ifdef SUPPORT_WIDE_CHARS
6932 case OP_XCLASS:
6933 #endif
6934 case OP_CLASS:
6935 case OP_NCLASS:
6936 case OP_REF:
6937 case OP_REFI:
6938 case OP_DNREF:
6939 case OP_DNREFI:
6940
6941 if (repeat_max == 0)
6942 {
6943 code = previous;
6944 goto END_REPEAT;
6945 }
6946 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
6947
6948 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
6949 *code++ = OP_CRSTAR + repeat_type;
6950 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
6951 *code++ = OP_CRPLUS + repeat_type;
6952 else if (repeat_min == 0 && repeat_max == 1)
6953 *code++ = OP_CRQUERY + repeat_type;
6954 else
6955 {
6956 *code++ = OP_CRRANGE + repeat_type;
6957 PUT2INC(code, 0, repeat_min);
6958 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
6959 PUT2INC(code, 0, repeat_max);
6960 }
6961 break;
6962
6963 /* If previous is OP_FAIL, it was generated by an empty class []
6964 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
6965 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
6966 time. We can just ignore this repeat. */
6967
6968 case OP_FAIL:
6969 goto END_REPEAT;
6970
6971 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
6972 because pcre2_match() could not handle backtracking into recursively
6973 called groups. Now that this backtracking is available, we no longer need
6974 to do this. However, we still need to replicate recursions as we do for
6975 groups so as to have independent backtracking points. We can replicate
6976 for the minimum number of repeats directly. For optional repeats we now
6977 wrap the recursion in OP_BRA brackets and make use of the bracket
6978 repetition. */
6979
6980 case OP_RECURSE:
6981 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
6982 goto END_REPEAT;
6983
6984 /* Generate unwrapped repeats for a non-zero minimum, except when the
6985 minimum is 1 and the maximum unlimited, because that can be handled with
6986 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
6987 minimum, we just need to generate the appropriate additional copies.
6988 Otherwise we need to generate one more, to simulate the situation when
6989 the minimum is zero. */
6990
6991 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
6992 {
6993 int replicate = repeat_min;
6994 if (repeat_min == repeat_max) replicate--;
6995
6996 /* In the pre-compile phase, we don't actually do the replication. We
6997 just adjust the length as if we had. Do some paranoid checks for
6998 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6999 integer type when available, otherwise double. */
7000
7001 if (lengthptr != NULL)
7002 {
7003 PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
7004 if ((INT64_OR_DOUBLE)replicate*
7005 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
7006 (INT64_OR_DOUBLE)INT_MAX ||
7007 OFLOW_MAX - *lengthptr < delta)
7008 {
7009 *errorcodeptr = ERR20;
7010 return 0;
7011 }
7012 *lengthptr += delta;
7013 }
7014
7015 else for (i = 0; i < replicate; i++)
7016 {
7017 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7018 previous = code;
7019 code += 1 + LINK_SIZE;
7020 }
7021
7022 /* If the number of repeats is fixed, we are done. Otherwise, adjust
7023 the counts and fall through. */
7024
7025 if (repeat_min == repeat_max) break;
7026 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7027 repeat_min = 0;
7028 }
7029
7030 /* Wrap the recursion call in OP_BRA brackets. */
7031
7032 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7033 op_previous = *previous = OP_BRA;
7034 PUT(previous, 1, 2 + 2*LINK_SIZE);
7035 previous[2 + 2*LINK_SIZE] = OP_KET;
7036 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7037 code += 2 + 2 * LINK_SIZE;
7038 length_prevgroup = 3 + 3*LINK_SIZE;
7039 group_return = -1; /* Set "may match empty string" */
7040
7041 /* Now treat as a repeated OP_BRA. */
7042 /* Fall through */
7043
7044 /* If previous was a bracket group, we may have to replicate it in
7045 certain cases. Note that at this point we can encounter only the "basic"
7046 bracket opcodes such as BRA and CBRA, as this is the place where they get
7047 converted into the more special varieties such as BRAPOS and SBRA.
7048 Originally, PCRE did not allow repetition of assertions, but now it does,
7049 for Perl compatibility. */
7050
7051 case OP_ASSERT:
7052 case OP_ASSERT_NOT:
7053 case OP_ASSERT_NA:
7054 case OP_ASSERTBACK:
7055 case OP_ASSERTBACK_NOT:
7056 case OP_ASSERTBACK_NA:
7057 case OP_ONCE:
7058 case OP_SCRIPT_RUN:
7059 case OP_BRA:
7060 case OP_CBRA:
7061 case OP_COND:
7062 {
7063 int len = (int)(code - previous);
7064 PCRE2_UCHAR *bralink = NULL;
7065 PCRE2_UCHAR *brazeroptr = NULL;
7066
7067 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7068 goto END_REPEAT;
7069
7070 /* Repeating a DEFINE group (or any group where the condition is always
7071 FALSE and there is only one branch) is pointless, but Perl allows the
7072 syntax, so we just ignore the repeat. */
7073
7074 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7075 previous[GET(previous, 1)] != OP_ALT)
7076 goto END_REPEAT;
7077
7078 /* Perl allows all assertions to be quantified, and when they contain
7079 capturing parentheses and/or are optional there are potential uses for
7080 this feature. PCRE2 used to force the maximum quantifier to 1 on the
7081 invalid grounds that further repetition was never useful. This was
7082 always a bit pointless, since an assertion could be wrapped with a
7083 repeated group to achieve the effect. General repetition is now
7084 permitted, but if the maximum is unlimited it is set to one more than
7085 the minimum. */
7086
7087 if (op_previous < OP_ONCE) /* Assertion */
7088 {
7089 if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7090 }
7091
7092 /* The case of a zero minimum is special because of the need to stick
7093 OP_BRAZERO in front of it, and because the group appears once in the
7094 data, whereas in other cases it appears the minimum number of times. For
7095 this reason, it is simplest to treat this case separately, as otherwise
7096 the code gets far too messy. There are several special subcases when the
7097 minimum is zero. */
7098
7099 if (repeat_min == 0)
7100 {
7101 /* If the maximum is also zero, we used to just omit the group from
7102 the output altogether, like this:
7103
7104 ** if (repeat_max == 0)
7105 ** {
7106 ** code = previous;
7107 ** goto END_REPEAT;
7108 ** }
7109
7110 However, that fails when a group or a subgroup within it is
7111 referenced as a subroutine from elsewhere in the pattern, so now we
7112 stick in OP_SKIPZERO in front of it so that it is skipped on
7113 execution. As we don't have a list of which groups are referenced, we
7114 cannot do this selectively.
7115
7116 If the maximum is 1 or unlimited, we just have to stick in the
7117 BRAZERO and do no more at this point. */
7118
7119 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7120 {
7121 (void)memmove(previous + 1, previous, CU2BYTES(len));
7122 code++;
7123 if (repeat_max == 0)
7124 {
7125 *previous++ = OP_SKIPZERO;
7126 goto END_REPEAT;
7127 }
7128 brazeroptr = previous; /* Save for possessive optimizing */
7129 *previous++ = OP_BRAZERO + repeat_type;
7130 }
7131
7132 /* If the maximum is greater than 1 and limited, we have to replicate
7133 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7134 The first one has to be handled carefully because it's the original
7135 copy, which has to be moved up. The remainder can be handled by code
7136 that is common with the non-zero minimum case below. We have to
7137 adjust the value or repeat_max, since one less copy is required. */
7138
7139 else
7140 {
7141 int linkoffset;
7142 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7143 code += 2 + LINK_SIZE;
7144 *previous++ = OP_BRAZERO + repeat_type;
7145 *previous++ = OP_BRA;
7146
7147 /* We chain together the bracket link offset fields that have to be
7148 filled in later when the ends of the brackets are reached. */
7149
7150 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7151 bralink = previous;
7152 PUTINC(previous, 0, linkoffset);
7153 }
7154
7155 if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7156 }
7157
7158 /* If the minimum is greater than zero, replicate the group as many
7159 times as necessary, and adjust the maximum to the number of subsequent
7160 copies that we need. */
7161
7162 else
7163 {
7164 if (repeat_min > 1)
7165 {
7166 /* In the pre-compile phase, we don't actually do the replication.
7167 We just adjust the length as if we had. Do some paranoid checks for
7168 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7169 integer type when available, otherwise double. */
7170
7171 if (lengthptr != NULL)
7172 {
7173 PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7174 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7175 (INT64_OR_DOUBLE)length_prevgroup >
7176 (INT64_OR_DOUBLE)INT_MAX ||
7177 OFLOW_MAX - *lengthptr < delta)
7178 {
7179 *errorcodeptr = ERR20;
7180 return 0;
7181 }
7182 *lengthptr += delta;
7183 }
7184
7185 /* This is compiling for real. If there is a set first code unit
7186 for the group, and we have not yet set a "required code unit", set
7187 it. */
7188
7189 else
7190 {
7191 if (groupsetfirstcu && reqcuflags < 0)
7192 {
7193 reqcu = firstcu;
7194 reqcuflags = firstcuflags;
7195 }
7196 for (i = 1; (uint32_t)i < repeat_min; i++)
7197 {
7198 memcpy(code, previous, CU2BYTES(len));
7199 code += len;
7200 }
7201 }
7202 }
7203
7204 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7205 }
7206
7207 /* This code is common to both the zero and non-zero minimum cases. If
7208 the maximum is limited, it replicates the group in a nested fashion,
7209 remembering the bracket starts on a stack. In the case of a zero
7210 minimum, the first one was set up above. In all cases the repeat_max
7211 now specifies the number of additional copies needed. Again, we must
7212 remember to replicate entries on the forward reference list. */
7213
7214 if (repeat_max != REPEAT_UNLIMITED)
7215 {
7216 /* In the pre-compile phase, we don't actually do the replication. We
7217 just adjust the length as if we had. For each repetition we must add
7218 1 to the length for BRAZERO and for all but the last repetition we
7219 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7220 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7221 is a 64-bit integer type when available, otherwise double. */
7222
7223 if (lengthptr != NULL && repeat_max > 0)
7224 {
7225 PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7226 2 - 2*LINK_SIZE; /* Last one doesn't nest */
7227 if ((INT64_OR_DOUBLE)repeat_max *
7228 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7229 > (INT64_OR_DOUBLE)INT_MAX ||
7230 OFLOW_MAX - *lengthptr < delta)
7231 {
7232 *errorcodeptr = ERR20;
7233 return 0;
7234 }
7235 *lengthptr += delta;
7236 }
7237
7238 /* This is compiling for real */
7239
7240 else for (i = repeat_max - 1; i >= 0; i--)
7241 {
7242 *code++ = OP_BRAZERO + repeat_type;
7243
7244 /* All but the final copy start a new nesting, maintaining the
7245 chain of brackets outstanding. */
7246
7247 if (i != 0)
7248 {
7249 int linkoffset;
7250 *code++ = OP_BRA;
7251 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7252 bralink = code;
7253 PUTINC(code, 0, linkoffset);
7254 }
7255
7256 memcpy(code, previous, CU2BYTES(len));
7257 code += len;
7258 }
7259
7260 /* Now chain through the pending brackets, and fill in their length
7261 fields (which are holding the chain links pro tem). */
7262
7263 while (bralink != NULL)
7264 {
7265 int oldlinkoffset;
7266 int linkoffset = (int)(code - bralink + 1);
7267 PCRE2_UCHAR *bra = code - linkoffset;
7268 oldlinkoffset = GET(bra, 1);
7269 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7270 *code++ = OP_KET;
7271 PUTINC(code, 0, linkoffset);
7272 PUT(bra, 1, linkoffset);
7273 }
7274 }
7275
7276 /* If the maximum is unlimited, set a repeater in the final copy. For
7277 SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7278 possessively repeated ONCE brackets can be converted into non-capturing
7279 brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7280 saves having to deal with possessive ONCEs specially.
7281
7282 Otherwise, when we are doing the actual compile phase, check to see
7283 whether this group is one that could match an empty string. If so,
7284 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7285 that runtime checking can be done. [This check is also applied to ONCE
7286 and SCRIPT_RUN groups at runtime, but in a different way.]
7287
7288 Then, if the quantifier was possessive and the bracket is not a
7289 conditional, we convert the BRA code to the POS form, and the KET code
7290 to KETRPOS. (It turns out to be convenient at runtime to detect this
7291 kind of subpattern at both the start and at the end.) The use of
7292 special opcodes makes it possible to reduce greatly the stack usage in
7293 pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7294 OP_BRAPOSZERO.
7295
7296 Then, if the minimum number of matches is 1 or 0, cancel the possessive
7297 flag so that the default action below, of wrapping everything inside
7298 atomic brackets, does not happen. When the minimum is greater than 1,
7299 there will be earlier copies of the group, and so we still have to wrap
7300 the whole thing. */
7301
7302 else
7303 {
7304 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7305 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7306
7307 /* Convert possessive ONCE brackets to non-capturing */
7308
7309 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7310
7311 /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7312 to do is to set the KET. */
7313
7314 if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7315 *ketcode = OP_KETRMAX + repeat_type;
7316
7317 /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7318 (which have been converted to non-capturing above). */
7319
7320 else
7321 {
7322 /* In the compile phase, adjust the opcode if the group can match
7323 an empty string. For a conditional group with only one branch, the
7324 value of group_return will not show "could be empty", so we must
7325 check that separately. */
7326
7327 if (lengthptr == NULL)
7328 {
7329 if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7330 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7331 *bracode = OP_SCOND;
7332 }
7333
7334 /* Handle possessive quantifiers. */
7335
7336 if (possessive_quantifier)
7337 {
7338 /* For COND brackets, we wrap the whole thing in a possessively
7339 repeated non-capturing bracket, because we have not invented POS
7340 versions of the COND opcodes. */
7341
7342 if (*bracode == OP_COND || *bracode == OP_SCOND)
7343 {
7344 int nlen = (int)(code - bracode);
7345 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7346 code += 1 + LINK_SIZE;
7347 nlen += 1 + LINK_SIZE;
7348 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7349 *code++ = OP_KETRPOS;
7350 PUTINC(code, 0, nlen);
7351 PUT(bracode, 1, nlen);
7352 }
7353
7354 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7355
7356 else
7357 {
7358 *bracode += 1; /* Switch to xxxPOS opcodes */
7359 *ketcode = OP_KETRPOS;
7360 }
7361
7362 /* If the minimum is zero, mark it as possessive, then unset the
7363 possessive flag when the minimum is 0 or 1. */
7364
7365 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7366 if (repeat_min < 2) possessive_quantifier = FALSE;
7367 }
7368
7369 /* Non-possessive quantifier */
7370
7371 else *ketcode = OP_KETRMAX + repeat_type;
7372 }
7373 }
7374 }
7375 break;
7376
7377 /* If previous was a character type match (\d or similar), abolish it and
7378 create a suitable repeat item. The code is shared with single-character
7379 repeats by setting op_type to add a suitable offset into repeat_type.
7380 Note the the Unicode property types will be present only when
7381 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7382 here because it just makes it horribly messy. */
7383
7384 default:
7385 if (op_previous >= OP_EODN) /* Not a character type - internal error */
7386 {
7387 *errorcodeptr = ERR10;
7388 return 0;
7389 }
7390 else
7391 {
7392 int prop_type, prop_value;
7393 PCRE2_UCHAR *oldcode;
7394
7395 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7396
7397 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7398 mclength = 0; /* Not a character */
7399
7400 if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7401 {
7402 prop_type = previous[1];
7403 prop_value = previous[2];
7404 }
7405 else
7406 {
7407 /* Come here from just above with a character in mcbuffer/mclength. */
7408 OUTPUT_SINGLE_REPEAT:
7409 prop_type = prop_value = -1;
7410 }
7411
7412 /* At this point, if prop_type == prop_value == -1 we either have a
7413 character in mcbuffer when mclength is greater than zero, or we have
7414 mclength zero, in which case there is a non-property character type in
7415 op_previous. If prop_type/value are not negative, we have a property
7416 character type in op_previous. */
7417
7418 oldcode = code; /* Save where we were */
7419 code = previous; /* Usually overwrite previous item */
7420
7421 /* If the maximum is zero then the minimum must also be zero; Perl allows
7422 this case, so we do too - by simply omitting the item altogether. */
7423
7424 if (repeat_max == 0) goto END_REPEAT;
7425
7426 /* Combine the op_type with the repeat_type */
7427
7428 repeat_type += op_type;
7429
7430 /* A minimum of zero is handled either as the special case * or ?, or as
7431 an UPTO, with the maximum given. */
7432
7433 if (repeat_min == 0)
7434 {
7435 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7436 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7437 else
7438 {
7439 *code++ = OP_UPTO + repeat_type;
7440 PUT2INC(code, 0, repeat_max);
7441 }
7442 }
7443
7444 /* A repeat minimum of 1 is optimized into some special cases. If the
7445 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7446 left in place and, if the maximum is greater than 1, we use OP_UPTO with
7447 one less than the maximum. */
7448
7449 else if (repeat_min == 1)
7450 {
7451 if (repeat_max == REPEAT_UNLIMITED)
7452 *code++ = OP_PLUS + repeat_type;
7453 else
7454 {
7455 code = oldcode; /* Leave previous item in place */
7456 if (repeat_max == 1) goto END_REPEAT;
7457 *code++ = OP_UPTO + repeat_type;
7458 PUT2INC(code, 0, repeat_max - 1);
7459 }
7460 }
7461
7462 /* The case {n,n} is just an EXACT, while the general case {n,m} is
7463 handled as an EXACT followed by an UPTO or STAR or QUERY. */
7464
7465 else
7466 {
7467 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7468 PUT2INC(code, 0, repeat_min);
7469
7470 /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7471 and then generate the second opcode. For a repeated Unicode property
7472 match, there are two extra values that define the required property,
7473 and mclength is set zero to indicate this. */
7474
7475 if (repeat_max != repeat_min)
7476 {
7477 if (mclength > 0)
7478 {
7479 memcpy(code, mcbuffer, CU2BYTES(mclength));
7480 code += mclength;
7481 }
7482 else
7483 {
7484 *code++ = op_previous;
7485 if (prop_type >= 0)
7486 {
7487 *code++ = prop_type;
7488 *code++ = prop_value;
7489 }
7490 }
7491
7492 /* Now set up the following opcode */
7493
7494 if (repeat_max == REPEAT_UNLIMITED)
7495 *code++ = OP_STAR + repeat_type;
7496 else
7497 {
7498 repeat_max -= repeat_min;
7499 if (repeat_max == 1)
7500 {
7501 *code++ = OP_QUERY + repeat_type;
7502 }
7503 else
7504 {
7505 *code++ = OP_UPTO + repeat_type;
7506 PUT2INC(code, 0, repeat_max);
7507 }
7508 }
7509 }
7510 }
7511
7512 /* Fill in the character or character type for the final opcode. */
7513
7514 if (mclength > 0)
7515 {
7516 memcpy(code, mcbuffer, CU2BYTES(mclength));
7517 code += mclength;
7518 }
7519 else
7520 {
7521 *code++ = op_previous;
7522 if (prop_type >= 0)
7523 {
7524 *code++ = prop_type;
7525 *code++ = prop_value;
7526 }
7527 }
7528 }
7529 break;
7530 } /* End of switch on different op_previous values */
7531
7532
7533 /* If the character following a repeat is '+', possessive_quantifier is
7534 TRUE. For some opcodes, there are special alternative opcodes for this
7535 case. For anything else, we wrap the entire repeated item inside OP_ONCE
7536 brackets. Logically, the '+' notation is just syntactic sugar, taken from
7537 Sun's Java package, but the special opcodes can optimize it.
7538
7539 Some (but not all) possessively repeated subpatterns have already been
7540 completely handled in the code just above. For them, possessive_quantifier
7541 is always FALSE at this stage. Note that the repeated item starts at
7542 tempcode, not at previous, which might be the first part of a string whose
7543 (former) last char we repeated. */
7544
7545 if (possessive_quantifier)
7546 {
7547 int len;
7548
7549 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7550 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7551 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7552 remains is greater than zero, there's a further opcode that can be
7553 handled. If not, do nothing, leaving the EXACT alone. */
7554
7555 switch(*tempcode)
7556 {
7557 case OP_TYPEEXACT:
7558 tempcode += PRIV(OP_lengths)[*tempcode] +
7559 ((tempcode[1 + IMM2_SIZE] == OP_PROP
7560 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7561 break;
7562
7563 /* CHAR opcodes are used for exacts whose count is 1. */
7564
7565 case OP_CHAR:
7566 case OP_CHARI:
7567 case OP_NOT:
7568 case OP_NOTI:
7569 case OP_EXACT:
7570 case OP_EXACTI:
7571 case OP_NOTEXACT:
7572 case OP_NOTEXACTI:
7573 tempcode += PRIV(OP_lengths)[*tempcode];
7574 #ifdef SUPPORT_UNICODE
7575 if (utf && HAS_EXTRALEN(tempcode[-1]))
7576 tempcode += GET_EXTRALEN(tempcode[-1]);
7577 #endif
7578 break;
7579
7580 /* For the class opcodes, the repeat operator appears at the end;
7581 adjust tempcode to point to it. */
7582
7583 case OP_CLASS:
7584 case OP_NCLASS:
7585 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7586 break;
7587
7588 #ifdef SUPPORT_WIDE_CHARS
7589 case OP_XCLASS:
7590 tempcode += GET(tempcode, 1);
7591 break;
7592 #endif
7593 }
7594
7595 /* If tempcode is equal to code (which points to the end of the repeated
7596 item), it means we have skipped an EXACT item but there is no following
7597 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7598 all other cases, tempcode will be pointing to the repeat opcode, and will
7599 be less than code, so the value of len will be greater than 0. */
7600
7601 len = (int)(code - tempcode);
7602 if (len > 0)
7603 {
7604 unsigned int repcode = *tempcode;
7605
7606 /* There is a table for possessifying opcodes, all of which are less
7607 than OP_CALLOUT. A zero entry means there is no possessified version.
7608 */
7609
7610 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7611 *tempcode = opcode_possessify[repcode];
7612
7613 /* For opcode without a special possessified version, wrap the item in
7614 ONCE brackets. */
7615
7616 else
7617 {
7618 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7619 code += 1 + LINK_SIZE;
7620 len += 1 + LINK_SIZE;
7621 tempcode[0] = OP_ONCE;
7622 *code++ = OP_KET;
7623 PUTINC(code, 0, len);
7624 PUT(tempcode, 1, len);
7625 }
7626 }
7627 }
7628
7629 /* We set the "follows varying string" flag for subsequently encountered
7630 reqcus if it isn't already set and we have just passed a varying length
7631 item. */
7632
7633 END_REPEAT:
7634 cb->req_varyopt |= reqvary;
7635 break;
7636
7637
7638 /* ===================================================================*/
7639 /* Handle a 32-bit data character with a value greater than META_END. */
7640
7641 case META_BIGVALUE:
7642 pptr++;
7643 goto NORMAL_CHAR;
7644
7645
7646 /* ===============================================================*/
7647 /* Handle a back reference by number, which is the meta argument. The
7648 pattern offsets for back references to group numbers less than 10 are held
7649 in a special vector, to avoid using more than two parsed pattern elements
7650 in 64-bit environments. We only need the offset to the first occurrence,
7651 because if that doesn't fail, subsequent ones will also be OK. */
7652
7653 case META_BACKREF:
7654 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7655 else GETPLUSOFFSET(offset, pptr);
7656
7657 if (meta_arg > cb->bracount)
7658 {
7659 cb->erroroffset = offset;
7660 *errorcodeptr = ERR15; /* Non-existent subpattern */
7661 return 0;
7662 }
7663
7664 /* Come here from named backref handling when the reference is to a
7665 single group (that is, not to a duplicated name). The back reference
7666 data will have already been updated. We must disable firstcu if not
7667 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7668 later. */
7669
7670 HANDLE_SINGLE_REFERENCE:
7671 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7672 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7673 PUT2INC(code, 0, meta_arg);
7674
7675 /* Update the map of back references, and keep the highest one. We
7676 could do this in parse_regex() for numerical back references, but not
7677 for named back references, because we don't know the numbers to which
7678 named back references refer. So we do it all in this function. */
7679
7680 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7681 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7682 break;
7683
7684
7685 /* ===============================================================*/
7686 /* Handle recursion by inserting the number of the called group (which is
7687 the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7688 scanned and these numbers are replaced by offsets within the pattern. It is
7689 done like this to avoid problems with forward references and adjusting
7690 offsets when groups are duplicated and moved (as discovered in previous
7691 implementations). Note that a recursion does not have a set first
7692 character. */
7693
7694 case META_RECURSE:
7695 GETPLUSOFFSET(offset, pptr);
7696 if (meta_arg > cb->bracount)
7697 {
7698 cb->erroroffset = offset;
7699 *errorcodeptr = ERR15; /* Non-existent subpattern */
7700 return 0;
7701 }
7702 HANDLE_NUMERICAL_RECURSION:
7703 *code = OP_RECURSE;
7704 PUT(code, 1, meta_arg);
7705 code += 1 + LINK_SIZE;
7706 groupsetfirstcu = FALSE;
7707 cb->had_recurse = TRUE;
7708 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7709 zerofirstcu = firstcu;
7710 zerofirstcuflags = firstcuflags;
7711 break;
7712
7713
7714 /* ===============================================================*/
7715 /* Handle capturing parentheses; the number is the meta argument. */
7716
7717 case META_CAPTURE:
7718 bravalue = OP_CBRA;
7719 skipunits = IMM2_SIZE;
7720 PUT2(code, 1+LINK_SIZE, meta_arg);
7721 cb->lastcapture = meta_arg;
7722 goto GROUP_PROCESS_NOTE_EMPTY;
7723
7724
7725 /* ===============================================================*/
7726 /* Handle escape sequence items. For ones like \d, the ESC_values are
7727 arranged to be the same as the corresponding OP_values in the default case
7728 when PCRE2_UCP is not set (which is the only case in which they will appear
7729 here).
7730
7731 Note: \Q and \E are never seen here, as they were dealt with in
7732 parse_pattern(). Neither are numerical back references or recursions, which
7733 were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7734 \g, when followed by names, are turned into META_BACKREF_BYNAME or
7735 META_RECURSE_BYNAME. */
7736
7737 case META_ESCAPE:
7738
7739 /* We can test for escape sequences that consume a character because their
7740 values lie between ESC_b and ESC_Z; this may have to change if any new ones
7741 are ever created. For these sequences, we disable the setting of a first
7742 character if it hasn't already been set. */
7743
7744 if (meta_arg > ESC_b && meta_arg < ESC_Z)
7745 {
7746 matched_char = TRUE;
7747 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7748 }
7749
7750 /* Set values to reset to if this is followed by a zero repeat. */
7751
7752 zerofirstcu = firstcu;
7753 zerofirstcuflags = firstcuflags;
7754 zeroreqcu = reqcu;
7755 zeroreqcuflags = reqcuflags;
7756
7757 /* If Unicode is not supported, \P and \p are not allowed and are
7758 faulted at parse time, so will never appear here. */
7759
7760 #ifdef SUPPORT_UNICODE
7761 if (meta_arg == ESC_P || meta_arg == ESC_p)
7762 {
7763 uint32_t ptype = *(++pptr) >> 16;
7764 uint32_t pdata = *pptr & 0xffff;
7765
7766 /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7767 from the auto-anchoring code. */
7768
7769 if (meta_arg == ESC_p && ptype == PT_ANY)
7770 {
7771 *code++ = OP_ALLANY;
7772 }
7773 else
7774 {
7775 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7776 *code++ = ptype;
7777 *code++ = pdata;
7778 }
7779 break; /* End META_ESCAPE */
7780 }
7781 #endif
7782
7783 /* For the rest (including \X when Unicode is supported - if not it's
7784 faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7785 not set; if it is set, these escapes do not show up here because they are
7786 converted into Unicode property tests in parse_regex(). Note that \b and \B
7787 do a one-character lookbehind, and \A also behaves as if it does. */
7788
7789 if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7790 if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7791 cb->max_lookbehind == 0)
7792 cb->max_lookbehind = 1;
7793
7794 /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7795 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7796
7797 #if PCRE2_CODE_UNIT_WIDTH == 32
7798 *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7799 #else
7800 *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7801 #endif
7802 break; /* End META_ESCAPE */
7803
7804
7805 /* ===================================================================*/
7806 /* Handle an unrecognized meta value. A parsed pattern value less than
7807 META_END is a literal. Otherwise we have a problem. */
7808
7809 default:
7810 if (meta >= META_END)
7811 {
7812 #ifdef DEBUG_SHOW_PARSED
7813 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7814 #endif
7815 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
7816 return 0;
7817 }
7818
7819 /* Handle a literal character. We come here by goto in the case of a
7820 32-bit, non-UTF character whose value is greater than META_END. */
7821
7822 NORMAL_CHAR:
7823 meta = *pptr; /* Get the full 32 bits */
7824 NORMAL_CHAR_SET: /* Character is already in meta */
7825 matched_char = TRUE;
7826
7827 /* For caseless UTF or UCP mode, check whether this character has more than
7828 one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
7829 */
7830
7831 #ifdef SUPPORT_UNICODE
7832 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
7833 {
7834 uint32_t caseset = UCD_CASESET(meta);
7835 if (caseset != 0)
7836 {
7837 *code++ = OP_PROP;
7838 *code++ = PT_CLIST;
7839 *code++ = caseset;
7840 if (firstcuflags == REQ_UNSET)
7841 firstcuflags = zerofirstcuflags = REQ_NONE;
7842 break; /* End handling this meta item */
7843 }
7844 }
7845 #endif
7846
7847 /* Caseful matches, or caseless and not one of the multicase characters. We
7848 come here by goto in the case of a positive class that contains only
7849 case-partners of a character with just two cases; matched_char has already
7850 been set TRUE and options fudged if necessary. */
7851
7852 CLASS_CASELESS_CHAR:
7853
7854 /* Get the character's code units into mcbuffer, with the length in
7855 mclength. When not in UTF mode, the length is always 1. */
7856
7857 #ifdef SUPPORT_UNICODE
7858 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7859 #endif
7860 {
7861 mclength = 1;
7862 mcbuffer[0] = meta;
7863 }
7864
7865 /* Generate the appropriate code */
7866
7867 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7868 memcpy(code, mcbuffer, CU2BYTES(mclength));
7869 code += mclength;
7870
7871 /* Remember if \r or \n were seen */
7872
7873 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7874 cb->external_flags |= PCRE2_HASCRORLF;
7875
7876 /* Set the first and required code units appropriately. If no previous
7877 first code unit, set it from this character, but revert to none on a zero
7878 repeat. Otherwise, leave the firstcu value alone, and don't change it on
7879 a zero repeat. */
7880
7881 if (firstcuflags == REQ_UNSET)
7882 {
7883 zerofirstcuflags = REQ_NONE;
7884 zeroreqcu = reqcu;
7885 zeroreqcuflags = reqcuflags;
7886
7887 /* If the character is more than one code unit long, we can set a single
7888 firstcu only if it is not to be matched caselessly. Multiple possible
7889 starting code units may be picked up later in the studying code. */
7890
7891 if (mclength == 1 || req_caseopt == 0)
7892 {
7893 firstcu = mcbuffer[0];
7894 firstcuflags = req_caseopt;
7895 if (mclength != 1)
7896 {
7897 reqcu = code[-1];
7898 reqcuflags = cb->req_varyopt;
7899 }
7900 }
7901 else firstcuflags = reqcuflags = REQ_NONE;
7902 }
7903
7904 /* firstcu was previously set; we can set reqcu only if the length is
7905 1 or the matching is caseful. */
7906
7907 else
7908 {
7909 zerofirstcu = firstcu;
7910 zerofirstcuflags = firstcuflags;
7911 zeroreqcu = reqcu;
7912 zeroreqcuflags = reqcuflags;
7913 if (mclength == 1 || req_caseopt == 0)
7914 {
7915 reqcu = code[-1];
7916 reqcuflags = req_caseopt | cb->req_varyopt;
7917 }
7918 }
7919
7920 /* If caselessness was temporarily instated, reset it. */
7921
7922 if (reset_caseful)
7923 {
7924 options &= ~PCRE2_CASELESS;
7925 req_caseopt = 0;
7926 reset_caseful = FALSE;
7927 }
7928
7929 break; /* End literal character handling */
7930 } /* End of big switch */
7931 } /* End of big loop */
7932
7933 /* Control never reaches here. */
7934 }
7935
7936
7937
7938 /*************************************************
7939 * Compile regex: a sequence of alternatives *
7940 *************************************************/
7941
7942 /* On entry, pptr is pointing past the bracket meta, but on return it points to
7943 the closing bracket or META_END. The code variable is pointing at the code unit
7944 into which the BRA operator has been stored. This function is used during the
7945 pre-compile phase when we are trying to find out the amount of memory needed,
7946 as well as during the real compile phase. The value of lengthptr distinguishes
7947 the two phases.
7948
7949 Arguments:
7950 options option bits, including any changes for this subpattern
7951 codeptr -> the address of the current code pointer
7952 pptrptr -> the address of the current parsed pattern pointer
7953 errorcodeptr -> pointer to error code variable
7954 skipunits skip this many code units at start (for brackets and OP_COND)
7955 firstcuptr place to put the first required code unit
7956 firstcuflagsptr place to put the first code unit flags, or a negative number
7957 reqcuptr place to put the last required code unit
7958 reqcuflagsptr place to put the last required code unit flags, or a negative number
7959 bcptr pointer to the chain of currently open branches
7960 cb points to the data block with tables pointers etc.
7961 lengthptr NULL during the real compile phase
7962 points to length accumulator during pre-compile phase
7963
7964 Returns: 0 There has been an error
7965 +1 Success, this group must match at least one character
7966 -1 Success, this group may match an empty string
7967 */
7968
7969 static int
compile_regex(uint32_t options,PCRE2_UCHAR ** codeptr,uint32_t ** pptrptr,int * errorcodeptr,uint32_t skipunits,uint32_t * firstcuptr,int32_t * firstcuflagsptr,uint32_t * reqcuptr,int32_t * reqcuflagsptr,branch_chain * bcptr,compile_block * cb,PCRE2_SIZE * lengthptr)7970 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
7971 int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
7972 int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
7973 branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
7974 {
7975 PCRE2_UCHAR *code = *codeptr;
7976 PCRE2_UCHAR *last_branch = code;
7977 PCRE2_UCHAR *start_bracket = code;
7978 BOOL lookbehind;
7979 open_capitem capitem;
7980 int capnumber = 0;
7981 int okreturn = 1;
7982 uint32_t *pptr = *pptrptr;
7983 uint32_t firstcu, reqcu;
7984 uint32_t lookbehindlength;
7985 int32_t firstcuflags, reqcuflags;
7986 uint32_t branchfirstcu, branchreqcu;
7987 int32_t branchfirstcuflags, branchreqcuflags;
7988 PCRE2_SIZE length;
7989 branch_chain bc;
7990
7991 /* If set, call the external function that checks for stack availability. */
7992
7993 if (cb->cx->stack_guard != NULL &&
7994 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7995 {
7996 *errorcodeptr= ERR33;
7997 return 0;
7998 }
7999
8000 /* Miscellaneous initialization */
8001
8002 bc.outer = bcptr;
8003 bc.current_branch = code;
8004
8005 firstcu = reqcu = 0;
8006 firstcuflags = reqcuflags = REQ_UNSET;
8007
8008 /* Accumulate the length for use in the pre-compile phase. Start with the
8009 length of the BRA and KET and any extra code units that are required at the
8010 beginning. We accumulate in a local variable to save frequent testing of
8011 lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8012 start and end of each alternative, because compiled items are discarded during
8013 the pre-compile phase so that the workspace is not exceeded. */
8014
8015 length = 2 + 2*LINK_SIZE + skipunits;
8016
8017 /* Remember if this is a lookbehind assertion, and if it is, save its length
8018 and skip over the pattern offset. */
8019
8020 lookbehind = *code == OP_ASSERTBACK ||
8021 *code == OP_ASSERTBACK_NOT ||
8022 *code == OP_ASSERTBACK_NA;
8023
8024 if (lookbehind)
8025 {
8026 lookbehindlength = META_DATA(pptr[-1]);
8027 pptr += SIZEOFFSET;
8028 }
8029 else lookbehindlength = 0;
8030
8031 /* If this is a capturing subpattern, add to the chain of open capturing items
8032 so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8033 need be tested here; changing this opcode to one of its variants, e.g.
8034 OP_SCBRAPOS, happens later, after the group has been compiled. */
8035
8036 if (*code == OP_CBRA)
8037 {
8038 capnumber = GET2(code, 1 + LINK_SIZE);
8039 capitem.number = capnumber;
8040 capitem.next = cb->open_caps;
8041 capitem.assert_depth = cb->assert_depth;
8042 cb->open_caps = &capitem;
8043 }
8044
8045 /* Offset is set zero to mark that this bracket is still open */
8046
8047 PUT(code, 1, 0);
8048 code += 1 + LINK_SIZE + skipunits;
8049
8050 /* Loop for each alternative branch */
8051
8052 for (;;)
8053 {
8054 int branch_return;
8055
8056 /* Insert OP_REVERSE if this is as lookbehind assertion. */
8057
8058 if (lookbehind && lookbehindlength > 0)
8059 {
8060 *code++ = OP_REVERSE;
8061 PUTINC(code, 0, lookbehindlength);
8062 length += 1 + LINK_SIZE;
8063 }
8064
8065 /* Now compile the branch; in the pre-compile phase its length gets added
8066 into the length. */
8067
8068 if ((branch_return =
8069 compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
8070 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
8071 cb, (lengthptr == NULL)? NULL : &length)) == 0)
8072 return 0;
8073
8074 /* If a branch can match an empty string, so can the whole group. */
8075
8076 if (branch_return < 0) okreturn = -1;
8077
8078 /* In the real compile phase, there is some post-processing to be done. */
8079
8080 if (lengthptr == NULL)
8081 {
8082 /* If this is the first branch, the firstcu and reqcu values for the
8083 branch become the values for the regex. */
8084
8085 if (*last_branch != OP_ALT)
8086 {
8087 firstcu = branchfirstcu;
8088 firstcuflags = branchfirstcuflags;
8089 reqcu = branchreqcu;
8090 reqcuflags = branchreqcuflags;
8091 }
8092
8093 /* If this is not the first branch, the first char and reqcu have to
8094 match the values from all the previous branches, except that if the
8095 previous value for reqcu didn't have REQ_VARY set, it can still match,
8096 and we set REQ_VARY for the group from this branch's value. */
8097
8098 else
8099 {
8100 /* If we previously had a firstcu, but it doesn't match the new branch,
8101 we have to abandon the firstcu for the regex, but if there was
8102 previously no reqcu, it takes on the value of the old firstcu. */
8103
8104 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8105 {
8106 if (firstcuflags >= 0)
8107 {
8108 if (reqcuflags < 0)
8109 {
8110 reqcu = firstcu;
8111 reqcuflags = firstcuflags;
8112 }
8113 }
8114 firstcuflags = REQ_NONE;
8115 }
8116
8117 /* If we (now or from before) have no firstcu, a firstcu from the
8118 branch becomes a reqcu if there isn't a branch reqcu. */
8119
8120 if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
8121 branchreqcuflags < 0)
8122 {
8123 branchreqcu = branchfirstcu;
8124 branchreqcuflags = branchfirstcuflags;
8125 }
8126
8127 /* Now ensure that the reqcus match */
8128
8129 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8130 reqcu != branchreqcu)
8131 reqcuflags = REQ_NONE;
8132 else
8133 {
8134 reqcu = branchreqcu;
8135 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8136 }
8137 }
8138 }
8139
8140 /* Handle reaching the end of the expression, either ')' or end of pattern.
8141 In the real compile phase, go back through the alternative branches and
8142 reverse the chain of offsets, with the field in the BRA item now becoming an
8143 offset to the first alternative. If there are no alternatives, it points to
8144 the end of the group. The length in the terminating ket is always the length
8145 of the whole bracketed item. Return leaving the pointer at the terminating
8146 char. */
8147
8148 if (META_CODE(*pptr) != META_ALT)
8149 {
8150 if (lengthptr == NULL)
8151 {
8152 PCRE2_SIZE branch_length = code - last_branch;
8153 do
8154 {
8155 PCRE2_SIZE prev_length = GET(last_branch, 1);
8156 PUT(last_branch, 1, branch_length);
8157 branch_length = prev_length;
8158 last_branch -= branch_length;
8159 }
8160 while (branch_length > 0);
8161 }
8162
8163 /* Fill in the ket */
8164
8165 *code = OP_KET;
8166 PUT(code, 1, (int)(code - start_bracket));
8167 code += 1 + LINK_SIZE;
8168
8169 /* If it was a capturing subpattern, remove the block from the chain. */
8170
8171 if (capnumber > 0) cb->open_caps = cb->open_caps->next;
8172
8173 /* Set values to pass back */
8174
8175 *codeptr = code;
8176 *pptrptr = pptr;
8177 *firstcuptr = firstcu;
8178 *firstcuflagsptr = firstcuflags;
8179 *reqcuptr = reqcu;
8180 *reqcuflagsptr = reqcuflags;
8181 if (lengthptr != NULL)
8182 {
8183 if (OFLOW_MAX - *lengthptr < length)
8184 {
8185 *errorcodeptr = ERR20;
8186 return 0;
8187 }
8188 *lengthptr += length;
8189 }
8190 return okreturn;
8191 }
8192
8193 /* Another branch follows. In the pre-compile phase, we can move the code
8194 pointer back to where it was for the start of the first branch. (That is,
8195 pretend that each branch is the only one.)
8196
8197 In the real compile phase, insert an ALT node. Its length field points back
8198 to the previous branch while the bracket remains open. At the end the chain
8199 is reversed. It's done like this so that the start of the bracket has a
8200 zero offset until it is closed, making it possible to detect recursion. */
8201
8202 if (lengthptr != NULL)
8203 {
8204 code = *codeptr + 1 + LINK_SIZE + skipunits;
8205 length += 1 + LINK_SIZE;
8206 }
8207 else
8208 {
8209 *code = OP_ALT;
8210 PUT(code, 1, (int)(code - last_branch));
8211 bc.current_branch = last_branch = code;
8212 code += 1 + LINK_SIZE;
8213 }
8214
8215 /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8216 and then advance past the vertical bar. */
8217
8218 lookbehindlength = META_DATA(*pptr);
8219 pptr++;
8220 }
8221 /* Control never reaches here */
8222 }
8223
8224
8225
8226 /*************************************************
8227 * Check for anchored pattern *
8228 *************************************************/
8229
8230 /* Try to find out if this is an anchored regular expression. Consider each
8231 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8232 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8233 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8234 be found, because ^ generates OP_CIRCM in that mode.
8235
8236 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8237 This is the code for \G, which means "match at start of match position, taking
8238 into account the match offset".
8239
8240 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8241 because that will try the rest of the pattern at all possible matching points,
8242 so there is no point trying again.... er ....
8243
8244 .... except when the .* appears inside capturing parentheses, and there is a
8245 subsequent back reference to those parentheses. We haven't enough information
8246 to catch that case precisely.
8247
8248 At first, the best we could do was to detect when .* was in capturing brackets
8249 and the highest back reference was greater than or equal to that level.
8250 However, by keeping a bitmap of the first 31 back references, we can catch some
8251 of the more common cases more precisely.
8252
8253 ... A second exception is when the .* appears inside an atomic group, because
8254 this prevents the number of characters it matches from being adjusted.
8255
8256 Arguments:
8257 code points to start of the compiled pattern
8258 bracket_map a bitmap of which brackets we are inside while testing; this
8259 handles up to substring 31; after that we just have to take
8260 the less precise approach
8261 cb points to the compile data block
8262 atomcount atomic group level
8263 inassert TRUE if in an assertion
8264
8265 Returns: TRUE or FALSE
8266 */
8267
8268 static BOOL
is_anchored(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8269 is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8270 int atomcount, BOOL inassert)
8271 {
8272 do {
8273 PCRE2_SPTR scode = first_significant_code(
8274 code + PRIV(OP_lengths)[*code], FALSE);
8275 int op = *scode;
8276
8277 /* Non-capturing brackets */
8278
8279 if (op == OP_BRA || op == OP_BRAPOS ||
8280 op == OP_SBRA || op == OP_SBRAPOS)
8281 {
8282 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8283 return FALSE;
8284 }
8285
8286 /* Capturing brackets */
8287
8288 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8289 op == OP_SCBRA || op == OP_SCBRAPOS)
8290 {
8291 int n = GET2(scode, 1+LINK_SIZE);
8292 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8293 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8294 }
8295
8296 /* Positive forward assertion */
8297
8298 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8299 {
8300 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8301 }
8302
8303 /* Condition. If there is no second branch, it can't be anchored. */
8304
8305 else if (op == OP_COND || op == OP_SCOND)
8306 {
8307 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8308 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8309 return FALSE;
8310 }
8311
8312 /* Atomic groups */
8313
8314 else if (op == OP_ONCE)
8315 {
8316 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8317 return FALSE;
8318 }
8319
8320 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8321 it isn't in brackets that are or may be referenced or inside an atomic
8322 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8323 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8324 with the subject "aab", which matches "b", i.e. not at the start of a line.
8325 There is also an option that disables auto-anchoring. */
8326
8327 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8328 op == OP_TYPEPOSSTAR))
8329 {
8330 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8331 atomcount > 0 || cb->had_pruneorskip || inassert ||
8332 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8333 return FALSE;
8334 }
8335
8336 /* Check for explicit anchoring */
8337
8338 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8339
8340 code += GET(code, 1);
8341 }
8342 while (*code == OP_ALT); /* Loop for each alternative */
8343 return TRUE;
8344 }
8345
8346
8347
8348 /*************************************************
8349 * Check for starting with ^ or .* *
8350 *************************************************/
8351
8352 /* This is called to find out if every branch starts with ^ or .* so that
8353 "first char" processing can be done to speed things up in multiline
8354 matching and for non-DOTALL patterns that start with .* (which must start at
8355 the beginning or after \n). As in the case of is_anchored() (see above), we
8356 have to take account of back references to capturing brackets that contain .*
8357 because in that case we can't make the assumption. Also, the appearance of .*
8358 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8359 or *SKIP does not count, because once again the assumption no longer holds.
8360
8361 Arguments:
8362 code points to start of the compiled pattern or a group
8363 bracket_map a bitmap of which brackets we are inside while testing; this
8364 handles up to substring 31; after that we just have to take
8365 the less precise approach
8366 cb points to the compile data
8367 atomcount atomic group level
8368 inassert TRUE if in an assertion
8369
8370 Returns: TRUE or FALSE
8371 */
8372
8373 static BOOL
is_startline(PCRE2_SPTR code,unsigned int bracket_map,compile_block * cb,int atomcount,BOOL inassert)8374 is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8375 int atomcount, BOOL inassert)
8376 {
8377 do {
8378 PCRE2_SPTR scode = first_significant_code(
8379 code + PRIV(OP_lengths)[*code], FALSE);
8380 int op = *scode;
8381
8382 /* If we are at the start of a conditional assertion group, *both* the
8383 conditional assertion *and* what follows the condition must satisfy the test
8384 for start of line. Other kinds of condition fail. Note that there may be an
8385 auto-callout at the start of a condition. */
8386
8387 if (op == OP_COND)
8388 {
8389 scode += 1 + LINK_SIZE;
8390
8391 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8392 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8393
8394 switch (*scode)
8395 {
8396 case OP_CREF:
8397 case OP_DNCREF:
8398 case OP_RREF:
8399 case OP_DNRREF:
8400 case OP_FAIL:
8401 case OP_FALSE:
8402 case OP_TRUE:
8403 return FALSE;
8404
8405 default: /* Assertion */
8406 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8407 do scode += GET(scode, 1); while (*scode == OP_ALT);
8408 scode += 1 + LINK_SIZE;
8409 break;
8410 }
8411 scode = first_significant_code(scode, FALSE);
8412 op = *scode;
8413 }
8414
8415 /* Non-capturing brackets */
8416
8417 if (op == OP_BRA || op == OP_BRAPOS ||
8418 op == OP_SBRA || op == OP_SBRAPOS)
8419 {
8420 if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8421 return FALSE;
8422 }
8423
8424 /* Capturing brackets */
8425
8426 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8427 op == OP_SCBRA || op == OP_SCBRAPOS)
8428 {
8429 int n = GET2(scode, 1+LINK_SIZE);
8430 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8431 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8432 }
8433
8434 /* Positive forward assertions */
8435
8436 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8437 {
8438 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8439 return FALSE;
8440 }
8441
8442 /* Atomic brackets */
8443
8444 else if (op == OP_ONCE)
8445 {
8446 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8447 return FALSE;
8448 }
8449
8450 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8451 brackets that may be referenced or an assertion, and as long as the pattern
8452 does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8453 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8454 i.e. not at the start of a line. There is also an option that disables this
8455 optimization. */
8456
8457 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8458 {
8459 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8460 atomcount > 0 || cb->had_pruneorskip || inassert ||
8461 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8462 return FALSE;
8463 }
8464
8465 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8466 in particular that this includes atomic brackets OP_ONCE because the number
8467 of characters matched by .* cannot be adjusted inside them. */
8468
8469 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8470
8471 /* Move on to the next alternative */
8472
8473 code += GET(code, 1);
8474 }
8475 while (*code == OP_ALT); /* Loop for each alternative */
8476 return TRUE;
8477 }
8478
8479
8480
8481 /*************************************************
8482 * Scan compiled regex for recursion reference *
8483 *************************************************/
8484
8485 /* This function scans through a compiled pattern until it finds an instance of
8486 OP_RECURSE.
8487
8488 Arguments:
8489 code points to start of expression
8490 utf TRUE in UTF mode
8491
8492 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8493 */
8494
8495 static PCRE2_SPTR
find_recurse(PCRE2_SPTR code,BOOL utf)8496 find_recurse(PCRE2_SPTR code, BOOL utf)
8497 {
8498 for (;;)
8499 {
8500 PCRE2_UCHAR c = *code;
8501 if (c == OP_END) return NULL;
8502 if (c == OP_RECURSE) return code;
8503
8504 /* XCLASS is used for classes that cannot be represented just by a bit map.
8505 This includes negated single high-valued characters. CALLOUT_STR is used for
8506 callouts with string arguments. In both cases the length in the table is
8507 zero; the actual length is stored in the compiled code. */
8508
8509 if (c == OP_XCLASS) code += GET(code, 1);
8510 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8511
8512 /* Otherwise, we can get the item's length from the table, except that for
8513 repeated character types, we have to test for \p and \P, which have an extra
8514 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8515 we must add in its length. */
8516
8517 else
8518 {
8519 switch(c)
8520 {
8521 case OP_TYPESTAR:
8522 case OP_TYPEMINSTAR:
8523 case OP_TYPEPLUS:
8524 case OP_TYPEMINPLUS:
8525 case OP_TYPEQUERY:
8526 case OP_TYPEMINQUERY:
8527 case OP_TYPEPOSSTAR:
8528 case OP_TYPEPOSPLUS:
8529 case OP_TYPEPOSQUERY:
8530 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8531 break;
8532
8533 case OP_TYPEPOSUPTO:
8534 case OP_TYPEUPTO:
8535 case OP_TYPEMINUPTO:
8536 case OP_TYPEEXACT:
8537 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8538 code += 2;
8539 break;
8540
8541 case OP_MARK:
8542 case OP_COMMIT_ARG:
8543 case OP_PRUNE_ARG:
8544 case OP_SKIP_ARG:
8545 case OP_THEN_ARG:
8546 code += code[1];
8547 break;
8548 }
8549
8550 /* Add in the fixed length from the table */
8551
8552 code += PRIV(OP_lengths)[c];
8553
8554 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8555 be followed by a multi-unit character. The length in the table is a
8556 minimum, so we have to arrange to skip the extra units. */
8557
8558 #ifdef MAYBE_UTF_MULTI
8559 if (utf) switch(c)
8560 {
8561 case OP_CHAR:
8562 case OP_CHARI:
8563 case OP_NOT:
8564 case OP_NOTI:
8565 case OP_EXACT:
8566 case OP_EXACTI:
8567 case OP_NOTEXACT:
8568 case OP_NOTEXACTI:
8569 case OP_UPTO:
8570 case OP_UPTOI:
8571 case OP_NOTUPTO:
8572 case OP_NOTUPTOI:
8573 case OP_MINUPTO:
8574 case OP_MINUPTOI:
8575 case OP_NOTMINUPTO:
8576 case OP_NOTMINUPTOI:
8577 case OP_POSUPTO:
8578 case OP_POSUPTOI:
8579 case OP_NOTPOSUPTO:
8580 case OP_NOTPOSUPTOI:
8581 case OP_STAR:
8582 case OP_STARI:
8583 case OP_NOTSTAR:
8584 case OP_NOTSTARI:
8585 case OP_MINSTAR:
8586 case OP_MINSTARI:
8587 case OP_NOTMINSTAR:
8588 case OP_NOTMINSTARI:
8589 case OP_POSSTAR:
8590 case OP_POSSTARI:
8591 case OP_NOTPOSSTAR:
8592 case OP_NOTPOSSTARI:
8593 case OP_PLUS:
8594 case OP_PLUSI:
8595 case OP_NOTPLUS:
8596 case OP_NOTPLUSI:
8597 case OP_MINPLUS:
8598 case OP_MINPLUSI:
8599 case OP_NOTMINPLUS:
8600 case OP_NOTMINPLUSI:
8601 case OP_POSPLUS:
8602 case OP_POSPLUSI:
8603 case OP_NOTPOSPLUS:
8604 case OP_NOTPOSPLUSI:
8605 case OP_QUERY:
8606 case OP_QUERYI:
8607 case OP_NOTQUERY:
8608 case OP_NOTQUERYI:
8609 case OP_MINQUERY:
8610 case OP_MINQUERYI:
8611 case OP_NOTMINQUERY:
8612 case OP_NOTMINQUERYI:
8613 case OP_POSQUERY:
8614 case OP_POSQUERYI:
8615 case OP_NOTPOSQUERY:
8616 case OP_NOTPOSQUERYI:
8617 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8618 break;
8619 }
8620 #else
8621 (void)(utf); /* Keep compiler happy by referencing function argument */
8622 #endif /* MAYBE_UTF_MULTI */
8623 }
8624 }
8625 }
8626
8627
8628
8629 /*************************************************
8630 * Check for asserted fixed first code unit *
8631 *************************************************/
8632
8633 /* During compilation, the "first code unit" settings from forward assertions
8634 are discarded, because they can cause conflicts with actual literals that
8635 follow. However, if we end up without a first code unit setting for an
8636 unanchored pattern, it is worth scanning the regex to see if there is an
8637 initial asserted first code unit. If all branches start with the same asserted
8638 code unit, or with a non-conditional bracket all of whose alternatives start
8639 with the same asserted code unit (recurse ad lib), then we return that code
8640 unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8641 REQ_NONE in the flags.
8642
8643 Arguments:
8644 code points to start of compiled pattern
8645 flags points to the first code unit flags
8646 inassert non-zero if in an assertion
8647
8648 Returns: the fixed first code unit, or 0 with REQ_NONE in flags
8649 */
8650
8651 static uint32_t
find_firstassertedcu(PCRE2_SPTR code,int32_t * flags,uint32_t inassert)8652 find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert)
8653 {
8654 uint32_t c = 0;
8655 int cflags = REQ_NONE;
8656
8657 *flags = REQ_NONE;
8658 do {
8659 uint32_t d;
8660 int dflags;
8661 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8662 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8663 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8664 PCRE2_UCHAR op = *scode;
8665
8666 switch(op)
8667 {
8668 default:
8669 return 0;
8670
8671 case OP_BRA:
8672 case OP_BRAPOS:
8673 case OP_CBRA:
8674 case OP_SCBRA:
8675 case OP_CBRAPOS:
8676 case OP_SCBRAPOS:
8677 case OP_ASSERT:
8678 case OP_ASSERT_NA:
8679 case OP_ONCE:
8680 case OP_SCRIPT_RUN:
8681 d = find_firstassertedcu(scode, &dflags, inassert +
8682 ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
8683 if (dflags < 0)
8684 return 0;
8685 if (cflags < 0) { c = d; cflags = dflags; }
8686 else if (c != d || cflags != dflags) return 0;
8687 break;
8688
8689 case OP_EXACT:
8690 scode += IMM2_SIZE;
8691 /* Fall through */
8692
8693 case OP_CHAR:
8694 case OP_PLUS:
8695 case OP_MINPLUS:
8696 case OP_POSPLUS:
8697 if (inassert == 0) return 0;
8698 if (cflags < 0) { c = scode[1]; cflags = 0; }
8699 else if (c != scode[1]) return 0;
8700 break;
8701
8702 case OP_EXACTI:
8703 scode += IMM2_SIZE;
8704 /* Fall through */
8705
8706 case OP_CHARI:
8707 case OP_PLUSI:
8708 case OP_MINPLUSI:
8709 case OP_POSPLUSI:
8710 if (inassert == 0) return 0;
8711
8712 /* If the character is more than one code unit long, we cannot set its
8713 first code unit when matching caselessly. Later scanning may pick up
8714 multiple code units. */
8715
8716 #ifdef SUPPORT_UNICODE
8717 #if PCRE2_CODE_UNIT_WIDTH == 8
8718 if (scode[1] >= 0x80) return 0;
8719 #elif PCRE2_CODE_UNIT_WIDTH == 16
8720 if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
8721 #endif
8722 #endif
8723
8724 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8725 else if (c != scode[1]) return 0;
8726 break;
8727 }
8728
8729 code += GET(code, 1);
8730 }
8731 while (*code == OP_ALT);
8732
8733 *flags = cflags;
8734 return c;
8735 }
8736
8737
8738
8739 /*************************************************
8740 * Add an entry to the name/number table *
8741 *************************************************/
8742
8743 /* This function is called between compiling passes to add an entry to the
8744 name/number table, maintaining alphabetical order. Checking for permitted
8745 and forbidden duplicates has already been done.
8746
8747 Arguments:
8748 cb the compile data block
8749 name the name to add
8750 length the length of the name
8751 groupno the group number
8752 tablecount the count of names in the table so far
8753
8754 Returns: nothing
8755 */
8756
8757 static void
add_name_to_table(compile_block * cb,PCRE2_SPTR name,int length,unsigned int groupno,uint32_t tablecount)8758 add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8759 unsigned int groupno, uint32_t tablecount)
8760 {
8761 uint32_t i;
8762 PCRE2_UCHAR *slot = cb->name_table;
8763
8764 for (i = 0; i < tablecount; i++)
8765 {
8766 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8767 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8768 crc = -1; /* Current name is a substring */
8769
8770 /* Make space in the table and break the loop for an earlier name. For a
8771 duplicate or later name, carry on. We do this for duplicates so that in the
8772 simple case (when ?(| is not used) they are in order of their numbers. In all
8773 cases they are in the order in which they appear in the pattern. */
8774
8775 if (crc < 0)
8776 {
8777 (void)memmove(slot + cb->name_entry_size, slot,
8778 CU2BYTES((tablecount - i) * cb->name_entry_size));
8779 break;
8780 }
8781
8782 /* Continue the loop for a later or duplicate name */
8783
8784 slot += cb->name_entry_size;
8785 }
8786
8787 PUT2(slot, 0, groupno);
8788 memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8789
8790 /* Add a terminating zero and fill the rest of the slot with zeroes so that
8791 the memory is all initialized. Otherwise valgrind moans about uninitialized
8792 memory when saving serialized compiled patterns. */
8793
8794 memset(slot + IMM2_SIZE + length, 0,
8795 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8796 }
8797
8798
8799
8800 /*************************************************
8801 * Skip in parsed pattern *
8802 *************************************************/
8803
8804 /* This function is called to skip parts of the parsed pattern when finding the
8805 length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8806 the end of the branch, it is called to skip over an internal lookaround or
8807 (DEFINE) group, and it is also called to skip to the end of a class, during
8808 which it will never encounter nested groups (but there's no need to have
8809 special code for that).
8810
8811 When called to find the end of a branch or group, pptr must point to the first
8812 meta code inside the branch, not the branch-starting code. In other cases it
8813 can point to the item that causes the function to be called.
8814
8815 Arguments:
8816 pptr current pointer to skip from
8817 skiptype PSKIP_CLASS when skipping to end of class
8818 PSKIP_ALT when META_ALT ends the skip
8819 PSKIP_KET when only META_KET ends the skip
8820
8821 Returns: new value of pptr
8822 NULL if META_END is reached - should never occur
8823 or for an unknown meta value - likewise
8824 */
8825
8826 static uint32_t *
parsed_skip(uint32_t * pptr,uint32_t skiptype)8827 parsed_skip(uint32_t *pptr, uint32_t skiptype)
8828 {
8829 uint32_t nestlevel = 0;
8830
8831 for (;; pptr++)
8832 {
8833 uint32_t meta = META_CODE(*pptr);
8834
8835 switch(meta)
8836 {
8837 default: /* Just skip over most items */
8838 if (meta < META_END) continue; /* Literal */
8839 break;
8840
8841 /* This should never occur. */
8842
8843 case META_END:
8844 return NULL;
8845
8846 /* The data for these items is variable in length. */
8847
8848 case META_BACKREF: /* Offset is present only if group >= 10 */
8849 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8850 break;
8851
8852 case META_ESCAPE: /* A few escapes are followed by data items. */
8853 switch (META_DATA(*pptr))
8854 {
8855 case ESC_P:
8856 case ESC_p:
8857 pptr += 1;
8858 break;
8859
8860 case ESC_g:
8861 case ESC_k:
8862 pptr += 1 + SIZEOFFSET;
8863 break;
8864 }
8865 break;
8866
8867 case META_MARK: /* Add the length of the name. */
8868 case META_COMMIT_ARG:
8869 case META_PRUNE_ARG:
8870 case META_SKIP_ARG:
8871 case META_THEN_ARG:
8872 pptr += pptr[1];
8873 break;
8874
8875 /* These are the "active" items in this loop. */
8876
8877 case META_CLASS_END:
8878 if (skiptype == PSKIP_CLASS) return pptr;
8879 break;
8880
8881 case META_ATOMIC:
8882 case META_CAPTURE:
8883 case META_COND_ASSERT:
8884 case META_COND_DEFINE:
8885 case META_COND_NAME:
8886 case META_COND_NUMBER:
8887 case META_COND_RNAME:
8888 case META_COND_RNUMBER:
8889 case META_COND_VERSION:
8890 case META_LOOKAHEAD:
8891 case META_LOOKAHEADNOT:
8892 case META_LOOKAHEAD_NA:
8893 case META_LOOKBEHIND:
8894 case META_LOOKBEHINDNOT:
8895 case META_LOOKBEHIND_NA:
8896 case META_NOCAPTURE:
8897 case META_SCRIPT_RUN:
8898 nestlevel++;
8899 break;
8900
8901 case META_ALT:
8902 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
8903 break;
8904
8905 case META_KET:
8906 if (nestlevel == 0) return pptr;
8907 nestlevel--;
8908 break;
8909 }
8910
8911 /* The extra data item length for each meta is in a table. */
8912
8913 meta = (meta >> 16) & 0x7fff;
8914 if (meta >= sizeof(meta_extra_lengths)) return NULL;
8915 pptr += meta_extra_lengths[meta];
8916 }
8917 /* Control never reaches here */
8918 return pptr;
8919 }
8920
8921
8922
8923 /*************************************************
8924 * Find length of a parsed group *
8925 *************************************************/
8926
8927 /* This is called for nested groups within a branch of a lookbehind whose
8928 length is being computed. If all the branches in the nested group have the same
8929 length, that is OK. On entry, the pointer must be at the first element after
8930 the group initializing code. On exit it points to OP_KET. Caching is used to
8931 improve processing speed when the same capturing group occurs many times.
8932
8933 Arguments:
8934 pptrptr pointer to pointer in the parsed pattern
8935 isinline FALSE if a reference or recursion; TRUE for inline group
8936 errcodeptr pointer to the errorcode
8937 lcptr pointer to the loop counter
8938 group number of captured group or -1 for a non-capturing group
8939 recurses chain of recurse_check to catch mutual recursion
8940 cb pointer to the compile data
8941
8942 Returns: the group length or a negative number
8943 */
8944
8945 static int
get_grouplength(uint32_t ** pptrptr,BOOL isinline,int * errcodeptr,int * lcptr,int group,parsed_recurse_check * recurses,compile_block * cb)8946 get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
8947 int group, parsed_recurse_check *recurses, compile_block *cb)
8948 {
8949 int branchlength;
8950 int grouplength = -1;
8951
8952 /* The cache can be used only if there is no possibility of there being two
8953 groups with the same number. We do not need to set the end pointer for a group
8954 that is being processed as a back reference or recursion, but we must do so for
8955 an inline group. */
8956
8957 if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
8958 {
8959 uint32_t groupinfo = cb->groupinfo[group];
8960 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
8961 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
8962 {
8963 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
8964 return groupinfo & GI_FIXED_LENGTH_MASK;
8965 }
8966 }
8967
8968 /* Scan the group. In this case we find the end pointer of necessity. */
8969
8970 for(;;)
8971 {
8972 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
8973 if (branchlength < 0) goto ISNOTFIXED;
8974 if (grouplength == -1) grouplength = branchlength;
8975 else if (grouplength != branchlength) goto ISNOTFIXED;
8976 if (**pptrptr == META_KET) break;
8977 *pptrptr += 1; /* Skip META_ALT */
8978 }
8979
8980 if (group > 0)
8981 cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
8982 return grouplength;
8983
8984 ISNOTFIXED:
8985 if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
8986 return -1;
8987 }
8988
8989
8990
8991 /*************************************************
8992 * Find length of a parsed branch *
8993 *************************************************/
8994
8995 /* Return a fixed length for a branch in a lookbehind, giving an error if the
8996 length is not fixed. On entry, *pptrptr points to the first element inside the
8997 branch. On exit it is set to point to the ALT or KET.
8998
8999 Arguments:
9000 pptrptr pointer to pointer in the parsed pattern
9001 errcodeptr pointer to error code
9002 lcptr pointer to loop counter
9003 recurses chain of recurse_check to catch mutual recursion
9004 cb pointer to compile block
9005
9006 Returns: the length, or a negative value on error
9007 */
9008
9009 static int
get_branchlength(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9010 get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9011 parsed_recurse_check *recurses, compile_block *cb)
9012 {
9013 int branchlength = 0;
9014 int grouplength;
9015 uint32_t lastitemlength = 0;
9016 uint32_t *pptr = *pptrptr;
9017 PCRE2_SIZE offset;
9018 parsed_recurse_check this_recurse;
9019
9020 /* A large and/or complex regex can take too long to process. This can happen
9021 more often when (?| groups are present in the pattern because their length
9022 cannot be cached. */
9023
9024 if ((*lcptr)++ > 2000)
9025 {
9026 *errcodeptr = ERR35; /* Lookbehind is too complicated */
9027 return -1;
9028 }
9029
9030 /* Scan the branch, accumulating the length. */
9031
9032 for (;; pptr++)
9033 {
9034 parsed_recurse_check *r;
9035 uint32_t *gptr, *gptrend;
9036 uint32_t escape;
9037 uint32_t group = 0;
9038 uint32_t itemlength = 0;
9039
9040 if (*pptr < META_END)
9041 {
9042 itemlength = 1;
9043 }
9044
9045 else switch (META_CODE(*pptr))
9046 {
9047 case META_KET:
9048 case META_ALT:
9049 goto EXIT;
9050
9051 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9052 actual termination. */
9053
9054 case META_ACCEPT:
9055 case META_FAIL:
9056 pptr = parsed_skip(pptr, PSKIP_ALT);
9057 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9058 goto EXIT;
9059
9060 case META_MARK:
9061 case META_COMMIT_ARG:
9062 case META_PRUNE_ARG:
9063 case META_SKIP_ARG:
9064 case META_THEN_ARG:
9065 pptr += pptr[1] + 1;
9066 break;
9067
9068 case META_CIRCUMFLEX:
9069 case META_COMMIT:
9070 case META_DOLLAR:
9071 case META_PRUNE:
9072 case META_SKIP:
9073 case META_THEN:
9074 break;
9075
9076 case META_OPTIONS:
9077 pptr += 1;
9078 break;
9079
9080 case META_BIGVALUE:
9081 itemlength = 1;
9082 pptr += 1;
9083 break;
9084
9085 case META_CLASS:
9086 case META_CLASS_NOT:
9087 itemlength = 1;
9088 pptr = parsed_skip(pptr, PSKIP_CLASS);
9089 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9090 break;
9091
9092 case META_CLASS_EMPTY_NOT:
9093 case META_DOT:
9094 itemlength = 1;
9095 break;
9096
9097 case META_CALLOUT_NUMBER:
9098 pptr += 3;
9099 break;
9100
9101 case META_CALLOUT_STRING:
9102 pptr += 3 + SIZEOFFSET;
9103 break;
9104
9105 /* Only some escapes consume a character. Of those, \R and \X are never
9106 allowed because they might match more than character. \C is allowed only in
9107 32-bit and non-UTF 8/16-bit modes. */
9108
9109 case META_ESCAPE:
9110 escape = META_DATA(*pptr);
9111 if (escape == ESC_R || escape == ESC_X) return -1;
9112 if (escape > ESC_b && escape < ESC_Z)
9113 {
9114 #if PCRE2_CODE_UNIT_WIDTH != 32
9115 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9116 {
9117 *errcodeptr = ERR36;
9118 return -1;
9119 }
9120 #endif
9121 itemlength = 1;
9122 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
9123 }
9124 break;
9125
9126 /* Lookaheads do not contribute to the length of this branch, but they may
9127 contain lookbehinds within them whose lengths need to be set. */
9128
9129 case META_LOOKAHEAD:
9130 case META_LOOKAHEADNOT:
9131 case META_LOOKAHEAD_NA:
9132 *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb);
9133 if (*errcodeptr != 0) return -1;
9134
9135 /* Ignore any qualifiers that follow a lookahead assertion. */
9136
9137 switch (pptr[1])
9138 {
9139 case META_ASTERISK:
9140 case META_ASTERISK_PLUS:
9141 case META_ASTERISK_QUERY:
9142 case META_PLUS:
9143 case META_PLUS_PLUS:
9144 case META_PLUS_QUERY:
9145 case META_QUERY:
9146 case META_QUERY_PLUS:
9147 case META_QUERY_QUERY:
9148 pptr++;
9149 break;
9150
9151 case META_MINMAX:
9152 case META_MINMAX_PLUS:
9153 case META_MINMAX_QUERY:
9154 pptr += 3;
9155 break;
9156
9157 default:
9158 break;
9159 }
9160 break;
9161
9162 /* A nested lookbehind does not contribute any length to this lookbehind,
9163 but must itself be checked and have its lengths set. */
9164
9165 case META_LOOKBEHIND:
9166 case META_LOOKBEHINDNOT:
9167 case META_LOOKBEHIND_NA:
9168 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9169 return -1;
9170 break;
9171
9172 /* Back references and recursions are handled by very similar code. At this
9173 stage, the names generated in the parsing pass are available, but the main
9174 name table has not yet been created. So for the named varieties, scan the
9175 list of names in order to get the number of the first one in the pattern,
9176 and whether or not this name is duplicated. */
9177
9178 case META_BACKREF_BYNAME:
9179 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9180 goto ISNOTFIXED;
9181 /* Fall through */
9182
9183 case META_RECURSE_BYNAME:
9184 {
9185 int i;
9186 PCRE2_SPTR name;
9187 BOOL is_dupname = FALSE;
9188 named_group *ng = cb->named_groups;
9189 uint32_t meta_code = META_CODE(*pptr);
9190 uint32_t length = *(++pptr);
9191
9192 GETPLUSOFFSET(offset, pptr);
9193 name = cb->start_pattern + offset;
9194 for (i = 0; i < cb->names_found; i++, ng++)
9195 {
9196 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9197 {
9198 group = ng->number;
9199 is_dupname = ng->isdup;
9200 break;
9201 }
9202 }
9203
9204 if (group == 0)
9205 {
9206 *errcodeptr = ERR15; /* Non-existent subpattern */
9207 cb->erroroffset = offset;
9208 return -1;
9209 }
9210
9211 /* A numerical back reference can be fixed length if duplicate capturing
9212 groups are not being used. A non-duplicate named back reference can also
9213 be handled. */
9214
9215 if (meta_code == META_RECURSE_BYNAME ||
9216 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9217 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9218 }
9219 goto ISNOTFIXED; /* Duplicate name or number */
9220
9221 /* The offset values for back references < 10 are in a separate vector
9222 because otherwise they would use more than two parsed pattern elements on
9223 64-bit systems. */
9224
9225 case META_BACKREF:
9226 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9227 (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9228 goto ISNOTFIXED;
9229 group = META_DATA(*pptr);
9230 if (group < 10)
9231 {
9232 offset = cb->small_ref_offset[group];
9233 goto RECURSE_OR_BACKREF_LENGTH;
9234 }
9235
9236 /* Fall through */
9237 /* For groups >= 10 - picking up group twice does no harm. */
9238
9239 /* A true recursion implies not fixed length, but a subroutine call may
9240 be OK. Back reference "recursions" are also failed. */
9241
9242 case META_RECURSE:
9243 group = META_DATA(*pptr);
9244 GETPLUSOFFSET(offset, pptr);
9245
9246 RECURSE_OR_BACKREF_LENGTH:
9247 if (group > cb->bracount)
9248 {
9249 cb->erroroffset = offset;
9250 *errcodeptr = ERR15; /* Non-existent subpattern */
9251 return -1;
9252 }
9253 if (group == 0) goto ISNOTFIXED; /* Local recursion */
9254 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9255 {
9256 if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9257 else if (*gptr == (META_CAPTURE | group)) break;
9258 }
9259
9260 /* We must start the search for the end of the group at the first meta code
9261 inside the group. Otherwise it will be treated as an enclosed group. */
9262
9263 gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9264 if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9265 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9266 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9267 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9268 this_recurse.prev = recurses;
9269 this_recurse.groupptr = gptr;
9270
9271 /* We do not need to know the position of the end of the group, that is,
9272 gptr is not used after the call to get_grouplength(). Setting the second
9273 argument FALSE stops it scanning for the end when the length can be found
9274 in the cache. */
9275
9276 gptr++;
9277 grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9278 &this_recurse, cb);
9279 if (grouplength < 0)
9280 {
9281 if (*errcodeptr == 0) goto ISNOTFIXED;
9282 return -1; /* Error already set */
9283 }
9284 itemlength = grouplength;
9285 break;
9286
9287 /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9288 the length of this branch. Skip from the following item to the next
9289 unpaired ket. */
9290
9291 case META_COND_DEFINE:
9292 pptr = parsed_skip(pptr + 1, PSKIP_KET);
9293 break;
9294
9295 /* Check other nested groups - advance past the initial data for each type
9296 and then seek a fixed length with get_grouplength(). */
9297
9298 case META_COND_NAME:
9299 case META_COND_NUMBER:
9300 case META_COND_RNAME:
9301 case META_COND_RNUMBER:
9302 pptr += 2 + SIZEOFFSET;
9303 goto CHECK_GROUP;
9304
9305 case META_COND_ASSERT:
9306 pptr += 1;
9307 goto CHECK_GROUP;
9308
9309 case META_COND_VERSION:
9310 pptr += 4;
9311 goto CHECK_GROUP;
9312
9313 case META_CAPTURE:
9314 group = META_DATA(*pptr);
9315 /* Fall through */
9316
9317 case META_ATOMIC:
9318 case META_NOCAPTURE:
9319 case META_SCRIPT_RUN:
9320 pptr++;
9321 CHECK_GROUP:
9322 grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9323 recurses, cb);
9324 if (grouplength < 0) return -1;
9325 itemlength = grouplength;
9326 break;
9327
9328 /* Exact repetition is OK; variable repetition is not. A repetition of zero
9329 must subtract the length that has already been added. */
9330
9331 case META_MINMAX:
9332 case META_MINMAX_PLUS:
9333 case META_MINMAX_QUERY:
9334 if (pptr[1] == pptr[2])
9335 {
9336 switch(pptr[1])
9337 {
9338 case 0:
9339 branchlength -= lastitemlength;
9340 break;
9341
9342 case 1:
9343 itemlength = 0;
9344 break;
9345
9346 default: /* Check for integer overflow */
9347 if (lastitemlength != 0 && /* Should not occur, but just in case */
9348 INT_MAX/lastitemlength < pptr[1] - 1)
9349 {
9350 *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */
9351 return -1;
9352 }
9353 itemlength = (pptr[1] - 1) * lastitemlength;
9354 break;
9355 }
9356 pptr += 2;
9357 break;
9358 }
9359 /* Fall through */
9360
9361 /* Any other item means this branch does not have a fixed length. */
9362
9363 default:
9364 ISNOTFIXED:
9365 *errcodeptr = ERR25; /* Not fixed length */
9366 return -1;
9367 }
9368
9369 /* Add the item length to the branchlength, checking for integer overflow and
9370 for the branch length exceeding the limit. */
9371
9372 if (INT_MAX - branchlength < (int)itemlength ||
9373 (branchlength += itemlength) > LOOKBEHIND_MAX)
9374 {
9375 *errcodeptr = ERR87;
9376 return -1;
9377 }
9378
9379 /* Save this item length for use if the next item is a quantifier. */
9380
9381 lastitemlength = itemlength;
9382 }
9383
9384 EXIT:
9385 *pptrptr = pptr;
9386 return branchlength;
9387
9388 PARSED_SKIP_FAILED:
9389 *errcodeptr = ERR90;
9390 return -1;
9391 }
9392
9393
9394
9395 /*************************************************
9396 * Set lengths in a lookbehind *
9397 *************************************************/
9398
9399 /* This function is called for each lookbehind, to set the lengths in its
9400 branches. An error occurs if any branch does not have a fixed length that is
9401 less than the maximum (65535). On exit, the pointer must be left on the final
9402 ket.
9403
9404 The function also maintains the max_lookbehind value. Any lookbehind branch
9405 that contains a nested lookbehind may actually look further back than the
9406 length of the branch. The additional amount is passed back from
9407 get_branchlength() as an "extra" value.
9408
9409 Arguments:
9410 pptrptr pointer to pointer in the parsed pattern
9411 errcodeptr pointer to error code
9412 lcptr pointer to loop counter
9413 recurses chain of recurse_check to catch mutual recursion
9414 cb pointer to compile block
9415
9416 Returns: TRUE if all is well
9417 FALSE otherwise, with error code and offset set
9418 */
9419
9420 static BOOL
set_lookbehind_lengths(uint32_t ** pptrptr,int * errcodeptr,int * lcptr,parsed_recurse_check * recurses,compile_block * cb)9421 set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9422 parsed_recurse_check *recurses, compile_block *cb)
9423 {
9424 PCRE2_SIZE offset;
9425 int branchlength;
9426 uint32_t *bptr = *pptrptr;
9427
9428 READPLUSOFFSET(offset, bptr); /* Offset for error messages */
9429 *pptrptr += SIZEOFFSET;
9430
9431 do
9432 {
9433 *pptrptr += 1;
9434 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9435 if (branchlength < 0)
9436 {
9437 /* The errorcode and offset may already be set from a nested lookbehind. */
9438 if (*errcodeptr == 0) *errcodeptr = ERR25;
9439 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9440 return FALSE;
9441 }
9442 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9443 *bptr |= branchlength; /* branchlength never more than 65535 */
9444 bptr = *pptrptr;
9445 }
9446 while (*bptr == META_ALT);
9447
9448 return TRUE;
9449 }
9450
9451
9452
9453 /*************************************************
9454 * Check parsed pattern lookbehinds *
9455 *************************************************/
9456
9457 /* This function is called at the end of parsing a pattern if any lookbehinds
9458 were encountered. It scans the parsed pattern for them, calling
9459 set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9460 the error offset is marked unset. The enables the functions above not to
9461 override settings from deeper nestings.
9462
9463 This function is called recursively from get_branchlength() for lookaheads in
9464 order to process any lookbehinds that they may contain. It stops when it hits a
9465 non-nested closing parenthesis in this case, returning a pointer to it.
9466
9467 Arguments
9468 pptr points to where to start (start of pattern or start of lookahead)
9469 retptr if not NULL, return the ket pointer here
9470 recurses chain of recurse_check to catch mutual recursion
9471 cb points to the compile block
9472
9473 Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9474 */
9475
9476 static int
check_lookbehinds(uint32_t * pptr,uint32_t ** retptr,parsed_recurse_check * recurses,compile_block * cb)9477 check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9478 parsed_recurse_check *recurses, compile_block *cb)
9479 {
9480 int errorcode = 0;
9481 int loopcount = 0;
9482 int nestlevel = 0;
9483
9484 cb->erroroffset = PCRE2_UNSET;
9485
9486 for (; *pptr != META_END; pptr++)
9487 {
9488 if (*pptr < META_END) continue; /* Literal */
9489
9490 switch (META_CODE(*pptr))
9491 {
9492 default:
9493 return ERR70; /* Unrecognized meta code */
9494
9495 case META_ESCAPE:
9496 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9497 pptr += 1;
9498 break;
9499
9500 case META_KET:
9501 if (--nestlevel < 0)
9502 {
9503 if (retptr != NULL) *retptr = pptr;
9504 return 0;
9505 }
9506 break;
9507
9508 case META_ATOMIC:
9509 case META_CAPTURE:
9510 case META_COND_ASSERT:
9511 case META_LOOKAHEAD:
9512 case META_LOOKAHEADNOT:
9513 case META_LOOKAHEAD_NA:
9514 case META_NOCAPTURE:
9515 case META_SCRIPT_RUN:
9516 nestlevel++;
9517 break;
9518
9519 case META_ACCEPT:
9520 case META_ALT:
9521 case META_ASTERISK:
9522 case META_ASTERISK_PLUS:
9523 case META_ASTERISK_QUERY:
9524 case META_BACKREF:
9525 case META_CIRCUMFLEX:
9526 case META_CLASS:
9527 case META_CLASS_EMPTY:
9528 case META_CLASS_EMPTY_NOT:
9529 case META_CLASS_END:
9530 case META_CLASS_NOT:
9531 case META_COMMIT:
9532 case META_DOLLAR:
9533 case META_DOT:
9534 case META_FAIL:
9535 case META_PLUS:
9536 case META_PLUS_PLUS:
9537 case META_PLUS_QUERY:
9538 case META_PRUNE:
9539 case META_QUERY:
9540 case META_QUERY_PLUS:
9541 case META_QUERY_QUERY:
9542 case META_RANGE_ESCAPED:
9543 case META_RANGE_LITERAL:
9544 case META_SKIP:
9545 case META_THEN:
9546 break;
9547
9548 case META_RECURSE:
9549 pptr += SIZEOFFSET;
9550 break;
9551
9552 case META_BACKREF_BYNAME:
9553 case META_RECURSE_BYNAME:
9554 pptr += 1 + SIZEOFFSET;
9555 break;
9556
9557 case META_COND_DEFINE:
9558 pptr += SIZEOFFSET;
9559 nestlevel++;
9560 break;
9561
9562 case META_COND_NAME:
9563 case META_COND_NUMBER:
9564 case META_COND_RNAME:
9565 case META_COND_RNUMBER:
9566 pptr += 1 + SIZEOFFSET;
9567 nestlevel++;
9568 break;
9569
9570 case META_COND_VERSION:
9571 pptr += 3;
9572 nestlevel++;
9573 break;
9574
9575 case META_CALLOUT_STRING:
9576 pptr += 3 + SIZEOFFSET;
9577 break;
9578
9579 case META_BIGVALUE:
9580 case META_OPTIONS:
9581 case META_POSIX:
9582 case META_POSIX_NEG:
9583 pptr += 1;
9584 break;
9585
9586 case META_MINMAX:
9587 case META_MINMAX_QUERY:
9588 case META_MINMAX_PLUS:
9589 pptr += 2;
9590 break;
9591
9592 case META_CALLOUT_NUMBER:
9593 pptr += 3;
9594 break;
9595
9596 case META_MARK:
9597 case META_COMMIT_ARG:
9598 case META_PRUNE_ARG:
9599 case META_SKIP_ARG:
9600 case META_THEN_ARG:
9601 pptr += 1 + pptr[1];
9602 break;
9603
9604 case META_LOOKBEHIND:
9605 case META_LOOKBEHINDNOT:
9606 case META_LOOKBEHIND_NA:
9607 if (!set_lookbehind_lengths(&pptr, &errorcode, &loopcount, recurses, cb))
9608 return errorcode;
9609 break;
9610 }
9611 }
9612
9613 return 0;
9614 }
9615
9616
9617
9618 /*************************************************
9619 * External function to compile a pattern *
9620 *************************************************/
9621
9622 /* This function reads a regular expression in the form of a string and returns
9623 a pointer to a block of store holding a compiled version of the expression.
9624
9625 Arguments:
9626 pattern the regular expression
9627 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
9628 options option bits
9629 errorptr pointer to errorcode
9630 erroroffset pointer to error offset
9631 ccontext points to a compile context or is NULL
9632
9633 Returns: pointer to compiled data block, or NULL on error,
9634 with errorcode and erroroffset set
9635 */
9636
9637 PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern,PCRE2_SIZE patlen,uint32_t options,int * errorptr,PCRE2_SIZE * erroroffset,pcre2_compile_context * ccontext)9638 pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9639 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9640 {
9641 BOOL utf; /* Set TRUE for UTF mode */
9642 BOOL ucp; /* Set TRUE for UCP mode */
9643 BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
9644 BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
9645 pcre2_real_code *re = NULL; /* What we will return */
9646 compile_block cb; /* "Static" compile-time data */
9647 const uint8_t *tables; /* Char tables base pointer */
9648
9649 PCRE2_UCHAR *code; /* Current pointer in compiled code */
9650 PCRE2_SPTR codestart; /* Start of compiled code */
9651 PCRE2_SPTR ptr; /* Current pointer in pattern */
9652 uint32_t *pptr; /* Current pointer in parsed pattern */
9653
9654 PCRE2_SIZE length = 1; /* Allow for final END opcode */
9655 PCRE2_SIZE usedlength; /* Actual length used */
9656 PCRE2_SIZE re_blocksize; /* Size of memory block */
9657 PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
9658 PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
9659
9660 int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
9661 uint32_t firstcu, reqcu; /* Value of first/req code unit */
9662 uint32_t setflags = 0; /* NL and BSR set flags */
9663
9664 uint32_t skipatstart; /* When checking (*UTF) etc */
9665 uint32_t limit_heap = UINT32_MAX;
9666 uint32_t limit_match = UINT32_MAX; /* Unset match limits */
9667 uint32_t limit_depth = UINT32_MAX;
9668
9669 int newline = 0; /* Unset; can be set by the pattern */
9670 int bsr = 0; /* Unset; can be set by the pattern */
9671 int errorcode = 0; /* Initialize to avoid compiler warn */
9672 int regexrc; /* Return from compile */
9673
9674 uint32_t i; /* Local loop counter */
9675
9676 /* Comments at the head of this file explain about these variables. */
9677
9678 uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9679 uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9680 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9681
9682 /* The workspace is used in different ways in the different compiling phases.
9683 It needs to be 16-bit aligned for the preliminary parsing scan. */
9684
9685 uint32_t c16workspace[C16_WORK_SIZE];
9686 PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9687
9688
9689 /* -------------- Check arguments and set up the pattern ----------------- */
9690
9691 /* There must be error code and offset pointers. */
9692
9693 if (errorptr == NULL || erroroffset == NULL) return NULL;
9694 *errorptr = ERR0;
9695 *erroroffset = 0;
9696
9697 /* There must be a pattern! */
9698
9699 if (pattern == NULL)
9700 {
9701 *errorptr = ERR16;
9702 return NULL;
9703 }
9704
9705 /* A NULL compile context means "use a default context" */
9706
9707 if (ccontext == NULL)
9708 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9709
9710 /* PCRE2_MATCH_INVALID_UTF implies UTF */
9711
9712 if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
9713
9714 /* Check that all undefined public option bits are zero. */
9715
9716 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9717 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9718 {
9719 *errorptr = ERR17;
9720 return NULL;
9721 }
9722
9723 if ((options & PCRE2_LITERAL) != 0 &&
9724 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9725 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9726 {
9727 *errorptr = ERR92;
9728 return NULL;
9729 }
9730
9731 /* A zero-terminated pattern is indicated by the special length value
9732 PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9733
9734 if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9735 patlen = PRIV(strlen)(pattern);
9736
9737 if (patlen > ccontext->max_pattern_length)
9738 {
9739 *errorptr = ERR88;
9740 return NULL;
9741 }
9742
9743 /* From here on, all returns from this function should end up going via the
9744 EXIT label. */
9745
9746
9747 /* ------------ Initialize the "static" compile data -------------- */
9748
9749 tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9750
9751 cb.lcc = tables + lcc_offset; /* Individual */
9752 cb.fcc = tables + fcc_offset; /* character */
9753 cb.cbits = tables + cbits_offset; /* tables */
9754 cb.ctypes = tables + ctypes_offset;
9755
9756 cb.assert_depth = 0;
9757 cb.bracount = 0;
9758 cb.cx = ccontext;
9759 cb.dupnames = FALSE;
9760 cb.end_pattern = pattern + patlen;
9761 cb.erroroffset = 0;
9762 cb.external_flags = 0;
9763 cb.external_options = options;
9764 cb.groupinfo = stack_groupinfo;
9765 cb.had_recurse = FALSE;
9766 cb.lastcapture = 0;
9767 cb.max_lookbehind = 0;
9768 cb.name_entry_size = 0;
9769 cb.name_table = NULL;
9770 cb.named_groups = named_groups;
9771 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9772 cb.names_found = 0;
9773 cb.open_caps = NULL;
9774 cb.parens_depth = 0;
9775 cb.parsed_pattern = stack_parsed_pattern;
9776 cb.req_varyopt = 0;
9777 cb.start_code = cworkspace;
9778 cb.start_pattern = pattern;
9779 cb.start_workspace = cworkspace;
9780 cb.workspace_size = COMPILE_WORK_SIZE;
9781
9782 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9783 references to help in deciding whether (.*) can be treated as anchored or not.
9784 */
9785
9786 cb.top_backref = 0;
9787 cb.backref_map = 0;
9788
9789 /* Escape sequences \1 to \9 are always back references, but as they are only
9790 two characters long, only two elements can be used in the parsed_pattern
9791 vector. The first contains the reference, and we'd like to use the second to
9792 record the offset in the pattern, so that forward references to non-existent
9793 groups can be diagnosed later with an offset. However, on 64-bit systems,
9794 PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9795 occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9796 references have enough space for the offset to be put into the parsed pattern.
9797 */
9798
9799 for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9800
9801
9802 /* --------------- Start looking at the pattern --------------- */
9803
9804 /* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9805 the start of the pattern, and remember the offset to the actual regex. With
9806 valgrind support, make the terminator of a zero-terminated pattern
9807 inaccessible. This catches bugs that would otherwise only show up for
9808 non-zero-terminated patterns. */
9809
9810 #ifdef SUPPORT_VALGRIND
9811 if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9812 #endif
9813
9814 ptr = pattern;
9815 skipatstart = 0;
9816
9817 if ((options & PCRE2_LITERAL) == 0)
9818 {
9819 while (patlen - skipatstart >= 2 &&
9820 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9821 ptr[skipatstart+1] == CHAR_ASTERISK)
9822 {
9823 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9824 {
9825 uint32_t c, pp;
9826 pso *p = pso_list + i;
9827
9828 if (patlen - skipatstart - 2 >= p->length &&
9829 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9830 p->length) == 0)
9831 {
9832 skipatstart += p->length + 2;
9833 switch(p->type)
9834 {
9835 case PSO_OPT:
9836 cb.external_options |= p->value;
9837 break;
9838
9839 case PSO_FLG:
9840 setflags |= p->value;
9841 break;
9842
9843 case PSO_NL:
9844 newline = p->value;
9845 setflags |= PCRE2_NL_SET;
9846 break;
9847
9848 case PSO_BSR:
9849 bsr = p->value;
9850 setflags |= PCRE2_BSR_SET;
9851 break;
9852
9853 case PSO_LIMM:
9854 case PSO_LIMD:
9855 case PSO_LIMH:
9856 c = 0;
9857 pp = skipatstart;
9858 if (!IS_DIGIT(ptr[pp]))
9859 {
9860 errorcode = ERR60;
9861 ptr += pp;
9862 goto HAD_EARLY_ERROR;
9863 }
9864 while (IS_DIGIT(ptr[pp]))
9865 {
9866 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
9867 c = c*10 + (ptr[pp++] - CHAR_0);
9868 }
9869 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9870 {
9871 errorcode = ERR60;
9872 ptr += pp;
9873 goto HAD_EARLY_ERROR;
9874 }
9875 if (p->type == PSO_LIMH) limit_heap = c;
9876 else if (p->type == PSO_LIMM) limit_match = c;
9877 else limit_depth = c;
9878 skipatstart += pp - skipatstart;
9879 break;
9880 }
9881 break; /* Out of the table scan loop */
9882 }
9883 }
9884 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
9885 }
9886 }
9887
9888 /* End of pattern-start options; advance to start of real regex. */
9889
9890 ptr += skipatstart;
9891
9892 /* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
9893
9894 #ifndef SUPPORT_UNICODE
9895 if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
9896 {
9897 errorcode = ERR32;
9898 goto HAD_EARLY_ERROR;
9899 }
9900 #endif
9901
9902 /* Check UTF. We have the original options in 'options', with that value as
9903 modified by (*UTF) etc in cb->external_options. The extra option
9904 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
9905 surrogate code points cannot be represented in UTF-16. */
9906
9907 utf = (cb.external_options & PCRE2_UTF) != 0;
9908 if (utf)
9909 {
9910 if ((options & PCRE2_NEVER_UTF) != 0)
9911 {
9912 errorcode = ERR74;
9913 goto HAD_EARLY_ERROR;
9914 }
9915 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
9916 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
9917 goto HAD_ERROR; /* Offset was set by valid_utf() */
9918
9919 #if PCRE2_CODE_UNIT_WIDTH == 16
9920 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
9921 {
9922 errorcode = ERR91;
9923 goto HAD_EARLY_ERROR;
9924 }
9925 #endif
9926 }
9927
9928 /* Check UCP lockout. */
9929
9930 ucp = (cb.external_options & PCRE2_UCP) != 0;
9931 if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
9932 {
9933 errorcode = ERR75;
9934 goto HAD_EARLY_ERROR;
9935 }
9936
9937 /* Process the BSR setting. */
9938
9939 if (bsr == 0) bsr = ccontext->bsr_convention;
9940
9941 /* Process the newline setting. */
9942
9943 if (newline == 0) newline = ccontext->newline_convention;
9944 cb.nltype = NLTYPE_FIXED;
9945 switch(newline)
9946 {
9947 case PCRE2_NEWLINE_CR:
9948 cb.nllen = 1;
9949 cb.nl[0] = CHAR_CR;
9950 break;
9951
9952 case PCRE2_NEWLINE_LF:
9953 cb.nllen = 1;
9954 cb.nl[0] = CHAR_NL;
9955 break;
9956
9957 case PCRE2_NEWLINE_NUL:
9958 cb.nllen = 1;
9959 cb.nl[0] = CHAR_NUL;
9960 break;
9961
9962 case PCRE2_NEWLINE_CRLF:
9963 cb.nllen = 2;
9964 cb.nl[0] = CHAR_CR;
9965 cb.nl[1] = CHAR_NL;
9966 break;
9967
9968 case PCRE2_NEWLINE_ANY:
9969 cb.nltype = NLTYPE_ANY;
9970 break;
9971
9972 case PCRE2_NEWLINE_ANYCRLF:
9973 cb.nltype = NLTYPE_ANYCRLF;
9974 break;
9975
9976 default:
9977 errorcode = ERR56;
9978 goto HAD_EARLY_ERROR;
9979 }
9980
9981 /* Pre-scan the pattern to do two things: (1) Discover the named groups and
9982 their numerical equivalents, so that this information is always available for
9983 the remaining processing. (2) At the same time, parse the pattern and put a
9984 processed version into the parsed_pattern vector. This has escapes interpreted
9985 and comments removed (amongst other things).
9986
9987 In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
9988 32-bit ints in the parsed pattern is bounded by the length of the pattern plus
9989 one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
9990 set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
9991 characters greater than META_END (0x80000000) have to be coded as two units. In
9992 this case, therefore, we scan the pattern to check for such values. */
9993
9994 #if PCRE2_CODE_UNIT_WIDTH == 32
9995 if (!utf)
9996 {
9997 PCRE2_SPTR p;
9998 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
9999 }
10000 #endif
10001
10002 /* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10003 is set we have to assume a numerical callout (4 elements) for each character
10004 plus one at the end. This is overkill, but memory is plentiful these days. For
10005 many smaller patterns the vector on the stack (which was set up above) can be
10006 used. */
10007
10008 parsed_size_needed = patlen - skipatstart + big32count;
10009
10010 if ((ccontext->extra_options &
10011 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10012 parsed_size_needed += 4;
10013
10014 if ((options & PCRE2_AUTO_CALLOUT) != 0)
10015 parsed_size_needed = (parsed_size_needed + 1) * 5;
10016
10017 if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10018 {
10019 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10020 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10021 if (heap_parsed_pattern == NULL)
10022 {
10023 *errorptr = ERR21;
10024 goto EXIT;
10025 }
10026 cb.parsed_pattern = heap_parsed_pattern;
10027 }
10028 cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10029
10030 /* Do the parsing scan. */
10031
10032 errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10033 if (errorcode != 0) goto HAD_CB_ERROR;
10034
10035 /* Workspace is needed to remember information about numbered groups: whether a
10036 group can match an empty string and what its fixed length is. This is done to
10037 avoid the possibility of recursive references causing very long compile times
10038 when checking these features. Unnumbered groups do not have this exposure since
10039 they cannot be referenced. We use an indexed vector for this purpose. If there
10040 are sufficiently few groups, the default vector on the stack, as set up above,
10041 can be used. Otherwise we have to get/free a special vector. The vector must be
10042 initialized to zero. */
10043
10044 if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
10045 {
10046 cb.groupinfo = ccontext->memctl.malloc(
10047 (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
10048 if (cb.groupinfo == NULL)
10049 {
10050 errorcode = ERR21;
10051 cb.erroroffset = 0;
10052 goto HAD_CB_ERROR;
10053 }
10054 }
10055 memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
10056
10057 /* If there were any lookbehinds, scan the parsed pattern to figure out their
10058 lengths. */
10059
10060 if (has_lookbehind)
10061 {
10062 errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb);
10063 if (errorcode != 0) goto HAD_CB_ERROR;
10064 }
10065
10066 /* For debugging, there is a function that shows the parsed data vector. */
10067
10068 #ifdef DEBUG_SHOW_PARSED
10069 fprintf(stderr, "+++ Pre-scan complete:\n");
10070 show_parsed(&cb);
10071 #endif
10072
10073 /* For debugging capturing information this code can be enabled. */
10074
10075 #ifdef DEBUG_SHOW_CAPTURES
10076 {
10077 named_group *ng = cb.named_groups;
10078 fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10079 for (i = 0; i < cb.names_found; i++, ng++)
10080 {
10081 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10082 }
10083 }
10084 #endif
10085
10086 /* Pretend to compile the pattern while actually just accumulating the amount
10087 of memory required in the 'length' variable. This behaviour is triggered by
10088 passing a non-NULL final argument to compile_regex(). We pass a block of
10089 workspace (cworkspace) for it to compile parts of the pattern into; the
10090 compiled code is discarded when it is no longer needed, so hopefully this
10091 workspace will never overflow, though there is a test for its doing so.
10092
10093 On error, errorcode will be set non-zero, so we don't need to look at the
10094 result of the function. The initial options have been put into the cb block,
10095 but we still have to pass a separate options variable (the first argument)
10096 because the options may change as the pattern is processed. */
10097
10098 cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
10099 pptr = cb.parsed_pattern;
10100 code = cworkspace;
10101 *code = OP_BRA;
10102
10103 (void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
10104 &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
10105
10106 if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
10107
10108 /* This should be caught in compile_regex(), but just in case... */
10109
10110 if (length > MAX_PATTERN_SIZE)
10111 {
10112 errorcode = ERR20;
10113 goto HAD_CB_ERROR;
10114 }
10115
10116 /* Compute the size of, and then get and initialize, the data block for storing
10117 the compiled pattern and names table. Integer overflow should no longer be
10118 possible because nowadays we limit the maximum value of cb.names_found and
10119 cb.name_entry_size. */
10120
10121 re_blocksize = sizeof(pcre2_real_code) +
10122 CU2BYTES(length +
10123 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10124 re = (pcre2_real_code *)
10125 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10126 if (re == NULL)
10127 {
10128 errorcode = ERR21;
10129 goto HAD_CB_ERROR;
10130 }
10131
10132 /* The compiler may put padding at the end of the pcre2_real_code structure in
10133 order to round it up to a multiple of 4 or 8 bytes. This means that when a
10134 compiled pattern is copied (for example, when serialized) undefined bytes are
10135 read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10136 write to the last 8 bytes of the structure before setting the fields. */
10137
10138 memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10139 re->memctl = ccontext->memctl;
10140 re->tables = tables;
10141 re->executable_jit = NULL;
10142 memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10143 re->blocksize = re_blocksize;
10144 re->magic_number = MAGIC_NUMBER;
10145 re->compile_options = options;
10146 re->overall_options = cb.external_options;
10147 re->extra_options = ccontext->extra_options;
10148 re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10149 re->limit_heap = limit_heap;
10150 re->limit_match = limit_match;
10151 re->limit_depth = limit_depth;
10152 re->first_codeunit = 0;
10153 re->last_codeunit = 0;
10154 re->bsr_convention = bsr;
10155 re->newline_convention = newline;
10156 re->max_lookbehind = 0;
10157 re->minlength = 0;
10158 re->top_bracket = 0;
10159 re->top_backref = 0;
10160 re->name_entry_size = cb.name_entry_size;
10161 re->name_count = cb.names_found;
10162
10163 /* The basic block is immediately followed by the name table, and the compiled
10164 code follows after that. */
10165
10166 codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10167 re->name_entry_size * re->name_count;
10168
10169 /* Update the compile data block for the actual compile. The starting points of
10170 the name/number translation table and of the code are passed around in the
10171 compile data block. The start/end pattern and initial options are already set
10172 from the pre-compile phase, as is the name_entry_size field. */
10173
10174 cb.parens_depth = 0;
10175 cb.assert_depth = 0;
10176 cb.lastcapture = 0;
10177 cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10178 cb.start_code = codestart;
10179 cb.req_varyopt = 0;
10180 cb.had_accept = FALSE;
10181 cb.had_pruneorskip = FALSE;
10182 cb.open_caps = NULL;
10183
10184 /* If any named groups were found, create the name/number table from the list
10185 created in the pre-pass. */
10186
10187 if (cb.names_found > 0)
10188 {
10189 named_group *ng = cb.named_groups;
10190 for (i = 0; i < cb.names_found; i++, ng++)
10191 add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10192 }
10193
10194 /* Set up a starting, non-extracting bracket, then compile the expression. On
10195 error, errorcode will be set non-zero, so we don't need to look at the result
10196 of the function here. */
10197
10198 pptr = cb.parsed_pattern;
10199 code = (PCRE2_UCHAR *)codestart;
10200 *code = OP_BRA;
10201 regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
10202 &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
10203 if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10204 re->top_bracket = cb.bracount;
10205 re->top_backref = cb.top_backref;
10206 re->max_lookbehind = cb.max_lookbehind;
10207
10208 if (cb.had_accept)
10209 {
10210 reqcu = 0; /* Must disable after (*ACCEPT) */
10211 reqcuflags = REQ_NONE;
10212 re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
10213 }
10214
10215 /* Fill in the final opcode and check for disastrous overflow. If no overflow,
10216 but the estimated length exceeds the really used length, adjust the value of
10217 re->blocksize, and if valgrind support is configured, mark the extra allocated
10218 memory as unaddressable, so that any out-of-bound reads can be detected. */
10219
10220 *code++ = OP_END;
10221 usedlength = code - codestart;
10222 if (usedlength > length) errorcode = ERR23; else
10223 {
10224 re->blocksize -= CU2BYTES(length - usedlength);
10225 #ifdef SUPPORT_VALGRIND
10226 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10227 #endif
10228 }
10229
10230 /* Scan the pattern for recursion/subroutine calls and convert the group
10231 numbers into offsets. Maintain a small cache so that repeated groups containing
10232 recursions are efficiently handled. */
10233
10234 #define RSCAN_CACHE_SIZE 8
10235
10236 if (errorcode == 0 && cb.had_recurse)
10237 {
10238 PCRE2_UCHAR *rcode;
10239 PCRE2_SPTR rgroup;
10240 unsigned int ccount = 0;
10241 int start = RSCAN_CACHE_SIZE;
10242 recurse_cache rc[RSCAN_CACHE_SIZE];
10243
10244 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10245 rcode != NULL;
10246 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10247 {
10248 int p, groupnumber;
10249
10250 groupnumber = (int)GET(rcode, 1);
10251 if (groupnumber == 0) rgroup = codestart; else
10252 {
10253 PCRE2_SPTR search_from = codestart;
10254 rgroup = NULL;
10255 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10256 {
10257 if (groupnumber == rc[p].groupnumber)
10258 {
10259 rgroup = rc[p].group;
10260 break;
10261 }
10262
10263 /* Group n+1 must always start to the right of group n, so we can save
10264 search time below when the new group number is greater than any of the
10265 previously found groups. */
10266
10267 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10268 }
10269
10270 if (rgroup == NULL)
10271 {
10272 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10273 if (rgroup == NULL)
10274 {
10275 errorcode = ERR53;
10276 break;
10277 }
10278 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10279 rc[start].groupnumber = groupnumber;
10280 rc[start].group = rgroup;
10281 if (ccount < RSCAN_CACHE_SIZE) ccount++;
10282 }
10283 }
10284
10285 PUT(rcode, 1, rgroup - codestart);
10286 }
10287 }
10288
10289 /* In rare debugging situations we sometimes need to look at the compiled code
10290 at this stage. */
10291
10292 #ifdef DEBUG_CALL_PRINTINT
10293 pcre2_printint(re, stderr, TRUE);
10294 fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10295 #endif
10296
10297 /* Unless disabled, check whether any single character iterators can be
10298 auto-possessified. The function overwrites the appropriate opcode values, so
10299 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10300 used in this code because at least one compiler gives a warning about loss of
10301 "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10302 function call. */
10303
10304 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10305 {
10306 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10307 if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10308 }
10309
10310 /* Failed to compile, or error while post-processing. */
10311
10312 if (errorcode != 0) goto HAD_CB_ERROR;
10313
10314 /* Successful compile. If the anchored option was not passed, set it if
10315 we can determine that the pattern is anchored by virtue of ^ characters or \A
10316 or anything else, such as starting with non-atomic .* when DOTALL is set and
10317 there are no occurrences of *PRUNE or *SKIP (though there is an option to
10318 disable this case). */
10319
10320 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10321 is_anchored(codestart, 0, &cb, 0, FALSE))
10322 re->overall_options |= PCRE2_ANCHORED;
10323
10324 /* Set up the first code unit or startline flag, the required code unit, and
10325 then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10326 is set, as the data it would create will not be used. Note that a first code
10327 unit (but not the startline flag) is useful for anchored patterns because it
10328 can still give a quick "no match" and also avoid searching for a last code
10329 unit. */
10330
10331 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10332 {
10333 int minminlength = 0; /* For minimal minlength from first/required CU */
10334
10335 /* If we do not have a first code unit, see if there is one that is asserted
10336 (these are not saved during the compile because they can cause conflicts with
10337 actual literals that follow). */
10338
10339 if (firstcuflags < 0)
10340 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10341
10342 /* Save the data for a first code unit. The existence of one means the
10343 minimum length must be at least 1. */
10344
10345 if (firstcuflags >= 0)
10346 {
10347 re->first_codeunit = firstcu;
10348 re->flags |= PCRE2_FIRSTSET;
10349 minminlength++;
10350
10351 /* Handle caseless first code units. */
10352
10353 if ((firstcuflags & REQ_CASELESS) != 0)
10354 {
10355 if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10356 {
10357 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10358 }
10359
10360 /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10361 In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10362 points and cannot have another case, but if UCP is set they may do. */
10363
10364 #ifdef SUPPORT_UNICODE
10365 #if PCRE2_CODE_UNIT_WIDTH == 8
10366 else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10367 re->flags |= PCRE2_FIRSTCASELESS;
10368 #else
10369 else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10370 UCD_OTHERCASE(firstcu) != firstcu)
10371 re->flags |= PCRE2_FIRSTCASELESS;
10372 #endif
10373 #endif /* SUPPORT_UNICODE */
10374 }
10375 }
10376
10377 /* When there is no first code unit, for non-anchored patterns, see if we can
10378 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10379 branches start with ^ and also when all branches start with non-atomic .* for
10380 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10381 that disables this case.) */
10382
10383 else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10384 is_startline(codestart, 0, &cb, 0, FALSE))
10385 re->flags |= PCRE2_STARTLINE;
10386
10387 /* Handle the "required code unit", if one is set. In the UTF case we can
10388 increment the minimum minimum length only if we are sure this really is a
10389 different character and not a non-starting code unit of the first character,
10390 because the minimum length count is in characters, not code units. */
10391
10392 if (reqcuflags >= 0)
10393 {
10394 #if PCRE2_CODE_UNIT_WIDTH == 16
10395 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10396 firstcuflags < 0 || /* First not set */
10397 (firstcu & 0xf800) != 0xd800 || /* First not surrogate */
10398 (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
10399 #elif PCRE2_CODE_UNIT_WIDTH == 8
10400 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
10401 firstcuflags < 0 || /* First not set */
10402 (firstcu & 0x80) == 0 || /* First is ASCII */
10403 (reqcu & 0x80) == 0) /* Req is ASCII */
10404 #endif
10405 {
10406 minminlength++;
10407 }
10408
10409 /* In the case of an anchored pattern, set up the value only if it follows
10410 a variable length item in the pattern. */
10411
10412 if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10413 (reqcuflags & REQ_VARY) != 0)
10414 {
10415 re->last_codeunit = reqcu;
10416 re->flags |= PCRE2_LASTSET;
10417
10418 /* Handle caseless required code units as for first code units (above). */
10419
10420 if ((reqcuflags & REQ_CASELESS) != 0)
10421 {
10422 if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10423 {
10424 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10425 }
10426 #ifdef SUPPORT_UNICODE
10427 #if PCRE2_CODE_UNIT_WIDTH == 8
10428 else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10429 re->flags |= PCRE2_LASTCASELESS;
10430 #else
10431 else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10432 UCD_OTHERCASE(reqcu) != reqcu)
10433 re->flags |= PCRE2_LASTCASELESS;
10434 #endif
10435 #endif /* SUPPORT_UNICODE */
10436 }
10437 }
10438 }
10439
10440 /* Study the compiled pattern to set up information such as a bitmap of
10441 starting code units and a minimum matching length. */
10442
10443 if (PRIV(study)(re) != 0)
10444 {
10445 errorcode = ERR31;
10446 goto HAD_CB_ERROR;
10447 }
10448
10449 /* If study() set a bitmap of starting code units, it implies a minimum
10450 length of at least one. */
10451
10452 if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10453 minminlength = 1;
10454
10455 /* If the minimum length set (or not set) by study() is less than the minimum
10456 implied by required code units, override it. */
10457
10458 if (re->minlength < minminlength) re->minlength = minminlength;
10459 } /* End of start-of-match optimizations. */
10460
10461 /* Control ends up here in all cases. When running under valgrind, make a
10462 pattern's terminating zero defined again. If memory was obtained for the parsed
10463 version of the pattern, free it before returning. Also free the list of named
10464 groups if a larger one had to be obtained, and likewise the group information
10465 vector. */
10466
10467 EXIT:
10468 #ifdef SUPPORT_VALGRIND
10469 if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10470 #endif
10471 if (cb.parsed_pattern != stack_parsed_pattern)
10472 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10473 if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10474 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10475 if (cb.groupinfo != stack_groupinfo)
10476 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10477 return re; /* Will be NULL after an error */
10478
10479 /* Errors discovered in parse_regex() set the offset value in the compile
10480 block. Errors discovered before it is called must compute it from the ptr
10481 value. After parse_regex() is called, the offset in the compile block is set to
10482 the end of the pattern, but certain errors in compile_regex() may reset it if
10483 an offset is available in the parsed pattern. */
10484
10485 HAD_CB_ERROR:
10486 ptr = pattern + cb.erroroffset;
10487
10488 HAD_EARLY_ERROR:
10489 *erroroffset = ptr - pattern;
10490
10491 HAD_ERROR:
10492 *errorptr = errorcode;
10493 pcre2_code_free(re);
10494 re = NULL;
10495 goto EXIT;
10496 }
10497
10498 /* End of pcre2_compile.c */
10499