1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #include "config.h"
46
47 #define NLBLOCK cd /* Block containing newline information */
48 #define PSSTART start_pattern /* Field containing pattern start */
49 #define PSEND end_pattern /* Field containing pattern end */
50
51 #include "pcre_internal.h"
52
53
54 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
55 is also used by pcretest. PCRE_DEBUG is not defined when building a production
56 library. We do not need to select pcre16_printint.c specially, because the
57 COMPILE_PCREx macro will already be appropriately set. */
58
59 #ifdef PCRE_DEBUG
60 /* pcre_printint.c should not include any headers */
61 #define PCRE_INCLUDED
62 #include "pcre_printint.c"
63 #undef PCRE_INCLUDED
64 #endif
65
66
67 /* Macro for setting individual bits in class bitmaps. */
68
69 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
70
71 /* Maximum length value to check against when making sure that the integer that
72 holds the compiled pattern length does not overflow. We make it a bit less than
73 INT_MAX to allow for adding in group terminating bytes, so that we don't have
74 to check them every time. */
75
76 #define OFLOW_MAX (INT_MAX - 20)
77
78 /* Definitions to allow mutual recursion */
79
80 static int
81 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
82 const pcre_uint32 *, unsigned int);
83
84 static BOOL
85 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
86 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
87 compile_data *, int *);
88
89
90
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
94
95 /* This value specifies the size of stack workspace that is used during the
96 first pre-compile phase that determines how much memory is required. The regex
97 is partly compiled into this space, but the compiled parts are discarded as
98 soon as they can be, so that hopefully there will never be an overrun. The code
99 does, however, check for an overrun. The largest amount I've seen used is 218,
100 so this number is very generous.
101
102 The same workspace is used during the second, actual compile phase for
103 remembering forward references to groups so that they can be filled in at the
104 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
105 is 4 there is plenty of room for most patterns. However, the memory can get
106 filled up by repetitions of forward references, for example patterns like
107 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
108 that the workspace is expanded using malloc() in this situation. The value
109 below is therefore a minimum, and we put a maximum on it for safety. The
110 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
111 kicks in at the same number of forward references in all cases. */
112
113 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
114 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
115
116 /* This value determines the size of the initial vector that is used for
117 remembering named groups during the pre-compile. It is allocated on the stack,
118 but if it is too small, it is expanded using malloc(), in a similar way to the
119 workspace. The value is the number of slots in the list. */
120
121 #define NAMED_GROUP_LIST_SIZE 20
122
123 /* The overrun tests check for a slightly smaller size so that they detect the
124 overrun before it actually does run off the end of the data block. */
125
126 #define WORK_SIZE_SAFETY_MARGIN (100)
127
128 /* Private flags added to firstchar and reqchar. */
129
130 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
131 #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
132 /* Negative values for the firstchar and reqchar flags */
133 #define REQ_UNSET (-2)
134 #define REQ_NONE (-1)
135
136 /* Repeated character flags. */
137
138 #define UTF_LENGTH 0x10000000l /* The char contains its length. */
139
140 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
141 are simple data values; negative values are for special things like \d and so
142 on. Zero means further processing is needed (for things like \x), or the escape
143 is invalid. */
144
145 #ifndef EBCDIC
146
147 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
148 in UTF-8 mode. */
149
150 static const short int escapes[] = {
151 0, 0,
152 0, 0,
153 0, 0,
154 0, 0,
155 0, 0,
156 CHAR_COLON, CHAR_SEMICOLON,
157 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
158 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
159 CHAR_COMMERCIAL_AT, -ESC_A,
160 -ESC_B, -ESC_C,
161 -ESC_D, -ESC_E,
162 0, -ESC_G,
163 -ESC_H, 0,
164 0, -ESC_K,
165 0, 0,
166 -ESC_N, 0,
167 -ESC_P, -ESC_Q,
168 -ESC_R, -ESC_S,
169 0, 0,
170 -ESC_V, -ESC_W,
171 -ESC_X, 0,
172 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
173 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
174 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
175 CHAR_GRAVE_ACCENT, ESC_a,
176 -ESC_b, 0,
177 -ESC_d, ESC_e,
178 ESC_f, 0,
179 -ESC_h, 0,
180 0, -ESC_k,
181 0, 0,
182 ESC_n, 0,
183 -ESC_p, 0,
184 ESC_r, -ESC_s,
185 ESC_tee, 0,
186 -ESC_v, -ESC_w,
187 0, 0,
188 -ESC_z
189 };
190
191 #else
192
193 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
194
195 static const short int escapes[] = {
196 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
197 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
198 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
199 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
200 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
201 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
202 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
203 /* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
204 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
205 /* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
206 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
207 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
208 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
209 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
210 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
211 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
212 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
213 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
214 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
215 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
216 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
217 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
218 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
219 };
220
221 /* We also need a table of characters that may follow \c in an EBCDIC
222 environment for characters 0-31. */
223
224 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
225
226 #endif
227
228
229 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
230 searched linearly. Put all the names into a single string, in order to reduce
231 the number of relocations when a shared library is dynamically linked. The
232 string is built from string macros so that it works in UTF-8 mode on EBCDIC
233 platforms. */
234
235 typedef struct verbitem {
236 int len; /* Length of verb name */
237 int op; /* Op when no arg, or -1 if arg mandatory */
238 int op_arg; /* Op when arg present, or -1 if not allowed */
239 } verbitem;
240
241 static const char verbnames[] =
242 "\0" /* Empty name is a shorthand for MARK */
243 STRING_MARK0
244 STRING_ACCEPT0
245 STRING_COMMIT0
246 STRING_F0
247 STRING_FAIL0
248 STRING_PRUNE0
249 STRING_SKIP0
250 STRING_THEN;
251
252 static const verbitem verbs[] = {
253 { 0, -1, OP_MARK },
254 { 4, -1, OP_MARK },
255 { 6, OP_ACCEPT, -1 },
256 { 6, OP_COMMIT, -1 },
257 { 1, OP_FAIL, -1 },
258 { 4, OP_FAIL, -1 },
259 { 5, OP_PRUNE, OP_PRUNE_ARG },
260 { 4, OP_SKIP, OP_SKIP_ARG },
261 { 4, OP_THEN, OP_THEN_ARG }
262 };
263
264 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
265
266
267 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
268 another regex library. */
269
270 static const pcre_uchar sub_start_of_word[] = {
271 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
272 CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
273
274 static const pcre_uchar sub_end_of_word[] = {
275 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
276 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
277 CHAR_RIGHT_PARENTHESIS, '\0' };
278
279
280 /* Tables of names of POSIX character classes and their lengths. The names are
281 now all in a single string, to reduce the number of relocations when a shared
282 library is dynamically loaded. The list of lengths is terminated by a zero
283 length entry. The first three must be alpha, lower, upper, as this is assumed
284 for handling case independence. The indices for graph, print, and punct are
285 needed, so identify them. */
286
287 static const char posix_names[] =
288 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
289 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
290 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
291 STRING_word0 STRING_xdigit;
292
293 static const pcre_uint8 posix_name_lengths[] = {
294 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
295
296 #define PC_GRAPH 8
297 #define PC_PRINT 9
298 #define PC_PUNCT 10
299
300
301 /* Table of class bit maps for each POSIX class. Each class is formed from a
302 base map, with an optional addition or removal of another map. Then, for some
303 classes, there is some additional tweaking: for [:blank:] the vertical space
304 characters are removed, and for [:alpha:] and [:alnum:] the underscore
305 character is removed. The triples in the table consist of the base map offset,
306 second map offset or -1 if no second map, and a non-negative value for map
307 addition or a negative value for map subtraction (if there are two maps). The
308 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
309 remove vertical space characters, 2 => remove underscore. */
310
311 static const int posix_class_maps[] = {
312 cbit_word, cbit_digit, -2, /* alpha */
313 cbit_lower, -1, 0, /* lower */
314 cbit_upper, -1, 0, /* upper */
315 cbit_word, -1, 2, /* alnum - word without underscore */
316 cbit_print, cbit_cntrl, 0, /* ascii */
317 cbit_space, -1, 1, /* blank - a GNU extension */
318 cbit_cntrl, -1, 0, /* cntrl */
319 cbit_digit, -1, 0, /* digit */
320 cbit_graph, -1, 0, /* graph */
321 cbit_print, -1, 0, /* print */
322 cbit_punct, -1, 0, /* punct */
323 cbit_space, -1, 0, /* space */
324 cbit_word, -1, 0, /* word - a Perl extension */
325 cbit_xdigit,-1, 0 /* xdigit */
326 };
327
328 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
329 Unicode property escapes. */
330
331 #ifdef SUPPORT_UCP
332 static const pcre_uchar string_PNd[] = {
333 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
334 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
335 static const pcre_uchar string_pNd[] = {
336 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
337 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
338 static const pcre_uchar string_PXsp[] = {
339 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
340 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_pXsp[] = {
342 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
343 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_PXwd[] = {
345 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
346 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_pXwd[] = {
348 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
349 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350
351 static const pcre_uchar *substitutes[] = {
352 string_PNd, /* \D */
353 string_pNd, /* \d */
354 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
355 string_pXsp, /* \s */ /* space and POSIX space are the same. */
356 string_PXwd, /* \W */
357 string_pXwd /* \w */
358 };
359
360 /* The POSIX class substitutes must be in the order of the POSIX class names,
361 defined above, and there are both positive and negative cases. NULL means no
362 general substitute of a Unicode property escape (\p or \P). However, for some
363 POSIX classes (e.g. graph, print, punct) a special property code is compiled
364 directly. */
365
366 static const pcre_uchar string_pL[] = {
367 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
368 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
369 static const pcre_uchar string_pLl[] = {
370 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
371 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372 static const pcre_uchar string_pLu[] = {
373 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
374 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
375 static const pcre_uchar string_pXan[] = {
376 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
377 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
378 static const pcre_uchar string_h[] = {
379 CHAR_BACKSLASH, CHAR_h, '\0' };
380 static const pcre_uchar string_pXps[] = {
381 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
382 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
383 static const pcre_uchar string_PL[] = {
384 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
385 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
386 static const pcre_uchar string_PLl[] = {
387 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
388 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
389 static const pcre_uchar string_PLu[] = {
390 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
391 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
392 static const pcre_uchar string_PXan[] = {
393 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
394 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
395 static const pcre_uchar string_H[] = {
396 CHAR_BACKSLASH, CHAR_H, '\0' };
397 static const pcre_uchar string_PXps[] = {
398 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
399 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
400
401 static const pcre_uchar *posix_substitutes[] = {
402 string_pL, /* alpha */
403 string_pLl, /* lower */
404 string_pLu, /* upper */
405 string_pXan, /* alnum */
406 NULL, /* ascii */
407 string_h, /* blank */
408 NULL, /* cntrl */
409 string_pNd, /* digit */
410 NULL, /* graph */
411 NULL, /* print */
412 NULL, /* punct */
413 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
414 string_pXwd, /* word */ /* Perl and POSIX space are the same */
415 NULL, /* xdigit */
416 /* Negated cases */
417 string_PL, /* ^alpha */
418 string_PLl, /* ^lower */
419 string_PLu, /* ^upper */
420 string_PXan, /* ^alnum */
421 NULL, /* ^ascii */
422 string_H, /* ^blank */
423 NULL, /* ^cntrl */
424 string_PNd, /* ^digit */
425 NULL, /* ^graph */
426 NULL, /* ^print */
427 NULL, /* ^punct */
428 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
429 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
430 NULL /* ^xdigit */
431 };
432 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
433 #endif
434
435 #define STRING(a) # a
436 #define XSTRING(s) STRING(s)
437
438 /* The texts of compile-time error messages. These are "char *" because they
439 are passed to the outside world. Do not ever re-use any error number, because
440 they are documented. Always add a new error instead. Messages marked DEAD below
441 are no longer used. This used to be a table of strings, but in order to reduce
442 the number of relocations needed when a shared library is loaded dynamically,
443 it is now one long string. We cannot use a table of offsets, because the
444 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
445 simply count through to the one we want - this isn't a performance issue
446 because these strings are used only when there is a compilation error.
447
448 Each substring ends with \0 to insert a null character. This includes the final
449 substring, so that the whole string ends with \0\0, which can be detected when
450 counting through. */
451
452 static const char error_texts[] =
453 "no error\0"
454 "\\ at end of pattern\0"
455 "\\c at end of pattern\0"
456 "unrecognized character follows \\\0"
457 "numbers out of order in {} quantifier\0"
458 /* 5 */
459 "number too big in {} quantifier\0"
460 "missing terminating ] for character class\0"
461 "invalid escape sequence in character class\0"
462 "range out of order in character class\0"
463 "nothing to repeat\0"
464 /* 10 */
465 "internal error: invalid forward reference offset\0"
466 "internal error: unexpected repeat\0"
467 "unrecognized character after (? or (?-\0"
468 "POSIX named classes are supported only within a class\0"
469 "missing )\0"
470 /* 15 */
471 "reference to non-existent subpattern\0"
472 "erroffset passed as NULL\0"
473 "unknown option bit(s) set\0"
474 "missing ) after comment\0"
475 "parentheses nested too deeply\0" /** DEAD **/
476 /* 20 */
477 "regular expression is too large\0"
478 "failed to get memory\0"
479 "unmatched parentheses\0"
480 "internal error: code overflow\0"
481 "unrecognized character after (?<\0"
482 /* 25 */
483 "lookbehind assertion is not fixed length\0"
484 "malformed number or name after (?(\0"
485 "conditional group contains more than two branches\0"
486 "assertion expected after (?( or (?(?C)\0"
487 "(?R or (?[+-]digits must be followed by )\0"
488 /* 30 */
489 "unknown POSIX class name\0"
490 "POSIX collating elements are not supported\0"
491 "this version of PCRE is compiled without UTF support\0"
492 "spare error\0" /** DEAD **/
493 "character value in \\x{} or \\o{} is too large\0"
494 /* 35 */
495 "invalid condition (?(0)\0"
496 "\\C not allowed in lookbehind assertion\0"
497 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
498 "number after (?C is > 255\0"
499 "closing ) for (?C expected\0"
500 /* 40 */
501 "recursive call could loop indefinitely\0"
502 "unrecognized character after (?P\0"
503 "syntax error in subpattern name (missing terminator)\0"
504 "two named subpatterns have the same name\0"
505 "invalid UTF-8 string\0"
506 /* 45 */
507 "support for \\P, \\p, and \\X has not been compiled\0"
508 "malformed \\P or \\p sequence\0"
509 "unknown property name after \\P or \\p\0"
510 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
511 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
512 /* 50 */
513 "repeated subpattern is too long\0" /** DEAD **/
514 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
515 "internal error: overran compiling workspace\0"
516 "internal error: previously-checked referenced subpattern not found\0"
517 "DEFINE group contains more than one branch\0"
518 /* 55 */
519 "repeating a DEFINE group is not allowed\0" /** DEAD **/
520 "inconsistent NEWLINE options\0"
521 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
522 "a numbered reference must not be zero\0"
523 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
524 /* 60 */
525 "(*VERB) not recognized or malformed\0"
526 "number is too big\0"
527 "subpattern name expected\0"
528 "digit expected after (?+\0"
529 "] is an invalid data character in JavaScript compatibility mode\0"
530 /* 65 */
531 "different names for subpatterns of the same number are not allowed\0"
532 "(*MARK) must have an argument\0"
533 "this version of PCRE is not compiled with Unicode property support\0"
534 #ifndef EBCDIC
535 "\\c must be followed by an ASCII character\0"
536 #else
537 "\\c must be followed by a letter or one of [\\]^_?\0"
538 #endif
539 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
540 /* 70 */
541 "internal error: unknown opcode in find_fixedlength()\0"
542 "\\N is not supported in a class\0"
543 "too many forward references\0"
544 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
545 "invalid UTF-16 string\0"
546 /* 75 */
547 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
548 "character value in \\u.... sequence is too large\0"
549 "invalid UTF-32 string\0"
550 "setting UTF is disabled by the application\0"
551 "non-hex character in \\x{} (closing brace missing?)\0"
552 /* 80 */
553 "non-octal character in \\o{} (closing brace missing?)\0"
554 "missing opening brace after \\o\0"
555 "parentheses are too deeply nested\0"
556 "invalid range in character class\0"
557 "group name must start with a non-digit\0"
558 /* 85 */
559 "parentheses are too deeply nested (stack check)\0"
560 "digits missing in \\x{} or \\o{}\0"
561 ;
562
563 /* Table to identify digits and hex digits. This is used when compiling
564 patterns. Note that the tables in chartables are dependent on the locale, and
565 may mark arbitrary characters as digits - but the PCRE compiling code expects
566 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
567 a private table here. It costs 256 bytes, but it is a lot faster than doing
568 character value tests (at least in some simple cases I timed), and in some
569 applications one wants PCRE to compile efficiently as well as match
570 efficiently.
571
572 For convenience, we use the same bit definitions as in chartables:
573
574 0x04 decimal digit
575 0x08 hexadecimal digit
576
577 Then we can use ctype_digit and ctype_xdigit in the code. */
578
579 /* Using a simple comparison for decimal numbers rather than a memory read
580 is much faster, and the resulting code is simpler (the compiler turns it
581 into a subtraction and unsigned comparison). */
582
583 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
584
585 #ifndef EBCDIC
586
587 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
588 UTF-8 mode. */
589
590 static const pcre_uint8 digitab[] =
591 {
592 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
598 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
599 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
600 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
604 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
624
625 #else
626
627 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
628
629 static const pcre_uint8 digitab[] =
630 {
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
647 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
650 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
655 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
658 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
661 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
662 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
663
664 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
665 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
666 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
667 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
669 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
673 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
674 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
676 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
677 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
678 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
679 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
680 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
681 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
682 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
683 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
684 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
685 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
686 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
687 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
688 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
689 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
690 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
691 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
692 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
693 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
694 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
695 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
696 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
697 #endif
698
699
700 /* This table is used to check whether auto-possessification is possible
701 between adjacent character-type opcodes. The left-hand (repeated) opcode is
702 used to select the row, and the right-hand opcode is use to select the column.
703 A value of 1 means that auto-possessification is OK. For example, the second
704 value in the first row means that \D+\d can be turned into \D++\d.
705
706 The Unicode property types (\P and \p) have to be present to fill out the table
707 because of what their opcode values are, but the table values should always be
708 zero because property types are handled separately in the code. The last four
709 columns apply to items that cannot be repeated, so there is no need to have
710 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
711 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
712
713 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
714 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
715
716 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
717 /* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
718 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
719 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
720 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
721 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
722 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
723 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
724 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
725 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
726 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
727 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
728 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
729 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
730 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
731 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
732 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
733 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
734 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
735 };
736
737
738 /* This table is used to check whether auto-possessification is possible
739 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
740 left-hand (repeated) opcode is used to select the row, and the right-hand
741 opcode is used to select the column. The values are as follows:
742
743 0 Always return FALSE (never auto-possessify)
744 1 Character groups are distinct (possessify if both are OP_PROP)
745 2 Check character categories in the same group (general or particular)
746 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
747
748 4 Check left general category vs right particular category
749 5 Check right general category vs left particular category
750
751 6 Left alphanum vs right general category
752 7 Left space vs right general category
753 8 Left word vs right general category
754
755 9 Right alphanum vs left general category
756 10 Right space vs left general category
757 11 Right word vs left general category
758
759 12 Left alphanum vs right particular category
760 13 Left space vs right particular category
761 14 Left word vs right particular category
762
763 15 Right alphanum vs left particular category
764 16 Right space vs left particular category
765 17 Right word vs left particular category
766 */
767
768 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
769 /* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
770 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
771 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
772 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
773 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
774 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
775 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
776 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
777 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
778 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
779 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
780 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
781 };
782
783 /* This table is used to check whether auto-possessification is possible
784 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
785 specifies a general category and the other specifies a particular category. The
786 row is selected by the general category and the column by the particular
787 category. The value is 1 if the particular category is not part of the general
788 category. */
789
790 static const pcre_uint8 catposstab[7][30] = {
791 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
792 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
793 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
794 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
795 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
796 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
797 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
798 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
799 };
800
801 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
802 a general or particular category. The properties in each row are those
803 that apply to the character set in question. Duplication means that a little
804 unnecessary work is done when checking, but this keeps things much simpler
805 because they can all use the same code. For more details see the comment where
806 this table is used.
807
808 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
809 "space", but from Perl 5.18 it's included, so both categories are treated the
810 same here. */
811
812 static const pcre_uint8 posspropstab[3][4] = {
813 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
814 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
815 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
816 };
817
818 /* This table is used when converting repeating opcodes into possessified
819 versions as a result of an explicit possessive quantifier such as ++. A zero
820 value means there is no possessified version - in those cases the item in
821 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
822 because all relevant opcodes are less than that. */
823
824 static const pcre_uint8 opcode_possessify[] = {
825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
827
828 0, /* NOTI */
829 OP_POSSTAR, 0, /* STAR, MINSTAR */
830 OP_POSPLUS, 0, /* PLUS, MINPLUS */
831 OP_POSQUERY, 0, /* QUERY, MINQUERY */
832 OP_POSUPTO, 0, /* UPTO, MINUPTO */
833 0, /* EXACT */
834 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
835
836 OP_POSSTARI, 0, /* STARI, MINSTARI */
837 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
838 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
839 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
840 0, /* EXACTI */
841 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
842
843 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
844 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
845 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
846 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
847 0, /* NOTEXACT */
848 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
849
850 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
851 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
852 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
853 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
854 0, /* NOTEXACTI */
855 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
856
857 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
858 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
859 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
860 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
861 0, /* TYPEEXACT */
862 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
863
864 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
865 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
866 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
867 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
868 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
869
870 0, 0, 0, /* CLASS, NCLASS, XCLASS */
871 0, 0, /* REF, REFI */
872 0, 0, /* DNREF, DNREFI */
873 0, 0 /* RECURSE, CALLOUT */
874 };
875
876
877
878 /*************************************************
879 * Find an error text *
880 *************************************************/
881
882 /* The error texts are now all in one long string, to save on relocations. As
883 some of the text is of unknown length, we can't use a table of offsets.
884 Instead, just count through the strings. This is not a performance issue
885 because it happens only when there has been a compilation error.
886
887 Argument: the error number
888 Returns: pointer to the error string
889 */
890
891 static const char *
find_error_text(int n)892 find_error_text(int n)
893 {
894 const char *s = error_texts;
895 for (; n > 0; n--)
896 {
897 while (*s++ != CHAR_NULL) {};
898 if (*s == CHAR_NULL) return "Error text not found (please report)";
899 }
900 return s;
901 }
902
903
904
905 /*************************************************
906 * Expand the workspace *
907 *************************************************/
908
909 /* This function is called during the second compiling phase, if the number of
910 forward references fills the existing workspace, which is originally a block on
911 the stack. A larger block is obtained from malloc() unless the ultimate limit
912 has been reached or the increase will be rather small.
913
914 Argument: pointer to the compile data block
915 Returns: 0 if all went well, else an error number
916 */
917
918 static int
expand_workspace(compile_data * cd)919 expand_workspace(compile_data *cd)
920 {
921 pcre_uchar *newspace;
922 int newsize = cd->workspace_size * 2;
923
924 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
925 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
926 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
927 return ERR72;
928
929 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
930 if (newspace == NULL) return ERR21;
931 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
932 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
933 if (cd->workspace_size > COMPILE_WORK_SIZE)
934 (PUBL(free))((void *)cd->start_workspace);
935 cd->start_workspace = newspace;
936 cd->workspace_size = newsize;
937 return 0;
938 }
939
940
941
942 /*************************************************
943 * Check for counted repeat *
944 *************************************************/
945
946 /* This function is called when a '{' is encountered in a place where it might
947 start a quantifier. It looks ahead to see if it really is a quantifier or not.
948 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
949 where the ddds are digits.
950
951 Arguments:
952 p pointer to the first char after '{'
953
954 Returns: TRUE or FALSE
955 */
956
957 static BOOL
is_counted_repeat(const pcre_uchar * p)958 is_counted_repeat(const pcre_uchar *p)
959 {
960 if (!IS_DIGIT(*p)) return FALSE;
961 p++;
962 while (IS_DIGIT(*p)) p++;
963 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
964
965 if (*p++ != CHAR_COMMA) return FALSE;
966 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
967
968 if (!IS_DIGIT(*p)) return FALSE;
969 p++;
970 while (IS_DIGIT(*p)) p++;
971
972 return (*p == CHAR_RIGHT_CURLY_BRACKET);
973 }
974
975
976
977 /*************************************************
978 * Handle escapes *
979 *************************************************/
980
981 /* This function is called when a \ has been encountered. It either returns a
982 positive value for a simple escape such as \n, or 0 for a data character which
983 will be placed in chptr. A backreference to group n is returned as negative n.
984 When UTF-8 is enabled, a positive value greater than 255 may be returned in
985 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
986 character of the escape sequence.
987
988 Arguments:
989 ptrptr points to the pattern position pointer
990 chptr points to a returned data character
991 errorcodeptr points to the errorcode variable
992 bracount number of previous extracting brackets
993 options the options bits
994 isclass TRUE if inside a character class
995
996 Returns: zero => a data character
997 positive => a special escape sequence
998 negative => a back reference
999 on error, errorcodeptr is set
1000 */
1001
1002 static int
check_escape(const pcre_uchar ** ptrptr,pcre_uint32 * chptr,int * errorcodeptr,int bracount,int options,BOOL isclass)1003 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1004 int bracount, int options, BOOL isclass)
1005 {
1006 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
1007 BOOL utf = (options & PCRE_UTF8) != 0;
1008 const pcre_uchar *ptr = *ptrptr + 1;
1009 pcre_uint32 c;
1010 int escape = 0;
1011 int i;
1012
1013 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1014 ptr--; /* Set pointer back to the last byte */
1015
1016 /* If backslash is at the end of the pattern, it's an error. */
1017
1018 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1019
1020 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1021 in a table. A non-zero result is something that can be returned immediately.
1022 Otherwise further processing may be required. */
1023
1024 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1025 /* Not alphanumeric */
1026 else if (c < CHAR_0 || c > CHAR_z) {}
1027 else if ((i = escapes[c - CHAR_0]) != 0)
1028 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1029
1030 #else /* EBCDIC coding */
1031 /* Not alphanumeric */
1032 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1033 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1034 #endif
1035
1036 /* Escapes that need further processing, or are illegal. */
1037
1038 else
1039 {
1040 const pcre_uchar *oldptr;
1041 BOOL braced, negated, overflow;
1042 int s;
1043
1044 switch (c)
1045 {
1046 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1047 error. */
1048
1049 case CHAR_l:
1050 case CHAR_L:
1051 *errorcodeptr = ERR37;
1052 break;
1053
1054 case CHAR_u:
1055 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1056 {
1057 /* In JavaScript, \u must be followed by four hexadecimal numbers.
1058 Otherwise it is a lowercase u letter. */
1059 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1060 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1061 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1062 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1063 {
1064 c = 0;
1065 for (i = 0; i < 4; ++i)
1066 {
1067 register pcre_uint32 cc = *(++ptr);
1068 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1069 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1070 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1071 #else /* EBCDIC coding */
1072 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1073 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1074 #endif
1075 }
1076
1077 #if defined COMPILE_PCRE8
1078 if (c > (utf ? 0x10ffffU : 0xffU))
1079 #elif defined COMPILE_PCRE16
1080 if (c > (utf ? 0x10ffffU : 0xffffU))
1081 #elif defined COMPILE_PCRE32
1082 if (utf && c > 0x10ffffU)
1083 #endif
1084 {
1085 *errorcodeptr = ERR76;
1086 }
1087 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1088 }
1089 }
1090 else
1091 *errorcodeptr = ERR37;
1092 break;
1093
1094 case CHAR_U:
1095 /* In JavaScript, \U is an uppercase U letter. */
1096 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1097 break;
1098
1099 /* In a character class, \g is just a literal "g". Outside a character
1100 class, \g must be followed by one of a number of specific things:
1101
1102 (1) A number, either plain or braced. If positive, it is an absolute
1103 backreference. If negative, it is a relative backreference. This is a Perl
1104 5.10 feature.
1105
1106 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1107 is part of Perl's movement towards a unified syntax for back references. As
1108 this is synonymous with \k{name}, we fudge it up by pretending it really
1109 was \k.
1110
1111 (3) For Oniguruma compatibility we also support \g followed by a name or a
1112 number either in angle brackets or in single quotes. However, these are
1113 (possibly recursive) subroutine calls, _not_ backreferences. Just return
1114 the ESC_g code (cf \k). */
1115
1116 case CHAR_g:
1117 if (isclass) break;
1118 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1119 {
1120 escape = ESC_g;
1121 break;
1122 }
1123
1124 /* Handle the Perl-compatible cases */
1125
1126 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1127 {
1128 const pcre_uchar *p;
1129 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1130 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1131 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1132 {
1133 escape = ESC_k;
1134 break;
1135 }
1136 braced = TRUE;
1137 ptr++;
1138 }
1139 else braced = FALSE;
1140
1141 if (ptr[1] == CHAR_MINUS)
1142 {
1143 negated = TRUE;
1144 ptr++;
1145 }
1146 else negated = FALSE;
1147
1148 /* The integer range is limited by the machine's int representation. */
1149 s = 0;
1150 overflow = FALSE;
1151 while (IS_DIGIT(ptr[1]))
1152 {
1153 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1154 {
1155 overflow = TRUE;
1156 break;
1157 }
1158 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1159 }
1160 if (overflow) /* Integer overflow */
1161 {
1162 while (IS_DIGIT(ptr[1]))
1163 ptr++;
1164 *errorcodeptr = ERR61;
1165 break;
1166 }
1167
1168 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1169 {
1170 *errorcodeptr = ERR57;
1171 break;
1172 }
1173
1174 if (s == 0)
1175 {
1176 *errorcodeptr = ERR58;
1177 break;
1178 }
1179
1180 if (negated)
1181 {
1182 if (s > bracount)
1183 {
1184 *errorcodeptr = ERR15;
1185 break;
1186 }
1187 s = bracount - (s - 1);
1188 }
1189
1190 escape = -s;
1191 break;
1192
1193 /* The handling of escape sequences consisting of a string of digits
1194 starting with one that is not zero is not straightforward. Perl has changed
1195 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1196 recommended to avoid the ambiguities in the old syntax.
1197
1198 Outside a character class, the digits are read as a decimal number. If the
1199 number is less than 8 (used to be 10), or if there are that many previous
1200 extracting left brackets, then it is a back reference. Otherwise, up to
1201 three octal digits are read to form an escaped byte. Thus \123 is likely to
1202 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1203 the octal value is greater than 377, the least significant 8 bits are
1204 taken. \8 and \9 are treated as the literal characters 8 and 9.
1205
1206 Inside a character class, \ followed by a digit is always either a literal
1207 8 or 9 or an octal number. */
1208
1209 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1210 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1211
1212 if (!isclass)
1213 {
1214 oldptr = ptr;
1215 /* The integer range is limited by the machine's int representation. */
1216 s = (int)(c -CHAR_0);
1217 overflow = FALSE;
1218 while (IS_DIGIT(ptr[1]))
1219 {
1220 if (s > INT_MAX / 10 - 1) /* Integer overflow */
1221 {
1222 overflow = TRUE;
1223 break;
1224 }
1225 s = s * 10 + (int)(*(++ptr) - CHAR_0);
1226 }
1227 if (overflow) /* Integer overflow */
1228 {
1229 while (IS_DIGIT(ptr[1]))
1230 ptr++;
1231 *errorcodeptr = ERR61;
1232 break;
1233 }
1234 if (s < 8 || s <= bracount) /* Check for back reference */
1235 {
1236 escape = -s;
1237 break;
1238 }
1239 ptr = oldptr; /* Put the pointer back and fall through */
1240 }
1241
1242 /* Handle a digit following \ when the number is not a back reference. If
1243 the first digit is 8 or 9, Perl used to generate a binary zero byte and
1244 then treat the digit as a following literal. At least by Perl 5.18 this
1245 changed so as not to insert the binary zero. */
1246
1247 if ((c = *ptr) >= CHAR_8) break;
1248
1249 /* Fall through with a digit less than 8 */
1250
1251 /* \0 always starts an octal number, but we may drop through to here with a
1252 larger first octal digit. The original code used just to take the least
1253 significant 8 bits of octal numbers (I think this is what early Perls used
1254 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1255 but no more than 3 octal digits. */
1256
1257 case CHAR_0:
1258 c -= CHAR_0;
1259 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1260 c = c * 8 + *(++ptr) - CHAR_0;
1261 #ifdef COMPILE_PCRE8
1262 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1263 #endif
1264 break;
1265
1266 /* \o is a relatively new Perl feature, supporting a more general way of
1267 specifying character codes in octal. The only supported form is \o{ddd}. */
1268
1269 case CHAR_o:
1270 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1271 if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1272 {
1273 ptr += 2;
1274 c = 0;
1275 overflow = FALSE;
1276 while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1277 {
1278 register pcre_uint32 cc = *ptr++;
1279 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1280 #ifdef COMPILE_PCRE32
1281 if (c >= 0x20000000l) { overflow = TRUE; break; }
1282 #endif
1283 c = (c << 3) + cc - CHAR_0 ;
1284 #if defined COMPILE_PCRE8
1285 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1286 #elif defined COMPILE_PCRE16
1287 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1288 #elif defined COMPILE_PCRE32
1289 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1290 #endif
1291 }
1292 if (overflow)
1293 {
1294 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1295 *errorcodeptr = ERR34;
1296 }
1297 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1298 {
1299 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1300 }
1301 else *errorcodeptr = ERR80;
1302 }
1303 break;
1304
1305 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1306 numbers. Otherwise it is a lowercase x letter. */
1307
1308 case CHAR_x:
1309 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1310 {
1311 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1312 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1313 {
1314 c = 0;
1315 for (i = 0; i < 2; ++i)
1316 {
1317 register pcre_uint32 cc = *(++ptr);
1318 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1319 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1320 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1321 #else /* EBCDIC coding */
1322 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1323 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1324 #endif
1325 }
1326 }
1327 } /* End JavaScript handling */
1328
1329 /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1330 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1331 digits. If not, { used to be treated as a data character. However, Perl
1332 seems to read hex digits up to the first non-such, and ignore the rest, so
1333 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1334 now gives an error. */
1335
1336 else
1337 {
1338 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1339 {
1340 ptr += 2;
1341 if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1342 {
1343 *errorcodeptr = ERR86;
1344 break;
1345 }
1346 c = 0;
1347 overflow = FALSE;
1348 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1349 {
1350 register pcre_uint32 cc = *ptr++;
1351 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1352
1353 #ifdef COMPILE_PCRE32
1354 if (c >= 0x10000000l) { overflow = TRUE; break; }
1355 #endif
1356
1357 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1358 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1359 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1360 #else /* EBCDIC coding */
1361 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1362 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1363 #endif
1364
1365 #if defined COMPILE_PCRE8
1366 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1367 #elif defined COMPILE_PCRE16
1368 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1369 #elif defined COMPILE_PCRE32
1370 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1371 #endif
1372 }
1373
1374 if (overflow)
1375 {
1376 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1377 *errorcodeptr = ERR34;
1378 }
1379
1380 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1381 {
1382 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1383 }
1384
1385 /* If the sequence of hex digits does not end with '}', give an error.
1386 We used just to recognize this construct and fall through to the normal
1387 \x handling, but nowadays Perl gives an error, which seems much more
1388 sensible, so we do too. */
1389
1390 else *errorcodeptr = ERR79;
1391 } /* End of \x{} processing */
1392
1393 /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1394
1395 else
1396 {
1397 c = 0;
1398 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1399 {
1400 pcre_uint32 cc; /* Some compilers don't like */
1401 cc = *(++ptr); /* ++ in initializers */
1402 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1403 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1404 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1405 #else /* EBCDIC coding */
1406 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1407 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1408 #endif
1409 }
1410 } /* End of \xdd handling */
1411 } /* End of Perl-style \x handling */
1412 break;
1413
1414 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1415 An error is given if the byte following \c is not an ASCII character. This
1416 coding is ASCII-specific, but then the whole concept of \cx is
1417 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1418
1419 case CHAR_c:
1420 c = *(++ptr);
1421 if (c == CHAR_NULL)
1422 {
1423 *errorcodeptr = ERR2;
1424 break;
1425 }
1426 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1427 if (c > 127) /* Excludes all non-ASCII in either mode */
1428 {
1429 *errorcodeptr = ERR68;
1430 break;
1431 }
1432 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1433 c ^= 0x40;
1434 #else /* EBCDIC coding */
1435 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1436 if (c == CHAR_QUESTION_MARK)
1437 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1438 else
1439 {
1440 for (i = 0; i < 32; i++)
1441 {
1442 if (c == ebcdic_escape_c[i]) break;
1443 }
1444 if (i < 32) c = i; else *errorcodeptr = ERR68;
1445 }
1446 #endif
1447 break;
1448
1449 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1450 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1451 otherwise, for Perl compatibility, it is a literal. This code looks a bit
1452 odd, but there used to be some cases other than the default, and there may
1453 be again in future, so I haven't "optimized" it. */
1454
1455 default:
1456 if ((options & PCRE_EXTRA) != 0) switch(c)
1457 {
1458 default:
1459 *errorcodeptr = ERR3;
1460 break;
1461 }
1462 break;
1463 }
1464 }
1465
1466 /* Perl supports \N{name} for character names, as well as plain \N for "not
1467 newline". PCRE does not support \N{name}. However, it does support
1468 quantification such as \N{2,3}. */
1469
1470 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1471 !is_counted_repeat(ptr+2))
1472 *errorcodeptr = ERR37;
1473
1474 /* If PCRE_UCP is set, we change the values for \d etc. */
1475
1476 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1477 escape += (ESC_DU - ESC_D);
1478
1479 /* Set the pointer to the final character before returning. */
1480
1481 *ptrptr = ptr;
1482 *chptr = c;
1483 return escape;
1484 }
1485
1486
1487
1488 #ifdef SUPPORT_UCP
1489 /*************************************************
1490 * Handle \P and \p *
1491 *************************************************/
1492
1493 /* This function is called after \P or \p has been encountered, provided that
1494 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1495 pointing at the P or p. On exit, it is pointing at the final character of the
1496 escape sequence.
1497
1498 Argument:
1499 ptrptr points to the pattern position pointer
1500 negptr points to a boolean that is set TRUE for negation else FALSE
1501 ptypeptr points to an unsigned int that is set to the type value
1502 pdataptr points to an unsigned int that is set to the detailed property value
1503 errorcodeptr points to the error code variable
1504
1505 Returns: TRUE if the type value was found, or FALSE for an invalid type
1506 */
1507
1508 static BOOL
get_ucp(const pcre_uchar ** ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr)1509 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1510 unsigned int *pdataptr, int *errorcodeptr)
1511 {
1512 pcre_uchar c;
1513 int i, bot, top;
1514 const pcre_uchar *ptr = *ptrptr;
1515 pcre_uchar name[32];
1516
1517 c = *(++ptr);
1518 if (c == CHAR_NULL) goto ERROR_RETURN;
1519
1520 *negptr = FALSE;
1521
1522 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1523 negation. */
1524
1525 if (c == CHAR_LEFT_CURLY_BRACKET)
1526 {
1527 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1528 {
1529 *negptr = TRUE;
1530 ptr++;
1531 }
1532 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1533 {
1534 c = *(++ptr);
1535 if (c == CHAR_NULL) goto ERROR_RETURN;
1536 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1537 name[i] = c;
1538 }
1539 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1540 name[i] = 0;
1541 }
1542
1543 /* Otherwise there is just one following character */
1544
1545 else
1546 {
1547 name[0] = c;
1548 name[1] = 0;
1549 }
1550
1551 *ptrptr = ptr;
1552
1553 /* Search for a recognized property name using binary chop */
1554
1555 bot = 0;
1556 top = PRIV(utt_size);
1557
1558 while (bot < top)
1559 {
1560 int r;
1561 i = (bot + top) >> 1;
1562 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1563 if (r == 0)
1564 {
1565 *ptypeptr = PRIV(utt)[i].type;
1566 *pdataptr = PRIV(utt)[i].value;
1567 return TRUE;
1568 }
1569 if (r > 0) bot = i + 1; else top = i;
1570 }
1571
1572 *errorcodeptr = ERR47;
1573 *ptrptr = ptr;
1574 return FALSE;
1575
1576 ERROR_RETURN:
1577 *errorcodeptr = ERR46;
1578 *ptrptr = ptr;
1579 return FALSE;
1580 }
1581 #endif
1582
1583
1584
1585 /*************************************************
1586 * Read repeat counts *
1587 *************************************************/
1588
1589 /* Read an item of the form {n,m} and return the values. This is called only
1590 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1591 so the syntax is guaranteed to be correct, but we need to check the values.
1592
1593 Arguments:
1594 p pointer to first char after '{'
1595 minp pointer to int for min
1596 maxp pointer to int for max
1597 returned as -1 if no max
1598 errorcodeptr points to error code variable
1599
1600 Returns: pointer to '}' on success;
1601 current ptr on error, with errorcodeptr set non-zero
1602 */
1603
1604 static const pcre_uchar *
read_repeat_counts(const pcre_uchar * p,int * minp,int * maxp,int * errorcodeptr)1605 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1606 {
1607 int min = 0;
1608 int max = -1;
1609
1610 while (IS_DIGIT(*p))
1611 {
1612 min = min * 10 + (int)(*p++ - CHAR_0);
1613 if (min > 65535)
1614 {
1615 *errorcodeptr = ERR5;
1616 return p;
1617 }
1618 }
1619
1620 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1621 {
1622 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1623 {
1624 max = 0;
1625 while(IS_DIGIT(*p))
1626 {
1627 max = max * 10 + (int)(*p++ - CHAR_0);
1628 if (max > 65535)
1629 {
1630 *errorcodeptr = ERR5;
1631 return p;
1632 }
1633 }
1634 if (max < min)
1635 {
1636 *errorcodeptr = ERR4;
1637 return p;
1638 }
1639 }
1640 }
1641
1642 *minp = min;
1643 *maxp = max;
1644 return p;
1645 }
1646
1647
1648
1649 /*************************************************
1650 * Find first significant op code *
1651 *************************************************/
1652
1653 /* This is called by several functions that scan a compiled expression looking
1654 for a fixed first character, or an anchoring op code etc. It skips over things
1655 that do not influence this. For some calls, it makes sense to skip negative
1656 forward and all backward assertions, and also the \b assertion; for others it
1657 does not.
1658
1659 Arguments:
1660 code pointer to the start of the group
1661 skipassert TRUE if certain assertions are to be skipped
1662
1663 Returns: pointer to the first significant opcode
1664 */
1665
1666 static const pcre_uchar*
first_significant_code(const pcre_uchar * code,BOOL skipassert)1667 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1668 {
1669 for (;;)
1670 {
1671 switch ((int)*code)
1672 {
1673 case OP_ASSERT_NOT:
1674 case OP_ASSERTBACK:
1675 case OP_ASSERTBACK_NOT:
1676 if (!skipassert) return code;
1677 do code += GET(code, 1); while (*code == OP_ALT);
1678 code += PRIV(OP_lengths)[*code];
1679 break;
1680
1681 case OP_WORD_BOUNDARY:
1682 case OP_NOT_WORD_BOUNDARY:
1683 if (!skipassert) return code;
1684 /* Fall through */
1685
1686 case OP_CALLOUT:
1687 case OP_CREF:
1688 case OP_DNCREF:
1689 case OP_RREF:
1690 case OP_DNRREF:
1691 case OP_DEF:
1692 code += PRIV(OP_lengths)[*code];
1693 break;
1694
1695 default:
1696 return code;
1697 }
1698 }
1699 /* Control never reaches here */
1700 }
1701
1702
1703
1704 /*************************************************
1705 * Find the fixed length of a branch *
1706 *************************************************/
1707
1708 /* Scan a branch and compute the fixed length of subject that will match it,
1709 if the length is fixed. This is needed for dealing with backward assertions.
1710 In UTF8 mode, the result is in characters rather than bytes. The branch is
1711 temporarily terminated with OP_END when this function is called.
1712
1713 This function is called when a backward assertion is encountered, so that if it
1714 fails, the error message can point to the correct place in the pattern.
1715 However, we cannot do this when the assertion contains subroutine calls,
1716 because they can be forward references. We solve this by remembering this case
1717 and doing the check at the end; a flag specifies which mode we are running in.
1718
1719 Arguments:
1720 code points to the start of the pattern (the bracket)
1721 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1722 atend TRUE if called when the pattern is complete
1723 cd the "compile data" structure
1724 recurses chain of recurse_check to catch mutual recursion
1725
1726 Returns: the fixed length,
1727 or -1 if there is no fixed length,
1728 or -2 if \C was encountered (in UTF-8 mode only)
1729 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1730 or -4 if an unknown opcode was encountered (internal error)
1731 */
1732
1733 static int
find_fixedlength(pcre_uchar * code,BOOL utf,BOOL atend,compile_data * cd,recurse_check * recurses)1734 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1735 recurse_check *recurses)
1736 {
1737 int length = -1;
1738 recurse_check this_recurse;
1739 register int branchlength = 0;
1740 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1741
1742 /* Scan along the opcodes for this branch. If we get to the end of the
1743 branch, check the length against that of the other branches. */
1744
1745 for (;;)
1746 {
1747 int d;
1748 pcre_uchar *ce, *cs;
1749 register pcre_uchar op = *cc;
1750
1751 switch (op)
1752 {
1753 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1754 OP_BRA (normal non-capturing bracket) because the other variants of these
1755 opcodes are all concerned with unlimited repeated groups, which of course
1756 are not of fixed length. */
1757
1758 case OP_CBRA:
1759 case OP_BRA:
1760 case OP_ONCE:
1761 case OP_ONCE_NC:
1762 case OP_COND:
1763 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1764 recurses);
1765 if (d < 0) return d;
1766 branchlength += d;
1767 do cc += GET(cc, 1); while (*cc == OP_ALT);
1768 cc += 1 + LINK_SIZE;
1769 break;
1770
1771 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1772 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1773 an ALT. If it is END it's the end of the outer call. All can be handled by
1774 the same code. Note that we must not include the OP_KETRxxx opcodes here,
1775 because they all imply an unlimited repeat. */
1776
1777 case OP_ALT:
1778 case OP_KET:
1779 case OP_END:
1780 case OP_ACCEPT:
1781 case OP_ASSERT_ACCEPT:
1782 if (length < 0) length = branchlength;
1783 else if (length != branchlength) return -1;
1784 if (*cc != OP_ALT) return length;
1785 cc += 1 + LINK_SIZE;
1786 branchlength = 0;
1787 break;
1788
1789 /* A true recursion implies not fixed length, but a subroutine call may
1790 be OK. If the subroutine is a forward reference, we can't deal with
1791 it until the end of the pattern, so return -3. */
1792
1793 case OP_RECURSE:
1794 if (!atend) return -3;
1795 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1796 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1797 if (cc > cs && cc < ce) return -1; /* Recursion */
1798 else /* Check for mutual recursion */
1799 {
1800 recurse_check *r = recurses;
1801 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1802 if (r != NULL) return -1; /* Mutual recursion */
1803 }
1804 this_recurse.prev = recurses;
1805 this_recurse.group = cs;
1806 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1807 if (d < 0) return d;
1808 branchlength += d;
1809 cc += 1 + LINK_SIZE;
1810 break;
1811
1812 /* Skip over assertive subpatterns */
1813
1814 case OP_ASSERT:
1815 case OP_ASSERT_NOT:
1816 case OP_ASSERTBACK:
1817 case OP_ASSERTBACK_NOT:
1818 do cc += GET(cc, 1); while (*cc == OP_ALT);
1819 cc += 1 + LINK_SIZE;
1820 break;
1821
1822 /* Skip over things that don't match chars */
1823
1824 case OP_MARK:
1825 case OP_PRUNE_ARG:
1826 case OP_SKIP_ARG:
1827 case OP_THEN_ARG:
1828 cc += cc[1] + PRIV(OP_lengths)[*cc];
1829 break;
1830
1831 case OP_CALLOUT:
1832 case OP_CIRC:
1833 case OP_CIRCM:
1834 case OP_CLOSE:
1835 case OP_COMMIT:
1836 case OP_CREF:
1837 case OP_DEF:
1838 case OP_DNCREF:
1839 case OP_DNRREF:
1840 case OP_DOLL:
1841 case OP_DOLLM:
1842 case OP_EOD:
1843 case OP_EODN:
1844 case OP_FAIL:
1845 case OP_NOT_WORD_BOUNDARY:
1846 case OP_PRUNE:
1847 case OP_REVERSE:
1848 case OP_RREF:
1849 case OP_SET_SOM:
1850 case OP_SKIP:
1851 case OP_SOD:
1852 case OP_SOM:
1853 case OP_THEN:
1854 case OP_WORD_BOUNDARY:
1855 cc += PRIV(OP_lengths)[*cc];
1856 break;
1857
1858 /* Handle literal characters */
1859
1860 case OP_CHAR:
1861 case OP_CHARI:
1862 case OP_NOT:
1863 case OP_NOTI:
1864 branchlength++;
1865 cc += 2;
1866 #ifdef SUPPORT_UTF
1867 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1868 #endif
1869 break;
1870
1871 /* Handle exact repetitions. The count is already in characters, but we
1872 need to skip over a multibyte character in UTF8 mode. */
1873
1874 case OP_EXACT:
1875 case OP_EXACTI:
1876 case OP_NOTEXACT:
1877 case OP_NOTEXACTI:
1878 branchlength += (int)GET2(cc,1);
1879 cc += 2 + IMM2_SIZE;
1880 #ifdef SUPPORT_UTF
1881 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1882 #endif
1883 break;
1884
1885 case OP_TYPEEXACT:
1886 branchlength += GET2(cc,1);
1887 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1888 cc += 2;
1889 cc += 1 + IMM2_SIZE + 1;
1890 break;
1891
1892 /* Handle single-char matchers */
1893
1894 case OP_PROP:
1895 case OP_NOTPROP:
1896 cc += 2;
1897 /* Fall through */
1898
1899 case OP_HSPACE:
1900 case OP_VSPACE:
1901 case OP_NOT_HSPACE:
1902 case OP_NOT_VSPACE:
1903 case OP_NOT_DIGIT:
1904 case OP_DIGIT:
1905 case OP_NOT_WHITESPACE:
1906 case OP_WHITESPACE:
1907 case OP_NOT_WORDCHAR:
1908 case OP_WORDCHAR:
1909 case OP_ANY:
1910 case OP_ALLANY:
1911 branchlength++;
1912 cc++;
1913 break;
1914
1915 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1916 otherwise \C is coded as OP_ALLANY. */
1917
1918 case OP_ANYBYTE:
1919 return -2;
1920
1921 /* Check a class for variable quantification */
1922
1923 case OP_CLASS:
1924 case OP_NCLASS:
1925 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1926 case OP_XCLASS:
1927 /* The original code caused an unsigned overflow in 64 bit systems,
1928 so now we use a conditional statement. */
1929 if (op == OP_XCLASS)
1930 cc += GET(cc, 1);
1931 else
1932 cc += PRIV(OP_lengths)[OP_CLASS];
1933 #else
1934 cc += PRIV(OP_lengths)[OP_CLASS];
1935 #endif
1936
1937 switch (*cc)
1938 {
1939 case OP_CRSTAR:
1940 case OP_CRMINSTAR:
1941 case OP_CRPLUS:
1942 case OP_CRMINPLUS:
1943 case OP_CRQUERY:
1944 case OP_CRMINQUERY:
1945 case OP_CRPOSSTAR:
1946 case OP_CRPOSPLUS:
1947 case OP_CRPOSQUERY:
1948 return -1;
1949
1950 case OP_CRRANGE:
1951 case OP_CRMINRANGE:
1952 case OP_CRPOSRANGE:
1953 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1954 branchlength += (int)GET2(cc,1);
1955 cc += 1 + 2 * IMM2_SIZE;
1956 break;
1957
1958 default:
1959 branchlength++;
1960 }
1961 break;
1962
1963 /* Anything else is variable length */
1964
1965 case OP_ANYNL:
1966 case OP_BRAMINZERO:
1967 case OP_BRAPOS:
1968 case OP_BRAPOSZERO:
1969 case OP_BRAZERO:
1970 case OP_CBRAPOS:
1971 case OP_EXTUNI:
1972 case OP_KETRMAX:
1973 case OP_KETRMIN:
1974 case OP_KETRPOS:
1975 case OP_MINPLUS:
1976 case OP_MINPLUSI:
1977 case OP_MINQUERY:
1978 case OP_MINQUERYI:
1979 case OP_MINSTAR:
1980 case OP_MINSTARI:
1981 case OP_MINUPTO:
1982 case OP_MINUPTOI:
1983 case OP_NOTMINPLUS:
1984 case OP_NOTMINPLUSI:
1985 case OP_NOTMINQUERY:
1986 case OP_NOTMINQUERYI:
1987 case OP_NOTMINSTAR:
1988 case OP_NOTMINSTARI:
1989 case OP_NOTMINUPTO:
1990 case OP_NOTMINUPTOI:
1991 case OP_NOTPLUS:
1992 case OP_NOTPLUSI:
1993 case OP_NOTPOSPLUS:
1994 case OP_NOTPOSPLUSI:
1995 case OP_NOTPOSQUERY:
1996 case OP_NOTPOSQUERYI:
1997 case OP_NOTPOSSTAR:
1998 case OP_NOTPOSSTARI:
1999 case OP_NOTPOSUPTO:
2000 case OP_NOTPOSUPTOI:
2001 case OP_NOTQUERY:
2002 case OP_NOTQUERYI:
2003 case OP_NOTSTAR:
2004 case OP_NOTSTARI:
2005 case OP_NOTUPTO:
2006 case OP_NOTUPTOI:
2007 case OP_PLUS:
2008 case OP_PLUSI:
2009 case OP_POSPLUS:
2010 case OP_POSPLUSI:
2011 case OP_POSQUERY:
2012 case OP_POSQUERYI:
2013 case OP_POSSTAR:
2014 case OP_POSSTARI:
2015 case OP_POSUPTO:
2016 case OP_POSUPTOI:
2017 case OP_QUERY:
2018 case OP_QUERYI:
2019 case OP_REF:
2020 case OP_REFI:
2021 case OP_DNREF:
2022 case OP_DNREFI:
2023 case OP_SBRA:
2024 case OP_SBRAPOS:
2025 case OP_SCBRA:
2026 case OP_SCBRAPOS:
2027 case OP_SCOND:
2028 case OP_SKIPZERO:
2029 case OP_STAR:
2030 case OP_STARI:
2031 case OP_TYPEMINPLUS:
2032 case OP_TYPEMINQUERY:
2033 case OP_TYPEMINSTAR:
2034 case OP_TYPEMINUPTO:
2035 case OP_TYPEPLUS:
2036 case OP_TYPEPOSPLUS:
2037 case OP_TYPEPOSQUERY:
2038 case OP_TYPEPOSSTAR:
2039 case OP_TYPEPOSUPTO:
2040 case OP_TYPEQUERY:
2041 case OP_TYPESTAR:
2042 case OP_TYPEUPTO:
2043 case OP_UPTO:
2044 case OP_UPTOI:
2045 return -1;
2046
2047 /* Catch unrecognized opcodes so that when new ones are added they
2048 are not forgotten, as has happened in the past. */
2049
2050 default:
2051 return -4;
2052 }
2053 }
2054 /* Control never gets here */
2055 }
2056
2057
2058
2059 /*************************************************
2060 * Scan compiled regex for specific bracket *
2061 *************************************************/
2062
2063 /* This little function scans through a compiled pattern until it finds a
2064 capturing bracket with the given number, or, if the number is negative, an
2065 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2066 so that it can be called from pcre_study() when finding the minimum matching
2067 length.
2068
2069 Arguments:
2070 code points to start of expression
2071 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2072 number the required bracket number or negative to find a lookbehind
2073
2074 Returns: pointer to the opcode for the bracket, or NULL if not found
2075 */
2076
2077 const pcre_uchar *
PRIV(find_bracket)2078 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2079 {
2080 for (;;)
2081 {
2082 register pcre_uchar c = *code;
2083
2084 if (c == OP_END) return NULL;
2085
2086 /* XCLASS is used for classes that cannot be represented just by a bit
2087 map. This includes negated single high-valued characters. The length in
2088 the table is zero; the actual length is stored in the compiled code. */
2089
2090 if (c == OP_XCLASS) code += GET(code, 1);
2091
2092 /* Handle recursion */
2093
2094 else if (c == OP_REVERSE)
2095 {
2096 if (number < 0) return (pcre_uchar *)code;
2097 code += PRIV(OP_lengths)[c];
2098 }
2099
2100 /* Handle capturing bracket */
2101
2102 else if (c == OP_CBRA || c == OP_SCBRA ||
2103 c == OP_CBRAPOS || c == OP_SCBRAPOS)
2104 {
2105 int n = (int)GET2(code, 1+LINK_SIZE);
2106 if (n == number) return (pcre_uchar *)code;
2107 code += PRIV(OP_lengths)[c];
2108 }
2109
2110 /* Otherwise, we can get the item's length from the table, except that for
2111 repeated character types, we have to test for \p and \P, which have an extra
2112 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2113 must add in its length. */
2114
2115 else
2116 {
2117 switch(c)
2118 {
2119 case OP_TYPESTAR:
2120 case OP_TYPEMINSTAR:
2121 case OP_TYPEPLUS:
2122 case OP_TYPEMINPLUS:
2123 case OP_TYPEQUERY:
2124 case OP_TYPEMINQUERY:
2125 case OP_TYPEPOSSTAR:
2126 case OP_TYPEPOSPLUS:
2127 case OP_TYPEPOSQUERY:
2128 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2129 break;
2130
2131 case OP_TYPEUPTO:
2132 case OP_TYPEMINUPTO:
2133 case OP_TYPEEXACT:
2134 case OP_TYPEPOSUPTO:
2135 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2136 code += 2;
2137 break;
2138
2139 case OP_MARK:
2140 case OP_PRUNE_ARG:
2141 case OP_SKIP_ARG:
2142 case OP_THEN_ARG:
2143 code += code[1];
2144 break;
2145 }
2146
2147 /* Add in the fixed length from the table */
2148
2149 code += PRIV(OP_lengths)[c];
2150
2151 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2152 a multi-byte character. The length in the table is a minimum, so we have to
2153 arrange to skip the extra bytes. */
2154
2155 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2156 if (utf) switch(c)
2157 {
2158 case OP_CHAR:
2159 case OP_CHARI:
2160 case OP_NOT:
2161 case OP_NOTI:
2162 case OP_EXACT:
2163 case OP_EXACTI:
2164 case OP_NOTEXACT:
2165 case OP_NOTEXACTI:
2166 case OP_UPTO:
2167 case OP_UPTOI:
2168 case OP_NOTUPTO:
2169 case OP_NOTUPTOI:
2170 case OP_MINUPTO:
2171 case OP_MINUPTOI:
2172 case OP_NOTMINUPTO:
2173 case OP_NOTMINUPTOI:
2174 case OP_POSUPTO:
2175 case OP_POSUPTOI:
2176 case OP_NOTPOSUPTO:
2177 case OP_NOTPOSUPTOI:
2178 case OP_STAR:
2179 case OP_STARI:
2180 case OP_NOTSTAR:
2181 case OP_NOTSTARI:
2182 case OP_MINSTAR:
2183 case OP_MINSTARI:
2184 case OP_NOTMINSTAR:
2185 case OP_NOTMINSTARI:
2186 case OP_POSSTAR:
2187 case OP_POSSTARI:
2188 case OP_NOTPOSSTAR:
2189 case OP_NOTPOSSTARI:
2190 case OP_PLUS:
2191 case OP_PLUSI:
2192 case OP_NOTPLUS:
2193 case OP_NOTPLUSI:
2194 case OP_MINPLUS:
2195 case OP_MINPLUSI:
2196 case OP_NOTMINPLUS:
2197 case OP_NOTMINPLUSI:
2198 case OP_POSPLUS:
2199 case OP_POSPLUSI:
2200 case OP_NOTPOSPLUS:
2201 case OP_NOTPOSPLUSI:
2202 case OP_QUERY:
2203 case OP_QUERYI:
2204 case OP_NOTQUERY:
2205 case OP_NOTQUERYI:
2206 case OP_MINQUERY:
2207 case OP_MINQUERYI:
2208 case OP_NOTMINQUERY:
2209 case OP_NOTMINQUERYI:
2210 case OP_POSQUERY:
2211 case OP_POSQUERYI:
2212 case OP_NOTPOSQUERY:
2213 case OP_NOTPOSQUERYI:
2214 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2215 break;
2216 }
2217 #else
2218 (void)(utf); /* Keep compiler happy by referencing function argument */
2219 #endif
2220 }
2221 }
2222 }
2223
2224
2225
2226 /*************************************************
2227 * Scan compiled regex for recursion reference *
2228 *************************************************/
2229
2230 /* This little function scans through a compiled pattern until it finds an
2231 instance of OP_RECURSE.
2232
2233 Arguments:
2234 code points to start of expression
2235 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2236
2237 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2238 */
2239
2240 static const pcre_uchar *
find_recurse(const pcre_uchar * code,BOOL utf)2241 find_recurse(const pcre_uchar *code, BOOL utf)
2242 {
2243 for (;;)
2244 {
2245 register pcre_uchar c = *code;
2246 if (c == OP_END) return NULL;
2247 if (c == OP_RECURSE) return code;
2248
2249 /* XCLASS is used for classes that cannot be represented just by a bit
2250 map. This includes negated single high-valued characters. The length in
2251 the table is zero; the actual length is stored in the compiled code. */
2252
2253 if (c == OP_XCLASS) code += GET(code, 1);
2254
2255 /* Otherwise, we can get the item's length from the table, except that for
2256 repeated character types, we have to test for \p and \P, which have an extra
2257 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2258 must add in its length. */
2259
2260 else
2261 {
2262 switch(c)
2263 {
2264 case OP_TYPESTAR:
2265 case OP_TYPEMINSTAR:
2266 case OP_TYPEPLUS:
2267 case OP_TYPEMINPLUS:
2268 case OP_TYPEQUERY:
2269 case OP_TYPEMINQUERY:
2270 case OP_TYPEPOSSTAR:
2271 case OP_TYPEPOSPLUS:
2272 case OP_TYPEPOSQUERY:
2273 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2274 break;
2275
2276 case OP_TYPEPOSUPTO:
2277 case OP_TYPEUPTO:
2278 case OP_TYPEMINUPTO:
2279 case OP_TYPEEXACT:
2280 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2281 code += 2;
2282 break;
2283
2284 case OP_MARK:
2285 case OP_PRUNE_ARG:
2286 case OP_SKIP_ARG:
2287 case OP_THEN_ARG:
2288 code += code[1];
2289 break;
2290 }
2291
2292 /* Add in the fixed length from the table */
2293
2294 code += PRIV(OP_lengths)[c];
2295
2296 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2297 by a multi-byte character. The length in the table is a minimum, so we have
2298 to arrange to skip the extra bytes. */
2299
2300 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2301 if (utf) switch(c)
2302 {
2303 case OP_CHAR:
2304 case OP_CHARI:
2305 case OP_NOT:
2306 case OP_NOTI:
2307 case OP_EXACT:
2308 case OP_EXACTI:
2309 case OP_NOTEXACT:
2310 case OP_NOTEXACTI:
2311 case OP_UPTO:
2312 case OP_UPTOI:
2313 case OP_NOTUPTO:
2314 case OP_NOTUPTOI:
2315 case OP_MINUPTO:
2316 case OP_MINUPTOI:
2317 case OP_NOTMINUPTO:
2318 case OP_NOTMINUPTOI:
2319 case OP_POSUPTO:
2320 case OP_POSUPTOI:
2321 case OP_NOTPOSUPTO:
2322 case OP_NOTPOSUPTOI:
2323 case OP_STAR:
2324 case OP_STARI:
2325 case OP_NOTSTAR:
2326 case OP_NOTSTARI:
2327 case OP_MINSTAR:
2328 case OP_MINSTARI:
2329 case OP_NOTMINSTAR:
2330 case OP_NOTMINSTARI:
2331 case OP_POSSTAR:
2332 case OP_POSSTARI:
2333 case OP_NOTPOSSTAR:
2334 case OP_NOTPOSSTARI:
2335 case OP_PLUS:
2336 case OP_PLUSI:
2337 case OP_NOTPLUS:
2338 case OP_NOTPLUSI:
2339 case OP_MINPLUS:
2340 case OP_MINPLUSI:
2341 case OP_NOTMINPLUS:
2342 case OP_NOTMINPLUSI:
2343 case OP_POSPLUS:
2344 case OP_POSPLUSI:
2345 case OP_NOTPOSPLUS:
2346 case OP_NOTPOSPLUSI:
2347 case OP_QUERY:
2348 case OP_QUERYI:
2349 case OP_NOTQUERY:
2350 case OP_NOTQUERYI:
2351 case OP_MINQUERY:
2352 case OP_MINQUERYI:
2353 case OP_NOTMINQUERY:
2354 case OP_NOTMINQUERYI:
2355 case OP_POSQUERY:
2356 case OP_POSQUERYI:
2357 case OP_NOTPOSQUERY:
2358 case OP_NOTPOSQUERYI:
2359 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2360 break;
2361 }
2362 #else
2363 (void)(utf); /* Keep compiler happy by referencing function argument */
2364 #endif
2365 }
2366 }
2367 }
2368
2369
2370
2371 /*************************************************
2372 * Scan compiled branch for non-emptiness *
2373 *************************************************/
2374
2375 /* This function scans through a branch of a compiled pattern to see whether it
2376 can match the empty string or not. It is called from could_be_empty()
2377 below and from compile_branch() when checking for an unlimited repeat of a
2378 group that can match nothing. Note that first_significant_code() skips over
2379 backward and negative forward assertions when its final argument is TRUE. If we
2380 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2381 bracket whose current branch will already have been scanned.
2382
2383 Arguments:
2384 code points to start of search
2385 endcode points to where to stop
2386 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2387 cd contains pointers to tables etc.
2388 recurses chain of recurse_check to catch mutual recursion
2389
2390 Returns: TRUE if what is matched could be empty
2391 */
2392
2393 static BOOL
could_be_empty_branch(const pcre_uchar * code,const pcre_uchar * endcode,BOOL utf,compile_data * cd,recurse_check * recurses)2394 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2395 BOOL utf, compile_data *cd, recurse_check *recurses)
2396 {
2397 register pcre_uchar c;
2398 recurse_check this_recurse;
2399
2400 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2401 code < endcode;
2402 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2403 {
2404 const pcre_uchar *ccode;
2405
2406 c = *code;
2407
2408 /* Skip over forward assertions; the other assertions are skipped by
2409 first_significant_code() with a TRUE final argument. */
2410
2411 if (c == OP_ASSERT)
2412 {
2413 do code += GET(code, 1); while (*code == OP_ALT);
2414 c = *code;
2415 continue;
2416 }
2417
2418 /* For a recursion/subroutine call, if its end has been reached, which
2419 implies a backward reference subroutine call, we can scan it. If it's a
2420 forward reference subroutine call, we can't. To detect forward reference
2421 we have to scan up the list that is kept in the workspace. This function is
2422 called only when doing the real compile, not during the pre-compile that
2423 measures the size of the compiled pattern. */
2424
2425 if (c == OP_RECURSE)
2426 {
2427 const pcre_uchar *scode = cd->start_code + GET(code, 1);
2428 const pcre_uchar *endgroup = scode;
2429 BOOL empty_branch;
2430
2431 /* Test for forward reference or uncompleted reference. This is disabled
2432 when called to scan a completed pattern by setting cd->start_workspace to
2433 NULL. */
2434
2435 if (cd->start_workspace != NULL)
2436 {
2437 const pcre_uchar *tcode;
2438 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2439 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2440 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2441 }
2442
2443 /* If the reference is to a completed group, we need to detect whether this
2444 is a recursive call, as otherwise there will be an infinite loop. If it is
2445 a recursion, just skip over it. Simple recursions are easily detected. For
2446 mutual recursions we keep a chain on the stack. */
2447
2448 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2449 if (code >= scode && code <= endgroup) continue; /* Simple recursion */
2450 else
2451 {
2452 recurse_check *r = recurses;
2453 for (r = recurses; r != NULL; r = r->prev)
2454 if (r->group == scode) break;
2455 if (r != NULL) continue; /* Mutual recursion */
2456 }
2457
2458 /* Completed reference; scan the referenced group, remembering it on the
2459 stack chain to detect mutual recursions. */
2460
2461 empty_branch = FALSE;
2462 this_recurse.prev = recurses;
2463 this_recurse.group = scode;
2464
2465 do
2466 {
2467 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2468 {
2469 empty_branch = TRUE;
2470 break;
2471 }
2472 scode += GET(scode, 1);
2473 }
2474 while (*scode == OP_ALT);
2475
2476 if (!empty_branch) return FALSE; /* All branches are non-empty */
2477 continue;
2478 }
2479
2480 /* Groups with zero repeats can of course be empty; skip them. */
2481
2482 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2483 c == OP_BRAPOSZERO)
2484 {
2485 code += PRIV(OP_lengths)[c];
2486 do code += GET(code, 1); while (*code == OP_ALT);
2487 c = *code;
2488 continue;
2489 }
2490
2491 /* A nested group that is already marked as "could be empty" can just be
2492 skipped. */
2493
2494 if (c == OP_SBRA || c == OP_SBRAPOS ||
2495 c == OP_SCBRA || c == OP_SCBRAPOS)
2496 {
2497 do code += GET(code, 1); while (*code == OP_ALT);
2498 c = *code;
2499 continue;
2500 }
2501
2502 /* For other groups, scan the branches. */
2503
2504 if (c == OP_BRA || c == OP_BRAPOS ||
2505 c == OP_CBRA || c == OP_CBRAPOS ||
2506 c == OP_ONCE || c == OP_ONCE_NC ||
2507 c == OP_COND || c == OP_SCOND)
2508 {
2509 BOOL empty_branch;
2510 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2511
2512 /* If a conditional group has only one branch, there is a second, implied,
2513 empty branch, so just skip over the conditional, because it could be empty.
2514 Otherwise, scan the individual branches of the group. */
2515
2516 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2517 code += GET(code, 1);
2518 else
2519 {
2520 empty_branch = FALSE;
2521 do
2522 {
2523 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2524 recurses)) empty_branch = TRUE;
2525 code += GET(code, 1);
2526 }
2527 while (*code == OP_ALT);
2528 if (!empty_branch) return FALSE; /* All branches are non-empty */
2529 }
2530
2531 c = *code;
2532 continue;
2533 }
2534
2535 /* Handle the other opcodes */
2536
2537 switch (c)
2538 {
2539 /* Check for quantifiers after a class. XCLASS is used for classes that
2540 cannot be represented just by a bit map. This includes negated single
2541 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2542 actual length is stored in the compiled code, so we must update "code"
2543 here. */
2544
2545 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2546 case OP_XCLASS:
2547 ccode = code += GET(code, 1);
2548 goto CHECK_CLASS_REPEAT;
2549 #endif
2550
2551 case OP_CLASS:
2552 case OP_NCLASS:
2553 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2554
2555 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2556 CHECK_CLASS_REPEAT:
2557 #endif
2558
2559 switch (*ccode)
2560 {
2561 case OP_CRSTAR: /* These could be empty; continue */
2562 case OP_CRMINSTAR:
2563 case OP_CRQUERY:
2564 case OP_CRMINQUERY:
2565 case OP_CRPOSSTAR:
2566 case OP_CRPOSQUERY:
2567 break;
2568
2569 default: /* Non-repeat => class must match */
2570 case OP_CRPLUS: /* These repeats aren't empty */
2571 case OP_CRMINPLUS:
2572 case OP_CRPOSPLUS:
2573 return FALSE;
2574
2575 case OP_CRRANGE:
2576 case OP_CRMINRANGE:
2577 case OP_CRPOSRANGE:
2578 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2579 break;
2580 }
2581 break;
2582
2583 /* Opcodes that must match a character */
2584
2585 case OP_ANY:
2586 case OP_ALLANY:
2587 case OP_ANYBYTE:
2588
2589 case OP_PROP:
2590 case OP_NOTPROP:
2591 case OP_ANYNL:
2592
2593 case OP_NOT_HSPACE:
2594 case OP_HSPACE:
2595 case OP_NOT_VSPACE:
2596 case OP_VSPACE:
2597 case OP_EXTUNI:
2598
2599 case OP_NOT_DIGIT:
2600 case OP_DIGIT:
2601 case OP_NOT_WHITESPACE:
2602 case OP_WHITESPACE:
2603 case OP_NOT_WORDCHAR:
2604 case OP_WORDCHAR:
2605
2606 case OP_CHAR:
2607 case OP_CHARI:
2608 case OP_NOT:
2609 case OP_NOTI:
2610
2611 case OP_PLUS:
2612 case OP_PLUSI:
2613 case OP_MINPLUS:
2614 case OP_MINPLUSI:
2615
2616 case OP_NOTPLUS:
2617 case OP_NOTPLUSI:
2618 case OP_NOTMINPLUS:
2619 case OP_NOTMINPLUSI:
2620
2621 case OP_POSPLUS:
2622 case OP_POSPLUSI:
2623 case OP_NOTPOSPLUS:
2624 case OP_NOTPOSPLUSI:
2625
2626 case OP_EXACT:
2627 case OP_EXACTI:
2628 case OP_NOTEXACT:
2629 case OP_NOTEXACTI:
2630
2631 case OP_TYPEPLUS:
2632 case OP_TYPEMINPLUS:
2633 case OP_TYPEPOSPLUS:
2634 case OP_TYPEEXACT:
2635
2636 return FALSE;
2637
2638 /* These are going to continue, as they may be empty, but we have to
2639 fudge the length for the \p and \P cases. */
2640
2641 case OP_TYPESTAR:
2642 case OP_TYPEMINSTAR:
2643 case OP_TYPEPOSSTAR:
2644 case OP_TYPEQUERY:
2645 case OP_TYPEMINQUERY:
2646 case OP_TYPEPOSQUERY:
2647 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2648 break;
2649
2650 /* Same for these */
2651
2652 case OP_TYPEUPTO:
2653 case OP_TYPEMINUPTO:
2654 case OP_TYPEPOSUPTO:
2655 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2656 code += 2;
2657 break;
2658
2659 /* End of branch */
2660
2661 case OP_KET:
2662 case OP_KETRMAX:
2663 case OP_KETRMIN:
2664 case OP_KETRPOS:
2665 case OP_ALT:
2666 return TRUE;
2667
2668 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2669 MINUPTO, and POSUPTO and their caseless and negative versions may be
2670 followed by a multibyte character. */
2671
2672 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2673 case OP_STAR:
2674 case OP_STARI:
2675 case OP_NOTSTAR:
2676 case OP_NOTSTARI:
2677
2678 case OP_MINSTAR:
2679 case OP_MINSTARI:
2680 case OP_NOTMINSTAR:
2681 case OP_NOTMINSTARI:
2682
2683 case OP_POSSTAR:
2684 case OP_POSSTARI:
2685 case OP_NOTPOSSTAR:
2686 case OP_NOTPOSSTARI:
2687
2688 case OP_QUERY:
2689 case OP_QUERYI:
2690 case OP_NOTQUERY:
2691 case OP_NOTQUERYI:
2692
2693 case OP_MINQUERY:
2694 case OP_MINQUERYI:
2695 case OP_NOTMINQUERY:
2696 case OP_NOTMINQUERYI:
2697
2698 case OP_POSQUERY:
2699 case OP_POSQUERYI:
2700 case OP_NOTPOSQUERY:
2701 case OP_NOTPOSQUERYI:
2702
2703 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2704 break;
2705
2706 case OP_UPTO:
2707 case OP_UPTOI:
2708 case OP_NOTUPTO:
2709 case OP_NOTUPTOI:
2710
2711 case OP_MINUPTO:
2712 case OP_MINUPTOI:
2713 case OP_NOTMINUPTO:
2714 case OP_NOTMINUPTOI:
2715
2716 case OP_POSUPTO:
2717 case OP_POSUPTOI:
2718 case OP_NOTPOSUPTO:
2719 case OP_NOTPOSUPTOI:
2720
2721 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2722 break;
2723 #endif
2724
2725 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2726 string. */
2727
2728 case OP_MARK:
2729 case OP_PRUNE_ARG:
2730 case OP_SKIP_ARG:
2731 case OP_THEN_ARG:
2732 code += code[1];
2733 break;
2734
2735 /* None of the remaining opcodes are required to match a character. */
2736
2737 default:
2738 break;
2739 }
2740 }
2741
2742 return TRUE;
2743 }
2744
2745
2746
2747 /*************************************************
2748 * Scan compiled regex for non-emptiness *
2749 *************************************************/
2750
2751 /* This function is called to check for left recursive calls. We want to check
2752 the current branch of the current pattern to see if it could match the empty
2753 string. If it could, we must look outwards for branches at other levels,
2754 stopping when we pass beyond the bracket which is the subject of the recursion.
2755 This function is called only during the real compile, not during the
2756 pre-compile.
2757
2758 Arguments:
2759 code points to start of the recursion
2760 endcode points to where to stop (current RECURSE item)
2761 bcptr points to the chain of current (unclosed) branch starts
2762 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2763 cd pointers to tables etc
2764
2765 Returns: TRUE if what is matched could be empty
2766 */
2767
2768 static BOOL
could_be_empty(const pcre_uchar * code,const pcre_uchar * endcode,branch_chain * bcptr,BOOL utf,compile_data * cd)2769 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2770 branch_chain *bcptr, BOOL utf, compile_data *cd)
2771 {
2772 while (bcptr != NULL && bcptr->current_branch >= code)
2773 {
2774 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2775 return FALSE;
2776 bcptr = bcptr->outer;
2777 }
2778 return TRUE;
2779 }
2780
2781
2782
2783 /*************************************************
2784 * Base opcode of repeated opcodes *
2785 *************************************************/
2786
2787 /* Returns the base opcode for repeated single character type opcodes. If the
2788 opcode is not a repeated character type, it returns with the original value.
2789
2790 Arguments: c opcode
2791 Returns: base opcode for the type
2792 */
2793
2794 static pcre_uchar
get_repeat_base(pcre_uchar c)2795 get_repeat_base(pcre_uchar c)
2796 {
2797 return (c > OP_TYPEPOSUPTO)? c :
2798 (c >= OP_TYPESTAR)? OP_TYPESTAR :
2799 (c >= OP_NOTSTARI)? OP_NOTSTARI :
2800 (c >= OP_NOTSTAR)? OP_NOTSTAR :
2801 (c >= OP_STARI)? OP_STARI :
2802 OP_STAR;
2803 }
2804
2805
2806
2807 #ifdef SUPPORT_UCP
2808 /*************************************************
2809 * Check a character and a property *
2810 *************************************************/
2811
2812 /* This function is called by check_auto_possessive() when a property item
2813 is adjacent to a fixed character.
2814
2815 Arguments:
2816 c the character
2817 ptype the property type
2818 pdata the data for the type
2819 negated TRUE if it's a negated property (\P or \p{^)
2820
2821 Returns: TRUE if auto-possessifying is OK
2822 */
2823
2824 static BOOL
check_char_prop(pcre_uint32 c,unsigned int ptype,unsigned int pdata,BOOL negated)2825 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2826 BOOL negated)
2827 {
2828 const pcre_uint32 *p;
2829 const ucd_record *prop = GET_UCD(c);
2830
2831 switch(ptype)
2832 {
2833 case PT_LAMP:
2834 return (prop->chartype == ucp_Lu ||
2835 prop->chartype == ucp_Ll ||
2836 prop->chartype == ucp_Lt) == negated;
2837
2838 case PT_GC:
2839 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2840
2841 case PT_PC:
2842 return (pdata == prop->chartype) == negated;
2843
2844 case PT_SC:
2845 return (pdata == prop->script) == negated;
2846
2847 /* These are specials */
2848
2849 case PT_ALNUM:
2850 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2851 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2852
2853 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2854 means that Perl space and POSIX space are now identical. PCRE was changed
2855 at release 8.34. */
2856
2857 case PT_SPACE: /* Perl space */
2858 case PT_PXSPACE: /* POSIX space */
2859 switch(c)
2860 {
2861 HSPACE_CASES:
2862 VSPACE_CASES:
2863 return negated;
2864
2865 default:
2866 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2867 }
2868 break; /* Control never reaches here */
2869
2870 case PT_WORD:
2871 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2872 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2873 c == CHAR_UNDERSCORE) == negated;
2874
2875 case PT_CLIST:
2876 p = PRIV(ucd_caseless_sets) + prop->caseset;
2877 for (;;)
2878 {
2879 if (c < *p) return !negated;
2880 if (c == *p++) return negated;
2881 }
2882 break; /* Control never reaches here */
2883 }
2884
2885 return FALSE;
2886 }
2887 #endif /* SUPPORT_UCP */
2888
2889
2890
2891 /*************************************************
2892 * Fill the character property list *
2893 *************************************************/
2894
2895 /* Checks whether the code points to an opcode that can take part in auto-
2896 possessification, and if so, fills a list with its properties.
2897
2898 Arguments:
2899 code points to start of expression
2900 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2901 fcc points to case-flipping table
2902 list points to output list
2903 list[0] will be filled with the opcode
2904 list[1] will be non-zero if this opcode
2905 can match an empty character string
2906 list[2..7] depends on the opcode
2907
2908 Returns: points to the start of the next opcode if *code is accepted
2909 NULL if *code is not accepted
2910 */
2911
2912 static const pcre_uchar *
get_chr_property_list(const pcre_uchar * code,BOOL utf,const pcre_uint8 * fcc,pcre_uint32 * list)2913 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2914 const pcre_uint8 *fcc, pcre_uint32 *list)
2915 {
2916 pcre_uchar c = *code;
2917 pcre_uchar base;
2918 const pcre_uchar *end;
2919 pcre_uint32 chr;
2920
2921 #ifdef SUPPORT_UCP
2922 pcre_uint32 *clist_dest;
2923 const pcre_uint32 *clist_src;
2924 #else
2925 utf = utf; /* Suppress "unused parameter" compiler warning */
2926 #endif
2927
2928 list[0] = c;
2929 list[1] = FALSE;
2930 code++;
2931
2932 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2933 {
2934 base = get_repeat_base(c);
2935 c -= (base - OP_STAR);
2936
2937 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2938 code += IMM2_SIZE;
2939
2940 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2941
2942 switch(base)
2943 {
2944 case OP_STAR:
2945 list[0] = OP_CHAR;
2946 break;
2947
2948 case OP_STARI:
2949 list[0] = OP_CHARI;
2950 break;
2951
2952 case OP_NOTSTAR:
2953 list[0] = OP_NOT;
2954 break;
2955
2956 case OP_NOTSTARI:
2957 list[0] = OP_NOTI;
2958 break;
2959
2960 case OP_TYPESTAR:
2961 list[0] = *code;
2962 code++;
2963 break;
2964 }
2965 c = list[0];
2966 }
2967
2968 switch(c)
2969 {
2970 case OP_NOT_DIGIT:
2971 case OP_DIGIT:
2972 case OP_NOT_WHITESPACE:
2973 case OP_WHITESPACE:
2974 case OP_NOT_WORDCHAR:
2975 case OP_WORDCHAR:
2976 case OP_ANY:
2977 case OP_ALLANY:
2978 case OP_ANYNL:
2979 case OP_NOT_HSPACE:
2980 case OP_HSPACE:
2981 case OP_NOT_VSPACE:
2982 case OP_VSPACE:
2983 case OP_EXTUNI:
2984 case OP_EODN:
2985 case OP_EOD:
2986 case OP_DOLL:
2987 case OP_DOLLM:
2988 return code;
2989
2990 case OP_CHAR:
2991 case OP_NOT:
2992 GETCHARINCTEST(chr, code);
2993 list[2] = chr;
2994 list[3] = NOTACHAR;
2995 return code;
2996
2997 case OP_CHARI:
2998 case OP_NOTI:
2999 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3000 GETCHARINCTEST(chr, code);
3001 list[2] = chr;
3002
3003 #ifdef SUPPORT_UCP
3004 if (chr < 128 || (chr < 256 && !utf))
3005 list[3] = fcc[chr];
3006 else
3007 list[3] = UCD_OTHERCASE(chr);
3008 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3009 list[3] = (chr < 256) ? fcc[chr] : chr;
3010 #else
3011 list[3] = fcc[chr];
3012 #endif
3013
3014 /* The othercase might be the same value. */
3015
3016 if (chr == list[3])
3017 list[3] = NOTACHAR;
3018 else
3019 list[4] = NOTACHAR;
3020 return code;
3021
3022 #ifdef SUPPORT_UCP
3023 case OP_PROP:
3024 case OP_NOTPROP:
3025 if (code[0] != PT_CLIST)
3026 {
3027 list[2] = code[0];
3028 list[3] = code[1];
3029 return code + 2;
3030 }
3031
3032 /* Convert only if we have enough space. */
3033
3034 clist_src = PRIV(ucd_caseless_sets) + code[1];
3035 clist_dest = list + 2;
3036 code += 2;
3037
3038 do {
3039 if (clist_dest >= list + 8)
3040 {
3041 /* Early return if there is not enough space. This should never
3042 happen, since all clists are shorter than 5 character now. */
3043 list[2] = code[0];
3044 list[3] = code[1];
3045 return code;
3046 }
3047 *clist_dest++ = *clist_src;
3048 }
3049 while(*clist_src++ != NOTACHAR);
3050
3051 /* All characters are stored. The terminating NOTACHAR
3052 is copied form the clist itself. */
3053
3054 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3055 return code;
3056 #endif
3057
3058 case OP_NCLASS:
3059 case OP_CLASS:
3060 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3061 case OP_XCLASS:
3062 if (c == OP_XCLASS)
3063 end = code + GET(code, 0) - 1;
3064 else
3065 #endif
3066 end = code + 32 / sizeof(pcre_uchar);
3067
3068 switch(*end)
3069 {
3070 case OP_CRSTAR:
3071 case OP_CRMINSTAR:
3072 case OP_CRQUERY:
3073 case OP_CRMINQUERY:
3074 case OP_CRPOSSTAR:
3075 case OP_CRPOSQUERY:
3076 list[1] = TRUE;
3077 end++;
3078 break;
3079
3080 case OP_CRPLUS:
3081 case OP_CRMINPLUS:
3082 case OP_CRPOSPLUS:
3083 end++;
3084 break;
3085
3086 case OP_CRRANGE:
3087 case OP_CRMINRANGE:
3088 case OP_CRPOSRANGE:
3089 list[1] = (GET2(end, 1) == 0);
3090 end += 1 + 2 * IMM2_SIZE;
3091 break;
3092 }
3093 list[2] = (pcre_uint32)(end - code);
3094 return end;
3095 }
3096 return NULL; /* Opcode not accepted */
3097 }
3098
3099
3100
3101 /*************************************************
3102 * Scan further character sets for match *
3103 *************************************************/
3104
3105 /* Checks whether the base and the current opcode have a common character, in
3106 which case the base cannot be possessified.
3107
3108 Arguments:
3109 code points to the byte code
3110 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3111 cd static compile data
3112 base_list the data list of the base opcode
3113
3114 Returns: TRUE if the auto-possessification is possible
3115 */
3116
3117 static BOOL
compare_opcodes(const pcre_uchar * code,BOOL utf,const compile_data * cd,const pcre_uint32 * base_list,const pcre_uchar * base_end,int * rec_limit)3118 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3119 const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3120 {
3121 pcre_uchar c;
3122 pcre_uint32 list[8];
3123 const pcre_uint32 *chr_ptr;
3124 const pcre_uint32 *ochr_ptr;
3125 const pcre_uint32 *list_ptr;
3126 const pcre_uchar *next_code;
3127 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3128 const pcre_uchar *xclass_flags;
3129 #endif
3130 const pcre_uint8 *class_bitset;
3131 const pcre_uint8 *set1, *set2, *set_end;
3132 pcre_uint32 chr;
3133 BOOL accepted, invert_bits;
3134 BOOL entered_a_group = FALSE;
3135
3136 if (*rec_limit == 0) return FALSE;
3137 --(*rec_limit);
3138
3139 /* Note: the base_list[1] contains whether the current opcode has greedy
3140 (represented by a non-zero value) quantifier. This is a different from
3141 other character type lists, which stores here that the character iterator
3142 matches to an empty string (also represented by a non-zero value). */
3143
3144 for(;;)
3145 {
3146 /* All operations move the code pointer forward.
3147 Therefore infinite recursions are not possible. */
3148
3149 c = *code;
3150
3151 /* Skip over callouts */
3152
3153 if (c == OP_CALLOUT)
3154 {
3155 code += PRIV(OP_lengths)[c];
3156 continue;
3157 }
3158
3159 if (c == OP_ALT)
3160 {
3161 do code += GET(code, 1); while (*code == OP_ALT);
3162 c = *code;
3163 }
3164
3165 switch(c)
3166 {
3167 case OP_END:
3168 case OP_KETRPOS:
3169 /* TRUE only in greedy case. The non-greedy case could be replaced by
3170 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3171 uses more memory, which we cannot get at this stage.) */
3172
3173 return base_list[1] != 0;
3174
3175 case OP_KET:
3176 /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3177 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3178 cannot be converted to a possessive form. */
3179
3180 if (base_list[1] == 0) return FALSE;
3181
3182 switch(*(code - GET(code, 1)))
3183 {
3184 case OP_ASSERT:
3185 case OP_ASSERT_NOT:
3186 case OP_ASSERTBACK:
3187 case OP_ASSERTBACK_NOT:
3188 case OP_ONCE:
3189 case OP_ONCE_NC:
3190 /* Atomic sub-patterns and assertions can always auto-possessify their
3191 last iterator. However, if the group was entered as a result of checking
3192 a previous iterator, this is not possible. */
3193
3194 return !entered_a_group;
3195 }
3196
3197 code += PRIV(OP_lengths)[c];
3198 continue;
3199
3200 case OP_ONCE:
3201 case OP_ONCE_NC:
3202 case OP_BRA:
3203 case OP_CBRA:
3204 next_code = code + GET(code, 1);
3205 code += PRIV(OP_lengths)[c];
3206
3207 while (*next_code == OP_ALT)
3208 {
3209 if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3210 return FALSE;
3211 code = next_code + 1 + LINK_SIZE;
3212 next_code += GET(next_code, 1);
3213 }
3214
3215 entered_a_group = TRUE;
3216 continue;
3217
3218 case OP_BRAZERO:
3219 case OP_BRAMINZERO:
3220
3221 next_code = code + 1;
3222 if (*next_code != OP_BRA && *next_code != OP_CBRA
3223 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3224
3225 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3226
3227 /* The bracket content will be checked by the
3228 OP_BRA/OP_CBRA case above. */
3229 next_code += 1 + LINK_SIZE;
3230 if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3231 return FALSE;
3232
3233 code += PRIV(OP_lengths)[c];
3234 continue;
3235
3236 default:
3237 break;
3238 }
3239
3240 /* Check for a supported opcode, and load its properties. */
3241
3242 code = get_chr_property_list(code, utf, cd->fcc, list);
3243 if (code == NULL) return FALSE; /* Unsupported */
3244
3245 /* If either opcode is a small character list, set pointers for comparing
3246 characters from that list with another list, or with a property. */
3247
3248 if (base_list[0] == OP_CHAR)
3249 {
3250 chr_ptr = base_list + 2;
3251 list_ptr = list;
3252 }
3253 else if (list[0] == OP_CHAR)
3254 {
3255 chr_ptr = list + 2;
3256 list_ptr = base_list;
3257 }
3258
3259 /* Character bitsets can also be compared to certain opcodes. */
3260
3261 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3262 #ifdef COMPILE_PCRE8
3263 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3264 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3265 #endif
3266 )
3267 {
3268 #ifdef COMPILE_PCRE8
3269 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3270 #else
3271 if (base_list[0] == OP_CLASS)
3272 #endif
3273 {
3274 set1 = (pcre_uint8 *)(base_end - base_list[2]);
3275 list_ptr = list;
3276 }
3277 else
3278 {
3279 set1 = (pcre_uint8 *)(code - list[2]);
3280 list_ptr = base_list;
3281 }
3282
3283 invert_bits = FALSE;
3284 switch(list_ptr[0])
3285 {
3286 case OP_CLASS:
3287 case OP_NCLASS:
3288 set2 = (pcre_uint8 *)
3289 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3290 break;
3291
3292 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3293 case OP_XCLASS:
3294 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3295 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3296 if ((*xclass_flags & XCL_MAP) == 0)
3297 {
3298 /* No bits are set for characters < 256. */
3299 if (list[1] == 0) return TRUE;
3300 /* Might be an empty repeat. */
3301 continue;
3302 }
3303 set2 = (pcre_uint8 *)(xclass_flags + 1);
3304 break;
3305 #endif
3306
3307 case OP_NOT_DIGIT:
3308 invert_bits = TRUE;
3309 /* Fall through */
3310 case OP_DIGIT:
3311 set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3312 break;
3313
3314 case OP_NOT_WHITESPACE:
3315 invert_bits = TRUE;
3316 /* Fall through */
3317 case OP_WHITESPACE:
3318 set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3319 break;
3320
3321 case OP_NOT_WORDCHAR:
3322 invert_bits = TRUE;
3323 /* Fall through */
3324 case OP_WORDCHAR:
3325 set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3326 break;
3327
3328 default:
3329 return FALSE;
3330 }
3331
3332 /* Because the sets are unaligned, we need
3333 to perform byte comparison here. */
3334 set_end = set1 + 32;
3335 if (invert_bits)
3336 {
3337 do
3338 {
3339 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3340 }
3341 while (set1 < set_end);
3342 }
3343 else
3344 {
3345 do
3346 {
3347 if ((*set1++ & *set2++) != 0) return FALSE;
3348 }
3349 while (set1 < set_end);
3350 }
3351
3352 if (list[1] == 0) return TRUE;
3353 /* Might be an empty repeat. */
3354 continue;
3355 }
3356
3357 /* Some property combinations also acceptable. Unicode property opcodes are
3358 processed specially; the rest can be handled with a lookup table. */
3359
3360 else
3361 {
3362 pcre_uint32 leftop, rightop;
3363
3364 leftop = base_list[0];
3365 rightop = list[0];
3366
3367 #ifdef SUPPORT_UCP
3368 accepted = FALSE; /* Always set in non-unicode case. */
3369 if (leftop == OP_PROP || leftop == OP_NOTPROP)
3370 {
3371 if (rightop == OP_EOD)
3372 accepted = TRUE;
3373 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3374 {
3375 int n;
3376 const pcre_uint8 *p;
3377 BOOL same = leftop == rightop;
3378 BOOL lisprop = leftop == OP_PROP;
3379 BOOL risprop = rightop == OP_PROP;
3380 BOOL bothprop = lisprop && risprop;
3381
3382 /* There's a table that specifies how each combination is to be
3383 processed:
3384 0 Always return FALSE (never auto-possessify)
3385 1 Character groups are distinct (possessify if both are OP_PROP)
3386 2 Check character categories in the same group (general or particular)
3387 3 Return TRUE if the two opcodes are not the same
3388 ... see comments below
3389 */
3390
3391 n = propposstab[base_list[2]][list[2]];
3392 switch(n)
3393 {
3394 case 0: break;
3395 case 1: accepted = bothprop; break;
3396 case 2: accepted = (base_list[3] == list[3]) != same; break;
3397 case 3: accepted = !same; break;
3398
3399 case 4: /* Left general category, right particular category */
3400 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3401 break;
3402
3403 case 5: /* Right general category, left particular category */
3404 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3405 break;
3406
3407 /* This code is logically tricky. Think hard before fiddling with it.
3408 The posspropstab table has four entries per row. Each row relates to
3409 one of PCRE's special properties such as ALNUM or SPACE or WORD.
3410 Only WORD actually needs all four entries, but using repeats for the
3411 others means they can all use the same code below.
3412
3413 The first two entries in each row are Unicode general categories, and
3414 apply always, because all the characters they include are part of the
3415 PCRE character set. The third and fourth entries are a general and a
3416 particular category, respectively, that include one or more relevant
3417 characters. One or the other is used, depending on whether the check
3418 is for a general or a particular category. However, in both cases the
3419 category contains more characters than the specials that are defined
3420 for the property being tested against. Therefore, it cannot be used
3421 in a NOTPROP case.
3422
3423 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3424 Underscore is covered by ucp_P or ucp_Po. */
3425
3426 case 6: /* Left alphanum vs right general category */
3427 case 7: /* Left space vs right general category */
3428 case 8: /* Left word vs right general category */
3429 p = posspropstab[n-6];
3430 accepted = risprop && lisprop ==
3431 (list[3] != p[0] &&
3432 list[3] != p[1] &&
3433 (list[3] != p[2] || !lisprop));
3434 break;
3435
3436 case 9: /* Right alphanum vs left general category */
3437 case 10: /* Right space vs left general category */
3438 case 11: /* Right word vs left general category */
3439 p = posspropstab[n-9];
3440 accepted = lisprop && risprop ==
3441 (base_list[3] != p[0] &&
3442 base_list[3] != p[1] &&
3443 (base_list[3] != p[2] || !risprop));
3444 break;
3445
3446 case 12: /* Left alphanum vs right particular category */
3447 case 13: /* Left space vs right particular category */
3448 case 14: /* Left word vs right particular category */
3449 p = posspropstab[n-12];
3450 accepted = risprop && lisprop ==
3451 (catposstab[p[0]][list[3]] &&
3452 catposstab[p[1]][list[3]] &&
3453 (list[3] != p[3] || !lisprop));
3454 break;
3455
3456 case 15: /* Right alphanum vs left particular category */
3457 case 16: /* Right space vs left particular category */
3458 case 17: /* Right word vs left particular category */
3459 p = posspropstab[n-15];
3460 accepted = lisprop && risprop ==
3461 (catposstab[p[0]][base_list[3]] &&
3462 catposstab[p[1]][base_list[3]] &&
3463 (base_list[3] != p[3] || !risprop));
3464 break;
3465 }
3466 }
3467 }
3468
3469 else
3470 #endif /* SUPPORT_UCP */
3471
3472 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3473 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3474 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3475
3476 if (!accepted) return FALSE;
3477
3478 if (list[1] == 0) return TRUE;
3479 /* Might be an empty repeat. */
3480 continue;
3481 }
3482
3483 /* Control reaches here only if one of the items is a small character list.
3484 All characters are checked against the other side. */
3485
3486 do
3487 {
3488 chr = *chr_ptr;
3489
3490 switch(list_ptr[0])
3491 {
3492 case OP_CHAR:
3493 ochr_ptr = list_ptr + 2;
3494 do
3495 {
3496 if (chr == *ochr_ptr) return FALSE;
3497 ochr_ptr++;
3498 }
3499 while(*ochr_ptr != NOTACHAR);
3500 break;
3501
3502 case OP_NOT:
3503 ochr_ptr = list_ptr + 2;
3504 do
3505 {
3506 if (chr == *ochr_ptr)
3507 break;
3508 ochr_ptr++;
3509 }
3510 while(*ochr_ptr != NOTACHAR);
3511 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
3512 break;
3513
3514 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3515 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3516
3517 case OP_DIGIT:
3518 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3519 break;
3520
3521 case OP_NOT_DIGIT:
3522 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3523 break;
3524
3525 case OP_WHITESPACE:
3526 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3527 break;
3528
3529 case OP_NOT_WHITESPACE:
3530 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3531 break;
3532
3533 case OP_WORDCHAR:
3534 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3535 break;
3536
3537 case OP_NOT_WORDCHAR:
3538 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3539 break;
3540
3541 case OP_HSPACE:
3542 switch(chr)
3543 {
3544 HSPACE_CASES: return FALSE;
3545 default: break;
3546 }
3547 break;
3548
3549 case OP_NOT_HSPACE:
3550 switch(chr)
3551 {
3552 HSPACE_CASES: break;
3553 default: return FALSE;
3554 }
3555 break;
3556
3557 case OP_ANYNL:
3558 case OP_VSPACE:
3559 switch(chr)
3560 {
3561 VSPACE_CASES: return FALSE;
3562 default: break;
3563 }
3564 break;
3565
3566 case OP_NOT_VSPACE:
3567 switch(chr)
3568 {
3569 VSPACE_CASES: break;
3570 default: return FALSE;
3571 }
3572 break;
3573
3574 case OP_DOLL:
3575 case OP_EODN:
3576 switch (chr)
3577 {
3578 case CHAR_CR:
3579 case CHAR_LF:
3580 case CHAR_VT:
3581 case CHAR_FF:
3582 case CHAR_NEL:
3583 #ifndef EBCDIC
3584 case 0x2028:
3585 case 0x2029:
3586 #endif /* Not EBCDIC */
3587 return FALSE;
3588 }
3589 break;
3590
3591 case OP_EOD: /* Can always possessify before \z */
3592 break;
3593
3594 #ifdef SUPPORT_UCP
3595 case OP_PROP:
3596 case OP_NOTPROP:
3597 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3598 list_ptr[0] == OP_NOTPROP))
3599 return FALSE;
3600 break;
3601 #endif
3602
3603 case OP_NCLASS:
3604 if (chr > 255) return FALSE;
3605 /* Fall through */
3606
3607 case OP_CLASS:
3608 if (chr > 255) break;
3609 class_bitset = (pcre_uint8 *)
3610 ((list_ptr == list ? code : base_end) - list_ptr[2]);
3611 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3612 break;
3613
3614 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3615 case OP_XCLASS:
3616 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3617 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3618 break;
3619 #endif
3620
3621 default:
3622 return FALSE;
3623 }
3624
3625 chr_ptr++;
3626 }
3627 while(*chr_ptr != NOTACHAR);
3628
3629 /* At least one character must be matched from this opcode. */
3630
3631 if (list[1] == 0) return TRUE;
3632 }
3633
3634 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3635 but some compilers complain about an unreachable statement. */
3636
3637 }
3638
3639
3640
3641 /*************************************************
3642 * Scan compiled regex for auto-possession *
3643 *************************************************/
3644
3645 /* Replaces single character iterations with their possessive alternatives
3646 if appropriate. This function modifies the compiled opcode!
3647
3648 Arguments:
3649 code points to start of the byte code
3650 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3651 cd static compile data
3652
3653 Returns: nothing
3654 */
3655
3656 static void
auto_possessify(pcre_uchar * code,BOOL utf,const compile_data * cd)3657 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3658 {
3659 register pcre_uchar c;
3660 const pcre_uchar *end;
3661 pcre_uchar *repeat_opcode;
3662 pcre_uint32 list[8];
3663 int rec_limit;
3664
3665 for (;;)
3666 {
3667 c = *code;
3668
3669 /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3670 it may compile without complaining, but may get into a loop here if the code
3671 pointer points to a bad value. This is, of course a documentated possibility,
3672 when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3673 just give up on this optimization. */
3674
3675 if (c >= OP_TABLE_LENGTH) return;
3676
3677 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3678 {
3679 c -= get_repeat_base(c) - OP_STAR;
3680 end = (c <= OP_MINUPTO) ?
3681 get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3682 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3683
3684 rec_limit = 1000;
3685 if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3686 {
3687 switch(c)
3688 {
3689 case OP_STAR:
3690 *code += OP_POSSTAR - OP_STAR;
3691 break;
3692
3693 case OP_MINSTAR:
3694 *code += OP_POSSTAR - OP_MINSTAR;
3695 break;
3696
3697 case OP_PLUS:
3698 *code += OP_POSPLUS - OP_PLUS;
3699 break;
3700
3701 case OP_MINPLUS:
3702 *code += OP_POSPLUS - OP_MINPLUS;
3703 break;
3704
3705 case OP_QUERY:
3706 *code += OP_POSQUERY - OP_QUERY;
3707 break;
3708
3709 case OP_MINQUERY:
3710 *code += OP_POSQUERY - OP_MINQUERY;
3711 break;
3712
3713 case OP_UPTO:
3714 *code += OP_POSUPTO - OP_UPTO;
3715 break;
3716
3717 case OP_MINUPTO:
3718 *code += OP_POSUPTO - OP_MINUPTO;
3719 break;
3720 }
3721 }
3722 c = *code;
3723 }
3724 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3725 {
3726 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3727 if (c == OP_XCLASS)
3728 repeat_opcode = code + GET(code, 1);
3729 else
3730 #endif
3731 repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3732
3733 c = *repeat_opcode;
3734 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3735 {
3736 /* end must not be NULL. */
3737 end = get_chr_property_list(code, utf, cd->fcc, list);
3738
3739 list[1] = (c & 1) == 0;
3740
3741 rec_limit = 1000;
3742 if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3743 {
3744 switch (c)
3745 {
3746 case OP_CRSTAR:
3747 case OP_CRMINSTAR:
3748 *repeat_opcode = OP_CRPOSSTAR;
3749 break;
3750
3751 case OP_CRPLUS:
3752 case OP_CRMINPLUS:
3753 *repeat_opcode = OP_CRPOSPLUS;
3754 break;
3755
3756 case OP_CRQUERY:
3757 case OP_CRMINQUERY:
3758 *repeat_opcode = OP_CRPOSQUERY;
3759 break;
3760
3761 case OP_CRRANGE:
3762 case OP_CRMINRANGE:
3763 *repeat_opcode = OP_CRPOSRANGE;
3764 break;
3765 }
3766 }
3767 }
3768 c = *code;
3769 }
3770
3771 switch(c)
3772 {
3773 case OP_END:
3774 return;
3775
3776 case OP_TYPESTAR:
3777 case OP_TYPEMINSTAR:
3778 case OP_TYPEPLUS:
3779 case OP_TYPEMINPLUS:
3780 case OP_TYPEQUERY:
3781 case OP_TYPEMINQUERY:
3782 case OP_TYPEPOSSTAR:
3783 case OP_TYPEPOSPLUS:
3784 case OP_TYPEPOSQUERY:
3785 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3786 break;
3787
3788 case OP_TYPEUPTO:
3789 case OP_TYPEMINUPTO:
3790 case OP_TYPEEXACT:
3791 case OP_TYPEPOSUPTO:
3792 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3793 code += 2;
3794 break;
3795
3796 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3797 case OP_XCLASS:
3798 code += GET(code, 1);
3799 break;
3800 #endif
3801
3802 case OP_MARK:
3803 case OP_PRUNE_ARG:
3804 case OP_SKIP_ARG:
3805 case OP_THEN_ARG:
3806 code += code[1];
3807 break;
3808 }
3809
3810 /* Add in the fixed length from the table */
3811
3812 code += PRIV(OP_lengths)[c];
3813
3814 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3815 a multi-byte character. The length in the table is a minimum, so we have to
3816 arrange to skip the extra bytes. */
3817
3818 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3819 if (utf) switch(c)
3820 {
3821 case OP_CHAR:
3822 case OP_CHARI:
3823 case OP_NOT:
3824 case OP_NOTI:
3825 case OP_STAR:
3826 case OP_MINSTAR:
3827 case OP_PLUS:
3828 case OP_MINPLUS:
3829 case OP_QUERY:
3830 case OP_MINQUERY:
3831 case OP_UPTO:
3832 case OP_MINUPTO:
3833 case OP_EXACT:
3834 case OP_POSSTAR:
3835 case OP_POSPLUS:
3836 case OP_POSQUERY:
3837 case OP_POSUPTO:
3838 case OP_STARI:
3839 case OP_MINSTARI:
3840 case OP_PLUSI:
3841 case OP_MINPLUSI:
3842 case OP_QUERYI:
3843 case OP_MINQUERYI:
3844 case OP_UPTOI:
3845 case OP_MINUPTOI:
3846 case OP_EXACTI:
3847 case OP_POSSTARI:
3848 case OP_POSPLUSI:
3849 case OP_POSQUERYI:
3850 case OP_POSUPTOI:
3851 case OP_NOTSTAR:
3852 case OP_NOTMINSTAR:
3853 case OP_NOTPLUS:
3854 case OP_NOTMINPLUS:
3855 case OP_NOTQUERY:
3856 case OP_NOTMINQUERY:
3857 case OP_NOTUPTO:
3858 case OP_NOTMINUPTO:
3859 case OP_NOTEXACT:
3860 case OP_NOTPOSSTAR:
3861 case OP_NOTPOSPLUS:
3862 case OP_NOTPOSQUERY:
3863 case OP_NOTPOSUPTO:
3864 case OP_NOTSTARI:
3865 case OP_NOTMINSTARI:
3866 case OP_NOTPLUSI:
3867 case OP_NOTMINPLUSI:
3868 case OP_NOTQUERYI:
3869 case OP_NOTMINQUERYI:
3870 case OP_NOTUPTOI:
3871 case OP_NOTMINUPTOI:
3872 case OP_NOTEXACTI:
3873 case OP_NOTPOSSTARI:
3874 case OP_NOTPOSPLUSI:
3875 case OP_NOTPOSQUERYI:
3876 case OP_NOTPOSUPTOI:
3877 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3878 break;
3879 }
3880 #else
3881 (void)(utf); /* Keep compiler happy by referencing function argument */
3882 #endif
3883 }
3884 }
3885
3886
3887
3888 /*************************************************
3889 * Check for POSIX class syntax *
3890 *************************************************/
3891
3892 /* This function is called when the sequence "[:" or "[." or "[=" is
3893 encountered in a character class. It checks whether this is followed by a
3894 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3895 reach an unescaped ']' without the special preceding character, return FALSE.
3896
3897 Originally, this function only recognized a sequence of letters between the
3898 terminators, but it seems that Perl recognizes any sequence of characters,
3899 though of course unknown POSIX names are subsequently rejected. Perl gives an
3900 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3901 didn't consider this to be a POSIX class. Likewise for [:1234:].
3902
3903 The problem in trying to be exactly like Perl is in the handling of escapes. We
3904 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3905 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3906 below handles the special cases \\ and \], but does not try to do any other
3907 escape processing. This makes it different from Perl for cases such as
3908 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3909 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3910 when Perl does, I think.
3911
3912 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3913 It seems that the appearance of a nested POSIX class supersedes an apparent
3914 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3915 a digit.
3916
3917 In Perl, unescaped square brackets may also appear as part of class names. For
3918 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3919 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3920 seem right at all. PCRE does not allow closing square brackets in POSIX class
3921 names.
3922
3923 Arguments:
3924 ptr pointer to the initial [
3925 endptr where to return the end pointer
3926
3927 Returns: TRUE or FALSE
3928 */
3929
3930 static BOOL
check_posix_syntax(const pcre_uchar * ptr,const pcre_uchar ** endptr)3931 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3932 {
3933 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
3934 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
3935 for (++ptr; *ptr != CHAR_NULL; ptr++)
3936 {
3937 if (*ptr == CHAR_BACKSLASH &&
3938 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3939 ptr[1] == CHAR_BACKSLASH))
3940 ptr++;
3941 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3942 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3943 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3944 {
3945 *endptr = ptr;
3946 return TRUE;
3947 }
3948 }
3949 return FALSE;
3950 }
3951
3952
3953
3954
3955 /*************************************************
3956 * Check POSIX class name *
3957 *************************************************/
3958
3959 /* This function is called to check the name given in a POSIX-style class entry
3960 such as [:alnum:].
3961
3962 Arguments:
3963 ptr points to the first letter
3964 len the length of the name
3965
3966 Returns: a value representing the name, or -1 if unknown
3967 */
3968
3969 static int
check_posix_name(const pcre_uchar * ptr,int len)3970 check_posix_name(const pcre_uchar *ptr, int len)
3971 {
3972 const char *pn = posix_names;
3973 register int yield = 0;
3974 while (posix_name_lengths[yield] != 0)
3975 {
3976 if (len == posix_name_lengths[yield] &&
3977 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3978 pn += posix_name_lengths[yield] + 1;
3979 yield++;
3980 }
3981 return -1;
3982 }
3983
3984
3985 /*************************************************
3986 * Adjust OP_RECURSE items in repeated group *
3987 *************************************************/
3988
3989 /* OP_RECURSE items contain an offset from the start of the regex to the group
3990 that is referenced. This means that groups can be replicated for fixed
3991 repetition simply by copying (because the recursion is allowed to refer to
3992 earlier groups that are outside the current group). However, when a group is
3993 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3994 inserted before it, after it has been compiled. This means that any OP_RECURSE
3995 items within it that refer to the group itself or any contained groups have to
3996 have their offsets adjusted. That one of the jobs of this function. Before it
3997 is called, the partially compiled regex must be temporarily terminated with
3998 OP_END.
3999
4000 This function has been extended to cope with forward references for recursions
4001 and subroutine calls. It must check the list of such references for the
4002 group we are dealing with. If it finds that one of the recursions in the
4003 current group is on this list, it does not adjust the value in the reference
4004 (which is a group number). After the group has been scanned, all the offsets in
4005 the forward reference list for the group are adjusted.
4006
4007 Arguments:
4008 group points to the start of the group
4009 adjust the amount by which the group is to be moved
4010 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
4011 cd contains pointers to tables etc.
4012 save_hwm_offset the hwm forward reference offset at the start of the group
4013
4014 Returns: nothing
4015 */
4016
4017 static void
adjust_recurse(pcre_uchar * group,int adjust,BOOL utf,compile_data * cd,size_t save_hwm_offset)4018 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4019 size_t save_hwm_offset)
4020 {
4021 int offset;
4022 pcre_uchar *hc;
4023 pcre_uchar *ptr = group;
4024
4025 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4026 {
4027 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4028 hc += LINK_SIZE)
4029 {
4030 offset = (int)GET(hc, 0);
4031 if (cd->start_code + offset == ptr + 1) break;
4032 }
4033
4034 /* If we have not found this recursion on the forward reference list, adjust
4035 the recursion's offset if it's after the start of this group. */
4036
4037 if (hc >= cd->hwm)
4038 {
4039 offset = (int)GET(ptr, 1);
4040 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4041 }
4042
4043 ptr += 1 + LINK_SIZE;
4044 }
4045
4046 /* Now adjust all forward reference offsets for the group. */
4047
4048 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4049 hc += LINK_SIZE)
4050 {
4051 offset = (int)GET(hc, 0);
4052 PUT(hc, 0, offset + adjust);
4053 }
4054 }
4055
4056
4057
4058 /*************************************************
4059 * Insert an automatic callout point *
4060 *************************************************/
4061
4062 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4063 callout points before each pattern item.
4064
4065 Arguments:
4066 code current code pointer
4067 ptr current pattern pointer
4068 cd pointers to tables etc
4069
4070 Returns: new code pointer
4071 */
4072
4073 static pcre_uchar *
auto_callout(pcre_uchar * code,const pcre_uchar * ptr,compile_data * cd)4074 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4075 {
4076 *code++ = OP_CALLOUT;
4077 *code++ = 255;
4078 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
4079 PUT(code, LINK_SIZE, 0); /* Default length */
4080 return code + 2 * LINK_SIZE;
4081 }
4082
4083
4084
4085 /*************************************************
4086 * Complete a callout item *
4087 *************************************************/
4088
4089 /* A callout item contains the length of the next item in the pattern, which
4090 we can't fill in till after we have reached the relevant point. This is used
4091 for both automatic and manual callouts.
4092
4093 Arguments:
4094 previous_callout points to previous callout item
4095 ptr current pattern pointer
4096 cd pointers to tables etc
4097
4098 Returns: nothing
4099 */
4100
4101 static void
complete_callout(pcre_uchar * previous_callout,const pcre_uchar * ptr,compile_data * cd)4102 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4103 {
4104 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4105 PUT(previous_callout, 2 + LINK_SIZE, length);
4106 }
4107
4108
4109
4110 #ifdef SUPPORT_UCP
4111 /*************************************************
4112 * Get othercase range *
4113 *************************************************/
4114
4115 /* This function is passed the start and end of a class range, in UTF-8 mode
4116 with UCP support. It searches up the characters, looking for ranges of
4117 characters in the "other" case. Each call returns the next one, updating the
4118 start address. A character with multiple other cases is returned on its own
4119 with a special return value.
4120
4121 Arguments:
4122 cptr points to starting character value; updated
4123 d end value
4124 ocptr where to put start of othercase range
4125 odptr where to put end of othercase range
4126
4127 Yield: -1 when no more
4128 0 when a range is returned
4129 >0 the CASESET offset for char with multiple other cases
4130 in this case, ocptr contains the original
4131 */
4132
4133 static int
get_othercase_range(pcre_uint32 * cptr,pcre_uint32 d,pcre_uint32 * ocptr,pcre_uint32 * odptr)4134 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4135 pcre_uint32 *odptr)
4136 {
4137 pcre_uint32 c, othercase, next;
4138 unsigned int co;
4139
4140 /* Find the first character that has an other case. If it has multiple other
4141 cases, return its case offset value. */
4142
4143 for (c = *cptr; c <= d; c++)
4144 {
4145 if ((co = UCD_CASESET(c)) != 0)
4146 {
4147 *ocptr = c++; /* Character that has the set */
4148 *cptr = c; /* Rest of input range */
4149 return (int)co;
4150 }
4151 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4152 }
4153
4154 if (c > d) return -1; /* Reached end of range */
4155
4156 /* Found a character that has a single other case. Search for the end of the
4157 range, which is either the end of the input range, or a character that has zero
4158 or more than one other cases. */
4159
4160 *ocptr = othercase;
4161 next = othercase + 1;
4162
4163 for (++c; c <= d; c++)
4164 {
4165 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4166 next++;
4167 }
4168
4169 *odptr = next - 1; /* End of othercase range */
4170 *cptr = c; /* Rest of input range */
4171 return 0;
4172 }
4173 #endif /* SUPPORT_UCP */
4174
4175
4176
4177 /*************************************************
4178 * Add a character or range to a class *
4179 *************************************************/
4180
4181 /* This function packages up the logic of adding a character or range of
4182 characters to a class. The character values in the arguments will be within the
4183 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4184 mutually recursive with the function immediately below.
4185
4186 Arguments:
4187 classbits the bit map for characters < 256
4188 uchardptr points to the pointer for extra data
4189 options the options word
4190 cd contains pointers to tables etc.
4191 start start of range character
4192 end end of range character
4193
4194 Returns: the number of < 256 characters added
4195 the pointer to extra data is updated
4196 */
4197
4198 static int
add_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,pcre_uint32 start,pcre_uint32 end)4199 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4200 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4201 {
4202 pcre_uint32 c;
4203 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4204 int n8 = 0;
4205
4206 /* If caseless matching is required, scan the range and process alternate
4207 cases. In Unicode, there are 8-bit characters that have alternate cases that
4208 are greater than 255 and vice-versa. Sometimes we can just extend the original
4209 range. */
4210
4211 if ((options & PCRE_CASELESS) != 0)
4212 {
4213 #ifdef SUPPORT_UCP
4214 if ((options & PCRE_UTF8) != 0)
4215 {
4216 int rc;
4217 pcre_uint32 oc, od;
4218
4219 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
4220 c = start;
4221
4222 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4223 {
4224 /* Handle a single character that has more than one other case. */
4225
4226 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4227 PRIV(ucd_caseless_sets) + rc, oc);
4228
4229 /* Do nothing if the other case range is within the original range. */
4230
4231 else if (oc >= start && od <= end) continue;
4232
4233 /* Extend the original range if there is overlap, noting that if oc < c, we
4234 can't have od > end because a subrange is always shorter than the basic
4235 range. Otherwise, use a recursive call to add the additional range. */
4236
4237 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4238 else if (od > end && oc <= end + 1)
4239 {
4240 end = od; /* Extend upwards */
4241 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4242 }
4243 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4244 }
4245 }
4246 else
4247 #endif /* SUPPORT_UCP */
4248
4249 /* Not UTF-mode, or no UCP */
4250
4251 for (c = start; c <= classbits_end; c++)
4252 {
4253 SETBIT(classbits, cd->fcc[c]);
4254 n8++;
4255 }
4256 }
4257
4258 /* Now handle the original range. Adjust the final value according to the bit
4259 length - this means that the same lists of (e.g.) horizontal spaces can be used
4260 in all cases. */
4261
4262 #if defined COMPILE_PCRE8
4263 #ifdef SUPPORT_UTF
4264 if ((options & PCRE_UTF8) == 0)
4265 #endif
4266 if (end > 0xff) end = 0xff;
4267
4268 #elif defined COMPILE_PCRE16
4269 #ifdef SUPPORT_UTF
4270 if ((options & PCRE_UTF16) == 0)
4271 #endif
4272 if (end > 0xffff) end = 0xffff;
4273
4274 #endif /* COMPILE_PCRE[8|16] */
4275
4276 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4277
4278 for (c = start; c <= classbits_end; c++)
4279 {
4280 /* Regardless of start, c will always be <= 255. */
4281 SETBIT(classbits, c);
4282 n8++;
4283 }
4284
4285 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4286 if (start <= 0xff) start = 0xff + 1;
4287
4288 if (end >= start)
4289 {
4290 pcre_uchar *uchardata = *uchardptr;
4291 #ifdef SUPPORT_UTF
4292 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
4293 {
4294 if (start < end)
4295 {
4296 *uchardata++ = XCL_RANGE;
4297 uchardata += PRIV(ord2utf)(start, uchardata);
4298 uchardata += PRIV(ord2utf)(end, uchardata);
4299 }
4300 else if (start == end)
4301 {
4302 *uchardata++ = XCL_SINGLE;
4303 uchardata += PRIV(ord2utf)(start, uchardata);
4304 }
4305 }
4306 else
4307 #endif /* SUPPORT_UTF */
4308
4309 /* Without UTF support, character values are constrained by the bit length,
4310 and can only be > 256 for 16-bit and 32-bit libraries. */
4311
4312 #ifdef COMPILE_PCRE8
4313 {}
4314 #else
4315 if (start < end)
4316 {
4317 *uchardata++ = XCL_RANGE;
4318 *uchardata++ = start;
4319 *uchardata++ = end;
4320 }
4321 else if (start == end)
4322 {
4323 *uchardata++ = XCL_SINGLE;
4324 *uchardata++ = start;
4325 }
4326 #endif
4327
4328 *uchardptr = uchardata; /* Updata extra data pointer */
4329 }
4330 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4331
4332 return n8; /* Number of 8-bit characters */
4333 }
4334
4335
4336
4337
4338 /*************************************************
4339 * Add a list of characters to a class *
4340 *************************************************/
4341
4342 /* This function is used for adding a list of case-equivalent characters to a
4343 class, and also for adding a list of horizontal or vertical whitespace. If the
4344 list is in order (which it should be), ranges of characters are detected and
4345 handled appropriately. This function is mutually recursive with the function
4346 above.
4347
4348 Arguments:
4349 classbits the bit map for characters < 256
4350 uchardptr points to the pointer for extra data
4351 options the options word
4352 cd contains pointers to tables etc.
4353 p points to row of 32-bit values, terminated by NOTACHAR
4354 except character to omit; this is used when adding lists of
4355 case-equivalent characters to avoid including the one we
4356 already know about
4357
4358 Returns: the number of < 256 characters added
4359 the pointer to extra data is updated
4360 */
4361
4362 static int
add_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p,unsigned int except)4363 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4364 compile_data *cd, const pcre_uint32 *p, unsigned int except)
4365 {
4366 int n8 = 0;
4367 while (p[0] < NOTACHAR)
4368 {
4369 int n = 0;
4370 if (p[0] != except)
4371 {
4372 while(p[n+1] == p[0] + n + 1) n++;
4373 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4374 }
4375 p += n + 1;
4376 }
4377 return n8;
4378 }
4379
4380
4381
4382 /*************************************************
4383 * Add characters not in a list to a class *
4384 *************************************************/
4385
4386 /* This function is used for adding the complement of a list of horizontal or
4387 vertical whitespace to a class. The list must be in order.
4388
4389 Arguments:
4390 classbits the bit map for characters < 256
4391 uchardptr points to the pointer for extra data
4392 options the options word
4393 cd contains pointers to tables etc.
4394 p points to row of 32-bit values, terminated by NOTACHAR
4395
4396 Returns: the number of < 256 characters added
4397 the pointer to extra data is updated
4398 */
4399
4400 static int
add_not_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p)4401 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4402 int options, compile_data *cd, const pcre_uint32 *p)
4403 {
4404 BOOL utf = (options & PCRE_UTF8) != 0;
4405 int n8 = 0;
4406 if (p[0] > 0)
4407 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4408 while (p[0] < NOTACHAR)
4409 {
4410 while (p[1] == p[0] + 1) p++;
4411 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4412 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4413 p++;
4414 }
4415 return n8;
4416 }
4417
4418
4419
4420 /*************************************************
4421 * Compile one branch *
4422 *************************************************/
4423
4424 /* Scan the pattern, compiling it into the a vector. If the options are
4425 changed during the branch, the pointer is used to change the external options
4426 bits. This function is used during the pre-compile phase when we are trying
4427 to find out the amount of memory needed, as well as during the real compile
4428 phase. The value of lengthptr distinguishes the two phases.
4429
4430 Arguments:
4431 optionsptr pointer to the option bits
4432 codeptr points to the pointer to the current code point
4433 ptrptr points to the current pattern pointer
4434 errorcodeptr points to error code variable
4435 firstcharptr place to put the first required character
4436 firstcharflagsptr place to put the first character flags, or a negative number
4437 reqcharptr place to put the last required character
4438 reqcharflagsptr place to put the last required character flags, or a negative number
4439 bcptr points to current branch chain
4440 cond_depth conditional nesting depth
4441 cd contains pointers to tables etc.
4442 lengthptr NULL during the real compile phase
4443 points to length accumulator during pre-compile phase
4444
4445 Returns: TRUE on success
4446 FALSE, with *errorcodeptr set non-zero on error
4447 */
4448
4449 static BOOL
compile_branch(int * optionsptr,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,int cond_depth,compile_data * cd,int * lengthptr)4450 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4451 const pcre_uchar **ptrptr, int *errorcodeptr,
4452 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4453 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4454 branch_chain *bcptr, int cond_depth,
4455 compile_data *cd, int *lengthptr)
4456 {
4457 int repeat_type, op_type;
4458 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
4459 int bravalue = 0;
4460 int greedy_default, greedy_non_default;
4461 pcre_uint32 firstchar, reqchar;
4462 pcre_int32 firstcharflags, reqcharflags;
4463 pcre_uint32 zeroreqchar, zerofirstchar;
4464 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4465 pcre_int32 req_caseopt, reqvary, tempreqvary;
4466 int options = *optionsptr; /* May change dynamically */
4467 int after_manual_callout = 0;
4468 int length_prevgroup = 0;
4469 register pcre_uint32 c;
4470 int escape;
4471 register pcre_uchar *code = *codeptr;
4472 pcre_uchar *last_code = code;
4473 pcre_uchar *orig_code = code;
4474 pcre_uchar *tempcode;
4475 BOOL inescq = FALSE;
4476 BOOL groupsetfirstchar = FALSE;
4477 const pcre_uchar *ptr = *ptrptr;
4478 const pcre_uchar *tempptr;
4479 const pcre_uchar *nestptr = NULL;
4480 pcre_uchar *previous = NULL;
4481 pcre_uchar *previous_callout = NULL;
4482 size_t item_hwm_offset = 0;
4483 pcre_uint8 classbits[32];
4484
4485 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4486 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4487 dynamically as we process the pattern. */
4488
4489 #ifdef SUPPORT_UTF
4490 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4491 BOOL utf = (options & PCRE_UTF8) != 0;
4492 #ifndef COMPILE_PCRE32
4493 pcre_uchar utf_chars[6];
4494 #endif
4495 #else
4496 BOOL utf = FALSE;
4497 #endif
4498
4499 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4500 class_uchardata always so that it can be passed to add_to_class() always,
4501 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4502 alternative calls for the different cases. */
4503
4504 pcre_uchar *class_uchardata;
4505 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4506 BOOL xclass;
4507 pcre_uchar *class_uchardata_base;
4508 #endif
4509
4510 #ifdef PCRE_DEBUG
4511 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4512 #endif
4513
4514 /* Set up the default and non-default settings for greediness */
4515
4516 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4517 greedy_non_default = greedy_default ^ 1;
4518
4519 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4520 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4521 matches a non-fixed char first char; reqchar just remains unset if we never
4522 find one.
4523
4524 When we hit a repeat whose minimum is zero, we may have to adjust these values
4525 to take the zero repeat into account. This is implemented by setting them to
4526 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4527 item types that can be repeated set these backoff variables appropriately. */
4528
4529 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4530 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4531
4532 /* The variable req_caseopt contains either the REQ_CASELESS value
4533 or zero, according to the current setting of the caseless flag. The
4534 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4535 firstchar or reqchar variables to record the case status of the
4536 value. This is used only for ASCII characters. */
4537
4538 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4539
4540 /* Switch on next character until the end of the branch */
4541
4542 for (;; ptr++)
4543 {
4544 BOOL negate_class;
4545 BOOL should_flip_negation;
4546 BOOL possessive_quantifier;
4547 BOOL is_quantifier;
4548 BOOL is_recurse;
4549 BOOL reset_bracount;
4550 int class_has_8bitchar;
4551 int class_one_char;
4552 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4553 BOOL xclass_has_prop;
4554 #endif
4555 int newoptions;
4556 int recno;
4557 int refsign;
4558 int skipbytes;
4559 pcre_uint32 subreqchar, subfirstchar;
4560 pcre_int32 subreqcharflags, subfirstcharflags;
4561 int terminator;
4562 unsigned int mclength;
4563 unsigned int tempbracount;
4564 pcre_uint32 ec;
4565 pcre_uchar mcbuffer[8];
4566
4567 /* Get next character in the pattern */
4568
4569 c = *ptr;
4570
4571 /* If we are at the end of a nested substitution, revert to the outer level
4572 string. Nesting only happens one level deep. */
4573
4574 if (c == CHAR_NULL && nestptr != NULL)
4575 {
4576 ptr = nestptr;
4577 nestptr = NULL;
4578 c = *ptr;
4579 }
4580
4581 /* If we are in the pre-compile phase, accumulate the length used for the
4582 previous cycle of this loop. */
4583
4584 if (lengthptr != NULL)
4585 {
4586 #ifdef PCRE_DEBUG
4587 if (code > cd->hwm) cd->hwm = code; /* High water info */
4588 #endif
4589 if (code > cd->start_workspace + cd->workspace_size -
4590 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
4591 {
4592 *errorcodeptr = ERR52;
4593 goto FAILED;
4594 }
4595
4596 /* There is at least one situation where code goes backwards: this is the
4597 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4598 the class is simply eliminated. However, it is created first, so we have to
4599 allow memory for it. Therefore, don't ever reduce the length at this point.
4600 */
4601
4602 if (code < last_code) code = last_code;
4603
4604 /* Paranoid check for integer overflow */
4605
4606 if (OFLOW_MAX - *lengthptr < code - last_code)
4607 {
4608 *errorcodeptr = ERR20;
4609 goto FAILED;
4610 }
4611
4612 *lengthptr += (int)(code - last_code);
4613 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4614 (int)(code - last_code), c, c));
4615
4616 /* If "previous" is set and it is not at the start of the work space, move
4617 it back to there, in order to avoid filling up the work space. Otherwise,
4618 if "previous" is NULL, reset the current code pointer to the start. */
4619
4620 if (previous != NULL)
4621 {
4622 if (previous > orig_code)
4623 {
4624 memmove(orig_code, previous, IN_UCHARS(code - previous));
4625 code -= previous - orig_code;
4626 previous = orig_code;
4627 }
4628 }
4629 else code = orig_code;
4630
4631 /* Remember where this code item starts so we can pick up the length
4632 next time round. */
4633
4634 last_code = code;
4635 }
4636
4637 /* In the real compile phase, just check the workspace used by the forward
4638 reference list. */
4639
4640 else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4641 {
4642 *errorcodeptr = ERR52;
4643 goto FAILED;
4644 }
4645
4646 /* If in \Q...\E, check for the end; if not, we have a literal */
4647
4648 if (inescq && c != CHAR_NULL)
4649 {
4650 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4651 {
4652 inescq = FALSE;
4653 ptr++;
4654 continue;
4655 }
4656 else
4657 {
4658 if (previous_callout != NULL)
4659 {
4660 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4661 complete_callout(previous_callout, ptr, cd);
4662 previous_callout = NULL;
4663 }
4664 if ((options & PCRE_AUTO_CALLOUT) != 0)
4665 {
4666 previous_callout = code;
4667 code = auto_callout(code, ptr, cd);
4668 }
4669 goto NORMAL_CHAR;
4670 }
4671 /* Control does not reach here. */
4672 }
4673
4674 /* In extended mode, skip white space and comments. We need a loop in order
4675 to check for more white space and more comments after a comment. */
4676
4677 if ((options & PCRE_EXTENDED) != 0)
4678 {
4679 for (;;)
4680 {
4681 while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4682 if (c != CHAR_NUMBER_SIGN) break;
4683 ptr++;
4684 while (*ptr != CHAR_NULL)
4685 {
4686 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
4687 { /* IS_NEWLINE sets cd->nllen. */
4688 ptr += cd->nllen;
4689 break;
4690 }
4691 ptr++;
4692 #ifdef SUPPORT_UTF
4693 if (utf) FORWARDCHAR(ptr);
4694 #endif
4695 }
4696 c = *ptr; /* Either NULL or the char after a newline */
4697 }
4698 }
4699
4700 /* See if the next thing is a quantifier. */
4701
4702 is_quantifier =
4703 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4704 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4705
4706 /* Fill in length of a previous callout, except when the next thing is a
4707 quantifier or when processing a property substitution string in UCP mode. */
4708
4709 if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4710 after_manual_callout-- <= 0)
4711 {
4712 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
4713 complete_callout(previous_callout, ptr, cd);
4714 previous_callout = NULL;
4715 }
4716
4717 /* Create auto callout, except for quantifiers, or while processing property
4718 strings that are substituted for \w etc in UCP mode. */
4719
4720 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4721 {
4722 previous_callout = code;
4723 code = auto_callout(code, ptr, cd);
4724 }
4725
4726 /* Process the next pattern item. */
4727
4728 switch(c)
4729 {
4730 /* ===================================================================*/
4731 case CHAR_NULL: /* The branch terminates at string end */
4732 case CHAR_VERTICAL_LINE: /* or | or ) */
4733 case CHAR_RIGHT_PARENTHESIS:
4734 *firstcharptr = firstchar;
4735 *firstcharflagsptr = firstcharflags;
4736 *reqcharptr = reqchar;
4737 *reqcharflagsptr = reqcharflags;
4738 *codeptr = code;
4739 *ptrptr = ptr;
4740 if (lengthptr != NULL)
4741 {
4742 if (OFLOW_MAX - *lengthptr < code - last_code)
4743 {
4744 *errorcodeptr = ERR20;
4745 goto FAILED;
4746 }
4747 *lengthptr += (int)(code - last_code); /* To include callout length */
4748 DPRINTF((">> end branch\n"));
4749 }
4750 return TRUE;
4751
4752
4753 /* ===================================================================*/
4754 /* Handle single-character metacharacters. In multiline mode, ^ disables
4755 the setting of any following char as a first character. */
4756
4757 case CHAR_CIRCUMFLEX_ACCENT:
4758 previous = NULL;
4759 if ((options & PCRE_MULTILINE) != 0)
4760 {
4761 if (firstcharflags == REQ_UNSET)
4762 zerofirstcharflags = firstcharflags = REQ_NONE;
4763 *code++ = OP_CIRCM;
4764 }
4765 else *code++ = OP_CIRC;
4766 break;
4767
4768 case CHAR_DOLLAR_SIGN:
4769 previous = NULL;
4770 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4771 break;
4772
4773 /* There can never be a first char if '.' is first, whatever happens about
4774 repeats. The value of reqchar doesn't change either. */
4775
4776 case CHAR_DOT:
4777 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4778 zerofirstchar = firstchar;
4779 zerofirstcharflags = firstcharflags;
4780 zeroreqchar = reqchar;
4781 zeroreqcharflags = reqcharflags;
4782 previous = code;
4783 item_hwm_offset = cd->hwm - cd->start_workspace;
4784 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4785 break;
4786
4787
4788 /* ===================================================================*/
4789 /* Character classes. If the included characters are all < 256, we build a
4790 32-byte bitmap of the permitted characters, except in the special case
4791 where there is only one such character. For negated classes, we build the
4792 map as usual, then invert it at the end. However, we use a different opcode
4793 so that data characters > 255 can be handled correctly.
4794
4795 If the class contains characters outside the 0-255 range, a different
4796 opcode is compiled. It may optionally have a bit map for characters < 256,
4797 but those above are are explicitly listed afterwards. A flag byte tells
4798 whether the bitmap is present, and whether this is a negated class or not.
4799
4800 In JavaScript compatibility mode, an isolated ']' causes an error. In
4801 default (Perl) mode, it is treated as a data character. */
4802
4803 case CHAR_RIGHT_SQUARE_BRACKET:
4804 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4805 {
4806 *errorcodeptr = ERR64;
4807 goto FAILED;
4808 }
4809 goto NORMAL_CHAR;
4810
4811 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4812 used for "start of word" and "end of word". As these are otherwise illegal
4813 sequences, we don't break anything by recognizing them. They are replaced
4814 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4815 erroneous and are handled by the normal code below. */
4816
4817 case CHAR_LEFT_SQUARE_BRACKET:
4818 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4819 {
4820 nestptr = ptr + 7;
4821 ptr = sub_start_of_word - 1;
4822 continue;
4823 }
4824
4825 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4826 {
4827 nestptr = ptr + 7;
4828 ptr = sub_end_of_word - 1;
4829 continue;
4830 }
4831
4832 /* Handle a real character class. */
4833
4834 previous = code;
4835 item_hwm_offset = cd->hwm - cd->start_workspace;
4836
4837 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4838 they are encountered at the top level, so we'll do that too. */
4839
4840 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4841 ptr[1] == CHAR_EQUALS_SIGN) &&
4842 check_posix_syntax(ptr, &tempptr))
4843 {
4844 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4845 goto FAILED;
4846 }
4847
4848 /* If the first character is '^', set the negation flag and skip it. Also,
4849 if the first few characters (either before or after ^) are \Q\E or \E we
4850 skip them too. This makes for compatibility with Perl. */
4851
4852 negate_class = FALSE;
4853 for (;;)
4854 {
4855 c = *(++ptr);
4856 if (c == CHAR_BACKSLASH)
4857 {
4858 if (ptr[1] == CHAR_E)
4859 ptr++;
4860 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4861 ptr += 3;
4862 else
4863 break;
4864 }
4865 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4866 negate_class = TRUE;
4867 else break;
4868 }
4869
4870 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4871 an initial ']' is taken as a data character -- the code below handles
4872 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4873 [^] must match any character, so generate OP_ALLANY. */
4874
4875 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4876 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4877 {
4878 *code++ = negate_class? OP_ALLANY : OP_FAIL;
4879 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4880 zerofirstchar = firstchar;
4881 zerofirstcharflags = firstcharflags;
4882 break;
4883 }
4884
4885 /* If a class contains a negative special such as \S, we need to flip the
4886 negation flag at the end, so that support for characters > 255 works
4887 correctly (they are all included in the class). */
4888
4889 should_flip_negation = FALSE;
4890
4891 /* Extended class (xclass) will be used when characters > 255
4892 might match. */
4893
4894 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4895 xclass = FALSE;
4896 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
4897 class_uchardata_base = class_uchardata; /* Save the start */
4898 #endif
4899
4900 /* For optimization purposes, we track some properties of the class:
4901 class_has_8bitchar will be non-zero if the class contains at least one <
4902 256 character; class_one_char will be 1 if the class contains just one
4903 character; xclass_has_prop will be TRUE if unicode property checks
4904 are present in the class. */
4905
4906 class_has_8bitchar = 0;
4907 class_one_char = 0;
4908 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4909 xclass_has_prop = FALSE;
4910 #endif
4911
4912 /* Initialize the 32-char bit map to all zeros. We build the map in a
4913 temporary bit of memory, in case the class contains fewer than two
4914 8-bit characters because in that case the compiled code doesn't use the bit
4915 map. */
4916
4917 memset(classbits, 0, 32 * sizeof(pcre_uint8));
4918
4919 /* Process characters until ] is reached. By writing this as a "do" it
4920 means that an initial ] is taken as a data character. At the start of the
4921 loop, c contains the first byte of the character. */
4922
4923 if (c != CHAR_NULL) do
4924 {
4925 const pcre_uchar *oldptr;
4926
4927 #ifdef SUPPORT_UTF
4928 if (utf && HAS_EXTRALEN(c))
4929 { /* Braces are required because the */
4930 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
4931 }
4932 #endif
4933
4934 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4935 /* In the pre-compile phase, accumulate the length of any extra
4936 data and reset the pointer. This is so that very large classes that
4937 contain a zillion > 255 characters no longer overwrite the work space
4938 (which is on the stack). We have to remember that there was XCLASS data,
4939 however. */
4940
4941 if (class_uchardata > class_uchardata_base) xclass = TRUE;
4942
4943 if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4944 {
4945 *lengthptr += (int)(class_uchardata - class_uchardata_base);
4946 class_uchardata = class_uchardata_base;
4947 }
4948 #endif
4949
4950 /* Inside \Q...\E everything is literal except \E */
4951
4952 if (inescq)
4953 {
4954 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
4955 {
4956 inescq = FALSE; /* Reset literal state */
4957 ptr++; /* Skip the 'E' */
4958 continue; /* Carry on with next */
4959 }
4960 goto CHECK_RANGE; /* Could be range if \E follows */
4961 }
4962
4963 /* Handle POSIX class names. Perl allows a negation extension of the
4964 form [:^name:]. A square bracket that doesn't match the syntax is
4965 treated as a literal. We also recognize the POSIX constructions
4966 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4967 5.6 and 5.8 do. */
4968
4969 if (c == CHAR_LEFT_SQUARE_BRACKET &&
4970 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4971 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4972 {
4973 BOOL local_negate = FALSE;
4974 int posix_class, taboffset, tabopt;
4975 register const pcre_uint8 *cbits = cd->cbits;
4976 pcre_uint8 pbits[32];
4977
4978 if (ptr[1] != CHAR_COLON)
4979 {
4980 *errorcodeptr = ERR31;
4981 goto FAILED;
4982 }
4983
4984 ptr += 2;
4985 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4986 {
4987 local_negate = TRUE;
4988 should_flip_negation = TRUE; /* Note negative special */
4989 ptr++;
4990 }
4991
4992 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4993 if (posix_class < 0)
4994 {
4995 *errorcodeptr = ERR30;
4996 goto FAILED;
4997 }
4998
4999 /* If matching is caseless, upper and lower are converted to
5000 alpha. This relies on the fact that the class table starts with
5001 alpha, lower, upper as the first 3 entries. */
5002
5003 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5004 posix_class = 0;
5005
5006 /* When PCRE_UCP is set, some of the POSIX classes are converted to
5007 different escape sequences that use Unicode properties \p or \P. Others
5008 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5009 directly. */
5010
5011 #ifdef SUPPORT_UCP
5012 if ((options & PCRE_UCP) != 0)
5013 {
5014 unsigned int ptype = 0;
5015 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5016
5017 /* The posix_substitutes table specifies which POSIX classes can be
5018 converted to \p or \P items. */
5019
5020 if (posix_substitutes[pc] != NULL)
5021 {
5022 nestptr = tempptr + 1;
5023 ptr = posix_substitutes[pc] - 1;
5024 continue;
5025 }
5026
5027 /* There are three other classes that generate special property calls
5028 that are recognized only in an XCLASS. */
5029
5030 else switch(posix_class)
5031 {
5032 case PC_GRAPH:
5033 ptype = PT_PXGRAPH;
5034 /* Fall through */
5035 case PC_PRINT:
5036 if (ptype == 0) ptype = PT_PXPRINT;
5037 /* Fall through */
5038 case PC_PUNCT:
5039 if (ptype == 0) ptype = PT_PXPUNCT;
5040 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5041 *class_uchardata++ = ptype;
5042 *class_uchardata++ = 0;
5043 xclass_has_prop = TRUE;
5044 ptr = tempptr + 1;
5045 continue;
5046
5047 /* For the other POSIX classes (ascii, xdigit) we are going to fall
5048 through to the non-UCP case and build a bit map for characters with
5049 code points less than 256. If we are in a negated POSIX class
5050 within a non-negated overall class, characters with code points
5051 greater than 255 must all match. In the special case where we have
5052 not yet generated any xclass data, and this is the final item in
5053 the overall class, we need do nothing: later on, the opcode
5054 OP_NCLASS will be used to indicate that characters greater than 255
5055 are acceptable. If we have already seen an xclass item or one may
5056 follow (we have to assume that it might if this is not the end of
5057 the class), explicitly match all wide codepoints. */
5058
5059 default:
5060 if (!negate_class && local_negate &&
5061 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5062 {
5063 *class_uchardata++ = XCL_RANGE;
5064 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5065 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5066 }
5067 break;
5068 }
5069 }
5070 #endif
5071 /* In the non-UCP case, or when UCP makes no difference, we build the
5072 bit map for the POSIX class in a chunk of local store because we may be
5073 adding and subtracting from it, and we don't want to subtract bits that
5074 may be in the main map already. At the end we or the result into the
5075 bit map that is being built. */
5076
5077 posix_class *= 3;
5078
5079 /* Copy in the first table (always present) */
5080
5081 memcpy(pbits, cbits + posix_class_maps[posix_class],
5082 32 * sizeof(pcre_uint8));
5083
5084 /* If there is a second table, add or remove it as required. */
5085
5086 taboffset = posix_class_maps[posix_class + 1];
5087 tabopt = posix_class_maps[posix_class + 2];
5088
5089 if (taboffset >= 0)
5090 {
5091 if (tabopt >= 0)
5092 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5093 else
5094 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5095 }
5096
5097 /* Now see if we need to remove any special characters. An option
5098 value of 1 removes vertical space and 2 removes underscore. */
5099
5100 if (tabopt < 0) tabopt = -tabopt;
5101 if (tabopt == 1) pbits[1] &= ~0x3c;
5102 else if (tabopt == 2) pbits[11] &= 0x7f;
5103
5104 /* Add the POSIX table or its complement into the main table that is
5105 being built and we are done. */
5106
5107 if (local_negate)
5108 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5109 else
5110 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5111
5112 ptr = tempptr + 1;
5113 /* Every class contains at least one < 256 character. */
5114 class_has_8bitchar = 1;
5115 /* Every class contains at least two characters. */
5116 class_one_char = 2;
5117 continue; /* End of POSIX syntax handling */
5118 }
5119
5120 /* Backslash may introduce a single character, or it may introduce one
5121 of the specials, which just set a flag. The sequence \b is a special
5122 case. Inside a class (and only there) it is treated as backspace. We
5123 assume that other escapes have more than one character in them, so
5124 speculatively set both class_has_8bitchar and class_one_char bigger
5125 than one. Unrecognized escapes fall through and are either treated
5126 as literal characters (by default), or are faulted if
5127 PCRE_EXTRA is set. */
5128
5129 if (c == CHAR_BACKSLASH)
5130 {
5131 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5132 TRUE);
5133 if (*errorcodeptr != 0) goto FAILED;
5134 if (escape == 0) c = ec;
5135 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5136 else if (escape == ESC_N) /* \N is not supported in a class */
5137 {
5138 *errorcodeptr = ERR71;
5139 goto FAILED;
5140 }
5141 else if (escape == ESC_Q) /* Handle start of quoted string */
5142 {
5143 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5144 {
5145 ptr += 2; /* avoid empty string */
5146 }
5147 else inescq = TRUE;
5148 continue;
5149 }
5150 else if (escape == ESC_E) continue; /* Ignore orphan \E */
5151
5152 else
5153 {
5154 register const pcre_uint8 *cbits = cd->cbits;
5155 /* Every class contains at least two < 256 characters. */
5156 class_has_8bitchar++;
5157 /* Every class contains at least two characters. */
5158 class_one_char += 2;
5159
5160 switch (escape)
5161 {
5162 #ifdef SUPPORT_UCP
5163 case ESC_du: /* These are the values given for \d etc */
5164 case ESC_DU: /* when PCRE_UCP is set. We replace the */
5165 case ESC_wu: /* escape sequence with an appropriate \p */
5166 case ESC_WU: /* or \P to test Unicode properties instead */
5167 case ESC_su: /* of the default ASCII testing. */
5168 case ESC_SU:
5169 nestptr = ptr;
5170 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
5171 class_has_8bitchar--; /* Undo! */
5172 continue;
5173 #endif
5174 case ESC_d:
5175 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5176 continue;
5177
5178 case ESC_D:
5179 should_flip_negation = TRUE;
5180 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5181 continue;
5182
5183 case ESC_w:
5184 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5185 continue;
5186
5187 case ESC_W:
5188 should_flip_negation = TRUE;
5189 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5190 continue;
5191
5192 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5193 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5194 previously set by something earlier in the character class.
5195 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5196 we could just adjust the appropriate bit. From PCRE 8.34 we no
5197 longer treat \s and \S specially. */
5198
5199 case ESC_s:
5200 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5201 continue;
5202
5203 case ESC_S:
5204 should_flip_negation = TRUE;
5205 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5206 continue;
5207
5208 /* The rest apply in both UCP and non-UCP cases. */
5209
5210 case ESC_h:
5211 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5212 PRIV(hspace_list), NOTACHAR);
5213 continue;
5214
5215 case ESC_H:
5216 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5217 cd, PRIV(hspace_list));
5218 continue;
5219
5220 case ESC_v:
5221 (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5222 PRIV(vspace_list), NOTACHAR);
5223 continue;
5224
5225 case ESC_V:
5226 (void)add_not_list_to_class(classbits, &class_uchardata, options,
5227 cd, PRIV(vspace_list));
5228 continue;
5229
5230 case ESC_p:
5231 case ESC_P:
5232 #ifdef SUPPORT_UCP
5233 {
5234 BOOL negated;
5235 unsigned int ptype = 0, pdata = 0;
5236 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5237 goto FAILED;
5238 *class_uchardata++ = ((escape == ESC_p) != negated)?
5239 XCL_PROP : XCL_NOTPROP;
5240 *class_uchardata++ = ptype;
5241 *class_uchardata++ = pdata;
5242 xclass_has_prop = TRUE;
5243 class_has_8bitchar--; /* Undo! */
5244 continue;
5245 }
5246 #else
5247 *errorcodeptr = ERR45;
5248 goto FAILED;
5249 #endif
5250 /* Unrecognized escapes are faulted if PCRE is running in its
5251 strict mode. By default, for compatibility with Perl, they are
5252 treated as literals. */
5253
5254 default:
5255 if ((options & PCRE_EXTRA) != 0)
5256 {
5257 *errorcodeptr = ERR7;
5258 goto FAILED;
5259 }
5260 class_has_8bitchar--; /* Undo the speculative increase. */
5261 class_one_char -= 2; /* Undo the speculative increase. */
5262 c = *ptr; /* Get the final character and fall through */
5263 break;
5264 }
5265 }
5266
5267 /* Fall through if the escape just defined a single character (c >= 0).
5268 This may be greater than 256. */
5269
5270 escape = 0;
5271
5272 } /* End of backslash handling */
5273
5274 /* A character may be followed by '-' to form a range. However, Perl does
5275 not permit ']' to be the end of the range. A '-' character at the end is
5276 treated as a literal. Perl ignores orphaned \E sequences entirely. The
5277 code for handling \Q and \E is messy. */
5278
5279 CHECK_RANGE:
5280 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5281 {
5282 inescq = FALSE;
5283 ptr += 2;
5284 }
5285 oldptr = ptr;
5286
5287 /* Remember if \r or \n were explicitly used */
5288
5289 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5290
5291 /* Check for range */
5292
5293 if (!inescq && ptr[1] == CHAR_MINUS)
5294 {
5295 pcre_uint32 d;
5296 ptr += 2;
5297 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5298
5299 /* If we hit \Q (not followed by \E) at this point, go into escaped
5300 mode. */
5301
5302 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5303 {
5304 ptr += 2;
5305 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5306 { ptr += 2; continue; }
5307 inescq = TRUE;
5308 break;
5309 }
5310
5311 /* Minus (hyphen) at the end of a class is treated as a literal, so put
5312 back the pointer and jump to handle the character that preceded it. */
5313
5314 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5315 {
5316 ptr = oldptr;
5317 goto CLASS_SINGLE_CHARACTER;
5318 }
5319
5320 /* Otherwise, we have a potential range; pick up the next character */
5321
5322 #ifdef SUPPORT_UTF
5323 if (utf)
5324 { /* Braces are required because the */
5325 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
5326 }
5327 else
5328 #endif
5329 d = *ptr; /* Not UTF-8 mode */
5330
5331 /* The second part of a range can be a single-character escape
5332 sequence, but not any of the other escapes. Perl treats a hyphen as a
5333 literal in such circumstances. However, in Perl's warning mode, a
5334 warning is given, so PCRE now faults it as it is almost certainly a
5335 mistake on the user's part. */
5336
5337 if (!inescq)
5338 {
5339 if (d == CHAR_BACKSLASH)
5340 {
5341 int descape;
5342 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5343 if (*errorcodeptr != 0) goto FAILED;
5344
5345 /* 0 means a character was put into d; \b is backspace; any other
5346 special causes an error. */
5347
5348 if (descape != 0)
5349 {
5350 if (descape == ESC_b) d = CHAR_BS; else
5351 {
5352 *errorcodeptr = ERR83;
5353 goto FAILED;
5354 }
5355 }
5356 }
5357
5358 /* A hyphen followed by a POSIX class is treated in the same way. */
5359
5360 else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5361 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5362 ptr[1] == CHAR_EQUALS_SIGN) &&
5363 check_posix_syntax(ptr, &tempptr))
5364 {
5365 *errorcodeptr = ERR83;
5366 goto FAILED;
5367 }
5368 }
5369
5370 /* Check that the two values are in the correct order. Optimize
5371 one-character ranges. */
5372
5373 if (d < c)
5374 {
5375 *errorcodeptr = ERR8;
5376 goto FAILED;
5377 }
5378 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
5379
5380 /* We have found a character range, so single character optimizations
5381 cannot be done anymore. Any value greater than 1 indicates that there
5382 is more than one character. */
5383
5384 class_one_char = 2;
5385
5386 /* Remember an explicit \r or \n, and add the range to the class. */
5387
5388 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5389
5390 class_has_8bitchar +=
5391 add_to_class(classbits, &class_uchardata, options, cd, c, d);
5392
5393 continue; /* Go get the next char in the class */
5394 }
5395
5396 /* Handle a single character - we can get here for a normal non-escape
5397 char, or after \ that introduces a single character or for an apparent
5398 range that isn't. Only the value 1 matters for class_one_char, so don't
5399 increase it if it is already 2 or more ... just in case there's a class
5400 with a zillion characters in it. */
5401
5402 CLASS_SINGLE_CHARACTER:
5403 if (class_one_char < 2) class_one_char++;
5404
5405 /* If xclass_has_prop is false and class_one_char is 1, we have the first
5406 single character in the class, and there have been no prior ranges, or
5407 XCLASS items generated by escapes. If this is the final character in the
5408 class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5409 if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5410 can cause firstchar to be set. Otherwise, there can be no first char if
5411 this item is first, whatever repeat count may follow. In the case of
5412 reqchar, save the previous value for reinstating. */
5413
5414 if (!inescq &&
5415 #ifdef SUPPORT_UCP
5416 !xclass_has_prop &&
5417 #endif
5418 class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5419 {
5420 ptr++;
5421 zeroreqchar = reqchar;
5422 zeroreqcharflags = reqcharflags;
5423
5424 if (negate_class)
5425 {
5426 #ifdef SUPPORT_UCP
5427 int d;
5428 #endif
5429 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5430 zerofirstchar = firstchar;
5431 zerofirstcharflags = firstcharflags;
5432
5433 /* For caseless UTF-8 mode when UCP support is available, check
5434 whether this character has more than one other case. If so, generate
5435 a special OP_NOTPROP item instead of OP_NOTI. */
5436
5437 #ifdef SUPPORT_UCP
5438 if (utf && (options & PCRE_CASELESS) != 0 &&
5439 (d = UCD_CASESET(c)) != 0)
5440 {
5441 *code++ = OP_NOTPROP;
5442 *code++ = PT_CLIST;
5443 *code++ = d;
5444 }
5445 else
5446 #endif
5447 /* Char has only one other case, or UCP not available */
5448
5449 {
5450 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5451 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5452 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5453 code += PRIV(ord2utf)(c, code);
5454 else
5455 #endif
5456 *code++ = c;
5457 }
5458
5459 /* We are finished with this character class */
5460
5461 goto END_CLASS;
5462 }
5463
5464 /* For a single, positive character, get the value into mcbuffer, and
5465 then we can handle this with the normal one-character code. */
5466
5467 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5468 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5469 mclength = PRIV(ord2utf)(c, mcbuffer);
5470 else
5471 #endif
5472 {
5473 mcbuffer[0] = c;
5474 mclength = 1;
5475 }
5476 goto ONE_CHAR;
5477 } /* End of 1-char optimization */
5478
5479 /* There is more than one character in the class, or an XCLASS item
5480 has been generated. Add this character to the class. */
5481
5482 class_has_8bitchar +=
5483 add_to_class(classbits, &class_uchardata, options, cd, c, c);
5484 }
5485
5486 /* Loop until ']' reached. This "while" is the end of the "do" far above.
5487 If we are at the end of an internal nested string, revert to the outer
5488 string. */
5489
5490 while (((c = *(++ptr)) != CHAR_NULL ||
5491 (nestptr != NULL &&
5492 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5493 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5494
5495 /* Check for missing terminating ']' */
5496
5497 if (c == CHAR_NULL)
5498 {
5499 *errorcodeptr = ERR6;
5500 goto FAILED;
5501 }
5502
5503 /* We will need an XCLASS if data has been placed in class_uchardata. In
5504 the second phase this is a sufficient test. However, in the pre-compile
5505 phase, class_uchardata gets emptied to prevent workspace overflow, so it
5506 only if the very last character in the class needs XCLASS will it contain
5507 anything at this point. For this reason, xclass gets set TRUE above when
5508 uchar_classdata is emptied, and that's why this code is the way it is here
5509 instead of just doing a test on class_uchardata below. */
5510
5511 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5512 if (class_uchardata > class_uchardata_base) xclass = TRUE;
5513 #endif
5514
5515 /* If this is the first thing in the branch, there can be no first char
5516 setting, whatever the repeat count. Any reqchar setting must remain
5517 unchanged after any kind of repeat. */
5518
5519 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5520 zerofirstchar = firstchar;
5521 zerofirstcharflags = firstcharflags;
5522 zeroreqchar = reqchar;
5523 zeroreqcharflags = reqcharflags;
5524
5525 /* If there are characters with values > 255, we have to compile an
5526 extended class, with its own opcode, unless there was a negated special
5527 such as \S in the class, and PCRE_UCP is not set, because in that case all
5528 characters > 255 are in the class, so any that were explicitly given as
5529 well can be ignored. If (when there are explicit characters > 255 that must
5530 be listed) there are no characters < 256, we can omit the bitmap in the
5531 actual compiled code. */
5532
5533 #ifdef SUPPORT_UTF
5534 if (xclass && (xclass_has_prop || !should_flip_negation ||
5535 (options & PCRE_UCP) != 0))
5536 #elif !defined COMPILE_PCRE8
5537 if (xclass && (xclass_has_prop || !should_flip_negation))
5538 #endif
5539 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5540 {
5541 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
5542 *code++ = OP_XCLASS;
5543 code += LINK_SIZE;
5544 *code = negate_class? XCL_NOT:0;
5545 if (xclass_has_prop) *code |= XCL_HASPROP;
5546
5547 /* If the map is required, move up the extra data to make room for it;
5548 otherwise just move the code pointer to the end of the extra data. */
5549
5550 if (class_has_8bitchar > 0)
5551 {
5552 *code++ |= XCL_MAP;
5553 memmove(code + (32 / sizeof(pcre_uchar)), code,
5554 IN_UCHARS(class_uchardata - code));
5555 if (negate_class && !xclass_has_prop)
5556 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5557 memcpy(code, classbits, 32);
5558 code = class_uchardata + (32 / sizeof(pcre_uchar));
5559 }
5560 else code = class_uchardata;
5561
5562 /* Now fill in the complete length of the item */
5563
5564 PUT(previous, 1, (int)(code - previous));
5565 break; /* End of class handling */
5566 }
5567
5568 /* Even though any XCLASS list is now discarded, we must allow for
5569 its memory. */
5570
5571 if (lengthptr != NULL)
5572 *lengthptr += (int)(class_uchardata - class_uchardata_base);
5573 #endif
5574
5575 /* If there are no characters > 255, or they are all to be included or
5576 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5577 whole class was negated and whether there were negative specials such as \S
5578 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5579 negating it if necessary. */
5580
5581 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5582 if (lengthptr == NULL) /* Save time in the pre-compile phase */
5583 {
5584 if (negate_class)
5585 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5586 memcpy(code, classbits, 32);
5587 }
5588 code += 32 / sizeof(pcre_uchar);
5589
5590 END_CLASS:
5591 break;
5592
5593
5594 /* ===================================================================*/
5595 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5596 has been tested above. */
5597
5598 case CHAR_LEFT_CURLY_BRACKET:
5599 if (!is_quantifier) goto NORMAL_CHAR;
5600 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5601 if (*errorcodeptr != 0) goto FAILED;
5602 goto REPEAT;
5603
5604 case CHAR_ASTERISK:
5605 repeat_min = 0;
5606 repeat_max = -1;
5607 goto REPEAT;
5608
5609 case CHAR_PLUS:
5610 repeat_min = 1;
5611 repeat_max = -1;
5612 goto REPEAT;
5613
5614 case CHAR_QUESTION_MARK:
5615 repeat_min = 0;
5616 repeat_max = 1;
5617
5618 REPEAT:
5619 if (previous == NULL)
5620 {
5621 *errorcodeptr = ERR9;
5622 goto FAILED;
5623 }
5624
5625 if (repeat_min == 0)
5626 {
5627 firstchar = zerofirstchar; /* Adjust for zero repeat */
5628 firstcharflags = zerofirstcharflags;
5629 reqchar = zeroreqchar; /* Ditto */
5630 reqcharflags = zeroreqcharflags;
5631 }
5632
5633 /* Remember whether this is a variable length repeat */
5634
5635 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5636
5637 op_type = 0; /* Default single-char op codes */
5638 possessive_quantifier = FALSE; /* Default not possessive quantifier */
5639
5640 /* Save start of previous item, in case we have to move it up in order to
5641 insert something before it. */
5642
5643 tempcode = previous;
5644
5645 /* Before checking for a possessive quantifier, we must skip over
5646 whitespace and comments in extended mode because Perl allows white space at
5647 this point. */
5648
5649 if ((options & PCRE_EXTENDED) != 0)
5650 {
5651 const pcre_uchar *p = ptr + 1;
5652 for (;;)
5653 {
5654 while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5655 if (*p != CHAR_NUMBER_SIGN) break;
5656 p++;
5657 while (*p != CHAR_NULL)
5658 {
5659 if (IS_NEWLINE(p)) /* For non-fixed-length newline cases, */
5660 { /* IS_NEWLINE sets cd->nllen. */
5661 p += cd->nllen;
5662 break;
5663 }
5664 p++;
5665 #ifdef SUPPORT_UTF
5666 if (utf) FORWARDCHAR(p);
5667 #endif
5668 } /* Loop for comment characters */
5669 } /* Loop for multiple comments */
5670 ptr = p - 1; /* Character before the next significant one. */
5671 }
5672
5673 /* If the next character is '+', we have a possessive quantifier. This
5674 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5675 If the next character is '?' this is a minimizing repeat, by default,
5676 but if PCRE_UNGREEDY is set, it works the other way round. We change the
5677 repeat type to the non-default. */
5678
5679 if (ptr[1] == CHAR_PLUS)
5680 {
5681 repeat_type = 0; /* Force greedy */
5682 possessive_quantifier = TRUE;
5683 ptr++;
5684 }
5685 else if (ptr[1] == CHAR_QUESTION_MARK)
5686 {
5687 repeat_type = greedy_non_default;
5688 ptr++;
5689 }
5690 else repeat_type = greedy_default;
5691
5692 /* If previous was a recursion call, wrap it in atomic brackets so that
5693 previous becomes the atomic group. All recursions were so wrapped in the
5694 past, but it no longer happens for non-repeated recursions. In fact, the
5695 repeated ones could be re-implemented independently so as not to need this,
5696 but for the moment we rely on the code for repeating groups. */
5697
5698 if (*previous == OP_RECURSE)
5699 {
5700 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5701 *previous = OP_ONCE;
5702 PUT(previous, 1, 2 + 2*LINK_SIZE);
5703 previous[2 + 2*LINK_SIZE] = OP_KET;
5704 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5705 code += 2 + 2 * LINK_SIZE;
5706 length_prevgroup = 3 + 3*LINK_SIZE;
5707
5708 /* When actually compiling, we need to check whether this was a forward
5709 reference, and if so, adjust the offset. */
5710
5711 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5712 {
5713 int offset = GET(cd->hwm, -LINK_SIZE);
5714 if (offset == previous + 1 - cd->start_code)
5715 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5716 }
5717 }
5718
5719 /* Now handle repetition for the different types of item. */
5720
5721 /* If previous was a character or negated character match, abolish the item
5722 and generate a repeat item instead. If a char item has a minimum of more
5723 than one, ensure that it is set in reqchar - it might not be if a sequence
5724 such as x{3} is the first thing in a branch because the x will have gone
5725 into firstchar instead. */
5726
5727 if (*previous == OP_CHAR || *previous == OP_CHARI
5728 || *previous == OP_NOT || *previous == OP_NOTI)
5729 {
5730 switch (*previous)
5731 {
5732 default: /* Make compiler happy. */
5733 case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
5734 case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5735 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
5736 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
5737 }
5738
5739 /* Deal with UTF characters that take up more than one character. It's
5740 easier to write this out separately than try to macrify it. Use c to
5741 hold the length of the character in bytes, plus UTF_LENGTH to flag that
5742 it's a length rather than a small character. */
5743
5744 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5745 if (utf && NOT_FIRSTCHAR(code[-1]))
5746 {
5747 pcre_uchar *lastchar = code - 1;
5748 BACKCHAR(lastchar);
5749 c = (int)(code - lastchar); /* Length of UTF-8 character */
5750 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5751 c |= UTF_LENGTH; /* Flag c as a length */
5752 }
5753 else
5754 #endif /* SUPPORT_UTF */
5755
5756 /* Handle the case of a single charater - either with no UTF support, or
5757 with UTF disabled, or for a single character UTF character. */
5758 {
5759 c = code[-1];
5760 if (*previous <= OP_CHARI && repeat_min > 1)
5761 {
5762 reqchar = c;
5763 reqcharflags = req_caseopt | cd->req_varyopt;
5764 }
5765 }
5766
5767 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
5768 }
5769
5770 /* If previous was a character type match (\d or similar), abolish it and
5771 create a suitable repeat item. The code is shared with single-character
5772 repeats by setting op_type to add a suitable offset into repeat_type. Note
5773 that the Unicode property types will be present only when SUPPORT_UCP is
5774 defined, but we don't wrap the little bits of code here because it just
5775 makes it horribly messy. */
5776
5777 else if (*previous < OP_EODN)
5778 {
5779 pcre_uchar *oldcode;
5780 int prop_type, prop_value;
5781 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
5782 c = *previous;
5783
5784 OUTPUT_SINGLE_REPEAT:
5785 if (*previous == OP_PROP || *previous == OP_NOTPROP)
5786 {
5787 prop_type = previous[1];
5788 prop_value = previous[2];
5789 }
5790 else prop_type = prop_value = -1;
5791
5792 oldcode = code;
5793 code = previous; /* Usually overwrite previous item */
5794
5795 /* If the maximum is zero then the minimum must also be zero; Perl allows
5796 this case, so we do too - by simply omitting the item altogether. */
5797
5798 if (repeat_max == 0) goto END_REPEAT;
5799
5800 /* Combine the op_type with the repeat_type */
5801
5802 repeat_type += op_type;
5803
5804 /* A minimum of zero is handled either as the special case * or ?, or as
5805 an UPTO, with the maximum given. */
5806
5807 if (repeat_min == 0)
5808 {
5809 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5810 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5811 else
5812 {
5813 *code++ = OP_UPTO + repeat_type;
5814 PUT2INC(code, 0, repeat_max);
5815 }
5816 }
5817
5818 /* A repeat minimum of 1 is optimized into some special cases. If the
5819 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5820 left in place and, if the maximum is greater than 1, we use OP_UPTO with
5821 one less than the maximum. */
5822
5823 else if (repeat_min == 1)
5824 {
5825 if (repeat_max == -1)
5826 *code++ = OP_PLUS + repeat_type;
5827 else
5828 {
5829 code = oldcode; /* leave previous item in place */
5830 if (repeat_max == 1) goto END_REPEAT;
5831 *code++ = OP_UPTO + repeat_type;
5832 PUT2INC(code, 0, repeat_max - 1);
5833 }
5834 }
5835
5836 /* The case {n,n} is just an EXACT, while the general case {n,m} is
5837 handled as an EXACT followed by an UPTO. */
5838
5839 else
5840 {
5841 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
5842 PUT2INC(code, 0, repeat_min);
5843
5844 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5845 we have to insert the character for the previous code. For a repeated
5846 Unicode property match, there are two extra bytes that define the
5847 required property. In UTF-8 mode, long characters have their length in
5848 c, with the UTF_LENGTH bit as a flag. */
5849
5850 if (repeat_max < 0)
5851 {
5852 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5853 if (utf && (c & UTF_LENGTH) != 0)
5854 {
5855 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5856 code += c & 7;
5857 }
5858 else
5859 #endif
5860 {
5861 *code++ = c;
5862 if (prop_type >= 0)
5863 {
5864 *code++ = prop_type;
5865 *code++ = prop_value;
5866 }
5867 }
5868 *code++ = OP_STAR + repeat_type;
5869 }
5870
5871 /* Else insert an UPTO if the max is greater than the min, again
5872 preceded by the character, for the previously inserted code. If the
5873 UPTO is just for 1 instance, we can use QUERY instead. */
5874
5875 else if (repeat_max != repeat_min)
5876 {
5877 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5878 if (utf && (c & UTF_LENGTH) != 0)
5879 {
5880 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5881 code += c & 7;
5882 }
5883 else
5884 #endif
5885 *code++ = c;
5886 if (prop_type >= 0)
5887 {
5888 *code++ = prop_type;
5889 *code++ = prop_value;
5890 }
5891 repeat_max -= repeat_min;
5892
5893 if (repeat_max == 1)
5894 {
5895 *code++ = OP_QUERY + repeat_type;
5896 }
5897 else
5898 {
5899 *code++ = OP_UPTO + repeat_type;
5900 PUT2INC(code, 0, repeat_max);
5901 }
5902 }
5903 }
5904
5905 /* The character or character type itself comes last in all cases. */
5906
5907 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5908 if (utf && (c & UTF_LENGTH) != 0)
5909 {
5910 memcpy(code, utf_chars, IN_UCHARS(c & 7));
5911 code += c & 7;
5912 }
5913 else
5914 #endif
5915 *code++ = c;
5916
5917 /* For a repeated Unicode property match, there are two extra bytes that
5918 define the required property. */
5919
5920 #ifdef SUPPORT_UCP
5921 if (prop_type >= 0)
5922 {
5923 *code++ = prop_type;
5924 *code++ = prop_value;
5925 }
5926 #endif
5927 }
5928
5929 /* If previous was a character class or a back reference, we put the repeat
5930 stuff after it, but just skip the item if the repeat was {0,0}. */
5931
5932 else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5933 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5934 *previous == OP_XCLASS ||
5935 #endif
5936 *previous == OP_REF || *previous == OP_REFI ||
5937 *previous == OP_DNREF || *previous == OP_DNREFI)
5938 {
5939 if (repeat_max == 0)
5940 {
5941 code = previous;
5942 goto END_REPEAT;
5943 }
5944
5945 if (repeat_min == 0 && repeat_max == -1)
5946 *code++ = OP_CRSTAR + repeat_type;
5947 else if (repeat_min == 1 && repeat_max == -1)
5948 *code++ = OP_CRPLUS + repeat_type;
5949 else if (repeat_min == 0 && repeat_max == 1)
5950 *code++ = OP_CRQUERY + repeat_type;
5951 else
5952 {
5953 *code++ = OP_CRRANGE + repeat_type;
5954 PUT2INC(code, 0, repeat_min);
5955 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
5956 PUT2INC(code, 0, repeat_max);
5957 }
5958 }
5959
5960 /* If previous was a bracket group, we may have to replicate it in certain
5961 cases. Note that at this point we can encounter only the "basic" bracket
5962 opcodes such as BRA and CBRA, as this is the place where they get converted
5963 into the more special varieties such as BRAPOS and SBRA. A test for >=
5964 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5965 ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5966 Originally, PCRE did not allow repetition of assertions, but now it does,
5967 for Perl compatibility. */
5968
5969 else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5970 {
5971 register int i;
5972 int len = (int)(code - previous);
5973 size_t base_hwm_offset = item_hwm_offset;
5974 pcre_uchar *bralink = NULL;
5975 pcre_uchar *brazeroptr = NULL;
5976
5977 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5978 we just ignore the repeat. */
5979
5980 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5981 goto END_REPEAT;
5982
5983 /* There is no sense in actually repeating assertions. The only potential
5984 use of repetition is in cases when the assertion is optional. Therefore,
5985 if the minimum is greater than zero, just ignore the repeat. If the
5986 maximum is not zero or one, set it to 1. */
5987
5988 if (*previous < OP_ONCE) /* Assertion */
5989 {
5990 if (repeat_min > 0) goto END_REPEAT;
5991 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5992 }
5993
5994 /* The case of a zero minimum is special because of the need to stick
5995 OP_BRAZERO in front of it, and because the group appears once in the
5996 data, whereas in other cases it appears the minimum number of times. For
5997 this reason, it is simplest to treat this case separately, as otherwise
5998 the code gets far too messy. There are several special subcases when the
5999 minimum is zero. */
6000
6001 if (repeat_min == 0)
6002 {
6003 /* If the maximum is also zero, we used to just omit the group from the
6004 output altogether, like this:
6005
6006 ** if (repeat_max == 0)
6007 ** {
6008 ** code = previous;
6009 ** goto END_REPEAT;
6010 ** }
6011
6012 However, that fails when a group or a subgroup within it is referenced
6013 as a subroutine from elsewhere in the pattern, so now we stick in
6014 OP_SKIPZERO in front of it so that it is skipped on execution. As we
6015 don't have a list of which groups are referenced, we cannot do this
6016 selectively.
6017
6018 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6019 and do no more at this point. However, we do need to adjust any
6020 OP_RECURSE calls inside the group that refer to the group itself or any
6021 internal or forward referenced group, because the offset is from the
6022 start of the whole regex. Temporarily terminate the pattern while doing
6023 this. */
6024
6025 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
6026 {
6027 *code = OP_END;
6028 adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6029 memmove(previous + 1, previous, IN_UCHARS(len));
6030 code++;
6031 if (repeat_max == 0)
6032 {
6033 *previous++ = OP_SKIPZERO;
6034 goto END_REPEAT;
6035 }
6036 brazeroptr = previous; /* Save for possessive optimizing */
6037 *previous++ = OP_BRAZERO + repeat_type;
6038 }
6039
6040 /* If the maximum is greater than 1 and limited, we have to replicate
6041 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6042 The first one has to be handled carefully because it's the original
6043 copy, which has to be moved up. The remainder can be handled by code
6044 that is common with the non-zero minimum case below. We have to
6045 adjust the value or repeat_max, since one less copy is required. Once
6046 again, we may have to adjust any OP_RECURSE calls inside the group. */
6047
6048 else
6049 {
6050 int offset;
6051 *code = OP_END;
6052 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6053 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6054 code += 2 + LINK_SIZE;
6055 *previous++ = OP_BRAZERO + repeat_type;
6056 *previous++ = OP_BRA;
6057
6058 /* We chain together the bracket offset fields that have to be
6059 filled in later when the ends of the brackets are reached. */
6060
6061 offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6062 bralink = previous;
6063 PUTINC(previous, 0, offset);
6064 }
6065
6066 repeat_max--;
6067 }
6068
6069 /* If the minimum is greater than zero, replicate the group as many
6070 times as necessary, and adjust the maximum to the number of subsequent
6071 copies that we need. If we set a first char from the group, and didn't
6072 set a required char, copy the latter from the former. If there are any
6073 forward reference subroutine calls in the group, there will be entries on
6074 the workspace list; replicate these with an appropriate increment. */
6075
6076 else
6077 {
6078 if (repeat_min > 1)
6079 {
6080 /* In the pre-compile phase, we don't actually do the replication. We
6081 just adjust the length as if we had. Do some paranoid checks for
6082 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6083 integer type when available, otherwise double. */
6084
6085 if (lengthptr != NULL)
6086 {
6087 int delta = (repeat_min - 1)*length_prevgroup;
6088 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6089 (INT64_OR_DOUBLE)length_prevgroup >
6090 (INT64_OR_DOUBLE)INT_MAX ||
6091 OFLOW_MAX - *lengthptr < delta)
6092 {
6093 *errorcodeptr = ERR20;
6094 goto FAILED;
6095 }
6096 *lengthptr += delta;
6097 }
6098
6099 /* This is compiling for real. If there is a set first byte for
6100 the group, and we have not yet set a "required byte", set it. Make
6101 sure there is enough workspace for copying forward references before
6102 doing the copy. */
6103
6104 else
6105 {
6106 if (groupsetfirstchar && reqcharflags < 0)
6107 {
6108 reqchar = firstchar;
6109 reqcharflags = firstcharflags;
6110 }
6111
6112 for (i = 1; i < repeat_min; i++)
6113 {
6114 pcre_uchar *hc;
6115 size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6116 memcpy(code, previous, IN_UCHARS(len));
6117
6118 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6119 WORK_SIZE_SAFETY_MARGIN -
6120 (this_hwm_offset - base_hwm_offset))
6121 {
6122 *errorcodeptr = expand_workspace(cd);
6123 if (*errorcodeptr != 0) goto FAILED;
6124 }
6125
6126 for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6127 hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6128 hc += LINK_SIZE)
6129 {
6130 PUT(cd->hwm, 0, GET(hc, 0) + len);
6131 cd->hwm += LINK_SIZE;
6132 }
6133 base_hwm_offset = this_hwm_offset;
6134 code += len;
6135 }
6136 }
6137 }
6138
6139 if (repeat_max > 0) repeat_max -= repeat_min;
6140 }
6141
6142 /* This code is common to both the zero and non-zero minimum cases. If
6143 the maximum is limited, it replicates the group in a nested fashion,
6144 remembering the bracket starts on a stack. In the case of a zero minimum,
6145 the first one was set up above. In all cases the repeat_max now specifies
6146 the number of additional copies needed. Again, we must remember to
6147 replicate entries on the forward reference list. */
6148
6149 if (repeat_max >= 0)
6150 {
6151 /* In the pre-compile phase, we don't actually do the replication. We
6152 just adjust the length as if we had. For each repetition we must add 1
6153 to the length for BRAZERO and for all but the last repetition we must
6154 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6155 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6156 a 64-bit integer type when available, otherwise double. */
6157
6158 if (lengthptr != NULL && repeat_max > 0)
6159 {
6160 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6161 2 - 2*LINK_SIZE; /* Last one doesn't nest */
6162 if ((INT64_OR_DOUBLE)repeat_max *
6163 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6164 > (INT64_OR_DOUBLE)INT_MAX ||
6165 OFLOW_MAX - *lengthptr < delta)
6166 {
6167 *errorcodeptr = ERR20;
6168 goto FAILED;
6169 }
6170 *lengthptr += delta;
6171 }
6172
6173 /* This is compiling for real */
6174
6175 else for (i = repeat_max - 1; i >= 0; i--)
6176 {
6177 pcre_uchar *hc;
6178 size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6179
6180 *code++ = OP_BRAZERO + repeat_type;
6181
6182 /* All but the final copy start a new nesting, maintaining the
6183 chain of brackets outstanding. */
6184
6185 if (i != 0)
6186 {
6187 int offset;
6188 *code++ = OP_BRA;
6189 offset = (bralink == NULL)? 0 : (int)(code - bralink);
6190 bralink = code;
6191 PUTINC(code, 0, offset);
6192 }
6193
6194 memcpy(code, previous, IN_UCHARS(len));
6195
6196 /* Ensure there is enough workspace for forward references before
6197 copying them. */
6198
6199 while (cd->hwm > cd->start_workspace + cd->workspace_size -
6200 WORK_SIZE_SAFETY_MARGIN -
6201 (this_hwm_offset - base_hwm_offset))
6202 {
6203 *errorcodeptr = expand_workspace(cd);
6204 if (*errorcodeptr != 0) goto FAILED;
6205 }
6206
6207 for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6208 hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6209 hc += LINK_SIZE)
6210 {
6211 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6212 cd->hwm += LINK_SIZE;
6213 }
6214 base_hwm_offset = this_hwm_offset;
6215 code += len;
6216 }
6217
6218 /* Now chain through the pending brackets, and fill in their length
6219 fields (which are holding the chain links pro tem). */
6220
6221 while (bralink != NULL)
6222 {
6223 int oldlinkoffset;
6224 int offset = (int)(code - bralink + 1);
6225 pcre_uchar *bra = code - offset;
6226 oldlinkoffset = GET(bra, 1);
6227 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6228 *code++ = OP_KET;
6229 PUTINC(code, 0, offset);
6230 PUT(bra, 1, offset);
6231 }
6232 }
6233
6234 /* If the maximum is unlimited, set a repeater in the final copy. For
6235 ONCE brackets, that's all we need to do. However, possessively repeated
6236 ONCE brackets can be converted into non-capturing brackets, as the
6237 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6238 deal with possessive ONCEs specially.
6239
6240 Otherwise, when we are doing the actual compile phase, check to see
6241 whether this group is one that could match an empty string. If so,
6242 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6243 that runtime checking can be done. [This check is also applied to ONCE
6244 groups at runtime, but in a different way.]
6245
6246 Then, if the quantifier was possessive and the bracket is not a
6247 conditional, we convert the BRA code to the POS form, and the KET code to
6248 KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6249 subpattern at both the start and at the end.) The use of special opcodes
6250 makes it possible to reduce greatly the stack usage in pcre_exec(). If
6251 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6252
6253 Then, if the minimum number of matches is 1 or 0, cancel the possessive
6254 flag so that the default action below, of wrapping everything inside
6255 atomic brackets, does not happen. When the minimum is greater than 1,
6256 there will be earlier copies of the group, and so we still have to wrap
6257 the whole thing. */
6258
6259 else
6260 {
6261 pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6262 pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6263
6264 /* Convert possessive ONCE brackets to non-capturing */
6265
6266 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6267 possessive_quantifier) *bracode = OP_BRA;
6268
6269 /* For non-possessive ONCE brackets, all we need to do is to
6270 set the KET. */
6271
6272 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6273 *ketcode = OP_KETRMAX + repeat_type;
6274
6275 /* Handle non-ONCE brackets and possessive ONCEs (which have been
6276 converted to non-capturing above). */
6277
6278 else
6279 {
6280 /* In the compile phase, check for empty string matching. */
6281
6282 if (lengthptr == NULL)
6283 {
6284 pcre_uchar *scode = bracode;
6285 do
6286 {
6287 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6288 {
6289 *bracode += OP_SBRA - OP_BRA;
6290 break;
6291 }
6292 scode += GET(scode, 1);
6293 }
6294 while (*scode == OP_ALT);
6295 }
6296
6297 /* A conditional group with only one branch has an implicit empty
6298 alternative branch. */
6299
6300 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6301 *bracode = OP_SCOND;
6302
6303 /* Handle possessive quantifiers. */
6304
6305 if (possessive_quantifier)
6306 {
6307 /* For COND brackets, we wrap the whole thing in a possessively
6308 repeated non-capturing bracket, because we have not invented POS
6309 versions of the COND opcodes. Because we are moving code along, we
6310 must ensure that any pending recursive references are updated. */
6311
6312 if (*bracode == OP_COND || *bracode == OP_SCOND)
6313 {
6314 int nlen = (int)(code - bracode);
6315 *code = OP_END;
6316 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6317 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6318 code += 1 + LINK_SIZE;
6319 nlen += 1 + LINK_SIZE;
6320 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6321 *code++ = OP_KETRPOS;
6322 PUTINC(code, 0, nlen);
6323 PUT(bracode, 1, nlen);
6324 }
6325
6326 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6327
6328 else
6329 {
6330 *bracode += 1; /* Switch to xxxPOS opcodes */
6331 *ketcode = OP_KETRPOS;
6332 }
6333
6334 /* If the minimum is zero, mark it as possessive, then unset the
6335 possessive flag when the minimum is 0 or 1. */
6336
6337 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6338 if (repeat_min < 2) possessive_quantifier = FALSE;
6339 }
6340
6341 /* Non-possessive quantifier */
6342
6343 else *ketcode = OP_KETRMAX + repeat_type;
6344 }
6345 }
6346 }
6347
6348 /* If previous is OP_FAIL, it was generated by an empty class [] in
6349 JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6350 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6351 error above. We can just ignore the repeat in JS case. */
6352
6353 else if (*previous == OP_FAIL) goto END_REPEAT;
6354
6355 /* Else there's some kind of shambles */
6356
6357 else
6358 {
6359 *errorcodeptr = ERR11;
6360 goto FAILED;
6361 }
6362
6363 /* If the character following a repeat is '+', possessive_quantifier is
6364 TRUE. For some opcodes, there are special alternative opcodes for this
6365 case. For anything else, we wrap the entire repeated item inside OP_ONCE
6366 brackets. Logically, the '+' notation is just syntactic sugar, taken from
6367 Sun's Java package, but the special opcodes can optimize it.
6368
6369 Some (but not all) possessively repeated subpatterns have already been
6370 completely handled in the code just above. For them, possessive_quantifier
6371 is always FALSE at this stage. Note that the repeated item starts at
6372 tempcode, not at previous, which might be the first part of a string whose
6373 (former) last char we repeated. */
6374
6375 if (possessive_quantifier)
6376 {
6377 int len;
6378
6379 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6380 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6381 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6382 remains is greater than zero, there's a further opcode that can be
6383 handled. If not, do nothing, leaving the EXACT alone. */
6384
6385 switch(*tempcode)
6386 {
6387 case OP_TYPEEXACT:
6388 tempcode += PRIV(OP_lengths)[*tempcode] +
6389 ((tempcode[1 + IMM2_SIZE] == OP_PROP
6390 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6391 break;
6392
6393 /* CHAR opcodes are used for exacts whose count is 1. */
6394
6395 case OP_CHAR:
6396 case OP_CHARI:
6397 case OP_NOT:
6398 case OP_NOTI:
6399 case OP_EXACT:
6400 case OP_EXACTI:
6401 case OP_NOTEXACT:
6402 case OP_NOTEXACTI:
6403 tempcode += PRIV(OP_lengths)[*tempcode];
6404 #ifdef SUPPORT_UTF
6405 if (utf && HAS_EXTRALEN(tempcode[-1]))
6406 tempcode += GET_EXTRALEN(tempcode[-1]);
6407 #endif
6408 break;
6409
6410 /* For the class opcodes, the repeat operator appears at the end;
6411 adjust tempcode to point to it. */
6412
6413 case OP_CLASS:
6414 case OP_NCLASS:
6415 tempcode += 1 + 32/sizeof(pcre_uchar);
6416 break;
6417
6418 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6419 case OP_XCLASS:
6420 tempcode += GET(tempcode, 1);
6421 break;
6422 #endif
6423 }
6424
6425 /* If tempcode is equal to code (which points to the end of the repeated
6426 item), it means we have skipped an EXACT item but there is no following
6427 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6428 all other cases, tempcode will be pointing to the repeat opcode, and will
6429 be less than code, so the value of len will be greater than 0. */
6430
6431 len = (int)(code - tempcode);
6432 if (len > 0)
6433 {
6434 unsigned int repcode = *tempcode;
6435
6436 /* There is a table for possessifying opcodes, all of which are less
6437 than OP_CALLOUT. A zero entry means there is no possessified version.
6438 */
6439
6440 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6441 *tempcode = opcode_possessify[repcode];
6442
6443 /* For opcode without a special possessified version, wrap the item in
6444 ONCE brackets. Because we are moving code along, we must ensure that any
6445 pending recursive references are updated. */
6446
6447 else
6448 {
6449 *code = OP_END;
6450 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6451 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6452 code += 1 + LINK_SIZE;
6453 len += 1 + LINK_SIZE;
6454 tempcode[0] = OP_ONCE;
6455 *code++ = OP_KET;
6456 PUTINC(code, 0, len);
6457 PUT(tempcode, 1, len);
6458 }
6459 }
6460
6461 #ifdef NEVER
6462 if (len > 0) switch (*tempcode)
6463 {
6464 case OP_STAR: *tempcode = OP_POSSTAR; break;
6465 case OP_PLUS: *tempcode = OP_POSPLUS; break;
6466 case OP_QUERY: *tempcode = OP_POSQUERY; break;
6467 case OP_UPTO: *tempcode = OP_POSUPTO; break;
6468
6469 case OP_STARI: *tempcode = OP_POSSTARI; break;
6470 case OP_PLUSI: *tempcode = OP_POSPLUSI; break;
6471 case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6472 case OP_UPTOI: *tempcode = OP_POSUPTOI; break;
6473
6474 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
6475 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
6476 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6477 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
6478
6479 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break;
6480 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break;
6481 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6482 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break;
6483
6484 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
6485 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
6486 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6487 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
6488
6489 case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break;
6490 case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break;
6491 case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break;
6492 case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break;
6493
6494 /* Because we are moving code along, we must ensure that any
6495 pending recursive references are updated. */
6496
6497 default:
6498 *code = OP_END;
6499 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6500 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6501 code += 1 + LINK_SIZE;
6502 len += 1 + LINK_SIZE;
6503 tempcode[0] = OP_ONCE;
6504 *code++ = OP_KET;
6505 PUTINC(code, 0, len);
6506 PUT(tempcode, 1, len);
6507 break;
6508 }
6509 #endif
6510 }
6511
6512 /* In all case we no longer have a previous item. We also set the
6513 "follows varying string" flag for subsequently encountered reqchars if
6514 it isn't already set and we have just passed a varying length item. */
6515
6516 END_REPEAT:
6517 previous = NULL;
6518 cd->req_varyopt |= reqvary;
6519 break;
6520
6521
6522 /* ===================================================================*/
6523 /* Start of nested parenthesized sub-expression, or comment or lookahead or
6524 lookbehind or option setting or condition or all the other extended
6525 parenthesis forms. */
6526
6527 case CHAR_LEFT_PARENTHESIS:
6528 ptr++;
6529
6530 /* First deal with comments. Putting this code right at the start ensures
6531 that comments have no bad side effects. */
6532
6533 if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
6534 {
6535 ptr += 2;
6536 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6537 if (*ptr == CHAR_NULL)
6538 {
6539 *errorcodeptr = ERR18;
6540 goto FAILED;
6541 }
6542 continue;
6543 }
6544
6545 /* Now deal with various "verbs" that can be introduced by '*'. */
6546
6547 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6548 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6549 {
6550 int i, namelen;
6551 int arglen = 0;
6552 const char *vn = verbnames;
6553 const pcre_uchar *name = ptr + 1;
6554 const pcre_uchar *arg = NULL;
6555 previous = NULL;
6556 ptr++;
6557 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6558 namelen = (int)(ptr - name);
6559
6560 /* It appears that Perl allows any characters whatsoever, other than
6561 a closing parenthesis, to appear in arguments, so we no longer insist on
6562 letters, digits, and underscores. */
6563
6564 if (*ptr == CHAR_COLON)
6565 {
6566 arg = ++ptr;
6567 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6568 arglen = (int)(ptr - arg);
6569 if ((unsigned int)arglen > MAX_MARK)
6570 {
6571 *errorcodeptr = ERR75;
6572 goto FAILED;
6573 }
6574 }
6575
6576 if (*ptr != CHAR_RIGHT_PARENTHESIS)
6577 {
6578 *errorcodeptr = ERR60;
6579 goto FAILED;
6580 }
6581
6582 /* Scan the table of verb names */
6583
6584 for (i = 0; i < verbcount; i++)
6585 {
6586 if (namelen == verbs[i].len &&
6587 STRNCMP_UC_C8(name, vn, namelen) == 0)
6588 {
6589 int setverb;
6590
6591 /* Check for open captures before ACCEPT and convert it to
6592 ASSERT_ACCEPT if in an assertion. */
6593
6594 if (verbs[i].op == OP_ACCEPT)
6595 {
6596 open_capitem *oc;
6597 if (arglen != 0)
6598 {
6599 *errorcodeptr = ERR59;
6600 goto FAILED;
6601 }
6602 cd->had_accept = TRUE;
6603 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6604 {
6605 *code++ = OP_CLOSE;
6606 PUT2INC(code, 0, oc->number);
6607 }
6608 setverb = *code++ =
6609 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6610
6611 /* Do not set firstchar after *ACCEPT */
6612 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6613 }
6614
6615 /* Handle other cases with/without an argument */
6616
6617 else if (arglen == 0)
6618 {
6619 if (verbs[i].op < 0) /* Argument is mandatory */
6620 {
6621 *errorcodeptr = ERR66;
6622 goto FAILED;
6623 }
6624 setverb = *code++ = verbs[i].op;
6625 }
6626
6627 else
6628 {
6629 if (verbs[i].op_arg < 0) /* Argument is forbidden */
6630 {
6631 *errorcodeptr = ERR59;
6632 goto FAILED;
6633 }
6634 setverb = *code++ = verbs[i].op_arg;
6635 if (lengthptr != NULL) /* In pass 1 just add in the length */
6636 { /* to avoid potential workspace */
6637 *lengthptr += arglen; /* overflow. */
6638 *code++ = 0;
6639 }
6640 else
6641 {
6642 *code++ = arglen;
6643 memcpy(code, arg, IN_UCHARS(arglen));
6644 code += arglen;
6645 }
6646 *code++ = 0;
6647 }
6648
6649 switch (setverb)
6650 {
6651 case OP_THEN:
6652 case OP_THEN_ARG:
6653 cd->external_flags |= PCRE_HASTHEN;
6654 break;
6655
6656 case OP_PRUNE:
6657 case OP_PRUNE_ARG:
6658 case OP_SKIP:
6659 case OP_SKIP_ARG:
6660 cd->had_pruneorskip = TRUE;
6661 break;
6662 }
6663
6664 break; /* Found verb, exit loop */
6665 }
6666
6667 vn += verbs[i].len + 1;
6668 }
6669
6670 if (i < verbcount) continue; /* Successfully handled a verb */
6671 *errorcodeptr = ERR60; /* Verb not recognized */
6672 goto FAILED;
6673 }
6674
6675 /* Initialize for "real" parentheses */
6676
6677 newoptions = options;
6678 skipbytes = 0;
6679 bravalue = OP_CBRA;
6680 item_hwm_offset = cd->hwm - cd->start_workspace;
6681 reset_bracount = FALSE;
6682
6683 /* Deal with the extended parentheses; all are introduced by '?', and the
6684 appearance of any of them means that this is not a capturing group. */
6685
6686 if (*ptr == CHAR_QUESTION_MARK)
6687 {
6688 int i, set, unset, namelen;
6689 int *optset;
6690 const pcre_uchar *name;
6691 pcre_uchar *slot;
6692
6693 switch (*(++ptr))
6694 {
6695 /* ------------------------------------------------------------ */
6696 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */
6697 reset_bracount = TRUE;
6698 cd->dupgroups = TRUE; /* Record (?| encountered */
6699 /* Fall through */
6700
6701 /* ------------------------------------------------------------ */
6702 case CHAR_COLON: /* Non-capturing bracket */
6703 bravalue = OP_BRA;
6704 ptr++;
6705 break;
6706
6707
6708 /* ------------------------------------------------------------ */
6709 case CHAR_LEFT_PARENTHESIS:
6710 bravalue = OP_COND; /* Conditional group */
6711 tempptr = ptr;
6712
6713 /* A condition can be an assertion, a number (referring to a numbered
6714 group's having been set), a name (referring to a named group), or 'R',
6715 referring to recursion. R<digits> and R&name are also permitted for
6716 recursion tests.
6717
6718 There are ways of testing a named group: (?(name)) is used by Python;
6719 Perl 5.10 onwards uses (?(<name>) or (?('name')).
6720
6721 There is one unfortunate ambiguity, caused by history. 'R' can be the
6722 recursive thing or the name 'R' (and similarly for 'R' followed by
6723 digits). We look for a name first; if not found, we try the other case.
6724
6725 For compatibility with auto-callouts, we allow a callout to be
6726 specified before a condition that is an assertion. First, check for the
6727 syntax of a callout; if found, adjust the temporary pointer that is
6728 used to check for an assertion condition. That's all that is needed! */
6729
6730 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6731 {
6732 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6733 if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6734 tempptr += i + 1;
6735
6736 /* tempptr should now be pointing to the opening parenthesis of the
6737 assertion condition. */
6738
6739 if (*tempptr != CHAR_LEFT_PARENTHESIS)
6740 {
6741 *errorcodeptr = ERR28;
6742 goto FAILED;
6743 }
6744 }
6745
6746 /* For conditions that are assertions, check the syntax, and then exit
6747 the switch. This will take control down to where bracketed groups,
6748 including assertions, are processed. */
6749
6750 if (tempptr[1] == CHAR_QUESTION_MARK &&
6751 (tempptr[2] == CHAR_EQUALS_SIGN ||
6752 tempptr[2] == CHAR_EXCLAMATION_MARK ||
6753 (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6754 (tempptr[3] == CHAR_EQUALS_SIGN ||
6755 tempptr[3] == CHAR_EXCLAMATION_MARK))))
6756 {
6757 cd->iscondassert = TRUE;
6758 break;
6759 }
6760
6761 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6762 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6763
6764 code[1+LINK_SIZE] = OP_CREF;
6765 skipbytes = 1+IMM2_SIZE;
6766 refsign = -1; /* => not a number */
6767 namelen = -1; /* => not a name; must set to avoid warning */
6768 name = NULL; /* Always set to avoid warning */
6769 recno = 0; /* Always set to avoid warning */
6770
6771 /* Check for a test for recursion in a named group. */
6772
6773 ptr++;
6774 if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6775 {
6776 terminator = -1;
6777 ptr += 2;
6778 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
6779 }
6780
6781 /* Check for a test for a named group's having been set, using the Perl
6782 syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6783 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6784
6785 else if (*ptr == CHAR_LESS_THAN_SIGN)
6786 {
6787 terminator = CHAR_GREATER_THAN_SIGN;
6788 ptr++;
6789 }
6790 else if (*ptr == CHAR_APOSTROPHE)
6791 {
6792 terminator = CHAR_APOSTROPHE;
6793 ptr++;
6794 }
6795 else
6796 {
6797 terminator = CHAR_NULL;
6798 if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6799 else if (IS_DIGIT(*ptr)) refsign = 0;
6800 }
6801
6802 /* Handle a number */
6803
6804 if (refsign >= 0)
6805 {
6806 while (IS_DIGIT(*ptr))
6807 {
6808 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6809 {
6810 while (IS_DIGIT(*ptr)) ptr++;
6811 *errorcodeptr = ERR61;
6812 goto FAILED;
6813 }
6814 recno = recno * 10 + (int)(*ptr - CHAR_0);
6815 ptr++;
6816 }
6817 }
6818
6819 /* Otherwise we expect to read a name; anything else is an error. When
6820 a name is one of a number of duplicates, a different opcode is used and
6821 it needs more memory. Unfortunately we cannot tell whether a name is a
6822 duplicate in the first pass, so we have to allow for more memory. */
6823
6824 else
6825 {
6826 if (IS_DIGIT(*ptr))
6827 {
6828 *errorcodeptr = ERR84;
6829 goto FAILED;
6830 }
6831 if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6832 {
6833 *errorcodeptr = ERR28; /* Assertion expected */
6834 goto FAILED;
6835 }
6836 name = ptr++;
6837 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6838 {
6839 ptr++;
6840 }
6841 namelen = (int)(ptr - name);
6842 if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6843 }
6844
6845 /* Check the terminator */
6846
6847 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6848 *ptr++ != CHAR_RIGHT_PARENTHESIS)
6849 {
6850 ptr--; /* Error offset */
6851 *errorcodeptr = ERR26; /* Malformed number or name */
6852 goto FAILED;
6853 }
6854
6855 /* Do no further checking in the pre-compile phase. */
6856
6857 if (lengthptr != NULL) break;
6858
6859 /* In the real compile we do the work of looking for the actual
6860 reference. If refsign is not negative, it means we have a number in
6861 recno. */
6862
6863 if (refsign >= 0)
6864 {
6865 if (recno <= 0)
6866 {
6867 *errorcodeptr = ERR35;
6868 goto FAILED;
6869 }
6870 if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6871 cd->bracount - recno + 1 : recno + cd->bracount;
6872 if (recno <= 0 || recno > cd->final_bracount)
6873 {
6874 *errorcodeptr = ERR15;
6875 goto FAILED;
6876 }
6877 PUT2(code, 2+LINK_SIZE, recno);
6878 if (recno > cd->top_backref) cd->top_backref = recno;
6879 break;
6880 }
6881
6882 /* Otherwise look for the name. */
6883
6884 slot = cd->name_table;
6885 for (i = 0; i < cd->names_found; i++)
6886 {
6887 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6888 slot += cd->name_entry_size;
6889 }
6890
6891 /* Found the named subpattern. If the name is duplicated, add one to
6892 the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6893 appropriate data values. Otherwise, just insert the unique subpattern
6894 number. */
6895
6896 if (i < cd->names_found)
6897 {
6898 int offset = i++;
6899 int count = 1;
6900 recno = GET2(slot, 0); /* Number from first found */
6901 if (recno > cd->top_backref) cd->top_backref = recno;
6902 for (; i < cd->names_found; i++)
6903 {
6904 slot += cd->name_entry_size;
6905 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6906 (slot+IMM2_SIZE)[namelen] != 0) break;
6907 count++;
6908 }
6909
6910 if (count > 1)
6911 {
6912 PUT2(code, 2+LINK_SIZE, offset);
6913 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6914 skipbytes += IMM2_SIZE;
6915 code[1+LINK_SIZE]++;
6916 }
6917 else /* Not a duplicated name */
6918 {
6919 PUT2(code, 2+LINK_SIZE, recno);
6920 }
6921 }
6922
6923 /* If terminator == CHAR_NULL it means that the name followed directly
6924 after the opening parenthesis [e.g. (?(abc)...] and in this case there
6925 are some further alternatives to try. For the cases where terminator !=
6926 CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6927 we have now checked all the possibilities, so give an error. */
6928
6929 else if (terminator != CHAR_NULL)
6930 {
6931 *errorcodeptr = ERR15;
6932 goto FAILED;
6933 }
6934
6935 /* Check for (?(R) for recursion. Allow digits after R to specify a
6936 specific group number. */
6937
6938 else if (*name == CHAR_R)
6939 {
6940 recno = 0;
6941 for (i = 1; i < namelen; i++)
6942 {
6943 if (!IS_DIGIT(name[i]))
6944 {
6945 *errorcodeptr = ERR15;
6946 goto FAILED;
6947 }
6948 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6949 {
6950 *errorcodeptr = ERR61;
6951 goto FAILED;
6952 }
6953 recno = recno * 10 + name[i] - CHAR_0;
6954 }
6955 if (recno == 0) recno = RREF_ANY;
6956 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
6957 PUT2(code, 2+LINK_SIZE, recno);
6958 }
6959
6960 /* Similarly, check for the (?(DEFINE) "condition", which is always
6961 false. */
6962
6963 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6964 {
6965 code[1+LINK_SIZE] = OP_DEF;
6966 skipbytes = 1;
6967 }
6968
6969 /* Reference to an unidentified subpattern. */
6970
6971 else
6972 {
6973 *errorcodeptr = ERR15;
6974 goto FAILED;
6975 }
6976 break;
6977
6978
6979 /* ------------------------------------------------------------ */
6980 case CHAR_EQUALS_SIGN: /* Positive lookahead */
6981 bravalue = OP_ASSERT;
6982 cd->assert_depth += 1;
6983 ptr++;
6984 break;
6985
6986 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6987 thing to do, but Perl allows all assertions to be quantified, and when
6988 they contain capturing parentheses there may be a potential use for
6989 this feature. Not that that applies to a quantified (?!) but we allow
6990 it for uniformity. */
6991
6992 /* ------------------------------------------------------------ */
6993 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */
6994 ptr++;
6995 if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6996 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6997 (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6998 {
6999 *code++ = OP_FAIL;
7000 previous = NULL;
7001 continue;
7002 }
7003 bravalue = OP_ASSERT_NOT;
7004 cd->assert_depth += 1;
7005 break;
7006
7007
7008 /* ------------------------------------------------------------ */
7009 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */
7010 switch (ptr[1])
7011 {
7012 case CHAR_EQUALS_SIGN: /* Positive lookbehind */
7013 bravalue = OP_ASSERTBACK;
7014 cd->assert_depth += 1;
7015 ptr += 2;
7016 break;
7017
7018 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */
7019 bravalue = OP_ASSERTBACK_NOT;
7020 cd->assert_depth += 1;
7021 ptr += 2;
7022 break;
7023
7024 default: /* Could be name define, else bad */
7025 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7026 goto DEFINE_NAME;
7027 ptr++; /* Correct offset for error */
7028 *errorcodeptr = ERR24;
7029 goto FAILED;
7030 }
7031 break;
7032
7033
7034 /* ------------------------------------------------------------ */
7035 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */
7036 bravalue = OP_ONCE;
7037 ptr++;
7038 break;
7039
7040
7041 /* ------------------------------------------------------------ */
7042 case CHAR_C: /* Callout - may be followed by digits; */
7043 previous_callout = code; /* Save for later completion */
7044 after_manual_callout = 1; /* Skip one item before completing */
7045 *code++ = OP_CALLOUT;
7046 {
7047 int n = 0;
7048 ptr++;
7049 while(IS_DIGIT(*ptr))
7050 n = n * 10 + *ptr++ - CHAR_0;
7051 if (*ptr != CHAR_RIGHT_PARENTHESIS)
7052 {
7053 *errorcodeptr = ERR39;
7054 goto FAILED;
7055 }
7056 if (n > 255)
7057 {
7058 *errorcodeptr = ERR38;
7059 goto FAILED;
7060 }
7061 *code++ = n;
7062 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7063 PUT(code, LINK_SIZE, 0); /* Default length */
7064 code += 2 * LINK_SIZE;
7065 }
7066 previous = NULL;
7067 continue;
7068
7069
7070 /* ------------------------------------------------------------ */
7071 case CHAR_P: /* Python-style named subpattern handling */
7072 if (*(++ptr) == CHAR_EQUALS_SIGN ||
7073 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */
7074 {
7075 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7076 terminator = CHAR_RIGHT_PARENTHESIS;
7077 goto NAMED_REF_OR_RECURSE;
7078 }
7079 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */
7080 {
7081 *errorcodeptr = ERR41;
7082 goto FAILED;
7083 }
7084 /* Fall through to handle (?P< as (?< is handled */
7085
7086
7087 /* ------------------------------------------------------------ */
7088 DEFINE_NAME: /* Come here from (?< handling */
7089 case CHAR_APOSTROPHE:
7090 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7091 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7092 name = ++ptr;
7093 if (IS_DIGIT(*ptr))
7094 {
7095 *errorcodeptr = ERR84; /* Group name must start with non-digit */
7096 goto FAILED;
7097 }
7098 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7099 namelen = (int)(ptr - name);
7100
7101 /* In the pre-compile phase, do a syntax check, remember the longest
7102 name, and then remember the group in a vector, expanding it if
7103 necessary. Duplicates for the same number are skipped; other duplicates
7104 are checked for validity. In the actual compile, there is nothing to
7105 do. */
7106
7107 if (lengthptr != NULL)
7108 {
7109 named_group *ng;
7110 pcre_uint32 number = cd->bracount + 1;
7111
7112 if (*ptr != (pcre_uchar)terminator)
7113 {
7114 *errorcodeptr = ERR42;
7115 goto FAILED;
7116 }
7117
7118 if (cd->names_found >= MAX_NAME_COUNT)
7119 {
7120 *errorcodeptr = ERR49;
7121 goto FAILED;
7122 }
7123
7124 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7125 {
7126 cd->name_entry_size = namelen + IMM2_SIZE + 1;
7127 if (namelen > MAX_NAME_SIZE)
7128 {
7129 *errorcodeptr = ERR48;
7130 goto FAILED;
7131 }
7132 }
7133
7134 /* Scan the list to check for duplicates. For duplicate names, if the
7135 number is the same, break the loop, which causes the name to be
7136 discarded; otherwise, if DUPNAMES is not set, give an error.
7137 If it is set, allow the name with a different number, but continue
7138 scanning in case this is a duplicate with the same number. For
7139 non-duplicate names, give an error if the number is duplicated. */
7140
7141 ng = cd->named_groups;
7142 for (i = 0; i < cd->names_found; i++, ng++)
7143 {
7144 if (namelen == ng->length &&
7145 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7146 {
7147 if (ng->number == number) break;
7148 if ((options & PCRE_DUPNAMES) == 0)
7149 {
7150 *errorcodeptr = ERR43;
7151 goto FAILED;
7152 }
7153 cd->dupnames = TRUE; /* Duplicate names exist */
7154 }
7155 else if (ng->number == number)
7156 {
7157 *errorcodeptr = ERR65;
7158 goto FAILED;
7159 }
7160 }
7161
7162 if (i >= cd->names_found) /* Not a duplicate with same number */
7163 {
7164 /* Increase the list size if necessary */
7165
7166 if (cd->names_found >= cd->named_group_list_size)
7167 {
7168 int newsize = cd->named_group_list_size * 2;
7169 named_group *newspace = (PUBL(malloc))
7170 (newsize * sizeof(named_group));
7171
7172 if (newspace == NULL)
7173 {
7174 *errorcodeptr = ERR21;
7175 goto FAILED;
7176 }
7177
7178 memcpy(newspace, cd->named_groups,
7179 cd->named_group_list_size * sizeof(named_group));
7180 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7181 (PUBL(free))((void *)cd->named_groups);
7182 cd->named_groups = newspace;
7183 cd->named_group_list_size = newsize;
7184 }
7185
7186 cd->named_groups[cd->names_found].name = name;
7187 cd->named_groups[cd->names_found].length = namelen;
7188 cd->named_groups[cd->names_found].number = number;
7189 cd->names_found++;
7190 }
7191 }
7192
7193 ptr++; /* Move past > or ' in both passes. */
7194 goto NUMBERED_GROUP;
7195
7196
7197 /* ------------------------------------------------------------ */
7198 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */
7199 terminator = CHAR_RIGHT_PARENTHESIS;
7200 is_recurse = TRUE;
7201 /* Fall through */
7202
7203 /* We come here from the Python syntax above that handles both
7204 references (?P=name) and recursion (?P>name), as well as falling
7205 through from the Perl recursion syntax (?&name). We also come here from
7206 the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7207 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7208
7209 NAMED_REF_OR_RECURSE:
7210 name = ++ptr;
7211 if (IS_DIGIT(*ptr))
7212 {
7213 *errorcodeptr = ERR84; /* Group name must start with non-digit */
7214 goto FAILED;
7215 }
7216 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7217 namelen = (int)(ptr - name);
7218
7219 /* In the pre-compile phase, do a syntax check. We used to just set
7220 a dummy reference number, because it was not used in the first pass.
7221 However, with the change of recursive back references to be atomic,
7222 we have to look for the number so that this state can be identified, as
7223 otherwise the incorrect length is computed. If it's not a backwards
7224 reference, the dummy number will do. */
7225
7226 if (lengthptr != NULL)
7227 {
7228 named_group *ng;
7229 recno = 0;
7230
7231 if (namelen == 0)
7232 {
7233 *errorcodeptr = ERR62;
7234 goto FAILED;
7235 }
7236 if (*ptr != (pcre_uchar)terminator)
7237 {
7238 *errorcodeptr = ERR42;
7239 goto FAILED;
7240 }
7241 if (namelen > MAX_NAME_SIZE)
7242 {
7243 *errorcodeptr = ERR48;
7244 goto FAILED;
7245 }
7246
7247 /* Count named back references. */
7248
7249 if (!is_recurse) cd->namedrefcount++;
7250
7251 /* We have to allow for a named reference to a duplicated name (this
7252 cannot be determined until the second pass). This needs an extra
7253 16-bit data item. */
7254
7255 *lengthptr += IMM2_SIZE;
7256
7257 /* If this is a forward reference and we are within a (?|...) group,
7258 the reference may end up as the number of a group which we are
7259 currently inside, that is, it could be a recursive reference. In the
7260 real compile this will be picked up and the reference wrapped with
7261 OP_ONCE to make it atomic, so we must space in case this occurs. */
7262
7263 /* In fact, this can happen for a non-forward reference because
7264 another group with the same number might be created later. This
7265 issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7266 only mode, we finesse the bug by allowing more memory always. */
7267
7268 *lengthptr += 2 + 2*LINK_SIZE;
7269
7270 /* It is even worse than that. The current reference may be to an
7271 existing named group with a different number (so apparently not
7272 recursive) but which later on is also attached to a group with the
7273 current number. This can only happen if $(| has been previous
7274 encountered. In that case, we allow yet more memory, just in case.
7275 (Again, this is fixed "properly" in PCRE2. */
7276
7277 if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7278
7279 /* Otherwise, check for recursion here. The name table does not exist
7280 in the first pass; instead we must scan the list of names encountered
7281 so far in order to get the number. If the name is not found, leave
7282 the value of recno as 0 for a forward reference. */
7283
7284 /* This patch (removing "else") fixes a problem when a reference is
7285 to multiple identically named nested groups from within the nest.
7286 Once again, it is not the "proper" fix, and it results in an
7287 over-allocation of memory. */
7288
7289 /* else */
7290 {
7291 ng = cd->named_groups;
7292 for (i = 0; i < cd->names_found; i++, ng++)
7293 {
7294 if (namelen == ng->length &&
7295 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7296 {
7297 open_capitem *oc;
7298 recno = ng->number;
7299 if (is_recurse) break;
7300 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7301 {
7302 if (oc->number == recno)
7303 {
7304 oc->flag = TRUE;
7305 break;
7306 }
7307 }
7308 }
7309 }
7310 }
7311 }
7312
7313 /* In the real compile, search the name table. We check the name
7314 first, and then check that we have reached the end of the name in the
7315 table. That way, if the name is longer than any in the table, the
7316 comparison will fail without reading beyond the table entry. */
7317
7318 else
7319 {
7320 slot = cd->name_table;
7321 for (i = 0; i < cd->names_found; i++)
7322 {
7323 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7324 slot[IMM2_SIZE+namelen] == 0)
7325 break;
7326 slot += cd->name_entry_size;
7327 }
7328
7329 if (i < cd->names_found)
7330 {
7331 recno = GET2(slot, 0);
7332 }
7333 else
7334 {
7335 *errorcodeptr = ERR15;
7336 goto FAILED;
7337 }
7338 }
7339
7340 /* In both phases, for recursions, we can now go to the code than
7341 handles numerical recursion. */
7342
7343 if (is_recurse) goto HANDLE_RECURSION;
7344
7345 /* In the second pass we must see if the name is duplicated. If so, we
7346 generate a different opcode. */
7347
7348 if (lengthptr == NULL && cd->dupnames)
7349 {
7350 int count = 1;
7351 unsigned int index = i;
7352 pcre_uchar *cslot = slot + cd->name_entry_size;
7353
7354 for (i++; i < cd->names_found; i++)
7355 {
7356 if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7357 count++;
7358 cslot += cd->name_entry_size;
7359 }
7360
7361 if (count > 1)
7362 {
7363 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7364 previous = code;
7365 item_hwm_offset = cd->hwm - cd->start_workspace;
7366 *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7367 PUT2INC(code, 0, index);
7368 PUT2INC(code, 0, count);
7369
7370 /* Process each potentially referenced group. */
7371
7372 for (; slot < cslot; slot += cd->name_entry_size)
7373 {
7374 open_capitem *oc;
7375 recno = GET2(slot, 0);
7376 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7377 if (recno > cd->top_backref) cd->top_backref = recno;
7378
7379 /* Check to see if this back reference is recursive, that it, it
7380 is inside the group that it references. A flag is set so that the
7381 group can be made atomic. */
7382
7383 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7384 {
7385 if (oc->number == recno)
7386 {
7387 oc->flag = TRUE;
7388 break;
7389 }
7390 }
7391 }
7392
7393 continue; /* End of back ref handling */
7394 }
7395 }
7396
7397 /* First pass, or a non-duplicated name. */
7398
7399 goto HANDLE_REFERENCE;
7400
7401
7402 /* ------------------------------------------------------------ */
7403 case CHAR_R: /* Recursion, same as (?0) */
7404 recno = 0;
7405 if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7406 {
7407 *errorcodeptr = ERR29;
7408 goto FAILED;
7409 }
7410 goto HANDLE_RECURSION;
7411
7412
7413 /* ------------------------------------------------------------ */
7414 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */
7415 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7416 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7417 {
7418 const pcre_uchar *called;
7419 terminator = CHAR_RIGHT_PARENTHESIS;
7420
7421 /* Come here from the \g<...> and \g'...' code (Oniguruma
7422 compatibility). However, the syntax has been checked to ensure that
7423 the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7424 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7425 ever be taken. */
7426
7427 HANDLE_NUMERICAL_RECURSION:
7428
7429 if ((refsign = *ptr) == CHAR_PLUS)
7430 {
7431 ptr++;
7432 if (!IS_DIGIT(*ptr))
7433 {
7434 *errorcodeptr = ERR63;
7435 goto FAILED;
7436 }
7437 }
7438 else if (refsign == CHAR_MINUS)
7439 {
7440 if (!IS_DIGIT(ptr[1]))
7441 goto OTHER_CHAR_AFTER_QUERY;
7442 ptr++;
7443 }
7444
7445 recno = 0;
7446 while(IS_DIGIT(*ptr))
7447 {
7448 if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7449 {
7450 while (IS_DIGIT(*ptr)) ptr++;
7451 *errorcodeptr = ERR61;
7452 goto FAILED;
7453 }
7454 recno = recno * 10 + *ptr++ - CHAR_0;
7455 }
7456
7457 if (*ptr != (pcre_uchar)terminator)
7458 {
7459 *errorcodeptr = ERR29;
7460 goto FAILED;
7461 }
7462
7463 if (refsign == CHAR_MINUS)
7464 {
7465 if (recno == 0)
7466 {
7467 *errorcodeptr = ERR58;
7468 goto FAILED;
7469 }
7470 recno = cd->bracount - recno + 1;
7471 if (recno <= 0)
7472 {
7473 *errorcodeptr = ERR15;
7474 goto FAILED;
7475 }
7476 }
7477 else if (refsign == CHAR_PLUS)
7478 {
7479 if (recno == 0)
7480 {
7481 *errorcodeptr = ERR58;
7482 goto FAILED;
7483 }
7484 recno += cd->bracount;
7485 }
7486
7487 /* Come here from code above that handles a named recursion */
7488
7489 HANDLE_RECURSION:
7490
7491 previous = code;
7492 item_hwm_offset = cd->hwm - cd->start_workspace;
7493 called = cd->start_code;
7494
7495 /* When we are actually compiling, find the bracket that is being
7496 referenced. Temporarily end the regex in case it doesn't exist before
7497 this point. If we end up with a forward reference, first check that
7498 the bracket does occur later so we can give the error (and position)
7499 now. Then remember this forward reference in the workspace so it can
7500 be filled in at the end. */
7501
7502 if (lengthptr == NULL)
7503 {
7504 *code = OP_END;
7505 if (recno != 0)
7506 called = PRIV(find_bracket)(cd->start_code, utf, recno);
7507
7508 /* Forward reference */
7509
7510 if (called == NULL)
7511 {
7512 if (recno > cd->final_bracount)
7513 {
7514 *errorcodeptr = ERR15;
7515 goto FAILED;
7516 }
7517
7518 /* Fudge the value of "called" so that when it is inserted as an
7519 offset below, what it actually inserted is the reference number
7520 of the group. Then remember the forward reference. */
7521
7522 called = cd->start_code + recno;
7523 if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7524 WORK_SIZE_SAFETY_MARGIN)
7525 {
7526 *errorcodeptr = expand_workspace(cd);
7527 if (*errorcodeptr != 0) goto FAILED;
7528 }
7529 PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7530 }
7531
7532 /* If not a forward reference, and the subpattern is still open,
7533 this is a recursive call. We check to see if this is a left
7534 recursion that could loop for ever, and diagnose that case. We
7535 must not, however, do this check if we are in a conditional
7536 subpattern because the condition might be testing for recursion in
7537 a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7538 Forever loops are also detected at runtime, so those that occur in
7539 conditional subpatterns will be picked up then. */
7540
7541 else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7542 could_be_empty(called, code, bcptr, utf, cd))
7543 {
7544 *errorcodeptr = ERR40;
7545 goto FAILED;
7546 }
7547 }
7548
7549 /* Insert the recursion/subroutine item. It does not have a set first
7550 character (relevant if it is repeated, because it will then be
7551 wrapped with ONCE brackets). */
7552
7553 *code = OP_RECURSE;
7554 PUT(code, 1, (int)(called - cd->start_code));
7555 code += 1 + LINK_SIZE;
7556 groupsetfirstchar = FALSE;
7557 }
7558
7559 /* Can't determine a first byte now */
7560
7561 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7562 continue;
7563
7564
7565 /* ------------------------------------------------------------ */
7566 default: /* Other characters: check option setting */
7567 OTHER_CHAR_AFTER_QUERY:
7568 set = unset = 0;
7569 optset = &set;
7570
7571 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7572 {
7573 switch (*ptr++)
7574 {
7575 case CHAR_MINUS: optset = &unset; break;
7576
7577 case CHAR_J: /* Record that it changed in the external options */
7578 *optset |= PCRE_DUPNAMES;
7579 cd->external_flags |= PCRE_JCHANGED;
7580 break;
7581
7582 case CHAR_i: *optset |= PCRE_CASELESS; break;
7583 case CHAR_m: *optset |= PCRE_MULTILINE; break;
7584 case CHAR_s: *optset |= PCRE_DOTALL; break;
7585 case CHAR_x: *optset |= PCRE_EXTENDED; break;
7586 case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7587 case CHAR_X: *optset |= PCRE_EXTRA; break;
7588
7589 default: *errorcodeptr = ERR12;
7590 ptr--; /* Correct the offset */
7591 goto FAILED;
7592 }
7593 }
7594
7595 /* Set up the changed option bits, but don't change anything yet. */
7596
7597 newoptions = (options | set) & (~unset);
7598
7599 /* If the options ended with ')' this is not the start of a nested
7600 group with option changes, so the options change at this level. If this
7601 item is right at the start of the pattern, the options can be
7602 abstracted and made external in the pre-compile phase, and ignored in
7603 the compile phase. This can be helpful when matching -- for instance in
7604 caseless checking of required bytes.
7605
7606 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
7607 definitely *not* at the start of the pattern because something has been
7608 compiled. In the pre-compile phase, however, the code pointer can have
7609 that value after the start, because it gets reset as code is discarded
7610 during the pre-compile. However, this can happen only at top level - if
7611 we are within parentheses, the starting BRA will still be present. At
7612 any parenthesis level, the length value can be used to test if anything
7613 has been compiled at that level. Thus, a test for both these conditions
7614 is necessary to ensure we correctly detect the start of the pattern in
7615 both phases.
7616
7617 If we are not at the pattern start, reset the greedy defaults and the
7618 case value for firstchar and reqchar. */
7619
7620 if (*ptr == CHAR_RIGHT_PARENTHESIS)
7621 {
7622 if (code == cd->start_code + 1 + LINK_SIZE &&
7623 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
7624 {
7625 cd->external_options = newoptions;
7626 }
7627 else
7628 {
7629 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7630 greedy_non_default = greedy_default ^ 1;
7631 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7632 }
7633
7634 /* Change options at this level, and pass them back for use
7635 in subsequent branches. */
7636
7637 *optionsptr = options = newoptions;
7638 previous = NULL; /* This item can't be repeated */
7639 continue; /* It is complete */
7640 }
7641
7642 /* If the options ended with ':' we are heading into a nested group
7643 with possible change of options. Such groups are non-capturing and are
7644 not assertions of any kind. All we need to do is skip over the ':';
7645 the newoptions value is handled below. */
7646
7647 bravalue = OP_BRA;
7648 ptr++;
7649 } /* End of switch for character following (? */
7650 } /* End of (? handling */
7651
7652 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7653 is set, all unadorned brackets become non-capturing and behave like (?:...)
7654 brackets. */
7655
7656 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7657 {
7658 bravalue = OP_BRA;
7659 }
7660
7661 /* Else we have a capturing group. */
7662
7663 else
7664 {
7665 NUMBERED_GROUP:
7666 cd->bracount += 1;
7667 PUT2(code, 1+LINK_SIZE, cd->bracount);
7668 skipbytes = IMM2_SIZE;
7669 }
7670
7671 /* Process nested bracketed regex. First check for parentheses nested too
7672 deeply. */
7673
7674 if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7675 {
7676 *errorcodeptr = ERR82;
7677 goto FAILED;
7678 }
7679
7680 /* All assertions used not to be repeatable, but this was changed for Perl
7681 compatibility. All kinds can now be repeated except for assertions that are
7682 conditions (Perl also forbids these to be repeated). We copy code into a
7683 non-register variable (tempcode) in order to be able to pass its address
7684 because some compilers complain otherwise. At the start of a conditional
7685 group whose condition is an assertion, cd->iscondassert is set. We unset it
7686 here so as to allow assertions later in the group to be quantified. */
7687
7688 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7689 cd->iscondassert)
7690 {
7691 previous = NULL;
7692 cd->iscondassert = FALSE;
7693 }
7694 else
7695 {
7696 previous = code;
7697 item_hwm_offset = cd->hwm - cd->start_workspace;
7698 }
7699
7700 *code = bravalue;
7701 tempcode = code;
7702 tempreqvary = cd->req_varyopt; /* Save value before bracket */
7703 tempbracount = cd->bracount; /* Save value before bracket */
7704 length_prevgroup = 0; /* Initialize for pre-compile phase */
7705
7706 if (!compile_regex(
7707 newoptions, /* The complete new option state */
7708 &tempcode, /* Where to put code (updated) */
7709 &ptr, /* Input pointer (updated) */
7710 errorcodeptr, /* Where to put an error message */
7711 (bravalue == OP_ASSERTBACK ||
7712 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7713 reset_bracount, /* True if (?| group */
7714 skipbytes, /* Skip over bracket number */
7715 cond_depth +
7716 ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */
7717 &subfirstchar, /* For possible first char */
7718 &subfirstcharflags,
7719 &subreqchar, /* For possible last char */
7720 &subreqcharflags,
7721 bcptr, /* Current branch chain */
7722 cd, /* Tables block */
7723 (lengthptr == NULL)? NULL : /* Actual compile phase */
7724 &length_prevgroup /* Pre-compile phase */
7725 ))
7726 goto FAILED;
7727
7728 cd->parens_depth -= 1;
7729
7730 /* If this was an atomic group and there are no capturing groups within it,
7731 generate OP_ONCE_NC instead of OP_ONCE. */
7732
7733 if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7734 *code = OP_ONCE_NC;
7735
7736 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7737 cd->assert_depth -= 1;
7738
7739 /* At the end of compiling, code is still pointing to the start of the
7740 group, while tempcode has been updated to point past the end of the group.
7741 The pattern pointer (ptr) is on the bracket.
7742
7743 If this is a conditional bracket, check that there are no more than
7744 two branches in the group, or just one if it's a DEFINE group. We do this
7745 in the real compile phase, not in the pre-pass, where the whole group may
7746 not be available. */
7747
7748 if (bravalue == OP_COND && lengthptr == NULL)
7749 {
7750 pcre_uchar *tc = code;
7751 int condcount = 0;
7752
7753 do {
7754 condcount++;
7755 tc += GET(tc,1);
7756 }
7757 while (*tc != OP_KET);
7758
7759 /* A DEFINE group is never obeyed inline (the "condition" is always
7760 false). It must have only one branch. */
7761
7762 if (code[LINK_SIZE+1] == OP_DEF)
7763 {
7764 if (condcount > 1)
7765 {
7766 *errorcodeptr = ERR54;
7767 goto FAILED;
7768 }
7769 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
7770 }
7771
7772 /* A "normal" conditional group. If there is just one branch, we must not
7773 make use of its firstchar or reqchar, because this is equivalent to an
7774 empty second branch. */
7775
7776 else
7777 {
7778 if (condcount > 2)
7779 {
7780 *errorcodeptr = ERR27;
7781 goto FAILED;
7782 }
7783 if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7784 }
7785 }
7786
7787 /* Error if hit end of pattern */
7788
7789 if (*ptr != CHAR_RIGHT_PARENTHESIS)
7790 {
7791 *errorcodeptr = ERR14;
7792 goto FAILED;
7793 }
7794
7795 /* In the pre-compile phase, update the length by the length of the group,
7796 less the brackets at either end. Then reduce the compiled code to just a
7797 set of non-capturing brackets so that it doesn't use much memory if it is
7798 duplicated by a quantifier.*/
7799
7800 if (lengthptr != NULL)
7801 {
7802 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7803 {
7804 *errorcodeptr = ERR20;
7805 goto FAILED;
7806 }
7807 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7808 code++; /* This already contains bravalue */
7809 PUTINC(code, 0, 1 + LINK_SIZE);
7810 *code++ = OP_KET;
7811 PUTINC(code, 0, 1 + LINK_SIZE);
7812 break; /* No need to waste time with special character handling */
7813 }
7814
7815 /* Otherwise update the main code pointer to the end of the group. */
7816
7817 code = tempcode;
7818
7819 /* For a DEFINE group, required and first character settings are not
7820 relevant. */
7821
7822 if (bravalue == OP_DEF) break;
7823
7824 /* Handle updating of the required and first characters for other types of
7825 group. Update for normal brackets of all kinds, and conditions with two
7826 branches (see code above). If the bracket is followed by a quantifier with
7827 zero repeat, we have to back off. Hence the definition of zeroreqchar and
7828 zerofirstchar outside the main loop so that they can be accessed for the
7829 back off. */
7830
7831 zeroreqchar = reqchar;
7832 zeroreqcharflags = reqcharflags;
7833 zerofirstchar = firstchar;
7834 zerofirstcharflags = firstcharflags;
7835 groupsetfirstchar = FALSE;
7836
7837 if (bravalue >= OP_ONCE)
7838 {
7839 /* If we have not yet set a firstchar in this branch, take it from the
7840 subpattern, remembering that it was set here so that a repeat of more
7841 than one can replicate it as reqchar if necessary. If the subpattern has
7842 no firstchar, set "none" for the whole branch. In both cases, a zero
7843 repeat forces firstchar to "none". */
7844
7845 if (firstcharflags == REQ_UNSET)
7846 {
7847 if (subfirstcharflags >= 0)
7848 {
7849 firstchar = subfirstchar;
7850 firstcharflags = subfirstcharflags;
7851 groupsetfirstchar = TRUE;
7852 }
7853 else firstcharflags = REQ_NONE;
7854 zerofirstcharflags = REQ_NONE;
7855 }
7856
7857 /* If firstchar was previously set, convert the subpattern's firstchar
7858 into reqchar if there wasn't one, using the vary flag that was in
7859 existence beforehand. */
7860
7861 else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7862 {
7863 subreqchar = subfirstchar;
7864 subreqcharflags = subfirstcharflags | tempreqvary;
7865 }
7866
7867 /* If the subpattern set a required byte (or set a first byte that isn't
7868 really the first byte - see above), set it. */
7869
7870 if (subreqcharflags >= 0)
7871 {
7872 reqchar = subreqchar;
7873 reqcharflags = subreqcharflags;
7874 }
7875 }
7876
7877 /* For a forward assertion, we take the reqchar, if set. This can be
7878 helpful if the pattern that follows the assertion doesn't set a different
7879 char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
7880 for an assertion, however because it leads to incorrect effect for patterns
7881 such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
7882 of a firstchar. This is overcome by a scan at the end if there's no
7883 firstchar, looking for an asserted first char. */
7884
7885 else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
7886 {
7887 reqchar = subreqchar;
7888 reqcharflags = subreqcharflags;
7889 }
7890 break; /* End of processing '(' */
7891
7892
7893 /* ===================================================================*/
7894 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7895 are arranged to be the negation of the corresponding OP_values in the
7896 default case when PCRE_UCP is not set. For the back references, the values
7897 are negative the reference number. Only back references and those types
7898 that consume a character may be repeated. We can test for values between
7899 ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7900 ever created. */
7901
7902 case CHAR_BACKSLASH:
7903 tempptr = ptr;
7904 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7905 if (*errorcodeptr != 0) goto FAILED;
7906
7907 if (escape == 0) /* The escape coded a single character */
7908 c = ec;
7909 else
7910 {
7911 if (escape == ESC_Q) /* Handle start of quoted string */
7912 {
7913 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
7914 ptr += 2; /* avoid empty string */
7915 else inescq = TRUE;
7916 continue;
7917 }
7918
7919 if (escape == ESC_E) continue; /* Perl ignores an orphan \E */
7920
7921 /* For metasequences that actually match a character, we disable the
7922 setting of a first character if it hasn't already been set. */
7923
7924 if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7925 firstcharflags = REQ_NONE;
7926
7927 /* Set values to reset to if this is followed by a zero repeat. */
7928
7929 zerofirstchar = firstchar;
7930 zerofirstcharflags = firstcharflags;
7931 zeroreqchar = reqchar;
7932 zeroreqcharflags = reqcharflags;
7933
7934 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7935 is a subroutine call by number (Oniguruma syntax). In fact, the value
7936 ESC_g is returned only for these cases. So we don't need to check for <
7937 or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7938 -n, and for the Perl syntax \g{name} the result is ESC_k (as
7939 that is a synonym for a named back reference). */
7940
7941 if (escape == ESC_g)
7942 {
7943 const pcre_uchar *p;
7944 pcre_uint32 cf;
7945
7946 item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */
7947 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7948 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7949
7950 /* These two statements stop the compiler for warning about possibly
7951 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7952 fact, because we do the check for a number below, the paths that
7953 would actually be in error are never taken. */
7954
7955 skipbytes = 0;
7956 reset_bracount = FALSE;
7957
7958 /* If it's not a signed or unsigned number, treat it as a name. */
7959
7960 cf = ptr[1];
7961 if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7962 {
7963 is_recurse = TRUE;
7964 goto NAMED_REF_OR_RECURSE;
7965 }
7966
7967 /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7968 or a digit. */
7969
7970 p = ptr + 2;
7971 while (IS_DIGIT(*p)) p++;
7972 if (*p != (pcre_uchar)terminator)
7973 {
7974 *errorcodeptr = ERR57;
7975 goto FAILED;
7976 }
7977 ptr++;
7978 goto HANDLE_NUMERICAL_RECURSION;
7979 }
7980
7981 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7982 We also support \k{name} (.NET syntax). */
7983
7984 if (escape == ESC_k)
7985 {
7986 if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7987 ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7988 {
7989 *errorcodeptr = ERR69;
7990 goto FAILED;
7991 }
7992 is_recurse = FALSE;
7993 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7994 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7995 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7996 goto NAMED_REF_OR_RECURSE;
7997 }
7998
7999 /* Back references are handled specially; must disable firstchar if
8000 not set to cope with cases like (?=(\w+))\1: which would otherwise set
8001 ':' later. */
8002
8003 if (escape < 0)
8004 {
8005 open_capitem *oc;
8006 recno = -escape;
8007
8008 /* Come here from named backref handling when the reference is to a
8009 single group (i.e. not to a duplicated name. */
8010
8011 HANDLE_REFERENCE:
8012 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8013 previous = code;
8014 item_hwm_offset = cd->hwm - cd->start_workspace;
8015 *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8016 PUT2INC(code, 0, recno);
8017 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8018 if (recno > cd->top_backref) cd->top_backref = recno;
8019
8020 /* Check to see if this back reference is recursive, that it, it
8021 is inside the group that it references. A flag is set so that the
8022 group can be made atomic. */
8023
8024 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8025 {
8026 if (oc->number == recno)
8027 {
8028 oc->flag = TRUE;
8029 break;
8030 }
8031 }
8032 }
8033
8034 /* So are Unicode property matches, if supported. */
8035
8036 #ifdef SUPPORT_UCP
8037 else if (escape == ESC_P || escape == ESC_p)
8038 {
8039 BOOL negated;
8040 unsigned int ptype = 0, pdata = 0;
8041 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8042 goto FAILED;
8043 previous = code;
8044 item_hwm_offset = cd->hwm - cd->start_workspace;
8045 *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8046 *code++ = ptype;
8047 *code++ = pdata;
8048 }
8049 #else
8050
8051 /* If Unicode properties are not supported, \X, \P, and \p are not
8052 allowed. */
8053
8054 else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8055 {
8056 *errorcodeptr = ERR45;
8057 goto FAILED;
8058 }
8059 #endif
8060
8061 /* For the rest (including \X when Unicode properties are supported), we
8062 can obtain the OP value by negating the escape value in the default
8063 situation when PCRE_UCP is not set. When it *is* set, we substitute
8064 Unicode property tests. Note that \b and \B do a one-character
8065 lookbehind, and \A also behaves as if it does. */
8066
8067 else
8068 {
8069 if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8070 cd->max_lookbehind == 0)
8071 cd->max_lookbehind = 1;
8072 #ifdef SUPPORT_UCP
8073 if (escape >= ESC_DU && escape <= ESC_wu)
8074 {
8075 nestptr = ptr + 1; /* Where to resume */
8076 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
8077 }
8078 else
8079 #endif
8080 /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8081 so that it works in DFA mode and in lookbehinds. */
8082
8083 {
8084 previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8085 item_hwm_offset = cd->hwm - cd->start_workspace;
8086 *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8087 }
8088 }
8089 continue;
8090 }
8091
8092 /* We have a data character whose value is in c. In UTF-8 mode it may have
8093 a value > 127. We set its representation in the length/buffer, and then
8094 handle it as a data character. */
8095
8096 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8097 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8098 mclength = PRIV(ord2utf)(c, mcbuffer);
8099 else
8100 #endif
8101
8102 {
8103 mcbuffer[0] = c;
8104 mclength = 1;
8105 }
8106 goto ONE_CHAR;
8107
8108
8109 /* ===================================================================*/
8110 /* Handle a literal character. It is guaranteed not to be whitespace or #
8111 when the extended flag is set. If we are in a UTF mode, it may be a
8112 multi-unit literal character. */
8113
8114 default:
8115 NORMAL_CHAR:
8116 mclength = 1;
8117 mcbuffer[0] = c;
8118
8119 #ifdef SUPPORT_UTF
8120 if (utf && HAS_EXTRALEN(c))
8121 ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8122 #endif
8123
8124 /* At this point we have the character's bytes in mcbuffer, and the length
8125 in mclength. When not in UTF-8 mode, the length is always 1. */
8126
8127 ONE_CHAR:
8128 previous = code;
8129 item_hwm_offset = cd->hwm - cd->start_workspace;
8130
8131 /* For caseless UTF-8 mode when UCP support is available, check whether
8132 this character has more than one other case. If so, generate a special
8133 OP_PROP item instead of OP_CHARI. */
8134
8135 #ifdef SUPPORT_UCP
8136 if (utf && (options & PCRE_CASELESS) != 0)
8137 {
8138 GETCHAR(c, mcbuffer);
8139 if ((c = UCD_CASESET(c)) != 0)
8140 {
8141 *code++ = OP_PROP;
8142 *code++ = PT_CLIST;
8143 *code++ = c;
8144 if (firstcharflags == REQ_UNSET)
8145 firstcharflags = zerofirstcharflags = REQ_NONE;
8146 break;
8147 }
8148 }
8149 #endif
8150
8151 /* Caseful matches, or not one of the multicase characters. */
8152
8153 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8154 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8155
8156 /* Remember if \r or \n were seen */
8157
8158 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8159 cd->external_flags |= PCRE_HASCRORLF;
8160
8161 /* Set the first and required bytes appropriately. If no previous first
8162 byte, set it from this character, but revert to none on a zero repeat.
8163 Otherwise, leave the firstchar value alone, and don't change it on a zero
8164 repeat. */
8165
8166 if (firstcharflags == REQ_UNSET)
8167 {
8168 zerofirstcharflags = REQ_NONE;
8169 zeroreqchar = reqchar;
8170 zeroreqcharflags = reqcharflags;
8171
8172 /* If the character is more than one byte long, we can set firstchar
8173 only if it is not to be matched caselessly. */
8174
8175 if (mclength == 1 || req_caseopt == 0)
8176 {
8177 firstchar = mcbuffer[0] | req_caseopt;
8178 firstchar = mcbuffer[0];
8179 firstcharflags = req_caseopt;
8180
8181 if (mclength != 1)
8182 {
8183 reqchar = code[-1];
8184 reqcharflags = cd->req_varyopt;
8185 }
8186 }
8187 else firstcharflags = reqcharflags = REQ_NONE;
8188 }
8189
8190 /* firstchar was previously set; we can set reqchar only if the length is
8191 1 or the matching is caseful. */
8192
8193 else
8194 {
8195 zerofirstchar = firstchar;
8196 zerofirstcharflags = firstcharflags;
8197 zeroreqchar = reqchar;
8198 zeroreqcharflags = reqcharflags;
8199 if (mclength == 1 || req_caseopt == 0)
8200 {
8201 reqchar = code[-1];
8202 reqcharflags = req_caseopt | cd->req_varyopt;
8203 }
8204 }
8205
8206 break; /* End of literal character handling */
8207 }
8208 } /* end of big loop */
8209
8210
8211 /* Control never reaches here by falling through, only by a goto for all the
8212 error states. Pass back the position in the pattern so that it can be displayed
8213 to the user for diagnosing the error. */
8214
8215 FAILED:
8216 *ptrptr = ptr;
8217 return FALSE;
8218 }
8219
8220
8221
8222 /*************************************************
8223 * Compile sequence of alternatives *
8224 *************************************************/
8225
8226 /* On entry, ptr is pointing past the bracket character, but on return it
8227 points to the closing bracket, or vertical bar, or end of string. The code
8228 variable is pointing at the byte into which the BRA operator has been stored.
8229 This function is used during the pre-compile phase when we are trying to find
8230 out the amount of memory needed, as well as during the real compile phase. The
8231 value of lengthptr distinguishes the two phases.
8232
8233 Arguments:
8234 options option bits, including any changes for this subpattern
8235 codeptr -> the address of the current code pointer
8236 ptrptr -> the address of the current pattern pointer
8237 errorcodeptr -> pointer to error code variable
8238 lookbehind TRUE if this is a lookbehind assertion
8239 reset_bracount TRUE to reset the count for each branch
8240 skipbytes skip this many bytes at start (for brackets and OP_COND)
8241 cond_depth depth of nesting for conditional subpatterns
8242 firstcharptr place to put the first required character
8243 firstcharflagsptr place to put the first character flags, or a negative number
8244 reqcharptr place to put the last required character
8245 reqcharflagsptr place to put the last required character flags, or a negative number
8246 bcptr pointer to the chain of currently open branches
8247 cd points to the data block with tables pointers etc.
8248 lengthptr NULL during the real compile phase
8249 points to length accumulator during pre-compile phase
8250
8251 Returns: TRUE on success
8252 */
8253
8254 static BOOL
compile_regex(int options,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,int skipbytes,int cond_depth,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,compile_data * cd,int * lengthptr)8255 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8256 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8257 int cond_depth,
8258 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8259 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8260 branch_chain *bcptr, compile_data *cd, int *lengthptr)
8261 {
8262 const pcre_uchar *ptr = *ptrptr;
8263 pcre_uchar *code = *codeptr;
8264 pcre_uchar *last_branch = code;
8265 pcre_uchar *start_bracket = code;
8266 pcre_uchar *reverse_count = NULL;
8267 open_capitem capitem;
8268 int capnumber = 0;
8269 pcre_uint32 firstchar, reqchar;
8270 pcre_int32 firstcharflags, reqcharflags;
8271 pcre_uint32 branchfirstchar, branchreqchar;
8272 pcre_int32 branchfirstcharflags, branchreqcharflags;
8273 int length;
8274 unsigned int orig_bracount;
8275 unsigned int max_bracount;
8276 branch_chain bc;
8277 size_t save_hwm_offset;
8278
8279 /* If set, call the external function that checks for stack availability. */
8280
8281 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8282 {
8283 *errorcodeptr= ERR85;
8284 return FALSE;
8285 }
8286
8287 /* Miscellaneous initialization */
8288
8289 bc.outer = bcptr;
8290 bc.current_branch = code;
8291
8292 firstchar = reqchar = 0;
8293 firstcharflags = reqcharflags = REQ_UNSET;
8294
8295 save_hwm_offset = cd->hwm - cd->start_workspace;
8296
8297 /* Accumulate the length for use in the pre-compile phase. Start with the
8298 length of the BRA and KET and any extra bytes that are required at the
8299 beginning. We accumulate in a local variable to save frequent testing of
8300 lenthptr for NULL. We cannot do this by looking at the value of code at the
8301 start and end of each alternative, because compiled items are discarded during
8302 the pre-compile phase so that the work space is not exceeded. */
8303
8304 length = 2 + 2*LINK_SIZE + skipbytes;
8305
8306 /* WARNING: If the above line is changed for any reason, you must also change
8307 the code that abstracts option settings at the start of the pattern and makes
8308 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8309 pre-compile phase to find out whether anything has yet been compiled or not. */
8310
8311 /* If this is a capturing subpattern, add to the chain of open capturing items
8312 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8313 detect groups that contain recursive back references to themselves. Note that
8314 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8315 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8316
8317 if (*code == OP_CBRA)
8318 {
8319 capnumber = GET2(code, 1 + LINK_SIZE);
8320 capitem.number = capnumber;
8321 capitem.next = cd->open_caps;
8322 capitem.flag = FALSE;
8323 cd->open_caps = &capitem;
8324 }
8325
8326 /* Offset is set zero to mark that this bracket is still open */
8327
8328 PUT(code, 1, 0);
8329 code += 1 + LINK_SIZE + skipbytes;
8330
8331 /* Loop for each alternative branch */
8332
8333 orig_bracount = max_bracount = cd->bracount;
8334 for (;;)
8335 {
8336 /* For a (?| group, reset the capturing bracket count so that each branch
8337 uses the same numbers. */
8338
8339 if (reset_bracount) cd->bracount = orig_bracount;
8340
8341 /* Set up dummy OP_REVERSE if lookbehind assertion */
8342
8343 if (lookbehind)
8344 {
8345 *code++ = OP_REVERSE;
8346 reverse_count = code;
8347 PUTINC(code, 0, 0);
8348 length += 1 + LINK_SIZE;
8349 }
8350
8351 /* Now compile the branch; in the pre-compile phase its length gets added
8352 into the length. */
8353
8354 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8355 &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8356 cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8357 {
8358 *ptrptr = ptr;
8359 return FALSE;
8360 }
8361
8362 /* Keep the highest bracket count in case (?| was used and some branch
8363 has fewer than the rest. */
8364
8365 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8366
8367 /* In the real compile phase, there is some post-processing to be done. */
8368
8369 if (lengthptr == NULL)
8370 {
8371 /* If this is the first branch, the firstchar and reqchar values for the
8372 branch become the values for the regex. */
8373
8374 if (*last_branch != OP_ALT)
8375 {
8376 firstchar = branchfirstchar;
8377 firstcharflags = branchfirstcharflags;
8378 reqchar = branchreqchar;
8379 reqcharflags = branchreqcharflags;
8380 }
8381
8382 /* If this is not the first branch, the first char and reqchar have to
8383 match the values from all the previous branches, except that if the
8384 previous value for reqchar didn't have REQ_VARY set, it can still match,
8385 and we set REQ_VARY for the regex. */
8386
8387 else
8388 {
8389 /* If we previously had a firstchar, but it doesn't match the new branch,
8390 we have to abandon the firstchar for the regex, but if there was
8391 previously no reqchar, it takes on the value of the old firstchar. */
8392
8393 if (firstcharflags >= 0 &&
8394 (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8395 {
8396 if (reqcharflags < 0)
8397 {
8398 reqchar = firstchar;
8399 reqcharflags = firstcharflags;
8400 }
8401 firstcharflags = REQ_NONE;
8402 }
8403
8404 /* If we (now or from before) have no firstchar, a firstchar from the
8405 branch becomes a reqchar if there isn't a branch reqchar. */
8406
8407 if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8408 {
8409 branchreqchar = branchfirstchar;
8410 branchreqcharflags = branchfirstcharflags;
8411 }
8412
8413 /* Now ensure that the reqchars match */
8414
8415 if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8416 reqchar != branchreqchar)
8417 reqcharflags = REQ_NONE;
8418 else
8419 {
8420 reqchar = branchreqchar;
8421 reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8422 }
8423 }
8424
8425 /* If lookbehind, check that this branch matches a fixed-length string, and
8426 put the length into the OP_REVERSE item. Temporarily mark the end of the
8427 branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8428 because there may be forward references that we can't check here. Set a
8429 flag to cause another lookbehind check at the end. Why not do it all at the
8430 end? Because common, erroneous checks are picked up here and the offset of
8431 the problem can be shown. */
8432
8433 if (lookbehind)
8434 {
8435 int fixed_length;
8436 *code = OP_END;
8437 fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0,
8438 FALSE, cd, NULL);
8439 DPRINTF(("fixed length = %d\n", fixed_length));
8440 if (fixed_length == -3)
8441 {
8442 cd->check_lookbehind = TRUE;
8443 }
8444 else if (fixed_length < 0)
8445 {
8446 *errorcodeptr = (fixed_length == -2)? ERR36 :
8447 (fixed_length == -4)? ERR70: ERR25;
8448 *ptrptr = ptr;
8449 return FALSE;
8450 }
8451 else
8452 {
8453 if (fixed_length > cd->max_lookbehind)
8454 cd->max_lookbehind = fixed_length;
8455 PUT(reverse_count, 0, fixed_length);
8456 }
8457 }
8458 }
8459
8460 /* Reached end of expression, either ')' or end of pattern. In the real
8461 compile phase, go back through the alternative branches and reverse the chain
8462 of offsets, with the field in the BRA item now becoming an offset to the
8463 first alternative. If there are no alternatives, it points to the end of the
8464 group. The length in the terminating ket is always the length of the whole
8465 bracketed item. Return leaving the pointer at the terminating char. */
8466
8467 if (*ptr != CHAR_VERTICAL_LINE)
8468 {
8469 if (lengthptr == NULL)
8470 {
8471 int branch_length = (int)(code - last_branch);
8472 do
8473 {
8474 int prev_length = GET(last_branch, 1);
8475 PUT(last_branch, 1, branch_length);
8476 branch_length = prev_length;
8477 last_branch -= branch_length;
8478 }
8479 while (branch_length > 0);
8480 }
8481
8482 /* Fill in the ket */
8483
8484 *code = OP_KET;
8485 PUT(code, 1, (int)(code - start_bracket));
8486 code += 1 + LINK_SIZE;
8487
8488 /* If it was a capturing subpattern, check to see if it contained any
8489 recursive back references. If so, we must wrap it in atomic brackets.
8490 Because we are moving code along, we must ensure that any pending recursive
8491 references are updated. In any event, remove the block from the chain. */
8492
8493 if (capnumber > 0)
8494 {
8495 if (cd->open_caps->flag)
8496 {
8497 *code = OP_END;
8498 adjust_recurse(start_bracket, 1 + LINK_SIZE,
8499 (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8500 memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8501 IN_UCHARS(code - start_bracket));
8502 *start_bracket = OP_ONCE;
8503 code += 1 + LINK_SIZE;
8504 PUT(start_bracket, 1, (int)(code - start_bracket));
8505 *code = OP_KET;
8506 PUT(code, 1, (int)(code - start_bracket));
8507 code += 1 + LINK_SIZE;
8508 length += 2 + 2*LINK_SIZE;
8509 }
8510 cd->open_caps = cd->open_caps->next;
8511 }
8512
8513 /* Retain the highest bracket number, in case resetting was used. */
8514
8515 cd->bracount = max_bracount;
8516
8517 /* Set values to pass back */
8518
8519 *codeptr = code;
8520 *ptrptr = ptr;
8521 *firstcharptr = firstchar;
8522 *firstcharflagsptr = firstcharflags;
8523 *reqcharptr = reqchar;
8524 *reqcharflagsptr = reqcharflags;
8525 if (lengthptr != NULL)
8526 {
8527 if (OFLOW_MAX - *lengthptr < length)
8528 {
8529 *errorcodeptr = ERR20;
8530 return FALSE;
8531 }
8532 *lengthptr += length;
8533 }
8534 return TRUE;
8535 }
8536
8537 /* Another branch follows. In the pre-compile phase, we can move the code
8538 pointer back to where it was for the start of the first branch. (That is,
8539 pretend that each branch is the only one.)
8540
8541 In the real compile phase, insert an ALT node. Its length field points back
8542 to the previous branch while the bracket remains open. At the end the chain
8543 is reversed. It's done like this so that the start of the bracket has a
8544 zero offset until it is closed, making it possible to detect recursion. */
8545
8546 if (lengthptr != NULL)
8547 {
8548 code = *codeptr + 1 + LINK_SIZE + skipbytes;
8549 length += 1 + LINK_SIZE;
8550 }
8551 else
8552 {
8553 *code = OP_ALT;
8554 PUT(code, 1, (int)(code - last_branch));
8555 bc.current_branch = last_branch = code;
8556 code += 1 + LINK_SIZE;
8557 }
8558
8559 ptr++;
8560 }
8561 /* Control never reaches here */
8562 }
8563
8564
8565
8566
8567 /*************************************************
8568 * Check for anchored expression *
8569 *************************************************/
8570
8571 /* Try to find out if this is an anchored regular expression. Consider each
8572 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8573 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8574 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8575 be found, because ^ generates OP_CIRCM in that mode.
8576
8577 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8578 This is the code for \G, which means "match at start of match position, taking
8579 into account the match offset".
8580
8581 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8582 because that will try the rest of the pattern at all possible matching points,
8583 so there is no point trying again.... er ....
8584
8585 .... except when the .* appears inside capturing parentheses, and there is a
8586 subsequent back reference to those parentheses. We haven't enough information
8587 to catch that case precisely.
8588
8589 At first, the best we could do was to detect when .* was in capturing brackets
8590 and the highest back reference was greater than or equal to that level.
8591 However, by keeping a bitmap of the first 31 back references, we can catch some
8592 of the more common cases more precisely.
8593
8594 ... A second exception is when the .* appears inside an atomic group, because
8595 this prevents the number of characters it matches from being adjusted.
8596
8597 Arguments:
8598 code points to start of expression (the bracket)
8599 bracket_map a bitmap of which brackets we are inside while testing; this
8600 handles up to substring 31; after that we just have to take
8601 the less precise approach
8602 cd points to the compile data block
8603 atomcount atomic group level
8604
8605 Returns: TRUE or FALSE
8606 */
8607
8608 static BOOL
is_anchored(register const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8609 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8610 compile_data *cd, int atomcount)
8611 {
8612 do {
8613 const pcre_uchar *scode = first_significant_code(
8614 code + PRIV(OP_lengths)[*code], FALSE);
8615 register int op = *scode;
8616
8617 /* Non-capturing brackets */
8618
8619 if (op == OP_BRA || op == OP_BRAPOS ||
8620 op == OP_SBRA || op == OP_SBRAPOS)
8621 {
8622 if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8623 }
8624
8625 /* Capturing brackets */
8626
8627 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8628 op == OP_SCBRA || op == OP_SCBRAPOS)
8629 {
8630 int n = GET2(scode, 1+LINK_SIZE);
8631 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8632 if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8633 }
8634
8635 /* Positive forward assertions and conditions */
8636
8637 else if (op == OP_ASSERT || op == OP_COND)
8638 {
8639 if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8640 }
8641
8642 /* Atomic groups */
8643
8644 else if (op == OP_ONCE || op == OP_ONCE_NC)
8645 {
8646 if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8647 return FALSE;
8648 }
8649
8650 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8651 it isn't in brackets that are or may be referenced or inside an atomic
8652 group. */
8653
8654 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8655 op == OP_TYPEPOSSTAR))
8656 {
8657 if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8658 atomcount > 0 || cd->had_pruneorskip)
8659 return FALSE;
8660 }
8661
8662 /* Check for explicit anchoring */
8663
8664 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8665
8666 code += GET(code, 1);
8667 }
8668 while (*code == OP_ALT); /* Loop for each alternative */
8669 return TRUE;
8670 }
8671
8672
8673
8674 /*************************************************
8675 * Check for starting with ^ or .* *
8676 *************************************************/
8677
8678 /* This is called to find out if every branch starts with ^ or .* so that
8679 "first char" processing can be done to speed things up in multiline
8680 matching and for non-DOTALL patterns that start with .* (which must start at
8681 the beginning or after \n). As in the case of is_anchored() (see above), we
8682 have to take account of back references to capturing brackets that contain .*
8683 because in that case we can't make the assumption. Also, the appearance of .*
8684 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8685 count, because once again the assumption no longer holds.
8686
8687 Arguments:
8688 code points to start of expression (the bracket)
8689 bracket_map a bitmap of which brackets we are inside while testing; this
8690 handles up to substring 31; after that we just have to take
8691 the less precise approach
8692 cd points to the compile data
8693 atomcount atomic group level
8694
8695 Returns: TRUE or FALSE
8696 */
8697
8698 static BOOL
is_startline(const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8699 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8700 compile_data *cd, int atomcount)
8701 {
8702 do {
8703 const pcre_uchar *scode = first_significant_code(
8704 code + PRIV(OP_lengths)[*code], FALSE);
8705 register int op = *scode;
8706
8707 /* If we are at the start of a conditional assertion group, *both* the
8708 conditional assertion *and* what follows the condition must satisfy the test
8709 for start of line. Other kinds of condition fail. Note that there may be an
8710 auto-callout at the start of a condition. */
8711
8712 if (op == OP_COND)
8713 {
8714 scode += 1 + LINK_SIZE;
8715 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8716 switch (*scode)
8717 {
8718 case OP_CREF:
8719 case OP_DNCREF:
8720 case OP_RREF:
8721 case OP_DNRREF:
8722 case OP_DEF:
8723 case OP_FAIL:
8724 return FALSE;
8725
8726 default: /* Assertion */
8727 if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8728 do scode += GET(scode, 1); while (*scode == OP_ALT);
8729 scode += 1 + LINK_SIZE;
8730 break;
8731 }
8732 scode = first_significant_code(scode, FALSE);
8733 op = *scode;
8734 }
8735
8736 /* Non-capturing brackets */
8737
8738 if (op == OP_BRA || op == OP_BRAPOS ||
8739 op == OP_SBRA || op == OP_SBRAPOS)
8740 {
8741 if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8742 }
8743
8744 /* Capturing brackets */
8745
8746 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8747 op == OP_SCBRA || op == OP_SCBRAPOS)
8748 {
8749 int n = GET2(scode, 1+LINK_SIZE);
8750 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8751 if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
8752 }
8753
8754 /* Positive forward assertions */
8755
8756 else if (op == OP_ASSERT)
8757 {
8758 if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8759 }
8760
8761 /* Atomic brackets */
8762
8763 else if (op == OP_ONCE || op == OP_ONCE_NC)
8764 {
8765 if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
8766 }
8767
8768 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8769 brackets that may be referenced, as long as the pattern does not contain
8770 *PRUNE or *SKIP, because these break the feature. Consider, for example,
8771 /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8772 start of a line. */
8773
8774 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8775 {
8776 if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8777 atomcount > 0 || cd->had_pruneorskip)
8778 return FALSE;
8779 }
8780
8781 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8782 in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8783 because the number of characters matched by .* cannot be adjusted inside
8784 them. */
8785
8786 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8787
8788 /* Move on to the next alternative */
8789
8790 code += GET(code, 1);
8791 }
8792 while (*code == OP_ALT); /* Loop for each alternative */
8793 return TRUE;
8794 }
8795
8796
8797
8798 /*************************************************
8799 * Check for asserted fixed first char *
8800 *************************************************/
8801
8802 /* During compilation, the "first char" settings from forward assertions are
8803 discarded, because they can cause conflicts with actual literals that follow.
8804 However, if we end up without a first char setting for an unanchored pattern,
8805 it is worth scanning the regex to see if there is an initial asserted first
8806 char. If all branches start with the same asserted char, or with a
8807 non-conditional bracket all of whose alternatives start with the same asserted
8808 char (recurse ad lib), then we return that char, with the flags set to zero or
8809 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8810
8811 Arguments:
8812 code points to start of expression (the bracket)
8813 flags points to the first char flags, or to REQ_NONE
8814 inassert TRUE if in an assertion
8815
8816 Returns: the fixed first char, or 0 with REQ_NONE in flags
8817 */
8818
8819 static pcre_uint32
find_firstassertedchar(const pcre_uchar * code,pcre_int32 * flags,BOOL inassert)8820 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8821 BOOL inassert)
8822 {
8823 register pcre_uint32 c = 0;
8824 int cflags = REQ_NONE;
8825
8826 *flags = REQ_NONE;
8827 do {
8828 pcre_uint32 d;
8829 int dflags;
8830 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8831 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8832 const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8833 TRUE);
8834 register pcre_uchar op = *scode;
8835
8836 switch(op)
8837 {
8838 default:
8839 return 0;
8840
8841 case OP_BRA:
8842 case OP_BRAPOS:
8843 case OP_CBRA:
8844 case OP_SCBRA:
8845 case OP_CBRAPOS:
8846 case OP_SCBRAPOS:
8847 case OP_ASSERT:
8848 case OP_ONCE:
8849 case OP_ONCE_NC:
8850 d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8851 if (dflags < 0)
8852 return 0;
8853 if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8854 break;
8855
8856 case OP_EXACT:
8857 scode += IMM2_SIZE;
8858 /* Fall through */
8859
8860 case OP_CHAR:
8861 case OP_PLUS:
8862 case OP_MINPLUS:
8863 case OP_POSPLUS:
8864 if (!inassert) return 0;
8865 if (cflags < 0) { c = scode[1]; cflags = 0; }
8866 else if (c != scode[1]) return 0;
8867 break;
8868
8869 case OP_EXACTI:
8870 scode += IMM2_SIZE;
8871 /* Fall through */
8872
8873 case OP_CHARI:
8874 case OP_PLUSI:
8875 case OP_MINPLUSI:
8876 case OP_POSPLUSI:
8877 if (!inassert) return 0;
8878 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8879 else if (c != scode[1]) return 0;
8880 break;
8881 }
8882
8883 code += GET(code, 1);
8884 }
8885 while (*code == OP_ALT);
8886
8887 *flags = cflags;
8888 return c;
8889 }
8890
8891
8892
8893 /*************************************************
8894 * Add an entry to the name/number table *
8895 *************************************************/
8896
8897 /* This function is called between compiling passes to add an entry to the
8898 name/number table, maintaining alphabetical order. Checking for permitted
8899 and forbidden duplicates has already been done.
8900
8901 Arguments:
8902 cd the compile data block
8903 name the name to add
8904 length the length of the name
8905 groupno the group number
8906
8907 Returns: nothing
8908 */
8909
8910 static void
add_name(compile_data * cd,const pcre_uchar * name,int length,unsigned int groupno)8911 add_name(compile_data *cd, const pcre_uchar *name, int length,
8912 unsigned int groupno)
8913 {
8914 int i;
8915 pcre_uchar *slot = cd->name_table;
8916
8917 for (i = 0; i < cd->names_found; i++)
8918 {
8919 int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8920 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8921 crc = -1; /* Current name is a substring */
8922
8923 /* Make space in the table and break the loop for an earlier name. For a
8924 duplicate or later name, carry on. We do this for duplicates so that in the
8925 simple case (when ?(| is not used) they are in order of their numbers. In all
8926 cases they are in the order in which they appear in the pattern. */
8927
8928 if (crc < 0)
8929 {
8930 memmove(slot + cd->name_entry_size, slot,
8931 IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8932 break;
8933 }
8934
8935 /* Continue the loop for a later or duplicate name */
8936
8937 slot += cd->name_entry_size;
8938 }
8939
8940 PUT2(slot, 0, groupno);
8941 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8942 slot[IMM2_SIZE + length] = 0;
8943 cd->names_found++;
8944 }
8945
8946
8947
8948 /*************************************************
8949 * Compile a Regular Expression *
8950 *************************************************/
8951
8952 /* This function takes a string and returns a pointer to a block of store
8953 holding a compiled version of the expression. The original API for this
8954 function had no error code return variable; it is retained for backwards
8955 compatibility. The new function is given a new name.
8956
8957 Arguments:
8958 pattern the regular expression
8959 options various option bits
8960 errorcodeptr pointer to error code variable (pcre_compile2() only)
8961 can be NULL if you don't want a code value
8962 errorptr pointer to pointer to error text
8963 erroroffset ptr offset in pattern where error was detected
8964 tables pointer to character tables or NULL
8965
8966 Returns: pointer to compiled data block, or NULL on error,
8967 with errorptr and erroroffset set
8968 */
8969
8970 #if defined COMPILE_PCRE8
8971 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char * pattern,int options,const char ** errorptr,int * erroroffset,const unsigned char * tables)8972 pcre_compile(const char *pattern, int options, const char **errorptr,
8973 int *erroroffset, const unsigned char *tables)
8974 #elif defined COMPILE_PCRE16
8975 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8976 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
8977 int *erroroffset, const unsigned char *tables)
8978 #elif defined COMPILE_PCRE32
8979 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8980 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
8981 int *erroroffset, const unsigned char *tables)
8982 #endif
8983 {
8984 #if defined COMPILE_PCRE8
8985 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8986 #elif defined COMPILE_PCRE16
8987 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8988 #elif defined COMPILE_PCRE32
8989 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8990 #endif
8991 }
8992
8993
8994 #if defined COMPILE_PCRE8
8995 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char * pattern,int options,int * errorcodeptr,const char ** errorptr,int * erroroffset,const unsigned char * tables)8996 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
8997 const char **errorptr, int *erroroffset, const unsigned char *tables)
8998 #elif defined COMPILE_PCRE16
8999 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9000 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9001 const char **errorptr, int *erroroffset, const unsigned char *tables)
9002 #elif defined COMPILE_PCRE32
9003 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9004 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9005 const char **errorptr, int *erroroffset, const unsigned char *tables)
9006 #endif
9007 {
9008 REAL_PCRE *re;
9009 int length = 1; /* For final END opcode */
9010 pcre_int32 firstcharflags, reqcharflags;
9011 pcre_uint32 firstchar, reqchar;
9012 pcre_uint32 limit_match = PCRE_UINT32_MAX;
9013 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9014 int newline;
9015 int errorcode = 0;
9016 int skipatstart = 0;
9017 BOOL utf;
9018 BOOL never_utf = FALSE;
9019 size_t size;
9020 pcre_uchar *code;
9021 const pcre_uchar *codestart;
9022 const pcre_uchar *ptr;
9023 compile_data compile_block;
9024 compile_data *cd = &compile_block;
9025
9026 /* This space is used for "compiling" into during the first phase, when we are
9027 computing the amount of memory that is needed. Compiled items are thrown away
9028 as soon as possible, so that a fairly large buffer should be sufficient for
9029 this purpose. The same space is used in the second phase for remembering where
9030 to fill in forward references to subpatterns. That may overflow, in which case
9031 new memory is obtained from malloc(). */
9032
9033 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9034
9035 /* This vector is used for remembering name groups during the pre-compile. In a
9036 similar way to cworkspace, it can be expanded using malloc() if necessary. */
9037
9038 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9039
9040 /* Set this early so that early errors get offset 0. */
9041
9042 ptr = (const pcre_uchar *)pattern;
9043
9044 /* We can't pass back an error message if errorptr is NULL; I guess the best we
9045 can do is just return NULL, but we can set a code value if there is a code
9046 pointer. */
9047
9048 if (errorptr == NULL)
9049 {
9050 if (errorcodeptr != NULL) *errorcodeptr = 99;
9051 return NULL;
9052 }
9053
9054 *errorptr = NULL;
9055 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9056
9057 /* However, we can give a message for this error */
9058
9059 if (erroroffset == NULL)
9060 {
9061 errorcode = ERR16;
9062 goto PCRE_EARLY_ERROR_RETURN2;
9063 }
9064
9065 *erroroffset = 0;
9066
9067 /* Set up pointers to the individual character tables */
9068
9069 if (tables == NULL) tables = PRIV(default_tables);
9070 cd->lcc = tables + lcc_offset;
9071 cd->fcc = tables + fcc_offset;
9072 cd->cbits = tables + cbits_offset;
9073 cd->ctypes = tables + ctypes_offset;
9074
9075 /* Check that all undefined public option bits are zero */
9076
9077 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9078 {
9079 errorcode = ERR17;
9080 goto PCRE_EARLY_ERROR_RETURN;
9081 }
9082
9083 /* If PCRE_NEVER_UTF is set, remember it. */
9084
9085 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9086
9087 /* Check for global one-time settings at the start of the pattern, and remember
9088 the offset for later. */
9089
9090 cd->external_flags = 0; /* Initialize here for LIMIT_MATCH/RECURSION */
9091
9092 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9093 ptr[skipatstart+1] == CHAR_ASTERISK)
9094 {
9095 int newnl = 0;
9096 int newbsr = 0;
9097
9098 /* For completeness and backward compatibility, (*UTFn) is supported in the
9099 relevant libraries, but (*UTF) is generic and always supported. Note that
9100 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9101
9102 #ifdef COMPILE_PCRE8
9103 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9104 { skipatstart += 7; options |= PCRE_UTF8; continue; }
9105 #endif
9106 #ifdef COMPILE_PCRE16
9107 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9108 { skipatstart += 8; options |= PCRE_UTF16; continue; }
9109 #endif
9110 #ifdef COMPILE_PCRE32
9111 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9112 { skipatstart += 8; options |= PCRE_UTF32; continue; }
9113 #endif
9114
9115 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9116 { skipatstart += 6; options |= PCRE_UTF8; continue; }
9117 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9118 { skipatstart += 6; options |= PCRE_UCP; continue; }
9119 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9120 { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9121 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9122 { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9123
9124 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9125 {
9126 pcre_uint32 c = 0;
9127 int p = skipatstart + 14;
9128 while (isdigit(ptr[p]))
9129 {
9130 if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow */
9131 c = c*10 + ptr[p++] - CHAR_0;
9132 }
9133 if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9134 if (c < limit_match)
9135 {
9136 limit_match = c;
9137 cd->external_flags |= PCRE_MLSET;
9138 }
9139 skipatstart = p;
9140 continue;
9141 }
9142
9143 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9144 {
9145 pcre_uint32 c = 0;
9146 int p = skipatstart + 18;
9147 while (isdigit(ptr[p]))
9148 {
9149 if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow check */
9150 c = c*10 + ptr[p++] - CHAR_0;
9151 }
9152 if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9153 if (c < limit_recursion)
9154 {
9155 limit_recursion = c;
9156 cd->external_flags |= PCRE_RLSET;
9157 }
9158 skipatstart = p;
9159 continue;
9160 }
9161
9162 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9163 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9164 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3) == 0)
9165 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9166 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5) == 0)
9167 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9168 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9169 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9170 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9171 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9172
9173 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9174 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9175 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9176 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9177
9178 if (newnl != 0)
9179 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9180 else if (newbsr != 0)
9181 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9182 else break;
9183 }
9184
9185 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9186 utf = (options & PCRE_UTF8) != 0;
9187 if (utf && never_utf)
9188 {
9189 errorcode = ERR78;
9190 goto PCRE_EARLY_ERROR_RETURN2;
9191 }
9192
9193 /* Can't support UTF unless PCRE has been compiled to include the code. The
9194 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9195 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9196 not used here. */
9197
9198 #ifdef SUPPORT_UTF
9199 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9200 (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9201 {
9202 #if defined COMPILE_PCRE8
9203 errorcode = ERR44;
9204 #elif defined COMPILE_PCRE16
9205 errorcode = ERR74;
9206 #elif defined COMPILE_PCRE32
9207 errorcode = ERR77;
9208 #endif
9209 goto PCRE_EARLY_ERROR_RETURN2;
9210 }
9211 #else
9212 if (utf)
9213 {
9214 errorcode = ERR32;
9215 goto PCRE_EARLY_ERROR_RETURN;
9216 }
9217 #endif
9218
9219 /* Can't support UCP unless PCRE has been compiled to include the code. */
9220
9221 #ifndef SUPPORT_UCP
9222 if ((options & PCRE_UCP) != 0)
9223 {
9224 errorcode = ERR67;
9225 goto PCRE_EARLY_ERROR_RETURN;
9226 }
9227 #endif
9228
9229 /* Check validity of \R options. */
9230
9231 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9232 (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9233 {
9234 errorcode = ERR56;
9235 goto PCRE_EARLY_ERROR_RETURN;
9236 }
9237
9238 /* Handle different types of newline. The three bits give seven cases. The
9239 current code allows for fixed one- or two-byte sequences, plus "any" and
9240 "anycrlf". */
9241
9242 switch (options & PCRE_NEWLINE_BITS)
9243 {
9244 case 0: newline = NEWLINE; break; /* Build-time default */
9245 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9246 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9247 case PCRE_NEWLINE_CR+
9248 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9249 case PCRE_NEWLINE_ANY: newline = -1; break;
9250 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9251 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9252 }
9253
9254 if (newline == -2)
9255 {
9256 cd->nltype = NLTYPE_ANYCRLF;
9257 }
9258 else if (newline < 0)
9259 {
9260 cd->nltype = NLTYPE_ANY;
9261 }
9262 else
9263 {
9264 cd->nltype = NLTYPE_FIXED;
9265 if (newline > 255)
9266 {
9267 cd->nllen = 2;
9268 cd->nl[0] = (newline >> 8) & 255;
9269 cd->nl[1] = newline & 255;
9270 }
9271 else
9272 {
9273 cd->nllen = 1;
9274 cd->nl[0] = newline;
9275 }
9276 }
9277
9278 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9279 references to help in deciding whether (.*) can be treated as anchored or not.
9280 */
9281
9282 cd->top_backref = 0;
9283 cd->backref_map = 0;
9284
9285 /* Reflect pattern for debugging output */
9286
9287 DPRINTF(("------------------------------------------------------------------\n"));
9288 #ifdef PCRE_DEBUG
9289 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9290 #endif
9291 DPRINTF(("\n"));
9292
9293 /* Pretend to compile the pattern while actually just accumulating the length
9294 of memory required. This behaviour is triggered by passing a non-NULL final
9295 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9296 to compile parts of the pattern into; the compiled code is discarded when it is
9297 no longer needed, so hopefully this workspace will never overflow, though there
9298 is a test for its doing so. */
9299
9300 cd->bracount = cd->final_bracount = 0;
9301 cd->names_found = 0;
9302 cd->name_entry_size = 0;
9303 cd->name_table = NULL;
9304 cd->dupnames = FALSE;
9305 cd->dupgroups = FALSE;
9306 cd->namedrefcount = 0;
9307 cd->start_code = cworkspace;
9308 cd->hwm = cworkspace;
9309 cd->iscondassert = FALSE;
9310 cd->start_workspace = cworkspace;
9311 cd->workspace_size = COMPILE_WORK_SIZE;
9312 cd->named_groups = named_groups;
9313 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9314 cd->start_pattern = (const pcre_uchar *)pattern;
9315 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9316 cd->req_varyopt = 0;
9317 cd->parens_depth = 0;
9318 cd->assert_depth = 0;
9319 cd->max_lookbehind = 0;
9320 cd->external_options = options;
9321 cd->open_caps = NULL;
9322
9323 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9324 don't need to look at the result of the function here. The initial options have
9325 been put into the cd block so that they can be changed if an option setting is
9326 found within the regex right at the beginning. Bringing initial option settings
9327 outside can help speed up starting point checks. */
9328
9329 ptr += skipatstart;
9330 code = cworkspace;
9331 *code = OP_BRA;
9332
9333 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9334 FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9335 cd, &length);
9336 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9337
9338 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9339 (int)(cd->hwm - cworkspace)));
9340
9341 if (length > MAX_PATTERN_SIZE)
9342 {
9343 errorcode = ERR20;
9344 goto PCRE_EARLY_ERROR_RETURN;
9345 }
9346
9347 /* Compute the size of the data block for storing the compiled pattern. Integer
9348 overflow should no longer be possible because nowadays we limit the maximum
9349 value of cd->names_found and cd->name_entry_size. */
9350
9351 size = sizeof(REAL_PCRE) +
9352 (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9353
9354 /* Get the memory. */
9355
9356 re = (REAL_PCRE *)(PUBL(malloc))(size);
9357 if (re == NULL)
9358 {
9359 errorcode = ERR21;
9360 goto PCRE_EARLY_ERROR_RETURN;
9361 }
9362
9363 /* Put in the magic number, and save the sizes, initial options, internal
9364 flags, and character table pointer. NULL is used for the default character
9365 tables. The nullpad field is at the end; it's there to help in the case when a
9366 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9367 pointers. */
9368
9369 re->magic_number = MAGIC_NUMBER;
9370 re->size = (int)size;
9371 re->options = cd->external_options;
9372 re->flags = cd->external_flags;
9373 re->limit_match = limit_match;
9374 re->limit_recursion = limit_recursion;
9375 re->first_char = 0;
9376 re->req_char = 0;
9377 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9378 re->name_entry_size = cd->name_entry_size;
9379 re->name_count = cd->names_found;
9380 re->ref_count = 0;
9381 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9382 re->nullpad = NULL;
9383 #ifdef COMPILE_PCRE32
9384 re->dummy = 0;
9385 #else
9386 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9387 #endif
9388
9389 /* The starting points of the name/number translation table and of the code are
9390 passed around in the compile data block. The start/end pattern and initial
9391 options are already set from the pre-compile phase, as is the name_entry_size
9392 field. Reset the bracket count and the names_found field. Also reset the hwm
9393 field; this time it's used for remembering forward references to subpatterns.
9394 */
9395
9396 cd->final_bracount = cd->bracount; /* Save for checking forward references */
9397 cd->parens_depth = 0;
9398 cd->assert_depth = 0;
9399 cd->bracount = 0;
9400 cd->max_lookbehind = 0;
9401 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9402 codestart = cd->name_table + re->name_entry_size * re->name_count;
9403 cd->start_code = codestart;
9404 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9405 cd->iscondassert = FALSE;
9406 cd->req_varyopt = 0;
9407 cd->had_accept = FALSE;
9408 cd->had_pruneorskip = FALSE;
9409 cd->check_lookbehind = FALSE;
9410 cd->open_caps = NULL;
9411
9412 /* If any named groups were found, create the name/number table from the list
9413 created in the first pass. */
9414
9415 if (cd->names_found > 0)
9416 {
9417 int i = cd->names_found;
9418 named_group *ng = cd->named_groups;
9419 cd->names_found = 0;
9420 for (; i > 0; i--, ng++)
9421 add_name(cd, ng->name, ng->length, ng->number);
9422 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9423 (PUBL(free))((void *)cd->named_groups);
9424 }
9425
9426 /* Set up a starting, non-extracting bracket, then compile the expression. On
9427 error, errorcode will be set non-zero, so we don't need to look at the result
9428 of the function here. */
9429
9430 ptr = (const pcre_uchar *)pattern + skipatstart;
9431 code = (pcre_uchar *)codestart;
9432 *code = OP_BRA;
9433 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9434 &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9435 re->top_bracket = cd->bracount;
9436 re->top_backref = cd->top_backref;
9437 re->max_lookbehind = cd->max_lookbehind;
9438 re->flags = cd->external_flags | PCRE_MODE;
9439
9440 if (cd->had_accept)
9441 {
9442 reqchar = 0; /* Must disable after (*ACCEPT) */
9443 reqcharflags = REQ_NONE;
9444 }
9445
9446 /* If not reached end of pattern on success, there's an excess bracket. */
9447
9448 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9449
9450 /* Fill in the terminating state and check for disastrous overflow, but
9451 if debugging, leave the test till after things are printed out. */
9452
9453 *code++ = OP_END;
9454
9455 #ifndef PCRE_DEBUG
9456 if (code - codestart > length) errorcode = ERR23;
9457 #endif
9458
9459 #ifdef SUPPORT_VALGRIND
9460 /* If the estimated length exceeds the really used length, mark the extra
9461 allocated memory as unaddressable, so that any out-of-bound reads can be
9462 detected. */
9463 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9464 #endif
9465
9466 /* Fill in any forward references that are required. There may be repeated
9467 references; optimize for them, as searching a large regex takes time. */
9468
9469 if (cd->hwm > cd->start_workspace)
9470 {
9471 int prev_recno = -1;
9472 const pcre_uchar *groupptr = NULL;
9473 while (errorcode == 0 && cd->hwm > cd->start_workspace)
9474 {
9475 int offset, recno;
9476 cd->hwm -= LINK_SIZE;
9477 offset = GET(cd->hwm, 0);
9478
9479 /* Check that the hwm handling hasn't gone wrong. This whole area is
9480 rewritten in PCRE2 because there are some obscure cases. */
9481
9482 if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9483 {
9484 errorcode = ERR10;
9485 break;
9486 }
9487
9488 recno = GET(codestart, offset);
9489 if (recno != prev_recno)
9490 {
9491 groupptr = PRIV(find_bracket)(codestart, utf, recno);
9492 prev_recno = recno;
9493 }
9494 if (groupptr == NULL) errorcode = ERR53;
9495 else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9496 }
9497 }
9498
9499 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9500 NULL to indicate that forward references have been filled in. */
9501
9502 if (cd->workspace_size > COMPILE_WORK_SIZE)
9503 (PUBL(free))((void *)cd->start_workspace);
9504 cd->start_workspace = NULL;
9505
9506 /* Give an error if there's back reference to a non-existent capturing
9507 subpattern. */
9508
9509 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9510
9511 /* Unless disabled, check whether any single character iterators can be
9512 auto-possessified. The function overwrites the appropriate opcode values, so
9513 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9514 used in this code because at least one compiler gives a warning about loss of
9515 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9516 function call. */
9517
9518 if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9519 {
9520 pcre_uchar *temp = (pcre_uchar *)codestart;
9521 auto_possessify(temp, utf, cd);
9522 }
9523
9524 /* If there were any lookbehind assertions that contained OP_RECURSE
9525 (recursions or subroutine calls), a flag is set for them to be checked here,
9526 because they may contain forward references. Actual recursions cannot be fixed
9527 length, but subroutine calls can. It is done like this so that those without
9528 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9529 exceptional ones forgo this. We scan the pattern to check that they are fixed
9530 length, and set their lengths. */
9531
9532 if (errorcode == 0 && cd->check_lookbehind)
9533 {
9534 pcre_uchar *cc = (pcre_uchar *)codestart;
9535
9536 /* Loop, searching for OP_REVERSE items, and process those that do not have
9537 their length set. (Actually, it will also re-process any that have a length
9538 of zero, but that is a pathological case, and it does no harm.) When we find
9539 one, we temporarily terminate the branch it is in while we scan it. */
9540
9541 for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9542 cc != NULL;
9543 cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9544 {
9545 if (GET(cc, 1) == 0)
9546 {
9547 int fixed_length;
9548 pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9549 int end_op = *be;
9550 *be = OP_END;
9551 fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9552 cd, NULL);
9553 *be = end_op;
9554 DPRINTF(("fixed length = %d\n", fixed_length));
9555 if (fixed_length < 0)
9556 {
9557 errorcode = (fixed_length == -2)? ERR36 :
9558 (fixed_length == -4)? ERR70 : ERR25;
9559 break;
9560 }
9561 if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9562 PUT(cc, 1, fixed_length);
9563 }
9564 cc += 1 + LINK_SIZE;
9565 }
9566 }
9567
9568 /* Failed to compile, or error while post-processing */
9569
9570 if (errorcode != 0)
9571 {
9572 (PUBL(free))(re);
9573 PCRE_EARLY_ERROR_RETURN:
9574 *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9575 PCRE_EARLY_ERROR_RETURN2:
9576 *errorptr = find_error_text(errorcode);
9577 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9578 return NULL;
9579 }
9580
9581 /* If the anchored option was not passed, set the flag if we can determine that
9582 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9583 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9584 of *PRUNE or *SKIP.
9585
9586 Otherwise, if we know what the first byte has to be, save it, because that
9587 speeds up unanchored matches no end. If not, see if we can set the
9588 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9589 start with ^. and also when all branches start with non-atomic .* for
9590 non-DOTALL matches when *PRUNE and SKIP are not present. */
9591
9592 if ((re->options & PCRE_ANCHORED) == 0)
9593 {
9594 if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9595 else
9596 {
9597 if (firstcharflags < 0)
9598 firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9599 if (firstcharflags >= 0) /* Remove caseless flag for non-caseable chars */
9600 {
9601 #if defined COMPILE_PCRE8
9602 re->first_char = firstchar & 0xff;
9603 #elif defined COMPILE_PCRE16
9604 re->first_char = firstchar & 0xffff;
9605 #elif defined COMPILE_PCRE32
9606 re->first_char = firstchar;
9607 #endif
9608 if ((firstcharflags & REQ_CASELESS) != 0)
9609 {
9610 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9611 /* We ignore non-ASCII first chars in 8 bit mode. */
9612 if (utf)
9613 {
9614 if (re->first_char < 128)
9615 {
9616 if (cd->fcc[re->first_char] != re->first_char)
9617 re->flags |= PCRE_FCH_CASELESS;
9618 }
9619 else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9620 re->flags |= PCRE_FCH_CASELESS;
9621 }
9622 else
9623 #endif
9624 if (MAX_255(re->first_char)
9625 && cd->fcc[re->first_char] != re->first_char)
9626 re->flags |= PCRE_FCH_CASELESS;
9627 }
9628
9629 re->flags |= PCRE_FIRSTSET;
9630 }
9631
9632 else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
9633 }
9634 }
9635
9636 /* For an anchored pattern, we use the "required byte" only if it follows a
9637 variable length item in the regex. Remove the caseless flag for non-caseable
9638 bytes. */
9639
9640 if (reqcharflags >= 0 &&
9641 ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9642 {
9643 #if defined COMPILE_PCRE8
9644 re->req_char = reqchar & 0xff;
9645 #elif defined COMPILE_PCRE16
9646 re->req_char = reqchar & 0xffff;
9647 #elif defined COMPILE_PCRE32
9648 re->req_char = reqchar;
9649 #endif
9650 if ((reqcharflags & REQ_CASELESS) != 0)
9651 {
9652 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9653 /* We ignore non-ASCII first chars in 8 bit mode. */
9654 if (utf)
9655 {
9656 if (re->req_char < 128)
9657 {
9658 if (cd->fcc[re->req_char] != re->req_char)
9659 re->flags |= PCRE_RCH_CASELESS;
9660 }
9661 else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9662 re->flags |= PCRE_RCH_CASELESS;
9663 }
9664 else
9665 #endif
9666 if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9667 re->flags |= PCRE_RCH_CASELESS;
9668 }
9669
9670 re->flags |= PCRE_REQCHSET;
9671 }
9672
9673 /* Print out the compiled data if debugging is enabled. This is never the
9674 case when building a production library. */
9675
9676 #ifdef PCRE_DEBUG
9677 printf("Length = %d top_bracket = %d top_backref = %d\n",
9678 length, re->top_bracket, re->top_backref);
9679
9680 printf("Options=%08x\n", re->options);
9681
9682 if ((re->flags & PCRE_FIRSTSET) != 0)
9683 {
9684 pcre_uchar ch = re->first_char;
9685 const char *caseless =
9686 ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9687 if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9688 else printf("First char = \\x%02x%s\n", ch, caseless);
9689 }
9690
9691 if ((re->flags & PCRE_REQCHSET) != 0)
9692 {
9693 pcre_uchar ch = re->req_char;
9694 const char *caseless =
9695 ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9696 if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9697 else printf("Req char = \\x%02x%s\n", ch, caseless);
9698 }
9699
9700 #if defined COMPILE_PCRE8
9701 pcre_printint((pcre *)re, stdout, TRUE);
9702 #elif defined COMPILE_PCRE16
9703 pcre16_printint((pcre *)re, stdout, TRUE);
9704 #elif defined COMPILE_PCRE32
9705 pcre32_printint((pcre *)re, stdout, TRUE);
9706 #endif
9707
9708 /* This check is done here in the debugging case so that the code that
9709 was compiled can be seen. */
9710
9711 if (code - codestart > length)
9712 {
9713 (PUBL(free))(re);
9714 *errorptr = find_error_text(ERR23);
9715 *erroroffset = ptr - (pcre_uchar *)pattern;
9716 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9717 return NULL;
9718 }
9719 #endif /* PCRE_DEBUG */
9720
9721 /* Check for a pattern than can match an empty string, so that this information
9722 can be provided to applications. */
9723
9724 do
9725 {
9726 if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9727 {
9728 re->flags |= PCRE_MATCH_EMPTY;
9729 break;
9730 }
9731 codestart += GET(codestart, 1);
9732 }
9733 while (*codestart == OP_ALT);
9734
9735 #if defined COMPILE_PCRE8
9736 return (pcre *)re;
9737 #elif defined COMPILE_PCRE16
9738 return (pcre16 *)re;
9739 #elif defined COMPILE_PCRE32
9740 return (pcre32 *)re;
9741 #endif
9742 }
9743
9744 /* End of pcre_compile.c */
9745