1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2023 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre2_internal.h"
50
51 /* The maximum remembered capturing brackets minimum. */
52
53 #define MAX_CACHE_BACKREF 128
54
55 /* Set a bit in the starting code unit bit map. */
56
57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58
59 /* Returns from set_start_bits() */
60
61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62
63
64 /*************************************************
65 * Find the minimum subject length for a group *
66 *************************************************/
67
68 /* Scan a parenthesized group and compute the minimum length of subject that
69 is needed to match it. This is a lower bound; it does not mean there is a
70 string of that length that matches. In UTF mode, the result is in characters
71 rather than code units. The field in a compiled pattern for storing the minimum
72 length is 16-bits long (on the grounds that anything longer than that is
73 pathological), so we give up when we reach that amount. This also means that
74 integer overflow for really crazy patterns cannot happen.
75
76 Backreference minimum lengths are cached to speed up multiple references. This
77 function is called only when the highest back reference in the pattern is less
78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79 caching vector. The zeroth element contains the number of the highest set
80 value.
81
82 Arguments:
83 re compiled pattern block
84 code pointer to start of group (the bracket)
85 startcode pointer to start of the whole pattern's code
86 utf UTF flag
87 recurses chain of recurse_check to catch mutual recursion
88 countptr pointer to call count (to catch over complexity)
89 backref_cache vector for caching back references.
90
91 This function is no longer called when the pattern contains (*ACCEPT); however,
92 the old code for returning -1 is retained, just in case.
93
94 Returns: the minimum length
95 -1 \C in UTF-8 mode
96 or (*ACCEPT)
97 or pattern too complicated
98 -2 internal error (missing capturing bracket)
99 -3 internal error (opcode not listed)
100 */
101
102 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr,int * backref_cache)103 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104 PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105 int *backref_cache)
106 {
107 int length = -1;
108 int branchlength = 0;
109 int prev_cap_recno = -1;
110 int prev_cap_d = 0;
111 int prev_recurse_recno = -1;
112 int prev_recurse_d = 0;
113 uint32_t once_fudge = 0;
114 BOOL had_recurse = FALSE;
115 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116 PCRE2_SPTR nextbranch = code + GET(code, 1);
117 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
118 recurse_check this_recurse;
119
120 /* If this is a "could be empty" group, its minimum length is 0. */
121
122 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123
124 /* Skip over capturing bracket number */
125
126 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127
128 /* A large and/or complex regex can take too long to process. */
129
130 if ((*countptr)++ > 1000) return -1;
131
132 /* Scan along the opcodes for this branch. If we get to the end of the branch,
133 check the length against that of the other branches. If the accumulated length
134 passes 16-bits, reset to that value and skip the rest of the branch. */
135
136 for (;;)
137 {
138 int d, min, recno;
139 PCRE2_UCHAR op, *cs, *ce;
140
141 if (branchlength >= UINT16_MAX)
142 {
143 branchlength = UINT16_MAX;
144 cc = (PCRE2_UCHAR *)nextbranch;
145 }
146
147 op = *cc;
148 switch (op)
149 {
150 case OP_COND:
151 case OP_SCOND:
152
153 /* If there is only one branch in a condition, the implied branch has zero
154 length, so we don't add anything. This covers the DEFINE "condition"
155 automatically. If there are two branches we can treat it the same as any
156 other non-capturing subpattern. */
157
158 cs = cc + GET(cc, 1);
159 if (*cs != OP_ALT)
160 {
161 cc = cs + 1 + LINK_SIZE;
162 break;
163 }
164 goto PROCESS_NON_CAPTURE;
165
166 case OP_BRA:
167 /* There's a special case of OP_BRA, when it is wrapped round a repeated
168 OP_RECURSE. We'd like to process the latter at this level so that
169 remembering the value works for repeated cases. So we do nothing, but
170 set a fudge value to skip over the OP_KET after the recurse. */
171
172 if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173 {
174 once_fudge = 1 + LINK_SIZE;
175 cc += 1 + LINK_SIZE;
176 break;
177 }
178 /* Fall through */
179
180 case OP_ONCE:
181 case OP_SCRIPT_RUN:
182 case OP_SBRA:
183 case OP_BRAPOS:
184 case OP_SBRAPOS:
185 PROCESS_NON_CAPTURE:
186 d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187 backref_cache);
188 if (d < 0) return d;
189 branchlength += d;
190 do cc += GET(cc, 1); while (*cc == OP_ALT);
191 cc += 1 + LINK_SIZE;
192 break;
193
194 /* To save time for repeated capturing subpatterns, we remember the
195 length of the previous one. Unfortunately we can't do the same for
196 the unnumbered ones above. Nor can we do this if (?| is present in the
197 pattern because captures with the same number are not then identical. */
198
199 case OP_CBRA:
200 case OP_SCBRA:
201 case OP_CBRAPOS:
202 case OP_SCBRAPOS:
203 recno = (int)GET2(cc, 1+LINK_SIZE);
204 if (dupcapused || recno != prev_cap_recno)
205 {
206 prev_cap_recno = recno;
207 prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208 backref_cache);
209 if (prev_cap_d < 0) return prev_cap_d;
210 }
211 branchlength += prev_cap_d;
212 do cc += GET(cc, 1); while (*cc == OP_ALT);
213 cc += 1 + LINK_SIZE;
214 break;
215
216 /* ACCEPT makes things far too complicated; we have to give up. In fact,
217 from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218 used. However, leave the code in place, just in case. */
219
220 case OP_ACCEPT:
221 case OP_ASSERT_ACCEPT:
222 return -1;
223
224 /* Reached end of a branch; if it's a ket it is the end of a nested
225 call. If it's ALT it is an alternation in a nested call. If it is END it's
226 the end of the outer call. All can be handled by the same code. If the
227 length of any branch is zero, there is no need to scan any subsequent
228 branches. */
229
230 case OP_ALT:
231 case OP_KET:
232 case OP_KETRMAX:
233 case OP_KETRMIN:
234 case OP_KETRPOS:
235 case OP_END:
236 if (length < 0 || (!had_recurse && branchlength < length))
237 length = branchlength;
238 if (op != OP_ALT || length == 0) return length;
239 nextbranch = cc + GET(cc, 1);
240 cc += 1 + LINK_SIZE;
241 branchlength = 0;
242 had_recurse = FALSE;
243 break;
244
245 /* Skip over assertive subpatterns */
246
247 case OP_ASSERT:
248 case OP_ASSERT_NOT:
249 case OP_ASSERTBACK:
250 case OP_ASSERTBACK_NOT:
251 case OP_ASSERT_NA:
252 case OP_ASSERTBACK_NA:
253 do cc += GET(cc, 1); while (*cc == OP_ALT);
254 /* Fall through */
255
256 /* Skip over things that don't match chars */
257
258 case OP_REVERSE:
259 case OP_VREVERSE:
260 case OP_CREF:
261 case OP_DNCREF:
262 case OP_RREF:
263 case OP_DNRREF:
264 case OP_FALSE:
265 case OP_TRUE:
266 case OP_CALLOUT:
267 case OP_SOD:
268 case OP_SOM:
269 case OP_EOD:
270 case OP_EODN:
271 case OP_CIRC:
272 case OP_CIRCM:
273 case OP_DOLL:
274 case OP_DOLLM:
275 case OP_NOT_WORD_BOUNDARY:
276 case OP_WORD_BOUNDARY:
277 case OP_NOT_UCP_WORD_BOUNDARY:
278 case OP_UCP_WORD_BOUNDARY:
279 cc += PRIV(OP_lengths)[*cc];
280 break;
281
282 case OP_CALLOUT_STR:
283 cc += GET(cc, 1 + 2*LINK_SIZE);
284 break;
285
286 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
287
288 case OP_BRAZERO:
289 case OP_BRAMINZERO:
290 case OP_BRAPOSZERO:
291 case OP_SKIPZERO:
292 cc += PRIV(OP_lengths)[*cc];
293 do cc += GET(cc, 1); while (*cc == OP_ALT);
294 cc += 1 + LINK_SIZE;
295 break;
296
297 /* Handle literal characters and + repetitions */
298
299 case OP_CHAR:
300 case OP_CHARI:
301 case OP_NOT:
302 case OP_NOTI:
303 case OP_PLUS:
304 case OP_PLUSI:
305 case OP_MINPLUS:
306 case OP_MINPLUSI:
307 case OP_POSPLUS:
308 case OP_POSPLUSI:
309 case OP_NOTPLUS:
310 case OP_NOTPLUSI:
311 case OP_NOTMINPLUS:
312 case OP_NOTMINPLUSI:
313 case OP_NOTPOSPLUS:
314 case OP_NOTPOSPLUSI:
315 branchlength++;
316 cc += 2;
317 #ifdef SUPPORT_UNICODE
318 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
319 #endif
320 break;
321
322 case OP_TYPEPLUS:
323 case OP_TYPEMINPLUS:
324 case OP_TYPEPOSPLUS:
325 branchlength++;
326 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
327 break;
328
329 /* Handle exact repetitions. The count is already in characters, but we
330 may need to skip over a multibyte character in UTF mode. */
331
332 case OP_EXACT:
333 case OP_EXACTI:
334 case OP_NOTEXACT:
335 case OP_NOTEXACTI:
336 branchlength += GET2(cc,1);
337 cc += 2 + IMM2_SIZE;
338 #ifdef SUPPORT_UNICODE
339 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
340 #endif
341 break;
342
343 case OP_TYPEEXACT:
344 branchlength += GET2(cc,1);
345 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
346 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
347 break;
348
349 /* Handle single-char non-literal matchers */
350
351 case OP_PROP:
352 case OP_NOTPROP:
353 cc += 2;
354 /* Fall through */
355
356 case OP_NOT_DIGIT:
357 case OP_DIGIT:
358 case OP_NOT_WHITESPACE:
359 case OP_WHITESPACE:
360 case OP_NOT_WORDCHAR:
361 case OP_WORDCHAR:
362 case OP_ANY:
363 case OP_ALLANY:
364 case OP_EXTUNI:
365 case OP_HSPACE:
366 case OP_NOT_HSPACE:
367 case OP_VSPACE:
368 case OP_NOT_VSPACE:
369 branchlength++;
370 cc++;
371 break;
372
373 /* "Any newline" might match two characters, but it also might match just
374 one. */
375
376 case OP_ANYNL:
377 branchlength += 1;
378 cc++;
379 break;
380
381 /* The single-byte matcher means we can't proceed in UTF mode. (In
382 non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
383 appear, but leave the code, just in case.) */
384
385 case OP_ANYBYTE:
386 #ifdef SUPPORT_UNICODE
387 if (utf) return -1;
388 #endif
389 branchlength++;
390 cc++;
391 break;
392
393 /* For repeated character types, we have to test for \p and \P, which have
394 an extra two bytes of parameters. */
395
396 case OP_TYPESTAR:
397 case OP_TYPEMINSTAR:
398 case OP_TYPEQUERY:
399 case OP_TYPEMINQUERY:
400 case OP_TYPEPOSSTAR:
401 case OP_TYPEPOSQUERY:
402 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
403 cc += PRIV(OP_lengths)[op];
404 break;
405
406 case OP_TYPEUPTO:
407 case OP_TYPEMINUPTO:
408 case OP_TYPEPOSUPTO:
409 if (cc[1 + IMM2_SIZE] == OP_PROP
410 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
411 cc += PRIV(OP_lengths)[op];
412 break;
413
414 /* Check a class for variable quantification */
415
416 case OP_CLASS:
417 case OP_NCLASS:
418 #ifdef SUPPORT_WIDE_CHARS
419 case OP_XCLASS:
420 /* The original code caused an unsigned overflow in 64 bit systems,
421 so now we use a conditional statement. */
422 if (op == OP_XCLASS)
423 cc += GET(cc, 1);
424 else
425 cc += PRIV(OP_lengths)[OP_CLASS];
426 #else
427 cc += PRIV(OP_lengths)[OP_CLASS];
428 #endif
429
430 switch (*cc)
431 {
432 case OP_CRPLUS:
433 case OP_CRMINPLUS:
434 case OP_CRPOSPLUS:
435 branchlength++;
436 /* Fall through */
437
438 case OP_CRSTAR:
439 case OP_CRMINSTAR:
440 case OP_CRQUERY:
441 case OP_CRMINQUERY:
442 case OP_CRPOSSTAR:
443 case OP_CRPOSQUERY:
444 cc++;
445 break;
446
447 case OP_CRRANGE:
448 case OP_CRMINRANGE:
449 case OP_CRPOSRANGE:
450 branchlength += GET2(cc,1);
451 cc += 1 + 2 * IMM2_SIZE;
452 break;
453
454 default:
455 branchlength++;
456 break;
457 }
458 break;
459
460 /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
461 way: we find the minimum length for the subpattern. A recursion
462 (backreference or subroutine) causes an a flag to be set that causes the
463 length of this branch to be ignored. The logic is that a recursion can only
464 make sense if there is another alternative that stops the recursing. That
465 will provide the minimum length (when no recursion happens).
466
467 If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
468 matches an empty string (by default it causes a matching failure), so in
469 that case we must set the minimum length to zero.
470
471 For backreferenes, if duplicate numbers are present in the pattern we check
472 for a reference to a duplicate. If it is, we don't know which version will
473 be referenced, so we have to set the minimum length to zero. */
474
475 /* Duplicate named pattern back reference. */
476
477 case OP_DNREF:
478 case OP_DNREFI:
479 if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
480 {
481 int count = GET2(cc, 1+IMM2_SIZE);
482 PCRE2_UCHAR *slot =
483 (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
484 GET2(cc, 1) * re->name_entry_size;
485
486 d = INT_MAX;
487
488 /* Scan all groups with the same name; find the shortest. */
489
490 while (count-- > 0)
491 {
492 int dd, i;
493 recno = GET2(slot, 0);
494
495 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
496 dd = backref_cache[recno];
497 else
498 {
499 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
500 if (cs == NULL) return -2;
501 do ce += GET(ce, 1); while (*ce == OP_ALT);
502
503 dd = 0;
504 if (!dupcapused ||
505 (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
506 {
507 if (cc > cs && cc < ce) /* Simple recursion */
508 {
509 had_recurse = TRUE;
510 }
511 else
512 {
513 recurse_check *r = recurses;
514 for (r = recurses; r != NULL; r = r->prev)
515 if (r->group == cs) break;
516 if (r != NULL) /* Mutual recursion */
517 {
518 had_recurse = TRUE;
519 }
520 else
521 {
522 this_recurse.prev = recurses; /* No recursion */
523 this_recurse.group = cs;
524 dd = find_minlength(re, cs, startcode, utf, &this_recurse,
525 countptr, backref_cache);
526 if (dd < 0) return dd;
527 }
528 }
529 }
530
531 backref_cache[recno] = dd;
532 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
533 backref_cache[0] = recno;
534 }
535
536 if (dd < d) d = dd;
537 if (d <= 0) break; /* No point looking at any more */
538 slot += re->name_entry_size;
539 }
540 }
541 else d = 0;
542 cc += 1 + 2*IMM2_SIZE;
543 goto REPEAT_BACK_REFERENCE;
544
545 /* Single back reference by number. References by name are converted to by
546 number when there is no duplication. */
547
548 case OP_REF:
549 case OP_REFI:
550 recno = GET2(cc, 1);
551 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
552 d = backref_cache[recno];
553 else
554 {
555 int i;
556 d = 0;
557
558 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
559 {
560 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
561 if (cs == NULL) return -2;
562 do ce += GET(ce, 1); while (*ce == OP_ALT);
563
564 if (!dupcapused ||
565 (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
566 {
567 if (cc > cs && cc < ce) /* Simple recursion */
568 {
569 had_recurse = TRUE;
570 }
571 else
572 {
573 recurse_check *r = recurses;
574 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
575 if (r != NULL) /* Mutual recursion */
576 {
577 had_recurse = TRUE;
578 }
579 else /* No recursion */
580 {
581 this_recurse.prev = recurses;
582 this_recurse.group = cs;
583 d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
584 backref_cache);
585 if (d < 0) return d;
586 }
587 }
588 }
589 }
590
591 backref_cache[recno] = d;
592 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
593 backref_cache[0] = recno;
594 }
595
596 cc += 1 + IMM2_SIZE;
597
598 /* Handle repeated back references */
599
600 REPEAT_BACK_REFERENCE:
601 switch (*cc)
602 {
603 case OP_CRSTAR:
604 case OP_CRMINSTAR:
605 case OP_CRQUERY:
606 case OP_CRMINQUERY:
607 case OP_CRPOSSTAR:
608 case OP_CRPOSQUERY:
609 min = 0;
610 cc++;
611 break;
612
613 case OP_CRPLUS:
614 case OP_CRMINPLUS:
615 case OP_CRPOSPLUS:
616 min = 1;
617 cc++;
618 break;
619
620 case OP_CRRANGE:
621 case OP_CRMINRANGE:
622 case OP_CRPOSRANGE:
623 min = GET2(cc, 1);
624 cc += 1 + 2 * IMM2_SIZE;
625 break;
626
627 default:
628 min = 1;
629 break;
630 }
631
632 /* Take care not to overflow: (1) min and d are ints, so check that their
633 product is not greater than INT_MAX. (2) branchlength is limited to
634 UINT16_MAX (checked at the top of the loop). */
635
636 if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
637 branchlength = UINT16_MAX;
638 else branchlength += min * d;
639 break;
640
641 /* Recursion always refers to the first occurrence of a subpattern with a
642 given number. Therefore, we can always make use of caching, even when the
643 pattern contains multiple subpatterns with the same number. */
644
645 case OP_RECURSE:
646 cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
647 recno = GET2(cs, 1+LINK_SIZE);
648 if (recno == prev_recurse_recno)
649 {
650 branchlength += prev_recurse_d;
651 }
652 else
653 {
654 do ce += GET(ce, 1); while (*ce == OP_ALT);
655 if (cc > cs && cc < ce) /* Simple recursion */
656 had_recurse = TRUE;
657 else
658 {
659 recurse_check *r = recurses;
660 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
661 if (r != NULL) /* Mutual recursion */
662 had_recurse = TRUE;
663 else
664 {
665 this_recurse.prev = recurses;
666 this_recurse.group = cs;
667 prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
668 countptr, backref_cache);
669 if (prev_recurse_d < 0) return prev_recurse_d;
670 prev_recurse_recno = recno;
671 branchlength += prev_recurse_d;
672 }
673 }
674 }
675 cc += 1 + LINK_SIZE + once_fudge;
676 once_fudge = 0;
677 break;
678
679 /* Anything else does not or need not match a character. We can get the
680 item's length from the table, but for those that can match zero occurrences
681 of a character, we must take special action for UTF-8 characters. As it
682 happens, the "NOT" versions of these opcodes are used at present only for
683 ASCII characters, so they could be omitted from this list. However, in
684 future that may change, so we include them here so as not to leave a
685 gotcha for a future maintainer. */
686
687 case OP_UPTO:
688 case OP_UPTOI:
689 case OP_NOTUPTO:
690 case OP_NOTUPTOI:
691 case OP_MINUPTO:
692 case OP_MINUPTOI:
693 case OP_NOTMINUPTO:
694 case OP_NOTMINUPTOI:
695 case OP_POSUPTO:
696 case OP_POSUPTOI:
697 case OP_NOTPOSUPTO:
698 case OP_NOTPOSUPTOI:
699
700 case OP_STAR:
701 case OP_STARI:
702 case OP_NOTSTAR:
703 case OP_NOTSTARI:
704 case OP_MINSTAR:
705 case OP_MINSTARI:
706 case OP_NOTMINSTAR:
707 case OP_NOTMINSTARI:
708 case OP_POSSTAR:
709 case OP_POSSTARI:
710 case OP_NOTPOSSTAR:
711 case OP_NOTPOSSTARI:
712
713 case OP_QUERY:
714 case OP_QUERYI:
715 case OP_NOTQUERY:
716 case OP_NOTQUERYI:
717 case OP_MINQUERY:
718 case OP_MINQUERYI:
719 case OP_NOTMINQUERY:
720 case OP_NOTMINQUERYI:
721 case OP_POSQUERY:
722 case OP_POSQUERYI:
723 case OP_NOTPOSQUERY:
724 case OP_NOTPOSQUERYI:
725
726 cc += PRIV(OP_lengths)[op];
727 #ifdef SUPPORT_UNICODE
728 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
729 #endif
730 break;
731
732 /* Skip these, but we need to add in the name length. */
733
734 case OP_MARK:
735 case OP_COMMIT_ARG:
736 case OP_PRUNE_ARG:
737 case OP_SKIP_ARG:
738 case OP_THEN_ARG:
739 cc += PRIV(OP_lengths)[op] + cc[1];
740 break;
741
742 /* The remaining opcodes are just skipped over. */
743
744 case OP_CLOSE:
745 case OP_COMMIT:
746 case OP_FAIL:
747 case OP_PRUNE:
748 case OP_SET_SOM:
749 case OP_SKIP:
750 case OP_THEN:
751 cc += PRIV(OP_lengths)[op];
752 break;
753
754 /* This should not occur: we list all opcodes explicitly so that when
755 new ones get added they are properly considered. */
756
757 default:
758 return -3;
759 }
760 }
761 /* Control never gets here */
762 }
763
764
765
766 /*************************************************
767 * Set a bit and maybe its alternate case *
768 *************************************************/
769
770 /* Given a character, set its first code unit's bit in the table, and also the
771 corresponding bit for the other version of a letter if we are caseless.
772
773 Arguments:
774 re points to the regex block
775 p points to the first code unit of the character
776 caseless TRUE if caseless
777 utf TRUE for UTF mode
778 ucp TRUE for UCP mode
779
780 Returns: pointer after the character
781 */
782
783 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf,BOOL ucp)784 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
785 BOOL ucp)
786 {
787 uint32_t c = *p++; /* First code unit */
788
789 (void)utf; /* Stop compiler warnings when UTF not supported */
790 (void)ucp;
791
792 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
793 0xff. */
794
795 #if PCRE2_CODE_UNIT_WIDTH != 8
796 if (c > 0xff) SET_BIT(0xff); else
797 #endif
798
799 SET_BIT(c);
800
801 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
802 the end of the character, even when caseless. */
803
804 #ifdef SUPPORT_UNICODE
805 if (utf)
806 {
807 #if PCRE2_CODE_UNIT_WIDTH == 8
808 if (c >= 0xc0) GETUTF8INC(c, p);
809 #elif PCRE2_CODE_UNIT_WIDTH == 16
810 if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
811 #endif
812 }
813 #endif /* SUPPORT_UNICODE */
814
815 /* If caseless, handle the other case of the character. */
816
817 if (caseless)
818 {
819 #ifdef SUPPORT_UNICODE
820 if (utf || ucp)
821 {
822 c = UCD_OTHERCASE(c);
823 #if PCRE2_CODE_UNIT_WIDTH == 8
824 if (utf)
825 {
826 PCRE2_UCHAR buff[6];
827 (void)PRIV(ord2utf)(c, buff);
828 SET_BIT(buff[0]);
829 }
830 else if (c < 256) SET_BIT(c);
831 #else /* 16-bit or 32-bit mode */
832 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
833 #endif
834 }
835
836 else
837 #endif /* SUPPORT_UNICODE */
838
839 /* Not UTF or UCP */
840
841 if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
842 }
843
844 return p;
845 }
846
847
848
849 /*************************************************
850 * Set bits for a positive character type *
851 *************************************************/
852
853 /* This function sets starting bits for a character type. In UTF-8 mode, we can
854 only do a direct setting for bytes less than 128, as otherwise there can be
855 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
856 environment, the tables will only recognize ASCII characters anyway, but in at
857 least one Windows environment, some higher bytes bits were set in the tables.
858 So we deal with that case by considering the UTF-8 encoding.
859
860 Arguments:
861 re the regex block
862 cbit type the type of character wanted
863 table_limit 32 for non-UTF-8; 16 for UTF-8
864
865 Returns: nothing
866 */
867
868 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)869 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
870 {
871 uint32_t c;
872 for (c = 0; c < table_limit; c++)
873 re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
874 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
875 if (table_limit == 32) return;
876 for (c = 128; c < 256; c++)
877 {
878 if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
879 {
880 PCRE2_UCHAR buff[6];
881 (void)PRIV(ord2utf)(c, buff);
882 SET_BIT(buff[0]);
883 }
884 }
885 #endif /* UTF-8 */
886 }
887
888
889 /*************************************************
890 * Set bits for a negative character type *
891 *************************************************/
892
893 /* This function sets starting bits for a negative character type such as \D.
894 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
895 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
896 Unlike in the positive case, where we can set appropriate starting bits for
897 specific high-valued UTF-8 characters, in this case we have to set the bits for
898 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
899 0xc0 (192) for simplicity.
900
901 Arguments:
902 re the regex block
903 cbit type the type of character wanted
904 table_limit 32 for non-UTF-8; 16 for UTF-8
905
906 Returns: nothing
907 */
908
909 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)910 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
911 {
912 uint32_t c;
913 for (c = 0; c < table_limit; c++)
914 re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
915 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
916 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
917 #endif
918 }
919
920
921
922 /*************************************************
923 * Create bitmap of starting code units *
924 *************************************************/
925
926 /* This function scans a compiled unanchored expression recursively and
927 attempts to build a bitmap of the set of possible starting code units whose
928 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
929 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
930 we pass a value of 16 rather than 32 as the final argument. (See comments in
931 those functions for the reason.)
932
933 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
934 (a*)b where the group provides some optional starting code units but scanning
935 must continue at the outer level to find at least one mandatory code unit. At
936 the outermost level, this function fails unless the result is SSB_DONE.
937
938 We restrict recursion (for nested groups) to 1000 to avoid stack overflow
939 issues.
940
941 Arguments:
942 re points to the compiled regex block
943 code points to an expression
944 utf TRUE if in UTF mode
945 ucp TRUE if in UCP mode
946 depthptr pointer to recurse depth
947
948 Returns: SSB_FAIL => Failed to find any starting code units
949 SSB_DONE => Found mandatory starting code units
950 SSB_CONTINUE => Found optional starting code units
951 SSB_UNKNOWN => Hit an unrecognized opcode
952 SSB_TOODEEP => Recursion is too deep
953 */
954
955 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf,BOOL ucp,int * depthptr)956 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
957 int *depthptr)
958 {
959 uint32_t c;
960 int yield = SSB_DONE;
961
962 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
963 int table_limit = utf? 16:32;
964 #else
965 int table_limit = 32;
966 #endif
967
968 *depthptr += 1;
969 if (*depthptr > 1000) return SSB_TOODEEP;
970
971 do
972 {
973 BOOL try_next = TRUE;
974 PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
975
976 if (*code == OP_CBRA || *code == OP_SCBRA ||
977 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
978
979 while (try_next) /* Loop for items in this branch */
980 {
981 int rc;
982 PCRE2_SPTR ncode;
983 uint8_t *classmap = NULL;
984 #ifdef SUPPORT_WIDE_CHARS
985 PCRE2_UCHAR xclassflags;
986 #endif
987
988 switch(*tcode)
989 {
990 /* If we reach something we don't understand, it means a new opcode has
991 been created that hasn't been added to this function. Hopefully this
992 problem will be discovered during testing. */
993
994 default:
995 return SSB_UNKNOWN;
996
997 /* Fail for a valid opcode that implies no starting bits. */
998
999 case OP_ACCEPT:
1000 case OP_ASSERT_ACCEPT:
1001 case OP_ALLANY:
1002 case OP_ANY:
1003 case OP_ANYBYTE:
1004 case OP_CIRCM:
1005 case OP_CLOSE:
1006 case OP_COMMIT:
1007 case OP_COMMIT_ARG:
1008 case OP_COND:
1009 case OP_CREF:
1010 case OP_FALSE:
1011 case OP_TRUE:
1012 case OP_DNCREF:
1013 case OP_DNREF:
1014 case OP_DNREFI:
1015 case OP_DNRREF:
1016 case OP_DOLL:
1017 case OP_DOLLM:
1018 case OP_END:
1019 case OP_EOD:
1020 case OP_EODN:
1021 case OP_EXTUNI:
1022 case OP_FAIL:
1023 case OP_MARK:
1024 case OP_NOT:
1025 case OP_NOTEXACT:
1026 case OP_NOTEXACTI:
1027 case OP_NOTI:
1028 case OP_NOTMINPLUS:
1029 case OP_NOTMINPLUSI:
1030 case OP_NOTMINQUERY:
1031 case OP_NOTMINQUERYI:
1032 case OP_NOTMINSTAR:
1033 case OP_NOTMINSTARI:
1034 case OP_NOTMINUPTO:
1035 case OP_NOTMINUPTOI:
1036 case OP_NOTPLUS:
1037 case OP_NOTPLUSI:
1038 case OP_NOTPOSPLUS:
1039 case OP_NOTPOSPLUSI:
1040 case OP_NOTPOSQUERY:
1041 case OP_NOTPOSQUERYI:
1042 case OP_NOTPOSSTAR:
1043 case OP_NOTPOSSTARI:
1044 case OP_NOTPOSUPTO:
1045 case OP_NOTPOSUPTOI:
1046 case OP_NOTPROP:
1047 case OP_NOTQUERY:
1048 case OP_NOTQUERYI:
1049 case OP_NOTSTAR:
1050 case OP_NOTSTARI:
1051 case OP_NOTUPTO:
1052 case OP_NOTUPTOI:
1053 case OP_NOT_HSPACE:
1054 case OP_NOT_VSPACE:
1055 case OP_PRUNE:
1056 case OP_PRUNE_ARG:
1057 case OP_RECURSE:
1058 case OP_REF:
1059 case OP_REFI:
1060 case OP_REVERSE:
1061 case OP_VREVERSE:
1062 case OP_RREF:
1063 case OP_SCOND:
1064 case OP_SET_SOM:
1065 case OP_SKIP:
1066 case OP_SKIP_ARG:
1067 case OP_SOD:
1068 case OP_SOM:
1069 case OP_THEN:
1070 case OP_THEN_ARG:
1071 return SSB_FAIL;
1072
1073 /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1074 uses OP_CIRCM). Skip over it. */
1075
1076 case OP_CIRC:
1077 tcode += PRIV(OP_lengths)[OP_CIRC];
1078 break;
1079
1080 /* A "real" property test implies no starting bits, but the fake property
1081 PT_CLIST identifies a list of characters. These lists are short, as they
1082 are used for characters with more than one "other case", so there is no
1083 point in recognizing them for OP_NOTPROP. */
1084
1085 case OP_PROP:
1086 if (tcode[1] != PT_CLIST) return SSB_FAIL;
1087 {
1088 const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1089 while ((c = *p++) < NOTACHAR)
1090 {
1091 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1092 if (utf)
1093 {
1094 PCRE2_UCHAR buff[6];
1095 (void)PRIV(ord2utf)(c, buff);
1096 c = buff[0];
1097 }
1098 #endif
1099 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1100 }
1101 }
1102 try_next = FALSE;
1103 break;
1104
1105 /* We can ignore word boundary tests. */
1106
1107 case OP_WORD_BOUNDARY:
1108 case OP_NOT_WORD_BOUNDARY:
1109 case OP_UCP_WORD_BOUNDARY:
1110 case OP_NOT_UCP_WORD_BOUNDARY:
1111 tcode++;
1112 break;
1113
1114 /* For a positive lookahead assertion, inspect what immediately follows,
1115 ignoring intermediate assertions and callouts. If the next item is one
1116 that sets a mandatory character, skip this assertion. Otherwise, treat it
1117 the same as other bracket groups. */
1118
1119 case OP_ASSERT:
1120 case OP_ASSERT_NA:
1121 ncode = tcode + GET(tcode, 1);
1122 while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1123 ncode += 1 + LINK_SIZE;
1124
1125 /* Skip irrelevant items */
1126
1127 for (BOOL done = FALSE; !done;)
1128 {
1129 switch (*ncode)
1130 {
1131 case OP_ASSERT:
1132 case OP_ASSERT_NOT:
1133 case OP_ASSERTBACK:
1134 case OP_ASSERTBACK_NOT:
1135 case OP_ASSERT_NA:
1136 case OP_ASSERTBACK_NA:
1137 ncode += GET(ncode, 1);
1138 while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1139 ncode += 1 + LINK_SIZE;
1140 break;
1141
1142 case OP_WORD_BOUNDARY:
1143 case OP_NOT_WORD_BOUNDARY:
1144 case OP_UCP_WORD_BOUNDARY:
1145 case OP_NOT_UCP_WORD_BOUNDARY:
1146 ncode++;
1147 break;
1148
1149 case OP_CALLOUT:
1150 ncode += PRIV(OP_lengths)[OP_CALLOUT];
1151 break;
1152
1153 case OP_CALLOUT_STR:
1154 ncode += GET(ncode, 1 + 2*LINK_SIZE);
1155 break;
1156
1157 default:
1158 done = TRUE;
1159 break;
1160 }
1161 }
1162
1163 /* Now check the next significant item. */
1164
1165 switch(*ncode)
1166 {
1167 default:
1168 break;
1169
1170 case OP_PROP:
1171 if (ncode[1] != PT_CLIST) break;
1172 /* Fall through */
1173 case OP_ANYNL:
1174 case OP_CHAR:
1175 case OP_CHARI:
1176 case OP_EXACT:
1177 case OP_EXACTI:
1178 case OP_HSPACE:
1179 case OP_MINPLUS:
1180 case OP_MINPLUSI:
1181 case OP_PLUS:
1182 case OP_PLUSI:
1183 case OP_POSPLUS:
1184 case OP_POSPLUSI:
1185 case OP_VSPACE:
1186 /* Note that these types will only be present in non-UCP mode. */
1187 case OP_DIGIT:
1188 case OP_NOT_DIGIT:
1189 case OP_WORDCHAR:
1190 case OP_NOT_WORDCHAR:
1191 case OP_WHITESPACE:
1192 case OP_NOT_WHITESPACE:
1193 tcode = ncode;
1194 continue; /* With the following significant opcode */
1195 }
1196 /* Fall through */
1197
1198 /* For a group bracket or a positive assertion without an immediately
1199 following mandatory setting, recurse to set bits from within the
1200 subpattern. If it can't find anything, we have to give up. If it finds
1201 some mandatory character(s), we are done for this branch. Otherwise,
1202 carry on scanning after the subpattern. */
1203
1204 case OP_BRA:
1205 case OP_SBRA:
1206 case OP_CBRA:
1207 case OP_SCBRA:
1208 case OP_BRAPOS:
1209 case OP_SBRAPOS:
1210 case OP_CBRAPOS:
1211 case OP_SCBRAPOS:
1212 case OP_ONCE:
1213 case OP_SCRIPT_RUN:
1214 rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1215 if (rc == SSB_DONE)
1216 {
1217 try_next = FALSE;
1218 }
1219 else if (rc == SSB_CONTINUE)
1220 {
1221 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1222 tcode += 1 + LINK_SIZE;
1223 }
1224 else return rc; /* FAIL, UNKNOWN, or TOODEEP */
1225 break;
1226
1227 /* If we hit ALT or KET, it means we haven't found anything mandatory in
1228 this branch, though we might have found something optional. For ALT, we
1229 continue with the next alternative, but we have to arrange that the final
1230 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1231 return SSB_CONTINUE: if this is the top level, that indicates failure,
1232 but after a nested subpattern, it causes scanning to continue. */
1233
1234 case OP_ALT:
1235 yield = SSB_CONTINUE;
1236 try_next = FALSE;
1237 break;
1238
1239 case OP_KET:
1240 case OP_KETRMAX:
1241 case OP_KETRMIN:
1242 case OP_KETRPOS:
1243 return SSB_CONTINUE;
1244
1245 /* Skip over callout */
1246
1247 case OP_CALLOUT:
1248 tcode += PRIV(OP_lengths)[OP_CALLOUT];
1249 break;
1250
1251 case OP_CALLOUT_STR:
1252 tcode += GET(tcode, 1 + 2*LINK_SIZE);
1253 break;
1254
1255 /* Skip over lookbehind and negative lookahead assertions */
1256
1257 case OP_ASSERT_NOT:
1258 case OP_ASSERTBACK:
1259 case OP_ASSERTBACK_NOT:
1260 case OP_ASSERTBACK_NA:
1261 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1262 tcode += 1 + LINK_SIZE;
1263 break;
1264
1265 /* BRAZERO does the bracket, but carries on. */
1266
1267 case OP_BRAZERO:
1268 case OP_BRAMINZERO:
1269 case OP_BRAPOSZERO:
1270 rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1271 if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1272 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1273 tcode += 1 + LINK_SIZE;
1274 break;
1275
1276 /* SKIPZERO skips the bracket. */
1277
1278 case OP_SKIPZERO:
1279 tcode++;
1280 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1281 tcode += 1 + LINK_SIZE;
1282 break;
1283
1284 /* Single-char * or ? sets the bit and tries the next item */
1285
1286 case OP_STAR:
1287 case OP_MINSTAR:
1288 case OP_POSSTAR:
1289 case OP_QUERY:
1290 case OP_MINQUERY:
1291 case OP_POSQUERY:
1292 tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1293 break;
1294
1295 case OP_STARI:
1296 case OP_MINSTARI:
1297 case OP_POSSTARI:
1298 case OP_QUERYI:
1299 case OP_MINQUERYI:
1300 case OP_POSQUERYI:
1301 tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1302 break;
1303
1304 /* Single-char upto sets the bit and tries the next */
1305
1306 case OP_UPTO:
1307 case OP_MINUPTO:
1308 case OP_POSUPTO:
1309 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1310 break;
1311
1312 case OP_UPTOI:
1313 case OP_MINUPTOI:
1314 case OP_POSUPTOI:
1315 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1316 break;
1317
1318 /* At least one single char sets the bit and stops */
1319
1320 case OP_EXACT:
1321 tcode += IMM2_SIZE;
1322 /* Fall through */
1323 case OP_CHAR:
1324 case OP_PLUS:
1325 case OP_MINPLUS:
1326 case OP_POSPLUS:
1327 (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1328 try_next = FALSE;
1329 break;
1330
1331 case OP_EXACTI:
1332 tcode += IMM2_SIZE;
1333 /* Fall through */
1334 case OP_CHARI:
1335 case OP_PLUSI:
1336 case OP_MINPLUSI:
1337 case OP_POSPLUSI:
1338 (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1339 try_next = FALSE;
1340 break;
1341
1342 /* Special spacing and line-terminating items. These recognize specific
1343 lists of characters. The difference between VSPACE and ANYNL is that the
1344 latter can match the two-character CRLF sequence, but that is not
1345 relevant for finding the first character, so their code here is
1346 identical. */
1347
1348 case OP_HSPACE:
1349 SET_BIT(CHAR_HT);
1350 SET_BIT(CHAR_SPACE);
1351
1352 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1353 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1354
1355 #if PCRE2_CODE_UNIT_WIDTH != 8
1356 SET_BIT(0xA0);
1357 SET_BIT(0xFF);
1358 #else
1359 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1360 units of horizontal space characters. */
1361
1362 #ifdef SUPPORT_UNICODE
1363 if (utf)
1364 {
1365 SET_BIT(0xC2); /* For U+00A0 */
1366 SET_BIT(0xE1); /* For U+1680, U+180E */
1367 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1368 SET_BIT(0xE3); /* For U+3000 */
1369 }
1370 else
1371 #endif
1372 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1373 the code is EBCDIC. */
1374 {
1375 #ifndef EBCDIC
1376 SET_BIT(0xA0);
1377 #endif /* Not EBCDIC */
1378 }
1379 #endif /* 8-bit support */
1380
1381 try_next = FALSE;
1382 break;
1383
1384 case OP_ANYNL:
1385 case OP_VSPACE:
1386 SET_BIT(CHAR_LF);
1387 SET_BIT(CHAR_VT);
1388 SET_BIT(CHAR_FF);
1389 SET_BIT(CHAR_CR);
1390
1391 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1392 the bits for NEL and for code units >= 255, independently of UTF. */
1393
1394 #if PCRE2_CODE_UNIT_WIDTH != 8
1395 SET_BIT(CHAR_NEL);
1396 SET_BIT(0xFF);
1397 #else
1398 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1399 units of vertical space characters. */
1400
1401 #ifdef SUPPORT_UNICODE
1402 if (utf)
1403 {
1404 SET_BIT(0xC2); /* For U+0085 (NEL) */
1405 SET_BIT(0xE2); /* For U+2028, U+2029 */
1406 }
1407 else
1408 #endif
1409 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1410 {
1411 SET_BIT(CHAR_NEL);
1412 }
1413 #endif /* 8-bit support */
1414
1415 try_next = FALSE;
1416 break;
1417
1418 /* Single character types set the bits and stop. Note that if PCRE2_UCP
1419 is set, we do not see these opcodes because \d etc are converted to
1420 properties. Therefore, these apply in the case when only characters less
1421 than 256 are recognized to match the types. */
1422
1423 case OP_NOT_DIGIT:
1424 set_nottype_bits(re, cbit_digit, table_limit);
1425 try_next = FALSE;
1426 break;
1427
1428 case OP_DIGIT:
1429 set_type_bits(re, cbit_digit, table_limit);
1430 try_next = FALSE;
1431 break;
1432
1433 case OP_NOT_WHITESPACE:
1434 set_nottype_bits(re, cbit_space, table_limit);
1435 try_next = FALSE;
1436 break;
1437
1438 case OP_WHITESPACE:
1439 set_type_bits(re, cbit_space, table_limit);
1440 try_next = FALSE;
1441 break;
1442
1443 case OP_NOT_WORDCHAR:
1444 set_nottype_bits(re, cbit_word, table_limit);
1445 try_next = FALSE;
1446 break;
1447
1448 case OP_WORDCHAR:
1449 set_type_bits(re, cbit_word, table_limit);
1450 try_next = FALSE;
1451 break;
1452
1453 /* One or more character type fudges the pointer and restarts, knowing
1454 it will hit a single character type and stop there. */
1455
1456 case OP_TYPEPLUS:
1457 case OP_TYPEMINPLUS:
1458 case OP_TYPEPOSPLUS:
1459 tcode++;
1460 break;
1461
1462 case OP_TYPEEXACT:
1463 tcode += 1 + IMM2_SIZE;
1464 break;
1465
1466 /* Zero or more repeats of character types set the bits and then
1467 try again. */
1468
1469 case OP_TYPEUPTO:
1470 case OP_TYPEMINUPTO:
1471 case OP_TYPEPOSUPTO:
1472 tcode += IMM2_SIZE; /* Fall through */
1473
1474 case OP_TYPESTAR:
1475 case OP_TYPEMINSTAR:
1476 case OP_TYPEPOSSTAR:
1477 case OP_TYPEQUERY:
1478 case OP_TYPEMINQUERY:
1479 case OP_TYPEPOSQUERY:
1480 switch(tcode[1])
1481 {
1482 default:
1483 case OP_ANY:
1484 case OP_ALLANY:
1485 return SSB_FAIL;
1486
1487 case OP_HSPACE:
1488 SET_BIT(CHAR_HT);
1489 SET_BIT(CHAR_SPACE);
1490
1491 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1492 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1493
1494 #if PCRE2_CODE_UNIT_WIDTH != 8
1495 SET_BIT(0xA0);
1496 SET_BIT(0xFF);
1497 #else
1498 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1499 units of horizontal space characters. */
1500
1501 #ifdef SUPPORT_UNICODE
1502 if (utf)
1503 {
1504 SET_BIT(0xC2); /* For U+00A0 */
1505 SET_BIT(0xE1); /* For U+1680, U+180E */
1506 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1507 SET_BIT(0xE3); /* For U+3000 */
1508 }
1509 else
1510 #endif
1511 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1512 the code is EBCDIC. */
1513 {
1514 #ifndef EBCDIC
1515 SET_BIT(0xA0);
1516 #endif /* Not EBCDIC */
1517 }
1518 #endif /* 8-bit support */
1519 break;
1520
1521 case OP_ANYNL:
1522 case OP_VSPACE:
1523 SET_BIT(CHAR_LF);
1524 SET_BIT(CHAR_VT);
1525 SET_BIT(CHAR_FF);
1526 SET_BIT(CHAR_CR);
1527
1528 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1529 the bits for NEL and for code units >= 255, independently of UTF. */
1530
1531 #if PCRE2_CODE_UNIT_WIDTH != 8
1532 SET_BIT(CHAR_NEL);
1533 SET_BIT(0xFF);
1534 #else
1535 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1536 units of vertical space characters. */
1537
1538 #ifdef SUPPORT_UNICODE
1539 if (utf)
1540 {
1541 SET_BIT(0xC2); /* For U+0085 (NEL) */
1542 SET_BIT(0xE2); /* For U+2028, U+2029 */
1543 }
1544 else
1545 #endif
1546 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1547 {
1548 SET_BIT(CHAR_NEL);
1549 }
1550 #endif /* 8-bit support */
1551 break;
1552
1553 case OP_NOT_DIGIT:
1554 set_nottype_bits(re, cbit_digit, table_limit);
1555 break;
1556
1557 case OP_DIGIT:
1558 set_type_bits(re, cbit_digit, table_limit);
1559 break;
1560
1561 case OP_NOT_WHITESPACE:
1562 set_nottype_bits(re, cbit_space, table_limit);
1563 break;
1564
1565 case OP_WHITESPACE:
1566 set_type_bits(re, cbit_space, table_limit);
1567 break;
1568
1569 case OP_NOT_WORDCHAR:
1570 set_nottype_bits(re, cbit_word, table_limit);
1571 break;
1572
1573 case OP_WORDCHAR:
1574 set_type_bits(re, cbit_word, table_limit);
1575 break;
1576 }
1577
1578 tcode += 2;
1579 break;
1580
1581 /* Extended class: if there are any property checks, or if this is a
1582 negative XCLASS without a map, give up. If there are no property checks,
1583 there must be wide characters on the XCLASS list, because otherwise an
1584 XCLASS would not have been created. This means that code points >= 255
1585 are potential starters. In the UTF-8 case we can scan them and set bits
1586 for the relevant leading bytes. */
1587
1588 #ifdef SUPPORT_WIDE_CHARS
1589 case OP_XCLASS:
1590 xclassflags = tcode[1 + LINK_SIZE];
1591 if ((xclassflags & XCL_HASPROP) != 0 ||
1592 (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1593 return SSB_FAIL;
1594
1595 /* We have a positive XCLASS or a negative one without a map. Set up the
1596 map pointer if there is one, and fall through. */
1597
1598 classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1599 (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1600
1601 /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1602 then jump to handle the map. */
1603
1604 #if PCRE2_CODE_UNIT_WIDTH == 8
1605 if (utf && (xclassflags & XCL_NOT) == 0)
1606 {
1607 PCRE2_UCHAR b, e;
1608 PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1609 tcode += GET(tcode, 1);
1610
1611 for (;;) switch (*p++)
1612 {
1613 case XCL_SINGLE:
1614 b = *p++;
1615 while ((*p & 0xc0) == 0x80) p++;
1616 re->start_bitmap[b/8] |= (1u << (b&7));
1617 break;
1618
1619 case XCL_RANGE:
1620 b = *p++;
1621 while ((*p & 0xc0) == 0x80) p++;
1622 e = *p++;
1623 while ((*p & 0xc0) == 0x80) p++;
1624 for (; b <= e; b++)
1625 re->start_bitmap[b/8] |= (1u << (b&7));
1626 break;
1627
1628 case XCL_END:
1629 goto HANDLE_CLASSMAP;
1630
1631 default:
1632 return SSB_UNKNOWN; /* Internal error, should not occur */
1633 }
1634 }
1635 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1636 #endif /* SUPPORT_WIDE_CHARS */
1637
1638 /* It seems that the fall through comment must be outside the #ifdef if
1639 it is to avoid the gcc compiler warning. */
1640
1641 /* Fall through */
1642
1643 /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1644 in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1645 because it starts a character with a value > 255. In 8-bit non-UTF mode,
1646 there is no difference between CLASS and NCLASS. In all other wide
1647 character modes, set the 0xFF bit to indicate code units >= 255. */
1648
1649 case OP_NCLASS:
1650 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1651 if (utf)
1652 {
1653 re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1654 memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1655 }
1656 #elif PCRE2_CODE_UNIT_WIDTH != 8
1657 SET_BIT(0xFF); /* For characters >= 255 */
1658 #endif
1659 /* Fall through */
1660
1661 /* Enter here for a positive non-XCLASS. If we have fallen through from
1662 an XCLASS, classmap will already be set; just advance the code pointer.
1663 Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1664
1665 case OP_CLASS:
1666 if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1667 {
1668 classmap = (uint8_t *)(++tcode);
1669 tcode += 32 / sizeof(PCRE2_UCHAR);
1670 }
1671
1672 /* When wide characters are supported, classmap may be NULL. In UTF-8
1673 (sic) mode, the bits in a class bit map correspond to character values,
1674 not to byte values. However, the bit map we are constructing is for byte
1675 values. So we have to do a conversion for characters whose code point is
1676 greater than 127. In fact, there are only two possible starting bytes for
1677 characters in the range 128 - 255. */
1678
1679 #if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1680 HANDLE_CLASSMAP:
1681 #endif
1682 if (classmap != NULL)
1683 {
1684 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1685 if (utf)
1686 {
1687 for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1688 for (c = 128; c < 256; c++)
1689 {
1690 if ((classmap[c/8] & (1u << (c&7))) != 0)
1691 {
1692 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1693 re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */
1694 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1695 }
1696 }
1697 }
1698 else
1699 #endif
1700 /* In all modes except UTF-8, the two bit maps are compatible. */
1701
1702 {
1703 for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1704 }
1705 }
1706
1707 /* Act on what follows the class. For a zero minimum repeat, continue;
1708 otherwise stop processing. */
1709
1710 switch (*tcode)
1711 {
1712 case OP_CRSTAR:
1713 case OP_CRMINSTAR:
1714 case OP_CRQUERY:
1715 case OP_CRMINQUERY:
1716 case OP_CRPOSSTAR:
1717 case OP_CRPOSQUERY:
1718 tcode++;
1719 break;
1720
1721 case OP_CRRANGE:
1722 case OP_CRMINRANGE:
1723 case OP_CRPOSRANGE:
1724 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1725 else try_next = FALSE;
1726 break;
1727
1728 default:
1729 try_next = FALSE;
1730 break;
1731 }
1732 break; /* End of class handling case */
1733 } /* End of switch for opcodes */
1734 } /* End of try_next loop */
1735
1736 code += GET(code, 1); /* Advance to next branch */
1737 }
1738 while (*code == OP_ALT);
1739
1740 return yield;
1741 }
1742
1743
1744
1745 /*************************************************
1746 * Study a compiled expression *
1747 *************************************************/
1748
1749 /* This function is handed a compiled expression that it must study to produce
1750 information that will speed up the matching.
1751
1752 Argument:
1753 re points to the compiled expression
1754
1755 Returns: 0 normally; non-zero should never normally occur
1756 1 unknown opcode in set_start_bits
1757 2 missing capturing bracket
1758 3 unknown opcode in find_minlength
1759 */
1760
1761 int
PRIV(study)1762 PRIV(study)(pcre2_real_code *re)
1763 {
1764 int count = 0;
1765 PCRE2_UCHAR *code;
1766 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1767 BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1768
1769 /* Find start of compiled code */
1770
1771 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1772 re->name_entry_size * re->name_count;
1773
1774 /* For a pattern that has a first code unit, or a multiline pattern that
1775 matches only at "line start", there is no point in seeking a list of starting
1776 code units. */
1777
1778 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1779 {
1780 int depth = 0;
1781 int rc = set_start_bits(re, code, utf, ucp, &depth);
1782 if (rc == SSB_UNKNOWN) return 1;
1783
1784 /* If a list of starting code units was set up, scan the list to see if only
1785 one or two were listed. Having only one listed is rare because usually a
1786 single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1787 If two are listed, see if they are caseless versions of the same character;
1788 if so we can replace the list with a caseless first code unit. This gives
1789 better performance and is plausibly worth doing for patterns such as [Ww]ord
1790 or (word|WORD). */
1791
1792 if (rc == SSB_DONE)
1793 {
1794 int i;
1795 int a = -1;
1796 int b = -1;
1797 uint8_t *p = re->start_bitmap;
1798 uint32_t flags = PCRE2_FIRSTMAPSET;
1799
1800 for (i = 0; i < 256; p++, i += 8)
1801 {
1802 uint8_t x = *p;
1803 if (x != 0)
1804 {
1805 int c;
1806 uint8_t y = x & (~x + 1); /* Least significant bit */
1807 if (y != x) goto DONE; /* More than one bit set */
1808
1809 /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1810 all wide characters", so we cannot use it here. */
1811
1812 #if PCRE2_CODE_UNIT_WIDTH != 8
1813 if (i == 248 && x == 0x80) goto DONE;
1814 #endif
1815
1816 /* Compute the character value */
1817
1818 c = i;
1819 switch (x)
1820 {
1821 case 1: break;
1822 case 2: c += 1; break; case 4: c += 2; break;
1823 case 8: c += 3; break; case 16: c += 4; break;
1824 case 32: c += 5; break; case 64: c += 6; break;
1825 case 128: c += 7; break;
1826 }
1827
1828 /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1829 mode, only values < 128 can be used. In all the other cases, c is a
1830 character value. */
1831
1832 #if PCRE2_CODE_UNIT_WIDTH == 8
1833 if (utf && c > 127) goto DONE;
1834 #endif
1835 if (a < 0) a = c; /* First one found, save in a */
1836 else if (b < 0) /* Second one found */
1837 {
1838 int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1839
1840 #ifdef SUPPORT_UNICODE
1841 if (utf || ucp)
1842 {
1843 if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
1844 if (c > 127) d = UCD_OTHERCASE(c);
1845 }
1846 #endif /* SUPPORT_UNICODE */
1847
1848 if (d != a) goto DONE; /* Not the other case of a */
1849 b = c; /* Save second in b */
1850 }
1851 else goto DONE; /* More than two characters found */
1852 }
1853 }
1854
1855 /* Replace the start code unit bits with a first code unit, but only if it
1856 is not the same as a required later code unit. This is because a search for
1857 a required code unit starts after an explicit first code unit, but at a
1858 code unit found from the bitmap. Patterns such as /a*a/ don't work
1859 if both the start unit and required unit are the same. */
1860
1861 if (a >= 0 &&
1862 (
1863 (re->flags & PCRE2_LASTSET) == 0 ||
1864 (
1865 re->last_codeunit != (uint32_t)a &&
1866 (b < 0 || re->last_codeunit != (uint32_t)b)
1867 )
1868 ))
1869 {
1870 re->first_codeunit = a;
1871 flags = PCRE2_FIRSTSET;
1872 if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
1873 }
1874
1875 DONE:
1876 re->flags |= flags;
1877 }
1878 }
1879
1880 /* Find the minimum length of subject string. If the pattern can match an empty
1881 string, the minimum length is already known. If the pattern contains (*ACCEPT)
1882 all bets are off, and we don't even try to find a minimum length. If there are
1883 more back references than the size of the vector we are going to cache them in,
1884 do nothing. A pattern that complicated will probably take a long time to
1885 analyze and may in any case turn out to be too complicated. Note that back
1886 reference minima are held as 16-bit numbers. */
1887
1888 if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
1889 re->top_backref <= MAX_CACHE_BACKREF)
1890 {
1891 int min;
1892 int backref_cache[MAX_CACHE_BACKREF+1];
1893 backref_cache[0] = 0; /* Highest one that is set */
1894 min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1895 switch(min)
1896 {
1897 case -1: /* \C in UTF mode or over-complex regex */
1898 break; /* Leave minlength unchanged (will be zero) */
1899
1900 case -2:
1901 return 2; /* missing capturing bracket */
1902
1903 case -3:
1904 return 3; /* unrecognized opcode */
1905
1906 default:
1907 re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
1908 break;
1909 }
1910 }
1911
1912 return 0;
1913 }
1914
1915 /* End of pcre2_study.c */
1916