1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2020 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre2_internal.h"
50
51 /* The maximum remembered capturing brackets minimum. */
52
53 #define MAX_CACHE_BACKREF 128
54
55 /* Set a bit in the starting code unit bit map. */
56
57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58
59 /* Returns from set_start_bits() */
60
61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62
63
64 /*************************************************
65 * Find the minimum subject length for a group *
66 *************************************************/
67
68 /* Scan a parenthesized group and compute the minimum length of subject that
69 is needed to match it. This is a lower bound; it does not mean there is a
70 string of that length that matches. In UTF mode, the result is in characters
71 rather than code units. The field in a compiled pattern for storing the minimum
72 length is 16-bits long (on the grounds that anything longer than that is
73 pathological), so we give up when we reach that amount. This also means that
74 integer overflow for really crazy patterns cannot happen.
75
76 Backreference minimum lengths are cached to speed up multiple references. This
77 function is called only when the highest back reference in the pattern is less
78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79 caching vector. The zeroth element contains the number of the highest set
80 value.
81
82 Arguments:
83 re compiled pattern block
84 code pointer to start of group (the bracket)
85 startcode pointer to start of the whole pattern's code
86 utf UTF flag
87 recurses chain of recurse_check to catch mutual recursion
88 countptr pointer to call count (to catch over complexity)
89 backref_cache vector for caching back references.
90
91 This function is no longer called when the pattern contains (*ACCEPT); however,
92 the old code for returning -1 is retained, just in case.
93
94 Returns: the minimum length
95 -1 \C in UTF-8 mode
96 or (*ACCEPT)
97 or pattern too complicated
98 -2 internal error (missing capturing bracket)
99 -3 internal error (opcode not listed)
100 */
101
102 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr,int * backref_cache)103 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104 PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105 int *backref_cache)
106 {
107 int length = -1;
108 int branchlength = 0;
109 int prev_cap_recno = -1;
110 int prev_cap_d = 0;
111 int prev_recurse_recno = -1;
112 int prev_recurse_d = 0;
113 uint32_t once_fudge = 0;
114 BOOL had_recurse = FALSE;
115 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116 PCRE2_SPTR nextbranch = code + GET(code, 1);
117 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
118 recurse_check this_recurse;
119
120 /* If this is a "could be empty" group, its minimum length is 0. */
121
122 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123
124 /* Skip over capturing bracket number */
125
126 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127
128 /* A large and/or complex regex can take too long to process. */
129
130 if ((*countptr)++ > 1000) return -1;
131
132 /* Scan along the opcodes for this branch. If we get to the end of the branch,
133 check the length against that of the other branches. If the accumulated length
134 passes 16-bits, reset to that value and skip the rest of the branch. */
135
136 for (;;)
137 {
138 int d, min, recno;
139 PCRE2_UCHAR op, *cs, *ce;
140
141 if (branchlength >= UINT16_MAX)
142 {
143 branchlength = UINT16_MAX;
144 cc = (PCRE2_UCHAR *)nextbranch;
145 }
146
147 op = *cc;
148 switch (op)
149 {
150 case OP_COND:
151 case OP_SCOND:
152
153 /* If there is only one branch in a condition, the implied branch has zero
154 length, so we don't add anything. This covers the DEFINE "condition"
155 automatically. If there are two branches we can treat it the same as any
156 other non-capturing subpattern. */
157
158 cs = cc + GET(cc, 1);
159 if (*cs != OP_ALT)
160 {
161 cc = cs + 1 + LINK_SIZE;
162 break;
163 }
164 goto PROCESS_NON_CAPTURE;
165
166 case OP_BRA:
167 /* There's a special case of OP_BRA, when it is wrapped round a repeated
168 OP_RECURSE. We'd like to process the latter at this level so that
169 remembering the value works for repeated cases. So we do nothing, but
170 set a fudge value to skip over the OP_KET after the recurse. */
171
172 if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173 {
174 once_fudge = 1 + LINK_SIZE;
175 cc += 1 + LINK_SIZE;
176 break;
177 }
178 /* Fall through */
179
180 case OP_ONCE:
181 case OP_SCRIPT_RUN:
182 case OP_SBRA:
183 case OP_BRAPOS:
184 case OP_SBRAPOS:
185 PROCESS_NON_CAPTURE:
186 d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187 backref_cache);
188 if (d < 0) return d;
189 branchlength += d;
190 do cc += GET(cc, 1); while (*cc == OP_ALT);
191 cc += 1 + LINK_SIZE;
192 break;
193
194 /* To save time for repeated capturing subpatterns, we remember the
195 length of the previous one. Unfortunately we can't do the same for
196 the unnumbered ones above. Nor can we do this if (?| is present in the
197 pattern because captures with the same number are not then identical. */
198
199 case OP_CBRA:
200 case OP_SCBRA:
201 case OP_CBRAPOS:
202 case OP_SCBRAPOS:
203 recno = (int)GET2(cc, 1+LINK_SIZE);
204 if (dupcapused || recno != prev_cap_recno)
205 {
206 prev_cap_recno = recno;
207 prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208 backref_cache);
209 if (prev_cap_d < 0) return prev_cap_d;
210 }
211 branchlength += prev_cap_d;
212 do cc += GET(cc, 1); while (*cc == OP_ALT);
213 cc += 1 + LINK_SIZE;
214 break;
215
216 /* ACCEPT makes things far too complicated; we have to give up. In fact,
217 from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218 used. However, leave the code in place, just in case. */
219
220 case OP_ACCEPT:
221 case OP_ASSERT_ACCEPT:
222 return -1;
223
224 /* Reached end of a branch; if it's a ket it is the end of a nested
225 call. If it's ALT it is an alternation in a nested call. If it is END it's
226 the end of the outer call. All can be handled by the same code. If the
227 length of any branch is zero, there is no need to scan any subsequent
228 branches. */
229
230 case OP_ALT:
231 case OP_KET:
232 case OP_KETRMAX:
233 case OP_KETRMIN:
234 case OP_KETRPOS:
235 case OP_END:
236 if (length < 0 || (!had_recurse && branchlength < length))
237 length = branchlength;
238 if (op != OP_ALT || length == 0) return length;
239 nextbranch = cc + GET(cc, 1);
240 cc += 1 + LINK_SIZE;
241 branchlength = 0;
242 had_recurse = FALSE;
243 break;
244
245 /* Skip over assertive subpatterns */
246
247 case OP_ASSERT:
248 case OP_ASSERT_NOT:
249 case OP_ASSERTBACK:
250 case OP_ASSERTBACK_NOT:
251 case OP_ASSERT_NA:
252 case OP_ASSERTBACK_NA:
253 do cc += GET(cc, 1); while (*cc == OP_ALT);
254 /* Fall through */
255
256 /* Skip over things that don't match chars */
257
258 case OP_REVERSE:
259 case OP_CREF:
260 case OP_DNCREF:
261 case OP_RREF:
262 case OP_DNRREF:
263 case OP_FALSE:
264 case OP_TRUE:
265 case OP_CALLOUT:
266 case OP_SOD:
267 case OP_SOM:
268 case OP_EOD:
269 case OP_EODN:
270 case OP_CIRC:
271 case OP_CIRCM:
272 case OP_DOLL:
273 case OP_DOLLM:
274 case OP_NOT_WORD_BOUNDARY:
275 case OP_WORD_BOUNDARY:
276 cc += PRIV(OP_lengths)[*cc];
277 break;
278
279 case OP_CALLOUT_STR:
280 cc += GET(cc, 1 + 2*LINK_SIZE);
281 break;
282
283 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
284
285 case OP_BRAZERO:
286 case OP_BRAMINZERO:
287 case OP_BRAPOSZERO:
288 case OP_SKIPZERO:
289 cc += PRIV(OP_lengths)[*cc];
290 do cc += GET(cc, 1); while (*cc == OP_ALT);
291 cc += 1 + LINK_SIZE;
292 break;
293
294 /* Handle literal characters and + repetitions */
295
296 case OP_CHAR:
297 case OP_CHARI:
298 case OP_NOT:
299 case OP_NOTI:
300 case OP_PLUS:
301 case OP_PLUSI:
302 case OP_MINPLUS:
303 case OP_MINPLUSI:
304 case OP_POSPLUS:
305 case OP_POSPLUSI:
306 case OP_NOTPLUS:
307 case OP_NOTPLUSI:
308 case OP_NOTMINPLUS:
309 case OP_NOTMINPLUSI:
310 case OP_NOTPOSPLUS:
311 case OP_NOTPOSPLUSI:
312 branchlength++;
313 cc += 2;
314 #ifdef SUPPORT_UNICODE
315 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
316 #endif
317 break;
318
319 case OP_TYPEPLUS:
320 case OP_TYPEMINPLUS:
321 case OP_TYPEPOSPLUS:
322 branchlength++;
323 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
324 break;
325
326 /* Handle exact repetitions. The count is already in characters, but we
327 may need to skip over a multibyte character in UTF mode. */
328
329 case OP_EXACT:
330 case OP_EXACTI:
331 case OP_NOTEXACT:
332 case OP_NOTEXACTI:
333 branchlength += GET2(cc,1);
334 cc += 2 + IMM2_SIZE;
335 #ifdef SUPPORT_UNICODE
336 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
337 #endif
338 break;
339
340 case OP_TYPEEXACT:
341 branchlength += GET2(cc,1);
342 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
343 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
344 break;
345
346 /* Handle single-char non-literal matchers */
347
348 case OP_PROP:
349 case OP_NOTPROP:
350 cc += 2;
351 /* Fall through */
352
353 case OP_NOT_DIGIT:
354 case OP_DIGIT:
355 case OP_NOT_WHITESPACE:
356 case OP_WHITESPACE:
357 case OP_NOT_WORDCHAR:
358 case OP_WORDCHAR:
359 case OP_ANY:
360 case OP_ALLANY:
361 case OP_EXTUNI:
362 case OP_HSPACE:
363 case OP_NOT_HSPACE:
364 case OP_VSPACE:
365 case OP_NOT_VSPACE:
366 branchlength++;
367 cc++;
368 break;
369
370 /* "Any newline" might match two characters, but it also might match just
371 one. */
372
373 case OP_ANYNL:
374 branchlength += 1;
375 cc++;
376 break;
377
378 /* The single-byte matcher means we can't proceed in UTF mode. (In
379 non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
380 appear, but leave the code, just in case.) */
381
382 case OP_ANYBYTE:
383 #ifdef SUPPORT_UNICODE
384 if (utf) return -1;
385 #endif
386 branchlength++;
387 cc++;
388 break;
389
390 /* For repeated character types, we have to test for \p and \P, which have
391 an extra two bytes of parameters. */
392
393 case OP_TYPESTAR:
394 case OP_TYPEMINSTAR:
395 case OP_TYPEQUERY:
396 case OP_TYPEMINQUERY:
397 case OP_TYPEPOSSTAR:
398 case OP_TYPEPOSQUERY:
399 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
400 cc += PRIV(OP_lengths)[op];
401 break;
402
403 case OP_TYPEUPTO:
404 case OP_TYPEMINUPTO:
405 case OP_TYPEPOSUPTO:
406 if (cc[1 + IMM2_SIZE] == OP_PROP
407 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
408 cc += PRIV(OP_lengths)[op];
409 break;
410
411 /* Check a class for variable quantification */
412
413 case OP_CLASS:
414 case OP_NCLASS:
415 #ifdef SUPPORT_WIDE_CHARS
416 case OP_XCLASS:
417 /* The original code caused an unsigned overflow in 64 bit systems,
418 so now we use a conditional statement. */
419 if (op == OP_XCLASS)
420 cc += GET(cc, 1);
421 else
422 cc += PRIV(OP_lengths)[OP_CLASS];
423 #else
424 cc += PRIV(OP_lengths)[OP_CLASS];
425 #endif
426
427 switch (*cc)
428 {
429 case OP_CRPLUS:
430 case OP_CRMINPLUS:
431 case OP_CRPOSPLUS:
432 branchlength++;
433 /* Fall through */
434
435 case OP_CRSTAR:
436 case OP_CRMINSTAR:
437 case OP_CRQUERY:
438 case OP_CRMINQUERY:
439 case OP_CRPOSSTAR:
440 case OP_CRPOSQUERY:
441 cc++;
442 break;
443
444 case OP_CRRANGE:
445 case OP_CRMINRANGE:
446 case OP_CRPOSRANGE:
447 branchlength += GET2(cc,1);
448 cc += 1 + 2 * IMM2_SIZE;
449 break;
450
451 default:
452 branchlength++;
453 break;
454 }
455 break;
456
457 /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
458 way: we find the minimum length for the subpattern. A recursion
459 (backreference or subroutine) causes an a flag to be set that causes the
460 length of this branch to be ignored. The logic is that a recursion can only
461 make sense if there is another alternative that stops the recursing. That
462 will provide the minimum length (when no recursion happens).
463
464 If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
465 matches an empty string (by default it causes a matching failure), so in
466 that case we must set the minimum length to zero.
467
468 For backreferenes, if duplicate numbers are present in the pattern we check
469 for a reference to a duplicate. If it is, we don't know which version will
470 be referenced, so we have to set the minimum length to zero. */
471
472 /* Duplicate named pattern back reference. */
473
474 case OP_DNREF:
475 case OP_DNREFI:
476 if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
477 {
478 int count = GET2(cc, 1+IMM2_SIZE);
479 PCRE2_UCHAR *slot =
480 (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
481 GET2(cc, 1) * re->name_entry_size;
482
483 d = INT_MAX;
484
485 /* Scan all groups with the same name; find the shortest. */
486
487 while (count-- > 0)
488 {
489 int dd, i;
490 recno = GET2(slot, 0);
491
492 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
493 dd = backref_cache[recno];
494 else
495 {
496 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
497 if (cs == NULL) return -2;
498 do ce += GET(ce, 1); while (*ce == OP_ALT);
499
500 dd = 0;
501 if (!dupcapused ||
502 (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
503 {
504 if (cc > cs && cc < ce) /* Simple recursion */
505 {
506 had_recurse = TRUE;
507 }
508 else
509 {
510 recurse_check *r = recurses;
511 for (r = recurses; r != NULL; r = r->prev)
512 if (r->group == cs) break;
513 if (r != NULL) /* Mutual recursion */
514 {
515 had_recurse = TRUE;
516 }
517 else
518 {
519 this_recurse.prev = recurses; /* No recursion */
520 this_recurse.group = cs;
521 dd = find_minlength(re, cs, startcode, utf, &this_recurse,
522 countptr, backref_cache);
523 if (dd < 0) return dd;
524 }
525 }
526 }
527
528 backref_cache[recno] = dd;
529 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
530 backref_cache[0] = recno;
531 }
532
533 if (dd < d) d = dd;
534 if (d <= 0) break; /* No point looking at any more */
535 slot += re->name_entry_size;
536 }
537 }
538 else d = 0;
539 cc += 1 + 2*IMM2_SIZE;
540 goto REPEAT_BACK_REFERENCE;
541
542 /* Single back reference by number. References by name are converted to by
543 number when there is no duplication. */
544
545 case OP_REF:
546 case OP_REFI:
547 recno = GET2(cc, 1);
548 if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
549 d = backref_cache[recno];
550 else
551 {
552 int i;
553 d = 0;
554
555 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
556 {
557 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
558 if (cs == NULL) return -2;
559 do ce += GET(ce, 1); while (*ce == OP_ALT);
560
561 if (!dupcapused ||
562 (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
563 {
564 if (cc > cs && cc < ce) /* Simple recursion */
565 {
566 had_recurse = TRUE;
567 }
568 else
569 {
570 recurse_check *r = recurses;
571 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
572 if (r != NULL) /* Mutual recursion */
573 {
574 had_recurse = TRUE;
575 }
576 else /* No recursion */
577 {
578 this_recurse.prev = recurses;
579 this_recurse.group = cs;
580 d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
581 backref_cache);
582 if (d < 0) return d;
583 }
584 }
585 }
586 }
587
588 backref_cache[recno] = d;
589 for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
590 backref_cache[0] = recno;
591 }
592
593 cc += 1 + IMM2_SIZE;
594
595 /* Handle repeated back references */
596
597 REPEAT_BACK_REFERENCE:
598 switch (*cc)
599 {
600 case OP_CRSTAR:
601 case OP_CRMINSTAR:
602 case OP_CRQUERY:
603 case OP_CRMINQUERY:
604 case OP_CRPOSSTAR:
605 case OP_CRPOSQUERY:
606 min = 0;
607 cc++;
608 break;
609
610 case OP_CRPLUS:
611 case OP_CRMINPLUS:
612 case OP_CRPOSPLUS:
613 min = 1;
614 cc++;
615 break;
616
617 case OP_CRRANGE:
618 case OP_CRMINRANGE:
619 case OP_CRPOSRANGE:
620 min = GET2(cc, 1);
621 cc += 1 + 2 * IMM2_SIZE;
622 break;
623
624 default:
625 min = 1;
626 break;
627 }
628
629 /* Take care not to overflow: (1) min and d are ints, so check that their
630 product is not greater than INT_MAX. (2) branchlength is limited to
631 UINT16_MAX (checked at the top of the loop). */
632
633 if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
634 branchlength = UINT16_MAX;
635 else branchlength += min * d;
636 break;
637
638 /* Recursion always refers to the first occurrence of a subpattern with a
639 given number. Therefore, we can always make use of caching, even when the
640 pattern contains multiple subpatterns with the same number. */
641
642 case OP_RECURSE:
643 cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
644 recno = GET2(cs, 1+LINK_SIZE);
645 if (recno == prev_recurse_recno)
646 {
647 branchlength += prev_recurse_d;
648 }
649 else
650 {
651 do ce += GET(ce, 1); while (*ce == OP_ALT);
652 if (cc > cs && cc < ce) /* Simple recursion */
653 had_recurse = TRUE;
654 else
655 {
656 recurse_check *r = recurses;
657 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
658 if (r != NULL) /* Mutual recursion */
659 had_recurse = TRUE;
660 else
661 {
662 this_recurse.prev = recurses;
663 this_recurse.group = cs;
664 prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
665 countptr, backref_cache);
666 if (prev_recurse_d < 0) return prev_recurse_d;
667 prev_recurse_recno = recno;
668 branchlength += prev_recurse_d;
669 }
670 }
671 }
672 cc += 1 + LINK_SIZE + once_fudge;
673 once_fudge = 0;
674 break;
675
676 /* Anything else does not or need not match a character. We can get the
677 item's length from the table, but for those that can match zero occurrences
678 of a character, we must take special action for UTF-8 characters. As it
679 happens, the "NOT" versions of these opcodes are used at present only for
680 ASCII characters, so they could be omitted from this list. However, in
681 future that may change, so we include them here so as not to leave a
682 gotcha for a future maintainer. */
683
684 case OP_UPTO:
685 case OP_UPTOI:
686 case OP_NOTUPTO:
687 case OP_NOTUPTOI:
688 case OP_MINUPTO:
689 case OP_MINUPTOI:
690 case OP_NOTMINUPTO:
691 case OP_NOTMINUPTOI:
692 case OP_POSUPTO:
693 case OP_POSUPTOI:
694 case OP_NOTPOSUPTO:
695 case OP_NOTPOSUPTOI:
696
697 case OP_STAR:
698 case OP_STARI:
699 case OP_NOTSTAR:
700 case OP_NOTSTARI:
701 case OP_MINSTAR:
702 case OP_MINSTARI:
703 case OP_NOTMINSTAR:
704 case OP_NOTMINSTARI:
705 case OP_POSSTAR:
706 case OP_POSSTARI:
707 case OP_NOTPOSSTAR:
708 case OP_NOTPOSSTARI:
709
710 case OP_QUERY:
711 case OP_QUERYI:
712 case OP_NOTQUERY:
713 case OP_NOTQUERYI:
714 case OP_MINQUERY:
715 case OP_MINQUERYI:
716 case OP_NOTMINQUERY:
717 case OP_NOTMINQUERYI:
718 case OP_POSQUERY:
719 case OP_POSQUERYI:
720 case OP_NOTPOSQUERY:
721 case OP_NOTPOSQUERYI:
722
723 cc += PRIV(OP_lengths)[op];
724 #ifdef SUPPORT_UNICODE
725 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
726 #endif
727 break;
728
729 /* Skip these, but we need to add in the name length. */
730
731 case OP_MARK:
732 case OP_COMMIT_ARG:
733 case OP_PRUNE_ARG:
734 case OP_SKIP_ARG:
735 case OP_THEN_ARG:
736 cc += PRIV(OP_lengths)[op] + cc[1];
737 break;
738
739 /* The remaining opcodes are just skipped over. */
740
741 case OP_CLOSE:
742 case OP_COMMIT:
743 case OP_FAIL:
744 case OP_PRUNE:
745 case OP_SET_SOM:
746 case OP_SKIP:
747 case OP_THEN:
748 cc += PRIV(OP_lengths)[op];
749 break;
750
751 /* This should not occur: we list all opcodes explicitly so that when
752 new ones get added they are properly considered. */
753
754 default:
755 return -3;
756 }
757 }
758 /* Control never gets here */
759 }
760
761
762
763 /*************************************************
764 * Set a bit and maybe its alternate case *
765 *************************************************/
766
767 /* Given a character, set its first code unit's bit in the table, and also the
768 corresponding bit for the other version of a letter if we are caseless.
769
770 Arguments:
771 re points to the regex block
772 p points to the first code unit of the character
773 caseless TRUE if caseless
774 utf TRUE for UTF mode
775 ucp TRUE for UCP mode
776
777 Returns: pointer after the character
778 */
779
780 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf,BOOL ucp)781 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
782 BOOL ucp)
783 {
784 uint32_t c = *p++; /* First code unit */
785
786 (void)utf; /* Stop compiler warnings when UTF not supported */
787 (void)ucp;
788
789 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
790 0xff. */
791
792 #if PCRE2_CODE_UNIT_WIDTH != 8
793 if (c > 0xff) SET_BIT(0xff); else
794 #endif
795
796 SET_BIT(c);
797
798 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
799 the end of the character, even when caseless. */
800
801 #ifdef SUPPORT_UNICODE
802 if (utf)
803 {
804 #if PCRE2_CODE_UNIT_WIDTH == 8
805 if (c >= 0xc0) GETUTF8INC(c, p);
806 #elif PCRE2_CODE_UNIT_WIDTH == 16
807 if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
808 #endif
809 }
810 #endif /* SUPPORT_UNICODE */
811
812 /* If caseless, handle the other case of the character. */
813
814 if (caseless)
815 {
816 #ifdef SUPPORT_UNICODE
817 if (utf || ucp)
818 {
819 c = UCD_OTHERCASE(c);
820 #if PCRE2_CODE_UNIT_WIDTH == 8
821 if (utf)
822 {
823 PCRE2_UCHAR buff[6];
824 (void)PRIV(ord2utf)(c, buff);
825 SET_BIT(buff[0]);
826 }
827 else if (c < 256) SET_BIT(c);
828 #else /* 16-bit or 32-bit mode */
829 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
830 #endif
831 }
832
833 else
834 #endif /* SUPPORT_UNICODE */
835
836 /* Not UTF or UCP */
837
838 if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
839 }
840
841 return p;
842 }
843
844
845
846 /*************************************************
847 * Set bits for a positive character type *
848 *************************************************/
849
850 /* This function sets starting bits for a character type. In UTF-8 mode, we can
851 only do a direct setting for bytes less than 128, as otherwise there can be
852 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
853 environment, the tables will only recognize ASCII characters anyway, but in at
854 least one Windows environment, some higher bytes bits were set in the tables.
855 So we deal with that case by considering the UTF-8 encoding.
856
857 Arguments:
858 re the regex block
859 cbit type the type of character wanted
860 table_limit 32 for non-UTF-8; 16 for UTF-8
861
862 Returns: nothing
863 */
864
865 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)866 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
867 {
868 uint32_t c;
869 for (c = 0; c < table_limit; c++)
870 re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
871 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
872 if (table_limit == 32) return;
873 for (c = 128; c < 256; c++)
874 {
875 if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
876 {
877 PCRE2_UCHAR buff[6];
878 (void)PRIV(ord2utf)(c, buff);
879 SET_BIT(buff[0]);
880 }
881 }
882 #endif /* UTF-8 */
883 }
884
885
886 /*************************************************
887 * Set bits for a negative character type *
888 *************************************************/
889
890 /* This function sets starting bits for a negative character type such as \D.
891 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
892 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
893 Unlike in the positive case, where we can set appropriate starting bits for
894 specific high-valued UTF-8 characters, in this case we have to set the bits for
895 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
896 0xc0 (192) for simplicity.
897
898 Arguments:
899 re the regex block
900 cbit type the type of character wanted
901 table_limit 32 for non-UTF-8; 16 for UTF-8
902
903 Returns: nothing
904 */
905
906 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)907 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
908 {
909 uint32_t c;
910 for (c = 0; c < table_limit; c++)
911 re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
912 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
913 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
914 #endif
915 }
916
917
918
919 /*************************************************
920 * Create bitmap of starting code units *
921 *************************************************/
922
923 /* This function scans a compiled unanchored expression recursively and
924 attempts to build a bitmap of the set of possible starting code units whose
925 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
926 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
927 we pass a value of 16 rather than 32 as the final argument. (See comments in
928 those functions for the reason.)
929
930 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
931 (a*)b where the group provides some optional starting code units but scanning
932 must continue at the outer level to find at least one mandatory code unit. At
933 the outermost level, this function fails unless the result is SSB_DONE.
934
935 We restrict recursion (for nested groups) to 1000 to avoid stack overflow
936 issues.
937
938 Arguments:
939 re points to the compiled regex block
940 code points to an expression
941 utf TRUE if in UTF mode
942 ucp TRUE if in UCP mode
943 depthptr pointer to recurse depth
944
945 Returns: SSB_FAIL => Failed to find any starting code units
946 SSB_DONE => Found mandatory starting code units
947 SSB_CONTINUE => Found optional starting code units
948 SSB_UNKNOWN => Hit an unrecognized opcode
949 SSB_TOODEEP => Recursion is too deep
950 */
951
952 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf,BOOL ucp,int * depthptr)953 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
954 int *depthptr)
955 {
956 uint32_t c;
957 int yield = SSB_DONE;
958
959 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
960 int table_limit = utf? 16:32;
961 #else
962 int table_limit = 32;
963 #endif
964
965 *depthptr += 1;
966 if (*depthptr > 1000) return SSB_TOODEEP;
967
968 do
969 {
970 BOOL try_next = TRUE;
971 PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
972
973 if (*code == OP_CBRA || *code == OP_SCBRA ||
974 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
975
976 while (try_next) /* Loop for items in this branch */
977 {
978 int rc;
979 uint8_t *classmap = NULL;
980 #ifdef SUPPORT_WIDE_CHARS
981 PCRE2_UCHAR xclassflags;
982 #endif
983
984 switch(*tcode)
985 {
986 /* If we reach something we don't understand, it means a new opcode has
987 been created that hasn't been added to this function. Hopefully this
988 problem will be discovered during testing. */
989
990 default:
991 return SSB_UNKNOWN;
992
993 /* Fail for a valid opcode that implies no starting bits. */
994
995 case OP_ACCEPT:
996 case OP_ASSERT_ACCEPT:
997 case OP_ALLANY:
998 case OP_ANY:
999 case OP_ANYBYTE:
1000 case OP_CIRCM:
1001 case OP_CLOSE:
1002 case OP_COMMIT:
1003 case OP_COMMIT_ARG:
1004 case OP_COND:
1005 case OP_CREF:
1006 case OP_FALSE:
1007 case OP_TRUE:
1008 case OP_DNCREF:
1009 case OP_DNREF:
1010 case OP_DNREFI:
1011 case OP_DNRREF:
1012 case OP_DOLL:
1013 case OP_DOLLM:
1014 case OP_END:
1015 case OP_EOD:
1016 case OP_EODN:
1017 case OP_EXTUNI:
1018 case OP_FAIL:
1019 case OP_MARK:
1020 case OP_NOT:
1021 case OP_NOTEXACT:
1022 case OP_NOTEXACTI:
1023 case OP_NOTI:
1024 case OP_NOTMINPLUS:
1025 case OP_NOTMINPLUSI:
1026 case OP_NOTMINQUERY:
1027 case OP_NOTMINQUERYI:
1028 case OP_NOTMINSTAR:
1029 case OP_NOTMINSTARI:
1030 case OP_NOTMINUPTO:
1031 case OP_NOTMINUPTOI:
1032 case OP_NOTPLUS:
1033 case OP_NOTPLUSI:
1034 case OP_NOTPOSPLUS:
1035 case OP_NOTPOSPLUSI:
1036 case OP_NOTPOSQUERY:
1037 case OP_NOTPOSQUERYI:
1038 case OP_NOTPOSSTAR:
1039 case OP_NOTPOSSTARI:
1040 case OP_NOTPOSUPTO:
1041 case OP_NOTPOSUPTOI:
1042 case OP_NOTPROP:
1043 case OP_NOTQUERY:
1044 case OP_NOTQUERYI:
1045 case OP_NOTSTAR:
1046 case OP_NOTSTARI:
1047 case OP_NOTUPTO:
1048 case OP_NOTUPTOI:
1049 case OP_NOT_HSPACE:
1050 case OP_NOT_VSPACE:
1051 case OP_PRUNE:
1052 case OP_PRUNE_ARG:
1053 case OP_RECURSE:
1054 case OP_REF:
1055 case OP_REFI:
1056 case OP_REVERSE:
1057 case OP_RREF:
1058 case OP_SCOND:
1059 case OP_SET_SOM:
1060 case OP_SKIP:
1061 case OP_SKIP_ARG:
1062 case OP_SOD:
1063 case OP_SOM:
1064 case OP_THEN:
1065 case OP_THEN_ARG:
1066 return SSB_FAIL;
1067
1068 /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1069 uses OP_CIRCM). Skip over it. */
1070
1071 case OP_CIRC:
1072 tcode += PRIV(OP_lengths)[OP_CIRC];
1073 break;
1074
1075 /* A "real" property test implies no starting bits, but the fake property
1076 PT_CLIST identifies a list of characters. These lists are short, as they
1077 are used for characters with more than one "other case", so there is no
1078 point in recognizing them for OP_NOTPROP. */
1079
1080 case OP_PROP:
1081 if (tcode[1] != PT_CLIST) return SSB_FAIL;
1082 {
1083 const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1084 while ((c = *p++) < NOTACHAR)
1085 {
1086 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1087 if (utf)
1088 {
1089 PCRE2_UCHAR buff[6];
1090 (void)PRIV(ord2utf)(c, buff);
1091 c = buff[0];
1092 }
1093 #endif
1094 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1095 }
1096 }
1097 try_next = FALSE;
1098 break;
1099
1100 /* We can ignore word boundary tests. */
1101
1102 case OP_WORD_BOUNDARY:
1103 case OP_NOT_WORD_BOUNDARY:
1104 tcode++;
1105 break;
1106
1107 /* If we hit a bracket or a positive lookahead assertion, recurse to set
1108 bits from within the subpattern. If it can't find anything, we have to
1109 give up. If it finds some mandatory character(s), we are done for this
1110 branch. Otherwise, carry on scanning after the subpattern. */
1111
1112 case OP_BRA:
1113 case OP_SBRA:
1114 case OP_CBRA:
1115 case OP_SCBRA:
1116 case OP_BRAPOS:
1117 case OP_SBRAPOS:
1118 case OP_CBRAPOS:
1119 case OP_SCBRAPOS:
1120 case OP_ONCE:
1121 case OP_SCRIPT_RUN:
1122 case OP_ASSERT:
1123 case OP_ASSERT_NA:
1124 rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1125 if (rc == SSB_DONE)
1126 {
1127 try_next = FALSE;
1128 }
1129 else if (rc == SSB_CONTINUE)
1130 {
1131 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1132 tcode += 1 + LINK_SIZE;
1133 }
1134 else return rc; /* FAIL, UNKNOWN, or TOODEEP */
1135 break;
1136
1137 /* If we hit ALT or KET, it means we haven't found anything mandatory in
1138 this branch, though we might have found something optional. For ALT, we
1139 continue with the next alternative, but we have to arrange that the final
1140 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1141 return SSB_CONTINUE: if this is the top level, that indicates failure,
1142 but after a nested subpattern, it causes scanning to continue. */
1143
1144 case OP_ALT:
1145 yield = SSB_CONTINUE;
1146 try_next = FALSE;
1147 break;
1148
1149 case OP_KET:
1150 case OP_KETRMAX:
1151 case OP_KETRMIN:
1152 case OP_KETRPOS:
1153 return SSB_CONTINUE;
1154
1155 /* Skip over callout */
1156
1157 case OP_CALLOUT:
1158 tcode += PRIV(OP_lengths)[OP_CALLOUT];
1159 break;
1160
1161 case OP_CALLOUT_STR:
1162 tcode += GET(tcode, 1 + 2*LINK_SIZE);
1163 break;
1164
1165 /* Skip over lookbehind and negative lookahead assertions */
1166
1167 case OP_ASSERT_NOT:
1168 case OP_ASSERTBACK:
1169 case OP_ASSERTBACK_NOT:
1170 case OP_ASSERTBACK_NA:
1171 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1172 tcode += 1 + LINK_SIZE;
1173 break;
1174
1175 /* BRAZERO does the bracket, but carries on. */
1176
1177 case OP_BRAZERO:
1178 case OP_BRAMINZERO:
1179 case OP_BRAPOSZERO:
1180 rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1181 if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1182 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1183 tcode += 1 + LINK_SIZE;
1184 break;
1185
1186 /* SKIPZERO skips the bracket. */
1187
1188 case OP_SKIPZERO:
1189 tcode++;
1190 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1191 tcode += 1 + LINK_SIZE;
1192 break;
1193
1194 /* Single-char * or ? sets the bit and tries the next item */
1195
1196 case OP_STAR:
1197 case OP_MINSTAR:
1198 case OP_POSSTAR:
1199 case OP_QUERY:
1200 case OP_MINQUERY:
1201 case OP_POSQUERY:
1202 tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1203 break;
1204
1205 case OP_STARI:
1206 case OP_MINSTARI:
1207 case OP_POSSTARI:
1208 case OP_QUERYI:
1209 case OP_MINQUERYI:
1210 case OP_POSQUERYI:
1211 tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1212 break;
1213
1214 /* Single-char upto sets the bit and tries the next */
1215
1216 case OP_UPTO:
1217 case OP_MINUPTO:
1218 case OP_POSUPTO:
1219 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1220 break;
1221
1222 case OP_UPTOI:
1223 case OP_MINUPTOI:
1224 case OP_POSUPTOI:
1225 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1226 break;
1227
1228 /* At least one single char sets the bit and stops */
1229
1230 case OP_EXACT:
1231 tcode += IMM2_SIZE;
1232 /* Fall through */
1233 case OP_CHAR:
1234 case OP_PLUS:
1235 case OP_MINPLUS:
1236 case OP_POSPLUS:
1237 (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1238 try_next = FALSE;
1239 break;
1240
1241 case OP_EXACTI:
1242 tcode += IMM2_SIZE;
1243 /* Fall through */
1244 case OP_CHARI:
1245 case OP_PLUSI:
1246 case OP_MINPLUSI:
1247 case OP_POSPLUSI:
1248 (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1249 try_next = FALSE;
1250 break;
1251
1252 /* Special spacing and line-terminating items. These recognize specific
1253 lists of characters. The difference between VSPACE and ANYNL is that the
1254 latter can match the two-character CRLF sequence, but that is not
1255 relevant for finding the first character, so their code here is
1256 identical. */
1257
1258 case OP_HSPACE:
1259 SET_BIT(CHAR_HT);
1260 SET_BIT(CHAR_SPACE);
1261
1262 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1263 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1264
1265 #if PCRE2_CODE_UNIT_WIDTH != 8
1266 SET_BIT(0xA0);
1267 SET_BIT(0xFF);
1268 #else
1269 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1270 units of horizontal space characters. */
1271
1272 #ifdef SUPPORT_UNICODE
1273 if (utf)
1274 {
1275 SET_BIT(0xC2); /* For U+00A0 */
1276 SET_BIT(0xE1); /* For U+1680, U+180E */
1277 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1278 SET_BIT(0xE3); /* For U+3000 */
1279 }
1280 else
1281 #endif
1282 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1283 the code is EBCDIC. */
1284 {
1285 #ifndef EBCDIC
1286 SET_BIT(0xA0);
1287 #endif /* Not EBCDIC */
1288 }
1289 #endif /* 8-bit support */
1290
1291 try_next = FALSE;
1292 break;
1293
1294 case OP_ANYNL:
1295 case OP_VSPACE:
1296 SET_BIT(CHAR_LF);
1297 SET_BIT(CHAR_VT);
1298 SET_BIT(CHAR_FF);
1299 SET_BIT(CHAR_CR);
1300
1301 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1302 the bits for NEL and for code units >= 255, independently of UTF. */
1303
1304 #if PCRE2_CODE_UNIT_WIDTH != 8
1305 SET_BIT(CHAR_NEL);
1306 SET_BIT(0xFF);
1307 #else
1308 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1309 units of vertical space characters. */
1310
1311 #ifdef SUPPORT_UNICODE
1312 if (utf)
1313 {
1314 SET_BIT(0xC2); /* For U+0085 (NEL) */
1315 SET_BIT(0xE2); /* For U+2028, U+2029 */
1316 }
1317 else
1318 #endif
1319 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1320 {
1321 SET_BIT(CHAR_NEL);
1322 }
1323 #endif /* 8-bit support */
1324
1325 try_next = FALSE;
1326 break;
1327
1328 /* Single character types set the bits and stop. Note that if PCRE2_UCP
1329 is set, we do not see these opcodes because \d etc are converted to
1330 properties. Therefore, these apply in the case when only characters less
1331 than 256 are recognized to match the types. */
1332
1333 case OP_NOT_DIGIT:
1334 set_nottype_bits(re, cbit_digit, table_limit);
1335 try_next = FALSE;
1336 break;
1337
1338 case OP_DIGIT:
1339 set_type_bits(re, cbit_digit, table_limit);
1340 try_next = FALSE;
1341 break;
1342
1343 case OP_NOT_WHITESPACE:
1344 set_nottype_bits(re, cbit_space, table_limit);
1345 try_next = FALSE;
1346 break;
1347
1348 case OP_WHITESPACE:
1349 set_type_bits(re, cbit_space, table_limit);
1350 try_next = FALSE;
1351 break;
1352
1353 case OP_NOT_WORDCHAR:
1354 set_nottype_bits(re, cbit_word, table_limit);
1355 try_next = FALSE;
1356 break;
1357
1358 case OP_WORDCHAR:
1359 set_type_bits(re, cbit_word, table_limit);
1360 try_next = FALSE;
1361 break;
1362
1363 /* One or more character type fudges the pointer and restarts, knowing
1364 it will hit a single character type and stop there. */
1365
1366 case OP_TYPEPLUS:
1367 case OP_TYPEMINPLUS:
1368 case OP_TYPEPOSPLUS:
1369 tcode++;
1370 break;
1371
1372 case OP_TYPEEXACT:
1373 tcode += 1 + IMM2_SIZE;
1374 break;
1375
1376 /* Zero or more repeats of character types set the bits and then
1377 try again. */
1378
1379 case OP_TYPEUPTO:
1380 case OP_TYPEMINUPTO:
1381 case OP_TYPEPOSUPTO:
1382 tcode += IMM2_SIZE; /* Fall through */
1383
1384 case OP_TYPESTAR:
1385 case OP_TYPEMINSTAR:
1386 case OP_TYPEPOSSTAR:
1387 case OP_TYPEQUERY:
1388 case OP_TYPEMINQUERY:
1389 case OP_TYPEPOSQUERY:
1390 switch(tcode[1])
1391 {
1392 default:
1393 case OP_ANY:
1394 case OP_ALLANY:
1395 return SSB_FAIL;
1396
1397 case OP_HSPACE:
1398 SET_BIT(CHAR_HT);
1399 SET_BIT(CHAR_SPACE);
1400
1401 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1402 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1403
1404 #if PCRE2_CODE_UNIT_WIDTH != 8
1405 SET_BIT(0xA0);
1406 SET_BIT(0xFF);
1407 #else
1408 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1409 units of horizontal space characters. */
1410
1411 #ifdef SUPPORT_UNICODE
1412 if (utf)
1413 {
1414 SET_BIT(0xC2); /* For U+00A0 */
1415 SET_BIT(0xE1); /* For U+1680, U+180E */
1416 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1417 SET_BIT(0xE3); /* For U+3000 */
1418 }
1419 else
1420 #endif
1421 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1422 the code is EBCDIC. */
1423 {
1424 #ifndef EBCDIC
1425 SET_BIT(0xA0);
1426 #endif /* Not EBCDIC */
1427 }
1428 #endif /* 8-bit support */
1429 break;
1430
1431 case OP_ANYNL:
1432 case OP_VSPACE:
1433 SET_BIT(CHAR_LF);
1434 SET_BIT(CHAR_VT);
1435 SET_BIT(CHAR_FF);
1436 SET_BIT(CHAR_CR);
1437
1438 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1439 the bits for NEL and for code units >= 255, independently of UTF. */
1440
1441 #if PCRE2_CODE_UNIT_WIDTH != 8
1442 SET_BIT(CHAR_NEL);
1443 SET_BIT(0xFF);
1444 #else
1445 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1446 units of vertical space characters. */
1447
1448 #ifdef SUPPORT_UNICODE
1449 if (utf)
1450 {
1451 SET_BIT(0xC2); /* For U+0085 (NEL) */
1452 SET_BIT(0xE2); /* For U+2028, U+2029 */
1453 }
1454 else
1455 #endif
1456 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1457 {
1458 SET_BIT(CHAR_NEL);
1459 }
1460 #endif /* 8-bit support */
1461 break;
1462
1463 case OP_NOT_DIGIT:
1464 set_nottype_bits(re, cbit_digit, table_limit);
1465 break;
1466
1467 case OP_DIGIT:
1468 set_type_bits(re, cbit_digit, table_limit);
1469 break;
1470
1471 case OP_NOT_WHITESPACE:
1472 set_nottype_bits(re, cbit_space, table_limit);
1473 break;
1474
1475 case OP_WHITESPACE:
1476 set_type_bits(re, cbit_space, table_limit);
1477 break;
1478
1479 case OP_NOT_WORDCHAR:
1480 set_nottype_bits(re, cbit_word, table_limit);
1481 break;
1482
1483 case OP_WORDCHAR:
1484 set_type_bits(re, cbit_word, table_limit);
1485 break;
1486 }
1487
1488 tcode += 2;
1489 break;
1490
1491 /* Extended class: if there are any property checks, or if this is a
1492 negative XCLASS without a map, give up. If there are no property checks,
1493 there must be wide characters on the XCLASS list, because otherwise an
1494 XCLASS would not have been created. This means that code points >= 255
1495 are potential starters. In the UTF-8 case we can scan them and set bits
1496 for the relevant leading bytes. */
1497
1498 #ifdef SUPPORT_WIDE_CHARS
1499 case OP_XCLASS:
1500 xclassflags = tcode[1 + LINK_SIZE];
1501 if ((xclassflags & XCL_HASPROP) != 0 ||
1502 (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1503 return SSB_FAIL;
1504
1505 /* We have a positive XCLASS or a negative one without a map. Set up the
1506 map pointer if there is one, and fall through. */
1507
1508 classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1509 (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1510
1511 /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1512 then jump to handle the map. */
1513
1514 #if PCRE2_CODE_UNIT_WIDTH == 8
1515 if (utf && (xclassflags & XCL_NOT) == 0)
1516 {
1517 PCRE2_UCHAR b, e;
1518 PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1519 tcode += GET(tcode, 1);
1520
1521 for (;;) switch (*p++)
1522 {
1523 case XCL_SINGLE:
1524 b = *p++;
1525 while ((*p & 0xc0) == 0x80) p++;
1526 re->start_bitmap[b/8] |= (1u << (b&7));
1527 break;
1528
1529 case XCL_RANGE:
1530 b = *p++;
1531 while ((*p & 0xc0) == 0x80) p++;
1532 e = *p++;
1533 while ((*p & 0xc0) == 0x80) p++;
1534 for (; b <= e; b++)
1535 re->start_bitmap[b/8] |= (1u << (b&7));
1536 break;
1537
1538 case XCL_END:
1539 goto HANDLE_CLASSMAP;
1540
1541 default:
1542 return SSB_UNKNOWN; /* Internal error, should not occur */
1543 }
1544 }
1545 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1546 #endif /* SUPPORT_WIDE_CHARS */
1547
1548 /* It seems that the fall through comment must be outside the #ifdef if
1549 it is to avoid the gcc compiler warning. */
1550
1551 /* Fall through */
1552
1553 /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1554 in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1555 because it starts a character with a value > 255. In 8-bit non-UTF mode,
1556 there is no difference between CLASS and NCLASS. In all other wide
1557 character modes, set the 0xFF bit to indicate code units >= 255. */
1558
1559 case OP_NCLASS:
1560 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1561 if (utf)
1562 {
1563 re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1564 memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1565 }
1566 #elif PCRE2_CODE_UNIT_WIDTH != 8
1567 SET_BIT(0xFF); /* For characters >= 255 */
1568 #endif
1569 /* Fall through */
1570
1571 /* Enter here for a positive non-XCLASS. If we have fallen through from
1572 an XCLASS, classmap will already be set; just advance the code pointer.
1573 Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1574
1575 case OP_CLASS:
1576 if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1577 {
1578 classmap = (uint8_t *)(++tcode);
1579 tcode += 32 / sizeof(PCRE2_UCHAR);
1580 }
1581
1582 /* When wide characters are supported, classmap may be NULL. In UTF-8
1583 (sic) mode, the bits in a class bit map correspond to character values,
1584 not to byte values. However, the bit map we are constructing is for byte
1585 values. So we have to do a conversion for characters whose code point is
1586 greater than 127. In fact, there are only two possible starting bytes for
1587 characters in the range 128 - 255. */
1588
1589 #if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1590 HANDLE_CLASSMAP:
1591 #endif
1592 if (classmap != NULL)
1593 {
1594 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1595 if (utf)
1596 {
1597 for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1598 for (c = 128; c < 256; c++)
1599 {
1600 if ((classmap[c/8] & (1u << (c&7))) != 0)
1601 {
1602 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1603 re->start_bitmap[d/8] |= (1u << (d&7)); /* and then skip on to the */
1604 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1605 }
1606 }
1607 }
1608 else
1609 #endif
1610 /* In all modes except UTF-8, the two bit maps are compatible. */
1611
1612 {
1613 for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1614 }
1615 }
1616
1617 /* Act on what follows the class. For a zero minimum repeat, continue;
1618 otherwise stop processing. */
1619
1620 switch (*tcode)
1621 {
1622 case OP_CRSTAR:
1623 case OP_CRMINSTAR:
1624 case OP_CRQUERY:
1625 case OP_CRMINQUERY:
1626 case OP_CRPOSSTAR:
1627 case OP_CRPOSQUERY:
1628 tcode++;
1629 break;
1630
1631 case OP_CRRANGE:
1632 case OP_CRMINRANGE:
1633 case OP_CRPOSRANGE:
1634 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1635 else try_next = FALSE;
1636 break;
1637
1638 default:
1639 try_next = FALSE;
1640 break;
1641 }
1642 break; /* End of class handling case */
1643 } /* End of switch for opcodes */
1644 } /* End of try_next loop */
1645
1646 code += GET(code, 1); /* Advance to next branch */
1647 }
1648 while (*code == OP_ALT);
1649
1650 return yield;
1651 }
1652
1653
1654
1655 /*************************************************
1656 * Study a compiled expression *
1657 *************************************************/
1658
1659 /* This function is handed a compiled expression that it must study to produce
1660 information that will speed up the matching.
1661
1662 Argument:
1663 re points to the compiled expression
1664
1665 Returns: 0 normally; non-zero should never normally occur
1666 1 unknown opcode in set_start_bits
1667 2 missing capturing bracket
1668 3 unknown opcode in find_minlength
1669 */
1670
1671 int
PRIV(study)1672 PRIV(study)(pcre2_real_code *re)
1673 {
1674 int count = 0;
1675 PCRE2_UCHAR *code;
1676 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1677 BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1678
1679 /* Find start of compiled code */
1680
1681 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1682 re->name_entry_size * re->name_count;
1683
1684 /* For a pattern that has a first code unit, or a multiline pattern that
1685 matches only at "line start", there is no point in seeking a list of starting
1686 code units. */
1687
1688 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1689 {
1690 int depth = 0;
1691 int rc = set_start_bits(re, code, utf, ucp, &depth);
1692 if (rc == SSB_UNKNOWN) return 1;
1693
1694 /* If a list of starting code units was set up, scan the list to see if only
1695 one or two were listed. Having only one listed is rare because usually a
1696 single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1697 If two are listed, see if they are caseless versions of the same character;
1698 if so we can replace the list with a caseless first code unit. This gives
1699 better performance and is plausibly worth doing for patterns such as [Ww]ord
1700 or (word|WORD). */
1701
1702 if (rc == SSB_DONE)
1703 {
1704 int i;
1705 int a = -1;
1706 int b = -1;
1707 uint8_t *p = re->start_bitmap;
1708 uint32_t flags = PCRE2_FIRSTMAPSET;
1709
1710 for (i = 0; i < 256; p++, i += 8)
1711 {
1712 uint8_t x = *p;
1713 if (x != 0)
1714 {
1715 int c;
1716 uint8_t y = x & (~x + 1); /* Least significant bit */
1717 if (y != x) goto DONE; /* More than one bit set */
1718
1719 /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1720 all wide characters", so we cannot use it here. */
1721
1722 #if PCRE2_CODE_UNIT_WIDTH != 8
1723 if (i == 248 && x == 0x80) goto DONE;
1724 #endif
1725
1726 /* Compute the character value */
1727
1728 c = i;
1729 switch (x)
1730 {
1731 case 1: break;
1732 case 2: c += 1; break; case 4: c += 2; break;
1733 case 8: c += 3; break; case 16: c += 4; break;
1734 case 32: c += 5; break; case 64: c += 6; break;
1735 case 128: c += 7; break;
1736 }
1737
1738 /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1739 mode, only values < 128 can be used. In all the other cases, c is a
1740 character value. */
1741
1742 #if PCRE2_CODE_UNIT_WIDTH == 8
1743 if (utf && c > 127) goto DONE;
1744 #endif
1745 if (a < 0) a = c; /* First one found, save in a */
1746 else if (b < 0) /* Second one found */
1747 {
1748 int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1749
1750 #ifdef SUPPORT_UNICODE
1751 if (utf || ucp)
1752 {
1753 if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
1754 if (c > 127) d = UCD_OTHERCASE(c);
1755 }
1756 #endif /* SUPPORT_UNICODE */
1757
1758 if (d != a) goto DONE; /* Not the other case of a */
1759 b = c; /* Save second in b */
1760 }
1761 else goto DONE; /* More than two characters found */
1762 }
1763 }
1764
1765 /* Replace the start code unit bits with a first code unit, but only if it
1766 is not the same as a required later code unit. This is because a search for
1767 a required code unit starts after an explicit first code unit, but at a
1768 code unit found from the bitmap. Patterns such as /a*a/ don't work
1769 if both the start unit and required unit are the same. */
1770
1771 if (a >= 0 &&
1772 (
1773 (re->flags & PCRE2_LASTSET) == 0 ||
1774 (
1775 re->last_codeunit != (uint32_t)a &&
1776 (b < 0 || re->last_codeunit != (uint32_t)b)
1777 )
1778 ))
1779 {
1780 re->first_codeunit = a;
1781 flags = PCRE2_FIRSTSET;
1782 if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
1783 }
1784
1785 DONE:
1786 re->flags |= flags;
1787 }
1788 }
1789
1790 /* Find the minimum length of subject string. If the pattern can match an empty
1791 string, the minimum length is already known. If the pattern contains (*ACCEPT)
1792 all bets are off, and we don't even try to find a minimum length. If there are
1793 more back references than the size of the vector we are going to cache them in,
1794 do nothing. A pattern that complicated will probably take a long time to
1795 analyze and may in any case turn out to be too complicated. Note that back
1796 reference minima are held as 16-bit numbers. */
1797
1798 if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
1799 re->top_backref <= MAX_CACHE_BACKREF)
1800 {
1801 int min;
1802 int backref_cache[MAX_CACHE_BACKREF+1];
1803 backref_cache[0] = 0; /* Highest one that is set */
1804 min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1805 switch(min)
1806 {
1807 case -1: /* \C in UTF mode or over-complex regex */
1808 break; /* Leave minlength unchanged (will be zero) */
1809
1810 case -2:
1811 return 2; /* missing capturing bracket */
1812
1813 case -3:
1814 return 3; /* unrecognized opcode */
1815
1816 default:
1817 re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
1818 break;
1819 }
1820 }
1821
1822 return 0;
1823 }
1824
1825 /* End of pcre2_study.c */
1826