xref: /PHP-7.4/ext/pcre/pcre2lib/pcre2_study.c (revision 9f2d0395)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2020 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #include "pcre2_internal.h"
50 
51 /* The maximum remembered capturing brackets minimum. */
52 
53 #define MAX_CACHE_BACKREF 128
54 
55 /* Set a bit in the starting code unit bit map. */
56 
57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58 
59 /* Returns from set_start_bits() */
60 
61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62 
63 
64 /*************************************************
65 *   Find the minimum subject length for a group  *
66 *************************************************/
67 
68 /* Scan a parenthesized group and compute the minimum length of subject that
69 is needed to match it. This is a lower bound; it does not mean there is a
70 string of that length that matches. In UTF mode, the result is in characters
71 rather than code units. The field in a compiled pattern for storing the minimum
72 length is 16-bits long (on the grounds that anything longer than that is
73 pathological), so we give up when we reach that amount. This also means that
74 integer overflow for really crazy patterns cannot happen.
75 
76 Backreference minimum lengths are cached to speed up multiple references. This
77 function is called only when the highest back reference in the pattern is less
78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79 caching vector. The zeroth element contains the number of the highest set
80 value.
81 
82 Arguments:
83   re              compiled pattern block
84   code            pointer to start of group (the bracket)
85   startcode       pointer to start of the whole pattern's code
86   utf             UTF flag
87   recurses        chain of recurse_check to catch mutual recursion
88   countptr        pointer to call count (to catch over complexity)
89   backref_cache   vector for caching back references.
90 
91 This function is no longer called when the pattern contains (*ACCEPT); however,
92 the old code for returning -1 is retained, just in case.
93 
94 Returns:   the minimum length
95            -1 \C in UTF-8 mode
96               or (*ACCEPT)
97               or pattern too complicated
98            -2 internal error (missing capturing bracket)
99            -3 internal error (opcode not listed)
100 */
101 
102 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr,int * backref_cache)103 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104   PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105   int *backref_cache)
106 {
107 int length = -1;
108 int branchlength = 0;
109 int prev_cap_recno = -1;
110 int prev_cap_d = 0;
111 int prev_recurse_recno = -1;
112 int prev_recurse_d = 0;
113 uint32_t once_fudge = 0;
114 BOOL had_recurse = FALSE;
115 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116 PCRE2_SPTR nextbranch = code + GET(code, 1);
117 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
118 recurse_check this_recurse;
119 
120 /* If this is a "could be empty" group, its minimum length is 0. */
121 
122 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123 
124 /* Skip over capturing bracket number */
125 
126 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127 
128 /* A large and/or complex regex can take too long to process. */
129 
130 if ((*countptr)++ > 1000) return -1;
131 
132 /* Scan along the opcodes for this branch. If we get to the end of the branch,
133 check the length against that of the other branches. If the accumulated length
134 passes 16-bits, reset to that value and skip the rest of the branch. */
135 
136 for (;;)
137   {
138   int d, min, recno;
139   PCRE2_UCHAR op, *cs, *ce;
140 
141   if (branchlength >= UINT16_MAX)
142     {
143     branchlength = UINT16_MAX;
144     cc = (PCRE2_UCHAR *)nextbranch;
145     }
146 
147   op = *cc;
148   switch (op)
149     {
150     case OP_COND:
151     case OP_SCOND:
152 
153     /* If there is only one branch in a condition, the implied branch has zero
154     length, so we don't add anything. This covers the DEFINE "condition"
155     automatically. If there are two branches we can treat it the same as any
156     other non-capturing subpattern. */
157 
158     cs = cc + GET(cc, 1);
159     if (*cs != OP_ALT)
160       {
161       cc = cs + 1 + LINK_SIZE;
162       break;
163       }
164     goto PROCESS_NON_CAPTURE;
165 
166     case OP_BRA:
167     /* There's a special case of OP_BRA, when it is wrapped round a repeated
168     OP_RECURSE. We'd like to process the latter at this level so that
169     remembering the value works for repeated cases. So we do nothing, but
170     set a fudge value to skip over the OP_KET after the recurse. */
171 
172     if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173       {
174       once_fudge = 1 + LINK_SIZE;
175       cc += 1 + LINK_SIZE;
176       break;
177       }
178     /* Fall through */
179 
180     case OP_ONCE:
181     case OP_SCRIPT_RUN:
182     case OP_SBRA:
183     case OP_BRAPOS:
184     case OP_SBRAPOS:
185     PROCESS_NON_CAPTURE:
186     d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187       backref_cache);
188     if (d < 0) return d;
189     branchlength += d;
190     do cc += GET(cc, 1); while (*cc == OP_ALT);
191     cc += 1 + LINK_SIZE;
192     break;
193 
194     /* To save time for repeated capturing subpatterns, we remember the
195     length of the previous one. Unfortunately we can't do the same for
196     the unnumbered ones above. Nor can we do this if (?| is present in the
197     pattern because captures with the same number are not then identical. */
198 
199     case OP_CBRA:
200     case OP_SCBRA:
201     case OP_CBRAPOS:
202     case OP_SCBRAPOS:
203     recno = (int)GET2(cc, 1+LINK_SIZE);
204     if (dupcapused || recno != prev_cap_recno)
205       {
206       prev_cap_recno = recno;
207       prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208         backref_cache);
209       if (prev_cap_d < 0) return prev_cap_d;
210       }
211     branchlength += prev_cap_d;
212     do cc += GET(cc, 1); while (*cc == OP_ALT);
213     cc += 1 + LINK_SIZE;
214     break;
215 
216     /* ACCEPT makes things far too complicated; we have to give up. In fact,
217     from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218     used. However, leave the code in place, just in case. */
219 
220     case OP_ACCEPT:
221     case OP_ASSERT_ACCEPT:
222     return -1;
223 
224     /* Reached end of a branch; if it's a ket it is the end of a nested
225     call. If it's ALT it is an alternation in a nested call. If it is END it's
226     the end of the outer call. All can be handled by the same code. If the
227     length of any branch is zero, there is no need to scan any subsequent
228     branches. */
229 
230     case OP_ALT:
231     case OP_KET:
232     case OP_KETRMAX:
233     case OP_KETRMIN:
234     case OP_KETRPOS:
235     case OP_END:
236     if (length < 0 || (!had_recurse && branchlength < length))
237       length = branchlength;
238     if (op != OP_ALT || length == 0) return length;
239     nextbranch = cc + GET(cc, 1);
240     cc += 1 + LINK_SIZE;
241     branchlength = 0;
242     had_recurse = FALSE;
243     break;
244 
245     /* Skip over assertive subpatterns */
246 
247     case OP_ASSERT:
248     case OP_ASSERT_NOT:
249     case OP_ASSERTBACK:
250     case OP_ASSERTBACK_NOT:
251     case OP_ASSERT_NA:
252     case OP_ASSERTBACK_NA:
253     do cc += GET(cc, 1); while (*cc == OP_ALT);
254     /* Fall through */
255 
256     /* Skip over things that don't match chars */
257 
258     case OP_REVERSE:
259     case OP_CREF:
260     case OP_DNCREF:
261     case OP_RREF:
262     case OP_DNRREF:
263     case OP_FALSE:
264     case OP_TRUE:
265     case OP_CALLOUT:
266     case OP_SOD:
267     case OP_SOM:
268     case OP_EOD:
269     case OP_EODN:
270     case OP_CIRC:
271     case OP_CIRCM:
272     case OP_DOLL:
273     case OP_DOLLM:
274     case OP_NOT_WORD_BOUNDARY:
275     case OP_WORD_BOUNDARY:
276     cc += PRIV(OP_lengths)[*cc];
277     break;
278 
279     case OP_CALLOUT_STR:
280     cc += GET(cc, 1 + 2*LINK_SIZE);
281     break;
282 
283     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
284 
285     case OP_BRAZERO:
286     case OP_BRAMINZERO:
287     case OP_BRAPOSZERO:
288     case OP_SKIPZERO:
289     cc += PRIV(OP_lengths)[*cc];
290     do cc += GET(cc, 1); while (*cc == OP_ALT);
291     cc += 1 + LINK_SIZE;
292     break;
293 
294     /* Handle literal characters and + repetitions */
295 
296     case OP_CHAR:
297     case OP_CHARI:
298     case OP_NOT:
299     case OP_NOTI:
300     case OP_PLUS:
301     case OP_PLUSI:
302     case OP_MINPLUS:
303     case OP_MINPLUSI:
304     case OP_POSPLUS:
305     case OP_POSPLUSI:
306     case OP_NOTPLUS:
307     case OP_NOTPLUSI:
308     case OP_NOTMINPLUS:
309     case OP_NOTMINPLUSI:
310     case OP_NOTPOSPLUS:
311     case OP_NOTPOSPLUSI:
312     branchlength++;
313     cc += 2;
314 #ifdef SUPPORT_UNICODE
315     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
316 #endif
317     break;
318 
319     case OP_TYPEPLUS:
320     case OP_TYPEMINPLUS:
321     case OP_TYPEPOSPLUS:
322     branchlength++;
323     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
324     break;
325 
326     /* Handle exact repetitions. The count is already in characters, but we
327     may need to skip over a multibyte character in UTF mode.  */
328 
329     case OP_EXACT:
330     case OP_EXACTI:
331     case OP_NOTEXACT:
332     case OP_NOTEXACTI:
333     branchlength += GET2(cc,1);
334     cc += 2 + IMM2_SIZE;
335 #ifdef SUPPORT_UNICODE
336     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
337 #endif
338     break;
339 
340     case OP_TYPEEXACT:
341     branchlength += GET2(cc,1);
342     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
343       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
344     break;
345 
346     /* Handle single-char non-literal matchers */
347 
348     case OP_PROP:
349     case OP_NOTPROP:
350     cc += 2;
351     /* Fall through */
352 
353     case OP_NOT_DIGIT:
354     case OP_DIGIT:
355     case OP_NOT_WHITESPACE:
356     case OP_WHITESPACE:
357     case OP_NOT_WORDCHAR:
358     case OP_WORDCHAR:
359     case OP_ANY:
360     case OP_ALLANY:
361     case OP_EXTUNI:
362     case OP_HSPACE:
363     case OP_NOT_HSPACE:
364     case OP_VSPACE:
365     case OP_NOT_VSPACE:
366     branchlength++;
367     cc++;
368     break;
369 
370     /* "Any newline" might match two characters, but it also might match just
371     one. */
372 
373     case OP_ANYNL:
374     branchlength += 1;
375     cc++;
376     break;
377 
378     /* The single-byte matcher means we can't proceed in UTF mode. (In
379     non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
380     appear, but leave the code, just in case.) */
381 
382     case OP_ANYBYTE:
383 #ifdef SUPPORT_UNICODE
384     if (utf) return -1;
385 #endif
386     branchlength++;
387     cc++;
388     break;
389 
390     /* For repeated character types, we have to test for \p and \P, which have
391     an extra two bytes of parameters. */
392 
393     case OP_TYPESTAR:
394     case OP_TYPEMINSTAR:
395     case OP_TYPEQUERY:
396     case OP_TYPEMINQUERY:
397     case OP_TYPEPOSSTAR:
398     case OP_TYPEPOSQUERY:
399     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
400     cc += PRIV(OP_lengths)[op];
401     break;
402 
403     case OP_TYPEUPTO:
404     case OP_TYPEMINUPTO:
405     case OP_TYPEPOSUPTO:
406     if (cc[1 + IMM2_SIZE] == OP_PROP
407       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
408     cc += PRIV(OP_lengths)[op];
409     break;
410 
411     /* Check a class for variable quantification */
412 
413     case OP_CLASS:
414     case OP_NCLASS:
415 #ifdef SUPPORT_WIDE_CHARS
416     case OP_XCLASS:
417     /* The original code caused an unsigned overflow in 64 bit systems,
418     so now we use a conditional statement. */
419     if (op == OP_XCLASS)
420       cc += GET(cc, 1);
421     else
422       cc += PRIV(OP_lengths)[OP_CLASS];
423 #else
424     cc += PRIV(OP_lengths)[OP_CLASS];
425 #endif
426 
427     switch (*cc)
428       {
429       case OP_CRPLUS:
430       case OP_CRMINPLUS:
431       case OP_CRPOSPLUS:
432       branchlength++;
433       /* Fall through */
434 
435       case OP_CRSTAR:
436       case OP_CRMINSTAR:
437       case OP_CRQUERY:
438       case OP_CRMINQUERY:
439       case OP_CRPOSSTAR:
440       case OP_CRPOSQUERY:
441       cc++;
442       break;
443 
444       case OP_CRRANGE:
445       case OP_CRMINRANGE:
446       case OP_CRPOSRANGE:
447       branchlength += GET2(cc,1);
448       cc += 1 + 2 * IMM2_SIZE;
449       break;
450 
451       default:
452       branchlength++;
453       break;
454       }
455     break;
456 
457     /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
458     way: we find the minimum length for the subpattern. A recursion
459     (backreference or subroutine) causes an a flag to be set that causes the
460     length of this branch to be ignored. The logic is that a recursion can only
461     make sense if there is another alternative that stops the recursing. That
462     will provide the minimum length (when no recursion happens).
463 
464     If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
465     matches an empty string (by default it causes a matching failure), so in
466     that case we must set the minimum length to zero.
467 
468     For backreferenes, if duplicate numbers are present in the pattern we check
469     for a reference to a duplicate. If it is, we don't know which version will
470     be referenced, so we have to set the minimum length to zero. */
471 
472     /* Duplicate named pattern back reference. */
473 
474     case OP_DNREF:
475     case OP_DNREFI:
476     if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
477       {
478       int count = GET2(cc, 1+IMM2_SIZE);
479       PCRE2_UCHAR *slot =
480         (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
481           GET2(cc, 1) * re->name_entry_size;
482 
483       d = INT_MAX;
484 
485       /* Scan all groups with the same name; find the shortest. */
486 
487       while (count-- > 0)
488         {
489         int dd, i;
490         recno = GET2(slot, 0);
491 
492         if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
493           dd = backref_cache[recno];
494         else
495           {
496           ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
497           if (cs == NULL) return -2;
498           do ce += GET(ce, 1); while (*ce == OP_ALT);
499 
500           dd = 0;
501           if (!dupcapused ||
502               (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
503             {
504             if (cc > cs && cc < ce)    /* Simple recursion */
505               {
506               had_recurse = TRUE;
507               }
508             else
509               {
510               recurse_check *r = recurses;
511               for (r = recurses; r != NULL; r = r->prev)
512                 if (r->group == cs) break;
513               if (r != NULL)           /* Mutual recursion */
514                 {
515                 had_recurse = TRUE;
516                 }
517               else
518                 {
519                 this_recurse.prev = recurses;  /* No recursion */
520                 this_recurse.group = cs;
521                 dd = find_minlength(re, cs, startcode, utf, &this_recurse,
522                   countptr, backref_cache);
523                 if (dd < 0) return dd;
524                 }
525               }
526             }
527 
528           backref_cache[recno] = dd;
529           for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
530           backref_cache[0] = recno;
531           }
532 
533         if (dd < d) d = dd;
534         if (d <= 0) break;    /* No point looking at any more */
535         slot += re->name_entry_size;
536         }
537       }
538     else d = 0;
539     cc += 1 + 2*IMM2_SIZE;
540     goto REPEAT_BACK_REFERENCE;
541 
542     /* Single back reference by number. References by name are converted to by
543     number when there is no duplication. */
544 
545     case OP_REF:
546     case OP_REFI:
547     recno = GET2(cc, 1);
548     if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
549       d = backref_cache[recno];
550     else
551       {
552       int i;
553       d = 0;
554 
555       if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
556         {
557         ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
558         if (cs == NULL) return -2;
559         do ce += GET(ce, 1); while (*ce == OP_ALT);
560 
561         if (!dupcapused ||
562             (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
563           {
564           if (cc > cs && cc < ce)    /* Simple recursion */
565             {
566             had_recurse = TRUE;
567             }
568           else
569             {
570             recurse_check *r = recurses;
571             for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
572             if (r != NULL)           /* Mutual recursion */
573               {
574               had_recurse = TRUE;
575               }
576             else                     /* No recursion */
577               {
578               this_recurse.prev = recurses;
579               this_recurse.group = cs;
580               d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
581                 backref_cache);
582               if (d < 0) return d;
583               }
584             }
585           }
586         }
587 
588       backref_cache[recno] = d;
589       for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
590       backref_cache[0] = recno;
591       }
592 
593     cc += 1 + IMM2_SIZE;
594 
595     /* Handle repeated back references */
596 
597     REPEAT_BACK_REFERENCE:
598     switch (*cc)
599       {
600       case OP_CRSTAR:
601       case OP_CRMINSTAR:
602       case OP_CRQUERY:
603       case OP_CRMINQUERY:
604       case OP_CRPOSSTAR:
605       case OP_CRPOSQUERY:
606       min = 0;
607       cc++;
608       break;
609 
610       case OP_CRPLUS:
611       case OP_CRMINPLUS:
612       case OP_CRPOSPLUS:
613       min = 1;
614       cc++;
615       break;
616 
617       case OP_CRRANGE:
618       case OP_CRMINRANGE:
619       case OP_CRPOSRANGE:
620       min = GET2(cc, 1);
621       cc += 1 + 2 * IMM2_SIZE;
622       break;
623 
624       default:
625       min = 1;
626       break;
627       }
628 
629      /* Take care not to overflow: (1) min and d are ints, so check that their
630      product is not greater than INT_MAX. (2) branchlength is limited to
631      UINT16_MAX (checked at the top of the loop). */
632 
633     if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
634       branchlength = UINT16_MAX;
635     else branchlength += min * d;
636     break;
637 
638     /* Recursion always refers to the first occurrence of a subpattern with a
639     given number. Therefore, we can always make use of caching, even when the
640     pattern contains multiple subpatterns with the same number. */
641 
642     case OP_RECURSE:
643     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
644     recno = GET2(cs, 1+LINK_SIZE);
645     if (recno == prev_recurse_recno)
646       {
647       branchlength += prev_recurse_d;
648       }
649     else
650       {
651       do ce += GET(ce, 1); while (*ce == OP_ALT);
652       if (cc > cs && cc < ce)    /* Simple recursion */
653         had_recurse = TRUE;
654       else
655         {
656         recurse_check *r = recurses;
657         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
658         if (r != NULL)          /* Mutual recursion */
659           had_recurse = TRUE;
660         else
661           {
662           this_recurse.prev = recurses;
663           this_recurse.group = cs;
664           prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
665             countptr, backref_cache);
666           if (prev_recurse_d < 0) return prev_recurse_d;
667           prev_recurse_recno = recno;
668           branchlength += prev_recurse_d;
669           }
670         }
671       }
672     cc += 1 + LINK_SIZE + once_fudge;
673     once_fudge = 0;
674     break;
675 
676     /* Anything else does not or need not match a character. We can get the
677     item's length from the table, but for those that can match zero occurrences
678     of a character, we must take special action for UTF-8 characters. As it
679     happens, the "NOT" versions of these opcodes are used at present only for
680     ASCII characters, so they could be omitted from this list. However, in
681     future that may change, so we include them here so as not to leave a
682     gotcha for a future maintainer. */
683 
684     case OP_UPTO:
685     case OP_UPTOI:
686     case OP_NOTUPTO:
687     case OP_NOTUPTOI:
688     case OP_MINUPTO:
689     case OP_MINUPTOI:
690     case OP_NOTMINUPTO:
691     case OP_NOTMINUPTOI:
692     case OP_POSUPTO:
693     case OP_POSUPTOI:
694     case OP_NOTPOSUPTO:
695     case OP_NOTPOSUPTOI:
696 
697     case OP_STAR:
698     case OP_STARI:
699     case OP_NOTSTAR:
700     case OP_NOTSTARI:
701     case OP_MINSTAR:
702     case OP_MINSTARI:
703     case OP_NOTMINSTAR:
704     case OP_NOTMINSTARI:
705     case OP_POSSTAR:
706     case OP_POSSTARI:
707     case OP_NOTPOSSTAR:
708     case OP_NOTPOSSTARI:
709 
710     case OP_QUERY:
711     case OP_QUERYI:
712     case OP_NOTQUERY:
713     case OP_NOTQUERYI:
714     case OP_MINQUERY:
715     case OP_MINQUERYI:
716     case OP_NOTMINQUERY:
717     case OP_NOTMINQUERYI:
718     case OP_POSQUERY:
719     case OP_POSQUERYI:
720     case OP_NOTPOSQUERY:
721     case OP_NOTPOSQUERYI:
722 
723     cc += PRIV(OP_lengths)[op];
724 #ifdef SUPPORT_UNICODE
725     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
726 #endif
727     break;
728 
729     /* Skip these, but we need to add in the name length. */
730 
731     case OP_MARK:
732     case OP_COMMIT_ARG:
733     case OP_PRUNE_ARG:
734     case OP_SKIP_ARG:
735     case OP_THEN_ARG:
736     cc += PRIV(OP_lengths)[op] + cc[1];
737     break;
738 
739     /* The remaining opcodes are just skipped over. */
740 
741     case OP_CLOSE:
742     case OP_COMMIT:
743     case OP_FAIL:
744     case OP_PRUNE:
745     case OP_SET_SOM:
746     case OP_SKIP:
747     case OP_THEN:
748     cc += PRIV(OP_lengths)[op];
749     break;
750 
751     /* This should not occur: we list all opcodes explicitly so that when
752     new ones get added they are properly considered. */
753 
754     default:
755     return -3;
756     }
757   }
758 /* Control never gets here */
759 }
760 
761 
762 
763 /*************************************************
764 *      Set a bit and maybe its alternate case    *
765 *************************************************/
766 
767 /* Given a character, set its first code unit's bit in the table, and also the
768 corresponding bit for the other version of a letter if we are caseless.
769 
770 Arguments:
771   re            points to the regex block
772   p             points to the first code unit of the character
773   caseless      TRUE if caseless
774   utf           TRUE for UTF mode
775   ucp           TRUE for UCP mode
776 
777 Returns:        pointer after the character
778 */
779 
780 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf,BOOL ucp)781 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
782   BOOL ucp)
783 {
784 uint32_t c = *p++;   /* First code unit */
785 
786 (void)utf;           /* Stop compiler warnings when UTF not supported */
787 (void)ucp;
788 
789 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
790 0xff. */
791 
792 #if PCRE2_CODE_UNIT_WIDTH != 8
793 if (c > 0xff) SET_BIT(0xff); else
794 #endif
795 
796 SET_BIT(c);
797 
798 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
799 the end of the character, even when caseless. */
800 
801 #ifdef SUPPORT_UNICODE
802 if (utf)
803   {
804 #if PCRE2_CODE_UNIT_WIDTH == 8
805   if (c >= 0xc0) GETUTF8INC(c, p);
806 #elif PCRE2_CODE_UNIT_WIDTH == 16
807   if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
808 #endif
809   }
810 #endif  /* SUPPORT_UNICODE */
811 
812 /* If caseless, handle the other case of the character. */
813 
814 if (caseless)
815   {
816 #ifdef SUPPORT_UNICODE
817   if (utf || ucp)
818     {
819     c = UCD_OTHERCASE(c);
820 #if PCRE2_CODE_UNIT_WIDTH == 8
821     if (utf)
822       {
823       PCRE2_UCHAR buff[6];
824       (void)PRIV(ord2utf)(c, buff);
825       SET_BIT(buff[0]);
826       }
827     else if (c < 256) SET_BIT(c);
828 #else  /* 16-bit or 32-bit mode */
829     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
830 #endif
831     }
832 
833   else
834 #endif  /* SUPPORT_UNICODE */
835 
836   /* Not UTF or UCP */
837 
838   if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
839   }
840 
841 return p;
842 }
843 
844 
845 
846 /*************************************************
847 *     Set bits for a positive character type     *
848 *************************************************/
849 
850 /* This function sets starting bits for a character type. In UTF-8 mode, we can
851 only do a direct setting for bytes less than 128, as otherwise there can be
852 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
853 environment, the tables will only recognize ASCII characters anyway, but in at
854 least one Windows environment, some higher bytes bits were set in the tables.
855 So we deal with that case by considering the UTF-8 encoding.
856 
857 Arguments:
858   re             the regex block
859   cbit type      the type of character wanted
860   table_limit    32 for non-UTF-8; 16 for UTF-8
861 
862 Returns:         nothing
863 */
864 
865 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)866 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
867 {
868 uint32_t c;
869 for (c = 0; c < table_limit; c++)
870   re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
871 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
872 if (table_limit == 32) return;
873 for (c = 128; c < 256; c++)
874   {
875   if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
876     {
877     PCRE2_UCHAR buff[6];
878     (void)PRIV(ord2utf)(c, buff);
879     SET_BIT(buff[0]);
880     }
881   }
882 #endif  /* UTF-8 */
883 }
884 
885 
886 /*************************************************
887 *     Set bits for a negative character type     *
888 *************************************************/
889 
890 /* This function sets starting bits for a negative character type such as \D.
891 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
892 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
893 Unlike in the positive case, where we can set appropriate starting bits for
894 specific high-valued UTF-8 characters, in this case we have to set the bits for
895 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
896 0xc0 (192) for simplicity.
897 
898 Arguments:
899   re             the regex block
900   cbit type      the type of character wanted
901   table_limit    32 for non-UTF-8; 16 for UTF-8
902 
903 Returns:         nothing
904 */
905 
906 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)907 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
908 {
909 uint32_t c;
910 for (c = 0; c < table_limit; c++)
911   re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
912 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
913 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
914 #endif
915 }
916 
917 
918 
919 /*************************************************
920 *      Create bitmap of starting code units      *
921 *************************************************/
922 
923 /* This function scans a compiled unanchored expression recursively and
924 attempts to build a bitmap of the set of possible starting code units whose
925 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
926 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
927 we pass a value of 16 rather than 32 as the final argument. (See comments in
928 those functions for the reason.)
929 
930 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
931 (a*)b where the group provides some optional starting code units but scanning
932 must continue at the outer level to find at least one mandatory code unit. At
933 the outermost level, this function fails unless the result is SSB_DONE.
934 
935 We restrict recursion (for nested groups) to 1000 to avoid stack overflow
936 issues.
937 
938 Arguments:
939   re           points to the compiled regex block
940   code         points to an expression
941   utf          TRUE if in UTF mode
942   ucp          TRUE if in UCP mode
943   depthptr     pointer to recurse depth
944 
945 Returns:       SSB_FAIL     => Failed to find any starting code units
946                SSB_DONE     => Found mandatory starting code units
947                SSB_CONTINUE => Found optional starting code units
948                SSB_UNKNOWN  => Hit an unrecognized opcode
949                SSB_TOODEEP  => Recursion is too deep
950 */
951 
952 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf,BOOL ucp,int * depthptr)953 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
954   int *depthptr)
955 {
956 uint32_t c;
957 int yield = SSB_DONE;
958 
959 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
960 int table_limit = utf? 16:32;
961 #else
962 int table_limit = 32;
963 #endif
964 
965 *depthptr += 1;
966 if (*depthptr > 1000) return SSB_TOODEEP;
967 
968 do
969   {
970   BOOL try_next = TRUE;
971   PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
972 
973   if (*code == OP_CBRA || *code == OP_SCBRA ||
974       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
975 
976   while (try_next)    /* Loop for items in this branch */
977     {
978     int rc;
979     uint8_t *classmap = NULL;
980 #ifdef SUPPORT_WIDE_CHARS
981     PCRE2_UCHAR xclassflags;
982 #endif
983 
984     switch(*tcode)
985       {
986       /* If we reach something we don't understand, it means a new opcode has
987       been created that hasn't been added to this function. Hopefully this
988       problem will be discovered during testing. */
989 
990       default:
991       return SSB_UNKNOWN;
992 
993       /* Fail for a valid opcode that implies no starting bits. */
994 
995       case OP_ACCEPT:
996       case OP_ASSERT_ACCEPT:
997       case OP_ALLANY:
998       case OP_ANY:
999       case OP_ANYBYTE:
1000       case OP_CIRCM:
1001       case OP_CLOSE:
1002       case OP_COMMIT:
1003       case OP_COMMIT_ARG:
1004       case OP_COND:
1005       case OP_CREF:
1006       case OP_FALSE:
1007       case OP_TRUE:
1008       case OP_DNCREF:
1009       case OP_DNREF:
1010       case OP_DNREFI:
1011       case OP_DNRREF:
1012       case OP_DOLL:
1013       case OP_DOLLM:
1014       case OP_END:
1015       case OP_EOD:
1016       case OP_EODN:
1017       case OP_EXTUNI:
1018       case OP_FAIL:
1019       case OP_MARK:
1020       case OP_NOT:
1021       case OP_NOTEXACT:
1022       case OP_NOTEXACTI:
1023       case OP_NOTI:
1024       case OP_NOTMINPLUS:
1025       case OP_NOTMINPLUSI:
1026       case OP_NOTMINQUERY:
1027       case OP_NOTMINQUERYI:
1028       case OP_NOTMINSTAR:
1029       case OP_NOTMINSTARI:
1030       case OP_NOTMINUPTO:
1031       case OP_NOTMINUPTOI:
1032       case OP_NOTPLUS:
1033       case OP_NOTPLUSI:
1034       case OP_NOTPOSPLUS:
1035       case OP_NOTPOSPLUSI:
1036       case OP_NOTPOSQUERY:
1037       case OP_NOTPOSQUERYI:
1038       case OP_NOTPOSSTAR:
1039       case OP_NOTPOSSTARI:
1040       case OP_NOTPOSUPTO:
1041       case OP_NOTPOSUPTOI:
1042       case OP_NOTPROP:
1043       case OP_NOTQUERY:
1044       case OP_NOTQUERYI:
1045       case OP_NOTSTAR:
1046       case OP_NOTSTARI:
1047       case OP_NOTUPTO:
1048       case OP_NOTUPTOI:
1049       case OP_NOT_HSPACE:
1050       case OP_NOT_VSPACE:
1051       case OP_PRUNE:
1052       case OP_PRUNE_ARG:
1053       case OP_RECURSE:
1054       case OP_REF:
1055       case OP_REFI:
1056       case OP_REVERSE:
1057       case OP_RREF:
1058       case OP_SCOND:
1059       case OP_SET_SOM:
1060       case OP_SKIP:
1061       case OP_SKIP_ARG:
1062       case OP_SOD:
1063       case OP_SOM:
1064       case OP_THEN:
1065       case OP_THEN_ARG:
1066       return SSB_FAIL;
1067 
1068       /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1069       uses OP_CIRCM). Skip over it. */
1070 
1071       case OP_CIRC:
1072       tcode += PRIV(OP_lengths)[OP_CIRC];
1073       break;
1074 
1075       /* A "real" property test implies no starting bits, but the fake property
1076       PT_CLIST identifies a list of characters. These lists are short, as they
1077       are used for characters with more than one "other case", so there is no
1078       point in recognizing them for OP_NOTPROP. */
1079 
1080       case OP_PROP:
1081       if (tcode[1] != PT_CLIST) return SSB_FAIL;
1082         {
1083         const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1084         while ((c = *p++) < NOTACHAR)
1085           {
1086 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1087           if (utf)
1088             {
1089             PCRE2_UCHAR buff[6];
1090             (void)PRIV(ord2utf)(c, buff);
1091             c = buff[0];
1092             }
1093 #endif
1094           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1095           }
1096         }
1097       try_next = FALSE;
1098       break;
1099 
1100       /* We can ignore word boundary tests. */
1101 
1102       case OP_WORD_BOUNDARY:
1103       case OP_NOT_WORD_BOUNDARY:
1104       tcode++;
1105       break;
1106 
1107       /* If we hit a bracket or a positive lookahead assertion, recurse to set
1108       bits from within the subpattern. If it can't find anything, we have to
1109       give up. If it finds some mandatory character(s), we are done for this
1110       branch. Otherwise, carry on scanning after the subpattern. */
1111 
1112       case OP_BRA:
1113       case OP_SBRA:
1114       case OP_CBRA:
1115       case OP_SCBRA:
1116       case OP_BRAPOS:
1117       case OP_SBRAPOS:
1118       case OP_CBRAPOS:
1119       case OP_SCBRAPOS:
1120       case OP_ONCE:
1121       case OP_SCRIPT_RUN:
1122       case OP_ASSERT:
1123       case OP_ASSERT_NA:
1124       rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1125       if (rc == SSB_DONE)
1126         {
1127         try_next = FALSE;
1128         }
1129       else if (rc == SSB_CONTINUE)
1130         {
1131         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1132         tcode += 1 + LINK_SIZE;
1133         }
1134       else return rc;   /* FAIL, UNKNOWN, or TOODEEP */
1135       break;
1136 
1137       /* If we hit ALT or KET, it means we haven't found anything mandatory in
1138       this branch, though we might have found something optional. For ALT, we
1139       continue with the next alternative, but we have to arrange that the final
1140       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1141       return SSB_CONTINUE: if this is the top level, that indicates failure,
1142       but after a nested subpattern, it causes scanning to continue. */
1143 
1144       case OP_ALT:
1145       yield = SSB_CONTINUE;
1146       try_next = FALSE;
1147       break;
1148 
1149       case OP_KET:
1150       case OP_KETRMAX:
1151       case OP_KETRMIN:
1152       case OP_KETRPOS:
1153       return SSB_CONTINUE;
1154 
1155       /* Skip over callout */
1156 
1157       case OP_CALLOUT:
1158       tcode += PRIV(OP_lengths)[OP_CALLOUT];
1159       break;
1160 
1161       case OP_CALLOUT_STR:
1162       tcode += GET(tcode, 1 + 2*LINK_SIZE);
1163       break;
1164 
1165       /* Skip over lookbehind and negative lookahead assertions */
1166 
1167       case OP_ASSERT_NOT:
1168       case OP_ASSERTBACK:
1169       case OP_ASSERTBACK_NOT:
1170       case OP_ASSERTBACK_NA:
1171       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1172       tcode += 1 + LINK_SIZE;
1173       break;
1174 
1175       /* BRAZERO does the bracket, but carries on. */
1176 
1177       case OP_BRAZERO:
1178       case OP_BRAMINZERO:
1179       case OP_BRAPOSZERO:
1180       rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1181       if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1182       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1183       tcode += 1 + LINK_SIZE;
1184       break;
1185 
1186       /* SKIPZERO skips the bracket. */
1187 
1188       case OP_SKIPZERO:
1189       tcode++;
1190       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1191       tcode += 1 + LINK_SIZE;
1192       break;
1193 
1194       /* Single-char * or ? sets the bit and tries the next item */
1195 
1196       case OP_STAR:
1197       case OP_MINSTAR:
1198       case OP_POSSTAR:
1199       case OP_QUERY:
1200       case OP_MINQUERY:
1201       case OP_POSQUERY:
1202       tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1203       break;
1204 
1205       case OP_STARI:
1206       case OP_MINSTARI:
1207       case OP_POSSTARI:
1208       case OP_QUERYI:
1209       case OP_MINQUERYI:
1210       case OP_POSQUERYI:
1211       tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1212       break;
1213 
1214       /* Single-char upto sets the bit and tries the next */
1215 
1216       case OP_UPTO:
1217       case OP_MINUPTO:
1218       case OP_POSUPTO:
1219       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1220       break;
1221 
1222       case OP_UPTOI:
1223       case OP_MINUPTOI:
1224       case OP_POSUPTOI:
1225       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1226       break;
1227 
1228       /* At least one single char sets the bit and stops */
1229 
1230       case OP_EXACT:
1231       tcode += IMM2_SIZE;
1232       /* Fall through */
1233       case OP_CHAR:
1234       case OP_PLUS:
1235       case OP_MINPLUS:
1236       case OP_POSPLUS:
1237       (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1238       try_next = FALSE;
1239       break;
1240 
1241       case OP_EXACTI:
1242       tcode += IMM2_SIZE;
1243       /* Fall through */
1244       case OP_CHARI:
1245       case OP_PLUSI:
1246       case OP_MINPLUSI:
1247       case OP_POSPLUSI:
1248       (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1249       try_next = FALSE;
1250       break;
1251 
1252       /* Special spacing and line-terminating items. These recognize specific
1253       lists of characters. The difference between VSPACE and ANYNL is that the
1254       latter can match the two-character CRLF sequence, but that is not
1255       relevant for finding the first character, so their code here is
1256       identical. */
1257 
1258       case OP_HSPACE:
1259       SET_BIT(CHAR_HT);
1260       SET_BIT(CHAR_SPACE);
1261 
1262       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1263       the bits for 0xA0 and for code units >= 255, independently of UTF. */
1264 
1265 #if PCRE2_CODE_UNIT_WIDTH != 8
1266       SET_BIT(0xA0);
1267       SET_BIT(0xFF);
1268 #else
1269       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1270       units of horizontal space characters. */
1271 
1272 #ifdef SUPPORT_UNICODE
1273       if (utf)
1274         {
1275         SET_BIT(0xC2);  /* For U+00A0 */
1276         SET_BIT(0xE1);  /* For U+1680, U+180E */
1277         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1278         SET_BIT(0xE3);  /* For U+3000 */
1279         }
1280       else
1281 #endif
1282       /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1283       the code is EBCDIC. */
1284         {
1285 #ifndef EBCDIC
1286         SET_BIT(0xA0);
1287 #endif  /* Not EBCDIC */
1288         }
1289 #endif  /* 8-bit support */
1290 
1291       try_next = FALSE;
1292       break;
1293 
1294       case OP_ANYNL:
1295       case OP_VSPACE:
1296       SET_BIT(CHAR_LF);
1297       SET_BIT(CHAR_VT);
1298       SET_BIT(CHAR_FF);
1299       SET_BIT(CHAR_CR);
1300 
1301       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1302       the bits for NEL and for code units >= 255, independently of UTF. */
1303 
1304 #if PCRE2_CODE_UNIT_WIDTH != 8
1305       SET_BIT(CHAR_NEL);
1306       SET_BIT(0xFF);
1307 #else
1308       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1309       units of vertical space characters. */
1310 
1311 #ifdef SUPPORT_UNICODE
1312       if (utf)
1313         {
1314         SET_BIT(0xC2);  /* For U+0085 (NEL) */
1315         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1316         }
1317       else
1318 #endif
1319       /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1320         {
1321         SET_BIT(CHAR_NEL);
1322         }
1323 #endif  /* 8-bit support */
1324 
1325       try_next = FALSE;
1326       break;
1327 
1328       /* Single character types set the bits and stop. Note that if PCRE2_UCP
1329       is set, we do not see these opcodes because \d etc are converted to
1330       properties. Therefore, these apply in the case when only characters less
1331       than 256 are recognized to match the types. */
1332 
1333       case OP_NOT_DIGIT:
1334       set_nottype_bits(re, cbit_digit, table_limit);
1335       try_next = FALSE;
1336       break;
1337 
1338       case OP_DIGIT:
1339       set_type_bits(re, cbit_digit, table_limit);
1340       try_next = FALSE;
1341       break;
1342 
1343       case OP_NOT_WHITESPACE:
1344       set_nottype_bits(re, cbit_space, table_limit);
1345       try_next = FALSE;
1346       break;
1347 
1348       case OP_WHITESPACE:
1349       set_type_bits(re, cbit_space, table_limit);
1350       try_next = FALSE;
1351       break;
1352 
1353       case OP_NOT_WORDCHAR:
1354       set_nottype_bits(re, cbit_word, table_limit);
1355       try_next = FALSE;
1356       break;
1357 
1358       case OP_WORDCHAR:
1359       set_type_bits(re, cbit_word, table_limit);
1360       try_next = FALSE;
1361       break;
1362 
1363       /* One or more character type fudges the pointer and restarts, knowing
1364       it will hit a single character type and stop there. */
1365 
1366       case OP_TYPEPLUS:
1367       case OP_TYPEMINPLUS:
1368       case OP_TYPEPOSPLUS:
1369       tcode++;
1370       break;
1371 
1372       case OP_TYPEEXACT:
1373       tcode += 1 + IMM2_SIZE;
1374       break;
1375 
1376       /* Zero or more repeats of character types set the bits and then
1377       try again. */
1378 
1379       case OP_TYPEUPTO:
1380       case OP_TYPEMINUPTO:
1381       case OP_TYPEPOSUPTO:
1382       tcode += IMM2_SIZE;  /* Fall through */
1383 
1384       case OP_TYPESTAR:
1385       case OP_TYPEMINSTAR:
1386       case OP_TYPEPOSSTAR:
1387       case OP_TYPEQUERY:
1388       case OP_TYPEMINQUERY:
1389       case OP_TYPEPOSQUERY:
1390       switch(tcode[1])
1391         {
1392         default:
1393         case OP_ANY:
1394         case OP_ALLANY:
1395         return SSB_FAIL;
1396 
1397         case OP_HSPACE:
1398         SET_BIT(CHAR_HT);
1399         SET_BIT(CHAR_SPACE);
1400 
1401         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1402         the bits for 0xA0 and for code units >= 255, independently of UTF. */
1403 
1404 #if PCRE2_CODE_UNIT_WIDTH != 8
1405         SET_BIT(0xA0);
1406         SET_BIT(0xFF);
1407 #else
1408         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1409         units of horizontal space characters. */
1410 
1411 #ifdef SUPPORT_UNICODE
1412         if (utf)
1413           {
1414           SET_BIT(0xC2);  /* For U+00A0 */
1415           SET_BIT(0xE1);  /* For U+1680, U+180E */
1416           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1417           SET_BIT(0xE3);  /* For U+3000 */
1418           }
1419         else
1420 #endif
1421         /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1422         the code is EBCDIC. */
1423           {
1424 #ifndef EBCDIC
1425           SET_BIT(0xA0);
1426 #endif  /* Not EBCDIC */
1427           }
1428 #endif  /* 8-bit support */
1429         break;
1430 
1431         case OP_ANYNL:
1432         case OP_VSPACE:
1433         SET_BIT(CHAR_LF);
1434         SET_BIT(CHAR_VT);
1435         SET_BIT(CHAR_FF);
1436         SET_BIT(CHAR_CR);
1437 
1438         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1439         the bits for NEL and for code units >= 255, independently of UTF. */
1440 
1441 #if PCRE2_CODE_UNIT_WIDTH != 8
1442         SET_BIT(CHAR_NEL);
1443         SET_BIT(0xFF);
1444 #else
1445         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1446         units of vertical space characters. */
1447 
1448 #ifdef SUPPORT_UNICODE
1449         if (utf)
1450           {
1451           SET_BIT(0xC2);  /* For U+0085 (NEL) */
1452           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1453           }
1454         else
1455 #endif
1456         /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1457           {
1458           SET_BIT(CHAR_NEL);
1459           }
1460 #endif  /* 8-bit support */
1461         break;
1462 
1463         case OP_NOT_DIGIT:
1464         set_nottype_bits(re, cbit_digit, table_limit);
1465         break;
1466 
1467         case OP_DIGIT:
1468         set_type_bits(re, cbit_digit, table_limit);
1469         break;
1470 
1471         case OP_NOT_WHITESPACE:
1472         set_nottype_bits(re, cbit_space, table_limit);
1473         break;
1474 
1475         case OP_WHITESPACE:
1476         set_type_bits(re, cbit_space, table_limit);
1477         break;
1478 
1479         case OP_NOT_WORDCHAR:
1480         set_nottype_bits(re, cbit_word, table_limit);
1481         break;
1482 
1483         case OP_WORDCHAR:
1484         set_type_bits(re, cbit_word, table_limit);
1485         break;
1486         }
1487 
1488       tcode += 2;
1489       break;
1490 
1491       /* Extended class: if there are any property checks, or if this is a
1492       negative XCLASS without a map, give up. If there are no property checks,
1493       there must be wide characters on the XCLASS list, because otherwise an
1494       XCLASS would not have been created. This means that code points >= 255
1495       are potential starters. In the UTF-8 case we can scan them and set bits
1496       for the relevant leading bytes. */
1497 
1498 #ifdef SUPPORT_WIDE_CHARS
1499       case OP_XCLASS:
1500       xclassflags = tcode[1 + LINK_SIZE];
1501       if ((xclassflags & XCL_HASPROP) != 0 ||
1502           (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1503         return SSB_FAIL;
1504 
1505       /* We have a positive XCLASS or a negative one without a map. Set up the
1506       map pointer if there is one, and fall through. */
1507 
1508       classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1509         (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1510 
1511       /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1512       then jump to handle the map. */
1513 
1514 #if PCRE2_CODE_UNIT_WIDTH == 8
1515       if (utf && (xclassflags & XCL_NOT) == 0)
1516         {
1517         PCRE2_UCHAR b, e;
1518         PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1519         tcode += GET(tcode, 1);
1520 
1521         for (;;) switch (*p++)
1522           {
1523           case XCL_SINGLE:
1524           b = *p++;
1525           while ((*p & 0xc0) == 0x80) p++;
1526           re->start_bitmap[b/8] |= (1u << (b&7));
1527           break;
1528 
1529           case XCL_RANGE:
1530           b = *p++;
1531           while ((*p & 0xc0) == 0x80) p++;
1532           e = *p++;
1533           while ((*p & 0xc0) == 0x80) p++;
1534           for (; b <= e; b++)
1535             re->start_bitmap[b/8] |= (1u << (b&7));
1536           break;
1537 
1538           case XCL_END:
1539           goto HANDLE_CLASSMAP;
1540 
1541           default:
1542           return SSB_UNKNOWN;   /* Internal error, should not occur */
1543           }
1544         }
1545 #endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1546 #endif  /* SUPPORT_WIDE_CHARS */
1547 
1548       /* It seems that the fall through comment must be outside the #ifdef if
1549       it is to avoid the gcc compiler warning. */
1550 
1551       /* Fall through */
1552 
1553       /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1554       in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1555       because it starts a character with a value > 255. In 8-bit non-UTF mode,
1556       there is no difference between CLASS and NCLASS. In all other wide
1557       character modes, set the 0xFF bit to indicate code units >= 255. */
1558 
1559       case OP_NCLASS:
1560 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1561       if (utf)
1562         {
1563         re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1564         memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1565         }
1566 #elif PCRE2_CODE_UNIT_WIDTH != 8
1567       SET_BIT(0xFF);                             /* For characters >= 255 */
1568 #endif
1569       /* Fall through */
1570 
1571       /* Enter here for a positive non-XCLASS. If we have fallen through from
1572       an XCLASS, classmap will already be set; just advance the code pointer.
1573       Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1574 
1575       case OP_CLASS:
1576       if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1577         {
1578         classmap = (uint8_t *)(++tcode);
1579         tcode += 32 / sizeof(PCRE2_UCHAR);
1580         }
1581 
1582       /* When wide characters are supported, classmap may be NULL. In UTF-8
1583       (sic) mode, the bits in a class bit map correspond to character values,
1584       not to byte values. However, the bit map we are constructing is for byte
1585       values. So we have to do a conversion for characters whose code point is
1586       greater than 127. In fact, there are only two possible starting bytes for
1587       characters in the range 128 - 255. */
1588 
1589 #if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1590       HANDLE_CLASSMAP:
1591 #endif
1592       if (classmap != NULL)
1593         {
1594 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1595         if (utf)
1596           {
1597           for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1598           for (c = 128; c < 256; c++)
1599             {
1600             if ((classmap[c/8] & (1u << (c&7))) != 0)
1601               {
1602               int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1603               re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1604               c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1605               }
1606             }
1607           }
1608         else
1609 #endif
1610         /* In all modes except UTF-8, the two bit maps are compatible. */
1611 
1612           {
1613           for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1614           }
1615         }
1616 
1617       /* Act on what follows the class. For a zero minimum repeat, continue;
1618       otherwise stop processing. */
1619 
1620       switch (*tcode)
1621         {
1622         case OP_CRSTAR:
1623         case OP_CRMINSTAR:
1624         case OP_CRQUERY:
1625         case OP_CRMINQUERY:
1626         case OP_CRPOSSTAR:
1627         case OP_CRPOSQUERY:
1628         tcode++;
1629         break;
1630 
1631         case OP_CRRANGE:
1632         case OP_CRMINRANGE:
1633         case OP_CRPOSRANGE:
1634         if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1635           else try_next = FALSE;
1636         break;
1637 
1638         default:
1639         try_next = FALSE;
1640         break;
1641         }
1642       break; /* End of class handling case */
1643       }      /* End of switch for opcodes */
1644     }        /* End of try_next loop */
1645 
1646   code += GET(code, 1);   /* Advance to next branch */
1647   }
1648 while (*code == OP_ALT);
1649 
1650 return yield;
1651 }
1652 
1653 
1654 
1655 /*************************************************
1656 *          Study a compiled expression           *
1657 *************************************************/
1658 
1659 /* This function is handed a compiled expression that it must study to produce
1660 information that will speed up the matching.
1661 
1662 Argument:
1663   re       points to the compiled expression
1664 
1665 Returns:   0 normally; non-zero should never normally occur
1666            1 unknown opcode in set_start_bits
1667            2 missing capturing bracket
1668            3 unknown opcode in find_minlength
1669 */
1670 
1671 int
PRIV(study)1672 PRIV(study)(pcre2_real_code *re)
1673 {
1674 int count = 0;
1675 PCRE2_UCHAR *code;
1676 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1677 BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1678 
1679 /* Find start of compiled code */
1680 
1681 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1682   re->name_entry_size * re->name_count;
1683 
1684 /* For a pattern that has a first code unit, or a multiline pattern that
1685 matches only at "line start", there is no point in seeking a list of starting
1686 code units. */
1687 
1688 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1689   {
1690   int depth = 0;
1691   int rc = set_start_bits(re, code, utf, ucp, &depth);
1692   if (rc == SSB_UNKNOWN) return 1;
1693 
1694   /* If a list of starting code units was set up, scan the list to see if only
1695   one or two were listed. Having only one listed is rare because usually a
1696   single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1697   If two are listed, see if they are caseless versions of the same character;
1698   if so we can replace the list with a caseless first code unit. This gives
1699   better performance and is plausibly worth doing for patterns such as [Ww]ord
1700   or (word|WORD). */
1701 
1702   if (rc == SSB_DONE)
1703     {
1704     int i;
1705     int a = -1;
1706     int b = -1;
1707     uint8_t *p = re->start_bitmap;
1708     uint32_t flags = PCRE2_FIRSTMAPSET;
1709 
1710     for (i = 0; i < 256; p++, i += 8)
1711       {
1712       uint8_t x = *p;
1713       if (x != 0)
1714         {
1715         int c;
1716         uint8_t y = x & (~x + 1);   /* Least significant bit */
1717         if (y != x) goto DONE;      /* More than one bit set */
1718 
1719         /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1720         all wide characters", so we cannot use it here. */
1721 
1722 #if PCRE2_CODE_UNIT_WIDTH != 8
1723         if (i == 248 && x == 0x80) goto DONE;
1724 #endif
1725 
1726         /* Compute the character value */
1727 
1728         c = i;
1729         switch (x)
1730           {
1731           case 1:   break;
1732           case 2:   c += 1; break;  case 4:  c += 2; break;
1733           case 8:   c += 3; break;  case 16: c += 4; break;
1734           case 32:  c += 5; break;  case 64: c += 6; break;
1735           case 128: c += 7; break;
1736           }
1737 
1738         /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1739         mode, only values < 128 can be used. In all the other cases, c is a
1740         character value. */
1741 
1742 #if PCRE2_CODE_UNIT_WIDTH == 8
1743         if (utf && c > 127) goto DONE;
1744 #endif
1745         if (a < 0) a = c;   /* First one found, save in a */
1746         else if (b < 0)     /* Second one found */
1747           {
1748           int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1749 
1750 #ifdef SUPPORT_UNICODE
1751           if (utf || ucp)
1752             {
1753             if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
1754             if (c > 127) d = UCD_OTHERCASE(c);
1755             }
1756 #endif  /* SUPPORT_UNICODE */
1757 
1758           if (d != a) goto DONE;   /* Not the other case of a */
1759           b = c;                   /* Save second in b */
1760           }
1761         else goto DONE;   /* More than two characters found */
1762         }
1763       }
1764 
1765     /* Replace the start code unit bits with a first code unit, but only if it
1766     is not the same as a required later code unit. This is because a search for
1767     a required code unit starts after an explicit first code unit, but at a
1768     code unit found from the bitmap. Patterns such as /a*a/ don't work
1769     if both the start unit and required unit are the same. */
1770 
1771     if (a >= 0 &&
1772         (
1773         (re->flags & PCRE2_LASTSET) == 0 ||
1774           (
1775           re->last_codeunit != (uint32_t)a &&
1776           (b < 0 || re->last_codeunit != (uint32_t)b)
1777           )
1778         ))
1779       {
1780       re->first_codeunit = a;
1781       flags = PCRE2_FIRSTSET;
1782       if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
1783       }
1784 
1785     DONE:
1786     re->flags |= flags;
1787     }
1788   }
1789 
1790 /* Find the minimum length of subject string. If the pattern can match an empty
1791 string, the minimum length is already known. If the pattern contains (*ACCEPT)
1792 all bets are off, and we don't even try to find a minimum length. If there are
1793 more back references than the size of the vector we are going to cache them in,
1794 do nothing. A pattern that complicated will probably take a long time to
1795 analyze and may in any case turn out to be too complicated. Note that back
1796 reference minima are held as 16-bit numbers. */
1797 
1798 if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
1799      re->top_backref <= MAX_CACHE_BACKREF)
1800   {
1801   int min;
1802   int backref_cache[MAX_CACHE_BACKREF+1];
1803   backref_cache[0] = 0;    /* Highest one that is set */
1804   min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1805   switch(min)
1806     {
1807     case -1:  /* \C in UTF mode or over-complex regex */
1808     break;    /* Leave minlength unchanged (will be zero) */
1809 
1810     case -2:
1811     return 2; /* missing capturing bracket */
1812 
1813     case -3:
1814     return 3; /* unrecognized opcode */
1815 
1816     default:
1817     re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
1818     break;
1819     }
1820   }
1821 
1822 return 0;
1823 }
1824 
1825 /* End of pcre2_study.c */
1826