xref: /php-src/ext/pcre/pcre2lib/pcre2_study.c (revision ae5beff6)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2023 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #include "pcre2_internal.h"
50 
51 /* The maximum remembered capturing brackets minimum. */
52 
53 #define MAX_CACHE_BACKREF 128
54 
55 /* Set a bit in the starting code unit bit map. */
56 
57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58 
59 /* Returns from set_start_bits() */
60 
61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
62 
63 
64 /*************************************************
65 *   Find the minimum subject length for a group  *
66 *************************************************/
67 
68 /* Scan a parenthesized group and compute the minimum length of subject that
69 is needed to match it. This is a lower bound; it does not mean there is a
70 string of that length that matches. In UTF mode, the result is in characters
71 rather than code units. The field in a compiled pattern for storing the minimum
72 length is 16-bits long (on the grounds that anything longer than that is
73 pathological), so we give up when we reach that amount. This also means that
74 integer overflow for really crazy patterns cannot happen.
75 
76 Backreference minimum lengths are cached to speed up multiple references. This
77 function is called only when the highest back reference in the pattern is less
78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79 caching vector. The zeroth element contains the number of the highest set
80 value.
81 
82 Arguments:
83   re              compiled pattern block
84   code            pointer to start of group (the bracket)
85   startcode       pointer to start of the whole pattern's code
86   utf             UTF flag
87   recurses        chain of recurse_check to catch mutual recursion
88   countptr        pointer to call count (to catch over complexity)
89   backref_cache   vector for caching back references.
90 
91 This function is no longer called when the pattern contains (*ACCEPT); however,
92 the old code for returning -1 is retained, just in case.
93 
94 Returns:   the minimum length
95            -1 \C in UTF-8 mode
96               or (*ACCEPT)
97               or pattern too complicated
98            -2 internal error (missing capturing bracket)
99            -3 internal error (opcode not listed)
100 */
101 
102 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr,int * backref_cache)103 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
104   PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
105   int *backref_cache)
106 {
107 int length = -1;
108 int branchlength = 0;
109 int prev_cap_recno = -1;
110 int prev_cap_d = 0;
111 int prev_recurse_recno = -1;
112 int prev_recurse_d = 0;
113 uint32_t once_fudge = 0;
114 BOOL had_recurse = FALSE;
115 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
116 PCRE2_SPTR nextbranch = code + GET(code, 1);
117 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
118 recurse_check this_recurse;
119 
120 /* If this is a "could be empty" group, its minimum length is 0. */
121 
122 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
123 
124 /* Skip over capturing bracket number */
125 
126 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
127 
128 /* A large and/or complex regex can take too long to process. */
129 
130 if ((*countptr)++ > 1000) return -1;
131 
132 /* Scan along the opcodes for this branch. If we get to the end of the branch,
133 check the length against that of the other branches. If the accumulated length
134 passes 16-bits, reset to that value and skip the rest of the branch. */
135 
136 for (;;)
137   {
138   int d, min, recno;
139   PCRE2_UCHAR op, *cs, *ce;
140 
141   if (branchlength >= UINT16_MAX)
142     {
143     branchlength = UINT16_MAX;
144     cc = (PCRE2_UCHAR *)nextbranch;
145     }
146 
147   op = *cc;
148   switch (op)
149     {
150     case OP_COND:
151     case OP_SCOND:
152 
153     /* If there is only one branch in a condition, the implied branch has zero
154     length, so we don't add anything. This covers the DEFINE "condition"
155     automatically. If there are two branches we can treat it the same as any
156     other non-capturing subpattern. */
157 
158     cs = cc + GET(cc, 1);
159     if (*cs != OP_ALT)
160       {
161       cc = cs + 1 + LINK_SIZE;
162       break;
163       }
164     goto PROCESS_NON_CAPTURE;
165 
166     case OP_BRA:
167     /* There's a special case of OP_BRA, when it is wrapped round a repeated
168     OP_RECURSE. We'd like to process the latter at this level so that
169     remembering the value works for repeated cases. So we do nothing, but
170     set a fudge value to skip over the OP_KET after the recurse. */
171 
172     if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
173       {
174       once_fudge = 1 + LINK_SIZE;
175       cc += 1 + LINK_SIZE;
176       break;
177       }
178     /* Fall through */
179 
180     case OP_ONCE:
181     case OP_SCRIPT_RUN:
182     case OP_SBRA:
183     case OP_BRAPOS:
184     case OP_SBRAPOS:
185     PROCESS_NON_CAPTURE:
186     d = find_minlength(re, cc, startcode, utf, recurses, countptr,
187       backref_cache);
188     if (d < 0) return d;
189     branchlength += d;
190     do cc += GET(cc, 1); while (*cc == OP_ALT);
191     cc += 1 + LINK_SIZE;
192     break;
193 
194     /* To save time for repeated capturing subpatterns, we remember the
195     length of the previous one. Unfortunately we can't do the same for
196     the unnumbered ones above. Nor can we do this if (?| is present in the
197     pattern because captures with the same number are not then identical. */
198 
199     case OP_CBRA:
200     case OP_SCBRA:
201     case OP_CBRAPOS:
202     case OP_SCBRAPOS:
203     recno = (int)GET2(cc, 1+LINK_SIZE);
204     if (dupcapused || recno != prev_cap_recno)
205       {
206       prev_cap_recno = recno;
207       prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
208         backref_cache);
209       if (prev_cap_d < 0) return prev_cap_d;
210       }
211     branchlength += prev_cap_d;
212     do cc += GET(cc, 1); while (*cc == OP_ALT);
213     cc += 1 + LINK_SIZE;
214     break;
215 
216     /* ACCEPT makes things far too complicated; we have to give up. In fact,
217     from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
218     used. However, leave the code in place, just in case. */
219 
220     case OP_ACCEPT:
221     case OP_ASSERT_ACCEPT:
222     return -1;
223 
224     /* Reached end of a branch; if it's a ket it is the end of a nested
225     call. If it's ALT it is an alternation in a nested call. If it is END it's
226     the end of the outer call. All can be handled by the same code. If the
227     length of any branch is zero, there is no need to scan any subsequent
228     branches. */
229 
230     case OP_ALT:
231     case OP_KET:
232     case OP_KETRMAX:
233     case OP_KETRMIN:
234     case OP_KETRPOS:
235     case OP_END:
236     if (length < 0 || (!had_recurse && branchlength < length))
237       length = branchlength;
238     if (op != OP_ALT || length == 0) return length;
239     nextbranch = cc + GET(cc, 1);
240     cc += 1 + LINK_SIZE;
241     branchlength = 0;
242     had_recurse = FALSE;
243     break;
244 
245     /* Skip over assertive subpatterns */
246 
247     case OP_ASSERT:
248     case OP_ASSERT_NOT:
249     case OP_ASSERTBACK:
250     case OP_ASSERTBACK_NOT:
251     case OP_ASSERT_NA:
252     case OP_ASSERTBACK_NA:
253     do cc += GET(cc, 1); while (*cc == OP_ALT);
254     /* Fall through */
255 
256     /* Skip over things that don't match chars */
257 
258     case OP_REVERSE:
259     case OP_VREVERSE:
260     case OP_CREF:
261     case OP_DNCREF:
262     case OP_RREF:
263     case OP_DNRREF:
264     case OP_FALSE:
265     case OP_TRUE:
266     case OP_CALLOUT:
267     case OP_SOD:
268     case OP_SOM:
269     case OP_EOD:
270     case OP_EODN:
271     case OP_CIRC:
272     case OP_CIRCM:
273     case OP_DOLL:
274     case OP_DOLLM:
275     case OP_NOT_WORD_BOUNDARY:
276     case OP_WORD_BOUNDARY:
277     case OP_NOT_UCP_WORD_BOUNDARY:
278     case OP_UCP_WORD_BOUNDARY:
279     cc += PRIV(OP_lengths)[*cc];
280     break;
281 
282     case OP_CALLOUT_STR:
283     cc += GET(cc, 1 + 2*LINK_SIZE);
284     break;
285 
286     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
287 
288     case OP_BRAZERO:
289     case OP_BRAMINZERO:
290     case OP_BRAPOSZERO:
291     case OP_SKIPZERO:
292     cc += PRIV(OP_lengths)[*cc];
293     do cc += GET(cc, 1); while (*cc == OP_ALT);
294     cc += 1 + LINK_SIZE;
295     break;
296 
297     /* Handle literal characters and + repetitions */
298 
299     case OP_CHAR:
300     case OP_CHARI:
301     case OP_NOT:
302     case OP_NOTI:
303     case OP_PLUS:
304     case OP_PLUSI:
305     case OP_MINPLUS:
306     case OP_MINPLUSI:
307     case OP_POSPLUS:
308     case OP_POSPLUSI:
309     case OP_NOTPLUS:
310     case OP_NOTPLUSI:
311     case OP_NOTMINPLUS:
312     case OP_NOTMINPLUSI:
313     case OP_NOTPOSPLUS:
314     case OP_NOTPOSPLUSI:
315     branchlength++;
316     cc += 2;
317 #ifdef SUPPORT_UNICODE
318     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
319 #endif
320     break;
321 
322     case OP_TYPEPLUS:
323     case OP_TYPEMINPLUS:
324     case OP_TYPEPOSPLUS:
325     branchlength++;
326     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
327     break;
328 
329     /* Handle exact repetitions. The count is already in characters, but we
330     may need to skip over a multibyte character in UTF mode.  */
331 
332     case OP_EXACT:
333     case OP_EXACTI:
334     case OP_NOTEXACT:
335     case OP_NOTEXACTI:
336     branchlength += GET2(cc,1);
337     cc += 2 + IMM2_SIZE;
338 #ifdef SUPPORT_UNICODE
339     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
340 #endif
341     break;
342 
343     case OP_TYPEEXACT:
344     branchlength += GET2(cc,1);
345     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
346       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
347     break;
348 
349     /* Handle single-char non-literal matchers */
350 
351     case OP_PROP:
352     case OP_NOTPROP:
353     cc += 2;
354     /* Fall through */
355 
356     case OP_NOT_DIGIT:
357     case OP_DIGIT:
358     case OP_NOT_WHITESPACE:
359     case OP_WHITESPACE:
360     case OP_NOT_WORDCHAR:
361     case OP_WORDCHAR:
362     case OP_ANY:
363     case OP_ALLANY:
364     case OP_EXTUNI:
365     case OP_HSPACE:
366     case OP_NOT_HSPACE:
367     case OP_VSPACE:
368     case OP_NOT_VSPACE:
369     branchlength++;
370     cc++;
371     break;
372 
373     /* "Any newline" might match two characters, but it also might match just
374     one. */
375 
376     case OP_ANYNL:
377     branchlength += 1;
378     cc++;
379     break;
380 
381     /* The single-byte matcher means we can't proceed in UTF mode. (In
382     non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
383     appear, but leave the code, just in case.) */
384 
385     case OP_ANYBYTE:
386 #ifdef SUPPORT_UNICODE
387     if (utf) return -1;
388 #endif
389     branchlength++;
390     cc++;
391     break;
392 
393     /* For repeated character types, we have to test for \p and \P, which have
394     an extra two bytes of parameters. */
395 
396     case OP_TYPESTAR:
397     case OP_TYPEMINSTAR:
398     case OP_TYPEQUERY:
399     case OP_TYPEMINQUERY:
400     case OP_TYPEPOSSTAR:
401     case OP_TYPEPOSQUERY:
402     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
403     cc += PRIV(OP_lengths)[op];
404     break;
405 
406     case OP_TYPEUPTO:
407     case OP_TYPEMINUPTO:
408     case OP_TYPEPOSUPTO:
409     if (cc[1 + IMM2_SIZE] == OP_PROP
410       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
411     cc += PRIV(OP_lengths)[op];
412     break;
413 
414     /* Check a class for variable quantification */
415 
416     case OP_CLASS:
417     case OP_NCLASS:
418 #ifdef SUPPORT_WIDE_CHARS
419     case OP_XCLASS:
420     /* The original code caused an unsigned overflow in 64 bit systems,
421     so now we use a conditional statement. */
422     if (op == OP_XCLASS)
423       cc += GET(cc, 1);
424     else
425       cc += PRIV(OP_lengths)[OP_CLASS];
426 #else
427     cc += PRIV(OP_lengths)[OP_CLASS];
428 #endif
429 
430     switch (*cc)
431       {
432       case OP_CRPLUS:
433       case OP_CRMINPLUS:
434       case OP_CRPOSPLUS:
435       branchlength++;
436       /* Fall through */
437 
438       case OP_CRSTAR:
439       case OP_CRMINSTAR:
440       case OP_CRQUERY:
441       case OP_CRMINQUERY:
442       case OP_CRPOSSTAR:
443       case OP_CRPOSQUERY:
444       cc++;
445       break;
446 
447       case OP_CRRANGE:
448       case OP_CRMINRANGE:
449       case OP_CRPOSRANGE:
450       branchlength += GET2(cc,1);
451       cc += 1 + 2 * IMM2_SIZE;
452       break;
453 
454       default:
455       branchlength++;
456       break;
457       }
458     break;
459 
460     /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
461     way: we find the minimum length for the subpattern. A recursion
462     (backreference or subroutine) causes an a flag to be set that causes the
463     length of this branch to be ignored. The logic is that a recursion can only
464     make sense if there is another alternative that stops the recursing. That
465     will provide the minimum length (when no recursion happens).
466 
467     If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
468     matches an empty string (by default it causes a matching failure), so in
469     that case we must set the minimum length to zero.
470 
471     For backreferenes, if duplicate numbers are present in the pattern we check
472     for a reference to a duplicate. If it is, we don't know which version will
473     be referenced, so we have to set the minimum length to zero. */
474 
475     /* Duplicate named pattern back reference. */
476 
477     case OP_DNREF:
478     case OP_DNREFI:
479     if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
480       {
481       int count = GET2(cc, 1+IMM2_SIZE);
482       PCRE2_UCHAR *slot =
483         (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
484           GET2(cc, 1) * re->name_entry_size;
485 
486       d = INT_MAX;
487 
488       /* Scan all groups with the same name; find the shortest. */
489 
490       while (count-- > 0)
491         {
492         int dd, i;
493         recno = GET2(slot, 0);
494 
495         if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
496           dd = backref_cache[recno];
497         else
498           {
499           ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
500           if (cs == NULL) return -2;
501           do ce += GET(ce, 1); while (*ce == OP_ALT);
502 
503           dd = 0;
504           if (!dupcapused ||
505               (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
506             {
507             if (cc > cs && cc < ce)    /* Simple recursion */
508               {
509               had_recurse = TRUE;
510               }
511             else
512               {
513               recurse_check *r = recurses;
514               for (r = recurses; r != NULL; r = r->prev)
515                 if (r->group == cs) break;
516               if (r != NULL)           /* Mutual recursion */
517                 {
518                 had_recurse = TRUE;
519                 }
520               else
521                 {
522                 this_recurse.prev = recurses;  /* No recursion */
523                 this_recurse.group = cs;
524                 dd = find_minlength(re, cs, startcode, utf, &this_recurse,
525                   countptr, backref_cache);
526                 if (dd < 0) return dd;
527                 }
528               }
529             }
530 
531           backref_cache[recno] = dd;
532           for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
533           backref_cache[0] = recno;
534           }
535 
536         if (dd < d) d = dd;
537         if (d <= 0) break;    /* No point looking at any more */
538         slot += re->name_entry_size;
539         }
540       }
541     else d = 0;
542     cc += 1 + 2*IMM2_SIZE;
543     goto REPEAT_BACK_REFERENCE;
544 
545     /* Single back reference by number. References by name are converted to by
546     number when there is no duplication. */
547 
548     case OP_REF:
549     case OP_REFI:
550     recno = GET2(cc, 1);
551     if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
552       d = backref_cache[recno];
553     else
554       {
555       int i;
556       d = 0;
557 
558       if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
559         {
560         ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
561         if (cs == NULL) return -2;
562         do ce += GET(ce, 1); while (*ce == OP_ALT);
563 
564         if (!dupcapused ||
565             (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
566           {
567           if (cc > cs && cc < ce)    /* Simple recursion */
568             {
569             had_recurse = TRUE;
570             }
571           else
572             {
573             recurse_check *r = recurses;
574             for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
575             if (r != NULL)           /* Mutual recursion */
576               {
577               had_recurse = TRUE;
578               }
579             else                     /* No recursion */
580               {
581               this_recurse.prev = recurses;
582               this_recurse.group = cs;
583               d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
584                 backref_cache);
585               if (d < 0) return d;
586               }
587             }
588           }
589         }
590 
591       backref_cache[recno] = d;
592       for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
593       backref_cache[0] = recno;
594       }
595 
596     cc += 1 + IMM2_SIZE;
597 
598     /* Handle repeated back references */
599 
600     REPEAT_BACK_REFERENCE:
601     switch (*cc)
602       {
603       case OP_CRSTAR:
604       case OP_CRMINSTAR:
605       case OP_CRQUERY:
606       case OP_CRMINQUERY:
607       case OP_CRPOSSTAR:
608       case OP_CRPOSQUERY:
609       min = 0;
610       cc++;
611       break;
612 
613       case OP_CRPLUS:
614       case OP_CRMINPLUS:
615       case OP_CRPOSPLUS:
616       min = 1;
617       cc++;
618       break;
619 
620       case OP_CRRANGE:
621       case OP_CRMINRANGE:
622       case OP_CRPOSRANGE:
623       min = GET2(cc, 1);
624       cc += 1 + 2 * IMM2_SIZE;
625       break;
626 
627       default:
628       min = 1;
629       break;
630       }
631 
632      /* Take care not to overflow: (1) min and d are ints, so check that their
633      product is not greater than INT_MAX. (2) branchlength is limited to
634      UINT16_MAX (checked at the top of the loop). */
635 
636     if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
637       branchlength = UINT16_MAX;
638     else branchlength += min * d;
639     break;
640 
641     /* Recursion always refers to the first occurrence of a subpattern with a
642     given number. Therefore, we can always make use of caching, even when the
643     pattern contains multiple subpatterns with the same number. */
644 
645     case OP_RECURSE:
646     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
647     recno = GET2(cs, 1+LINK_SIZE);
648     if (recno == prev_recurse_recno)
649       {
650       branchlength += prev_recurse_d;
651       }
652     else
653       {
654       do ce += GET(ce, 1); while (*ce == OP_ALT);
655       if (cc > cs && cc < ce)    /* Simple recursion */
656         had_recurse = TRUE;
657       else
658         {
659         recurse_check *r = recurses;
660         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
661         if (r != NULL)          /* Mutual recursion */
662           had_recurse = TRUE;
663         else
664           {
665           this_recurse.prev = recurses;
666           this_recurse.group = cs;
667           prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
668             countptr, backref_cache);
669           if (prev_recurse_d < 0) return prev_recurse_d;
670           prev_recurse_recno = recno;
671           branchlength += prev_recurse_d;
672           }
673         }
674       }
675     cc += 1 + LINK_SIZE + once_fudge;
676     once_fudge = 0;
677     break;
678 
679     /* Anything else does not or need not match a character. We can get the
680     item's length from the table, but for those that can match zero occurrences
681     of a character, we must take special action for UTF-8 characters. As it
682     happens, the "NOT" versions of these opcodes are used at present only for
683     ASCII characters, so they could be omitted from this list. However, in
684     future that may change, so we include them here so as not to leave a
685     gotcha for a future maintainer. */
686 
687     case OP_UPTO:
688     case OP_UPTOI:
689     case OP_NOTUPTO:
690     case OP_NOTUPTOI:
691     case OP_MINUPTO:
692     case OP_MINUPTOI:
693     case OP_NOTMINUPTO:
694     case OP_NOTMINUPTOI:
695     case OP_POSUPTO:
696     case OP_POSUPTOI:
697     case OP_NOTPOSUPTO:
698     case OP_NOTPOSUPTOI:
699 
700     case OP_STAR:
701     case OP_STARI:
702     case OP_NOTSTAR:
703     case OP_NOTSTARI:
704     case OP_MINSTAR:
705     case OP_MINSTARI:
706     case OP_NOTMINSTAR:
707     case OP_NOTMINSTARI:
708     case OP_POSSTAR:
709     case OP_POSSTARI:
710     case OP_NOTPOSSTAR:
711     case OP_NOTPOSSTARI:
712 
713     case OP_QUERY:
714     case OP_QUERYI:
715     case OP_NOTQUERY:
716     case OP_NOTQUERYI:
717     case OP_MINQUERY:
718     case OP_MINQUERYI:
719     case OP_NOTMINQUERY:
720     case OP_NOTMINQUERYI:
721     case OP_POSQUERY:
722     case OP_POSQUERYI:
723     case OP_NOTPOSQUERY:
724     case OP_NOTPOSQUERYI:
725 
726     cc += PRIV(OP_lengths)[op];
727 #ifdef SUPPORT_UNICODE
728     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
729 #endif
730     break;
731 
732     /* Skip these, but we need to add in the name length. */
733 
734     case OP_MARK:
735     case OP_COMMIT_ARG:
736     case OP_PRUNE_ARG:
737     case OP_SKIP_ARG:
738     case OP_THEN_ARG:
739     cc += PRIV(OP_lengths)[op] + cc[1];
740     break;
741 
742     /* The remaining opcodes are just skipped over. */
743 
744     case OP_CLOSE:
745     case OP_COMMIT:
746     case OP_FAIL:
747     case OP_PRUNE:
748     case OP_SET_SOM:
749     case OP_SKIP:
750     case OP_THEN:
751     cc += PRIV(OP_lengths)[op];
752     break;
753 
754     /* This should not occur: we list all opcodes explicitly so that when
755     new ones get added they are properly considered. */
756 
757     default:
758     return -3;
759     }
760   }
761 /* Control never gets here */
762 }
763 
764 
765 
766 /*************************************************
767 *      Set a bit and maybe its alternate case    *
768 *************************************************/
769 
770 /* Given a character, set its first code unit's bit in the table, and also the
771 corresponding bit for the other version of a letter if we are caseless.
772 
773 Arguments:
774   re            points to the regex block
775   p             points to the first code unit of the character
776   caseless      TRUE if caseless
777   utf           TRUE for UTF mode
778   ucp           TRUE for UCP mode
779 
780 Returns:        pointer after the character
781 */
782 
783 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf,BOOL ucp)784 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
785   BOOL ucp)
786 {
787 uint32_t c = *p++;   /* First code unit */
788 
789 (void)utf;           /* Stop compiler warnings when UTF not supported */
790 (void)ucp;
791 
792 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
793 0xff. */
794 
795 #if PCRE2_CODE_UNIT_WIDTH != 8
796 if (c > 0xff) SET_BIT(0xff); else
797 #endif
798 
799 SET_BIT(c);
800 
801 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
802 the end of the character, even when caseless. */
803 
804 #ifdef SUPPORT_UNICODE
805 if (utf)
806   {
807 #if PCRE2_CODE_UNIT_WIDTH == 8
808   if (c >= 0xc0) GETUTF8INC(c, p);
809 #elif PCRE2_CODE_UNIT_WIDTH == 16
810   if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
811 #endif
812   }
813 #endif  /* SUPPORT_UNICODE */
814 
815 /* If caseless, handle the other case of the character. */
816 
817 if (caseless)
818   {
819 #ifdef SUPPORT_UNICODE
820   if (utf || ucp)
821     {
822     c = UCD_OTHERCASE(c);
823 #if PCRE2_CODE_UNIT_WIDTH == 8
824     if (utf)
825       {
826       PCRE2_UCHAR buff[6];
827       (void)PRIV(ord2utf)(c, buff);
828       SET_BIT(buff[0]);
829       }
830     else if (c < 256) SET_BIT(c);
831 #else  /* 16-bit or 32-bit mode */
832     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
833 #endif
834     }
835 
836   else
837 #endif  /* SUPPORT_UNICODE */
838 
839   /* Not UTF or UCP */
840 
841   if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
842   }
843 
844 return p;
845 }
846 
847 
848 
849 /*************************************************
850 *     Set bits for a positive character type     *
851 *************************************************/
852 
853 /* This function sets starting bits for a character type. In UTF-8 mode, we can
854 only do a direct setting for bytes less than 128, as otherwise there can be
855 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
856 environment, the tables will only recognize ASCII characters anyway, but in at
857 least one Windows environment, some higher bytes bits were set in the tables.
858 So we deal with that case by considering the UTF-8 encoding.
859 
860 Arguments:
861   re             the regex block
862   cbit type      the type of character wanted
863   table_limit    32 for non-UTF-8; 16 for UTF-8
864 
865 Returns:         nothing
866 */
867 
868 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)869 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
870 {
871 uint32_t c;
872 for (c = 0; c < table_limit; c++)
873   re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
874 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
875 if (table_limit == 32) return;
876 for (c = 128; c < 256; c++)
877   {
878   if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
879     {
880     PCRE2_UCHAR buff[6];
881     (void)PRIV(ord2utf)(c, buff);
882     SET_BIT(buff[0]);
883     }
884   }
885 #endif  /* UTF-8 */
886 }
887 
888 
889 /*************************************************
890 *     Set bits for a negative character type     *
891 *************************************************/
892 
893 /* This function sets starting bits for a negative character type such as \D.
894 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
895 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
896 Unlike in the positive case, where we can set appropriate starting bits for
897 specific high-valued UTF-8 characters, in this case we have to set the bits for
898 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
899 0xc0 (192) for simplicity.
900 
901 Arguments:
902   re             the regex block
903   cbit type      the type of character wanted
904   table_limit    32 for non-UTF-8; 16 for UTF-8
905 
906 Returns:         nothing
907 */
908 
909 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)910 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
911 {
912 uint32_t c;
913 for (c = 0; c < table_limit; c++)
914   re->start_bitmap[c] |= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
915 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
916 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
917 #endif
918 }
919 
920 
921 
922 /*************************************************
923 *      Create bitmap of starting code units      *
924 *************************************************/
925 
926 /* This function scans a compiled unanchored expression recursively and
927 attempts to build a bitmap of the set of possible starting code units whose
928 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
929 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
930 we pass a value of 16 rather than 32 as the final argument. (See comments in
931 those functions for the reason.)
932 
933 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
934 (a*)b where the group provides some optional starting code units but scanning
935 must continue at the outer level to find at least one mandatory code unit. At
936 the outermost level, this function fails unless the result is SSB_DONE.
937 
938 We restrict recursion (for nested groups) to 1000 to avoid stack overflow
939 issues.
940 
941 Arguments:
942   re           points to the compiled regex block
943   code         points to an expression
944   utf          TRUE if in UTF mode
945   ucp          TRUE if in UCP mode
946   depthptr     pointer to recurse depth
947 
948 Returns:       SSB_FAIL     => Failed to find any starting code units
949                SSB_DONE     => Found mandatory starting code units
950                SSB_CONTINUE => Found optional starting code units
951                SSB_UNKNOWN  => Hit an unrecognized opcode
952                SSB_TOODEEP  => Recursion is too deep
953 */
954 
955 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf,BOOL ucp,int * depthptr)956 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
957   int *depthptr)
958 {
959 uint32_t c;
960 int yield = SSB_DONE;
961 
962 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
963 int table_limit = utf? 16:32;
964 #else
965 int table_limit = 32;
966 #endif
967 
968 *depthptr += 1;
969 if (*depthptr > 1000) return SSB_TOODEEP;
970 
971 do
972   {
973   BOOL try_next = TRUE;
974   PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
975 
976   if (*code == OP_CBRA || *code == OP_SCBRA ||
977       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
978 
979   while (try_next)    /* Loop for items in this branch */
980     {
981     int rc;
982     PCRE2_SPTR ncode;
983     uint8_t *classmap = NULL;
984 #ifdef SUPPORT_WIDE_CHARS
985     PCRE2_UCHAR xclassflags;
986 #endif
987 
988     switch(*tcode)
989       {
990       /* If we reach something we don't understand, it means a new opcode has
991       been created that hasn't been added to this function. Hopefully this
992       problem will be discovered during testing. */
993 
994       default:
995       return SSB_UNKNOWN;
996 
997       /* Fail for a valid opcode that implies no starting bits. */
998 
999       case OP_ACCEPT:
1000       case OP_ASSERT_ACCEPT:
1001       case OP_ALLANY:
1002       case OP_ANY:
1003       case OP_ANYBYTE:
1004       case OP_CIRCM:
1005       case OP_CLOSE:
1006       case OP_COMMIT:
1007       case OP_COMMIT_ARG:
1008       case OP_COND:
1009       case OP_CREF:
1010       case OP_FALSE:
1011       case OP_TRUE:
1012       case OP_DNCREF:
1013       case OP_DNREF:
1014       case OP_DNREFI:
1015       case OP_DNRREF:
1016       case OP_DOLL:
1017       case OP_DOLLM:
1018       case OP_END:
1019       case OP_EOD:
1020       case OP_EODN:
1021       case OP_EXTUNI:
1022       case OP_FAIL:
1023       case OP_MARK:
1024       case OP_NOT:
1025       case OP_NOTEXACT:
1026       case OP_NOTEXACTI:
1027       case OP_NOTI:
1028       case OP_NOTMINPLUS:
1029       case OP_NOTMINPLUSI:
1030       case OP_NOTMINQUERY:
1031       case OP_NOTMINQUERYI:
1032       case OP_NOTMINSTAR:
1033       case OP_NOTMINSTARI:
1034       case OP_NOTMINUPTO:
1035       case OP_NOTMINUPTOI:
1036       case OP_NOTPLUS:
1037       case OP_NOTPLUSI:
1038       case OP_NOTPOSPLUS:
1039       case OP_NOTPOSPLUSI:
1040       case OP_NOTPOSQUERY:
1041       case OP_NOTPOSQUERYI:
1042       case OP_NOTPOSSTAR:
1043       case OP_NOTPOSSTARI:
1044       case OP_NOTPOSUPTO:
1045       case OP_NOTPOSUPTOI:
1046       case OP_NOTPROP:
1047       case OP_NOTQUERY:
1048       case OP_NOTQUERYI:
1049       case OP_NOTSTAR:
1050       case OP_NOTSTARI:
1051       case OP_NOTUPTO:
1052       case OP_NOTUPTOI:
1053       case OP_NOT_HSPACE:
1054       case OP_NOT_VSPACE:
1055       case OP_PRUNE:
1056       case OP_PRUNE_ARG:
1057       case OP_RECURSE:
1058       case OP_REF:
1059       case OP_REFI:
1060       case OP_REVERSE:
1061       case OP_VREVERSE:
1062       case OP_RREF:
1063       case OP_SCOND:
1064       case OP_SET_SOM:
1065       case OP_SKIP:
1066       case OP_SKIP_ARG:
1067       case OP_SOD:
1068       case OP_SOM:
1069       case OP_THEN:
1070       case OP_THEN_ARG:
1071       return SSB_FAIL;
1072 
1073       /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1074       uses OP_CIRCM). Skip over it. */
1075 
1076       case OP_CIRC:
1077       tcode += PRIV(OP_lengths)[OP_CIRC];
1078       break;
1079 
1080       /* A "real" property test implies no starting bits, but the fake property
1081       PT_CLIST identifies a list of characters. These lists are short, as they
1082       are used for characters with more than one "other case", so there is no
1083       point in recognizing them for OP_NOTPROP. */
1084 
1085       case OP_PROP:
1086       if (tcode[1] != PT_CLIST) return SSB_FAIL;
1087         {
1088         const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1089         while ((c = *p++) < NOTACHAR)
1090           {
1091 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1092           if (utf)
1093             {
1094             PCRE2_UCHAR buff[6];
1095             (void)PRIV(ord2utf)(c, buff);
1096             c = buff[0];
1097             }
1098 #endif
1099           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1100           }
1101         }
1102       try_next = FALSE;
1103       break;
1104 
1105       /* We can ignore word boundary tests. */
1106 
1107       case OP_WORD_BOUNDARY:
1108       case OP_NOT_WORD_BOUNDARY:
1109       case OP_UCP_WORD_BOUNDARY:
1110       case OP_NOT_UCP_WORD_BOUNDARY:
1111       tcode++;
1112       break;
1113 
1114       /* For a positive lookahead assertion, inspect what immediately follows,
1115       ignoring intermediate assertions and callouts. If the next item is one
1116       that sets a mandatory character, skip this assertion. Otherwise, treat it
1117       the same as other bracket groups. */
1118 
1119       case OP_ASSERT:
1120       case OP_ASSERT_NA:
1121       ncode = tcode + GET(tcode, 1);
1122       while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1123       ncode += 1 + LINK_SIZE;
1124 
1125       /* Skip irrelevant items */
1126 
1127       for (BOOL done = FALSE; !done;)
1128         {
1129         switch (*ncode)
1130           {
1131           case OP_ASSERT:
1132           case OP_ASSERT_NOT:
1133           case OP_ASSERTBACK:
1134           case OP_ASSERTBACK_NOT:
1135           case OP_ASSERT_NA:
1136           case OP_ASSERTBACK_NA:
1137           ncode += GET(ncode, 1);
1138           while (*ncode == OP_ALT) ncode += GET(ncode, 1);
1139           ncode += 1 + LINK_SIZE;
1140           break;
1141 
1142           case OP_WORD_BOUNDARY:
1143           case OP_NOT_WORD_BOUNDARY:
1144           case OP_UCP_WORD_BOUNDARY:
1145           case OP_NOT_UCP_WORD_BOUNDARY:
1146           ncode++;
1147           break;
1148 
1149           case OP_CALLOUT:
1150           ncode += PRIV(OP_lengths)[OP_CALLOUT];
1151           break;
1152 
1153           case OP_CALLOUT_STR:
1154           ncode += GET(ncode, 1 + 2*LINK_SIZE);
1155           break;
1156 
1157           default:
1158           done = TRUE;
1159           break;
1160           }
1161         }
1162 
1163       /* Now check the next significant item. */
1164 
1165       switch(*ncode)
1166         {
1167         default:
1168         break;
1169 
1170         case OP_PROP:
1171         if (ncode[1] != PT_CLIST) break;
1172         /* Fall through */
1173         case OP_ANYNL:
1174         case OP_CHAR:
1175         case OP_CHARI:
1176         case OP_EXACT:
1177         case OP_EXACTI:
1178         case OP_HSPACE:
1179         case OP_MINPLUS:
1180         case OP_MINPLUSI:
1181         case OP_PLUS:
1182         case OP_PLUSI:
1183         case OP_POSPLUS:
1184         case OP_POSPLUSI:
1185         case OP_VSPACE:
1186         /* Note that these types will only be present in non-UCP mode. */
1187         case OP_DIGIT:
1188         case OP_NOT_DIGIT:
1189         case OP_WORDCHAR:
1190         case OP_NOT_WORDCHAR:
1191         case OP_WHITESPACE:
1192         case OP_NOT_WHITESPACE:
1193         tcode = ncode;
1194         continue;   /* With the following significant opcode */
1195         }
1196       /* Fall through */
1197 
1198       /* For a group bracket or a positive assertion without an immediately
1199       following mandatory setting, recurse to set bits from within the
1200       subpattern. If it can't find anything, we have to give up. If it finds
1201       some mandatory character(s), we are done for this branch. Otherwise,
1202       carry on scanning after the subpattern. */
1203 
1204       case OP_BRA:
1205       case OP_SBRA:
1206       case OP_CBRA:
1207       case OP_SCBRA:
1208       case OP_BRAPOS:
1209       case OP_SBRAPOS:
1210       case OP_CBRAPOS:
1211       case OP_SCBRAPOS:
1212       case OP_ONCE:
1213       case OP_SCRIPT_RUN:
1214       rc = set_start_bits(re, tcode, utf, ucp, depthptr);
1215       if (rc == SSB_DONE)
1216         {
1217         try_next = FALSE;
1218         }
1219       else if (rc == SSB_CONTINUE)
1220         {
1221         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1222         tcode += 1 + LINK_SIZE;
1223         }
1224       else return rc;   /* FAIL, UNKNOWN, or TOODEEP */
1225       break;
1226 
1227       /* If we hit ALT or KET, it means we haven't found anything mandatory in
1228       this branch, though we might have found something optional. For ALT, we
1229       continue with the next alternative, but we have to arrange that the final
1230       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1231       return SSB_CONTINUE: if this is the top level, that indicates failure,
1232       but after a nested subpattern, it causes scanning to continue. */
1233 
1234       case OP_ALT:
1235       yield = SSB_CONTINUE;
1236       try_next = FALSE;
1237       break;
1238 
1239       case OP_KET:
1240       case OP_KETRMAX:
1241       case OP_KETRMIN:
1242       case OP_KETRPOS:
1243       return SSB_CONTINUE;
1244 
1245       /* Skip over callout */
1246 
1247       case OP_CALLOUT:
1248       tcode += PRIV(OP_lengths)[OP_CALLOUT];
1249       break;
1250 
1251       case OP_CALLOUT_STR:
1252       tcode += GET(tcode, 1 + 2*LINK_SIZE);
1253       break;
1254 
1255       /* Skip over lookbehind and negative lookahead assertions */
1256 
1257       case OP_ASSERT_NOT:
1258       case OP_ASSERTBACK:
1259       case OP_ASSERTBACK_NOT:
1260       case OP_ASSERTBACK_NA:
1261       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1262       tcode += 1 + LINK_SIZE;
1263       break;
1264 
1265       /* BRAZERO does the bracket, but carries on. */
1266 
1267       case OP_BRAZERO:
1268       case OP_BRAMINZERO:
1269       case OP_BRAPOSZERO:
1270       rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
1271       if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
1272       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1273       tcode += 1 + LINK_SIZE;
1274       break;
1275 
1276       /* SKIPZERO skips the bracket. */
1277 
1278       case OP_SKIPZERO:
1279       tcode++;
1280       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1281       tcode += 1 + LINK_SIZE;
1282       break;
1283 
1284       /* Single-char * or ? sets the bit and tries the next item */
1285 
1286       case OP_STAR:
1287       case OP_MINSTAR:
1288       case OP_POSSTAR:
1289       case OP_QUERY:
1290       case OP_MINQUERY:
1291       case OP_POSQUERY:
1292       tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1293       break;
1294 
1295       case OP_STARI:
1296       case OP_MINSTARI:
1297       case OP_POSSTARI:
1298       case OP_QUERYI:
1299       case OP_MINQUERYI:
1300       case OP_POSQUERYI:
1301       tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1302       break;
1303 
1304       /* Single-char upto sets the bit and tries the next */
1305 
1306       case OP_UPTO:
1307       case OP_MINUPTO:
1308       case OP_POSUPTO:
1309       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
1310       break;
1311 
1312       case OP_UPTOI:
1313       case OP_MINUPTOI:
1314       case OP_POSUPTOI:
1315       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
1316       break;
1317 
1318       /* At least one single char sets the bit and stops */
1319 
1320       case OP_EXACT:
1321       tcode += IMM2_SIZE;
1322       /* Fall through */
1323       case OP_CHAR:
1324       case OP_PLUS:
1325       case OP_MINPLUS:
1326       case OP_POSPLUS:
1327       (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
1328       try_next = FALSE;
1329       break;
1330 
1331       case OP_EXACTI:
1332       tcode += IMM2_SIZE;
1333       /* Fall through */
1334       case OP_CHARI:
1335       case OP_PLUSI:
1336       case OP_MINPLUSI:
1337       case OP_POSPLUSI:
1338       (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
1339       try_next = FALSE;
1340       break;
1341 
1342       /* Special spacing and line-terminating items. These recognize specific
1343       lists of characters. The difference between VSPACE and ANYNL is that the
1344       latter can match the two-character CRLF sequence, but that is not
1345       relevant for finding the first character, so their code here is
1346       identical. */
1347 
1348       case OP_HSPACE:
1349       SET_BIT(CHAR_HT);
1350       SET_BIT(CHAR_SPACE);
1351 
1352       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1353       the bits for 0xA0 and for code units >= 255, independently of UTF. */
1354 
1355 #if PCRE2_CODE_UNIT_WIDTH != 8
1356       SET_BIT(0xA0);
1357       SET_BIT(0xFF);
1358 #else
1359       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1360       units of horizontal space characters. */
1361 
1362 #ifdef SUPPORT_UNICODE
1363       if (utf)
1364         {
1365         SET_BIT(0xC2);  /* For U+00A0 */
1366         SET_BIT(0xE1);  /* For U+1680, U+180E */
1367         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1368         SET_BIT(0xE3);  /* For U+3000 */
1369         }
1370       else
1371 #endif
1372       /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1373       the code is EBCDIC. */
1374         {
1375 #ifndef EBCDIC
1376         SET_BIT(0xA0);
1377 #endif  /* Not EBCDIC */
1378         }
1379 #endif  /* 8-bit support */
1380 
1381       try_next = FALSE;
1382       break;
1383 
1384       case OP_ANYNL:
1385       case OP_VSPACE:
1386       SET_BIT(CHAR_LF);
1387       SET_BIT(CHAR_VT);
1388       SET_BIT(CHAR_FF);
1389       SET_BIT(CHAR_CR);
1390 
1391       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1392       the bits for NEL and for code units >= 255, independently of UTF. */
1393 
1394 #if PCRE2_CODE_UNIT_WIDTH != 8
1395       SET_BIT(CHAR_NEL);
1396       SET_BIT(0xFF);
1397 #else
1398       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1399       units of vertical space characters. */
1400 
1401 #ifdef SUPPORT_UNICODE
1402       if (utf)
1403         {
1404         SET_BIT(0xC2);  /* For U+0085 (NEL) */
1405         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1406         }
1407       else
1408 #endif
1409       /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1410         {
1411         SET_BIT(CHAR_NEL);
1412         }
1413 #endif  /* 8-bit support */
1414 
1415       try_next = FALSE;
1416       break;
1417 
1418       /* Single character types set the bits and stop. Note that if PCRE2_UCP
1419       is set, we do not see these opcodes because \d etc are converted to
1420       properties. Therefore, these apply in the case when only characters less
1421       than 256 are recognized to match the types. */
1422 
1423       case OP_NOT_DIGIT:
1424       set_nottype_bits(re, cbit_digit, table_limit);
1425       try_next = FALSE;
1426       break;
1427 
1428       case OP_DIGIT:
1429       set_type_bits(re, cbit_digit, table_limit);
1430       try_next = FALSE;
1431       break;
1432 
1433       case OP_NOT_WHITESPACE:
1434       set_nottype_bits(re, cbit_space, table_limit);
1435       try_next = FALSE;
1436       break;
1437 
1438       case OP_WHITESPACE:
1439       set_type_bits(re, cbit_space, table_limit);
1440       try_next = FALSE;
1441       break;
1442 
1443       case OP_NOT_WORDCHAR:
1444       set_nottype_bits(re, cbit_word, table_limit);
1445       try_next = FALSE;
1446       break;
1447 
1448       case OP_WORDCHAR:
1449       set_type_bits(re, cbit_word, table_limit);
1450       try_next = FALSE;
1451       break;
1452 
1453       /* One or more character type fudges the pointer and restarts, knowing
1454       it will hit a single character type and stop there. */
1455 
1456       case OP_TYPEPLUS:
1457       case OP_TYPEMINPLUS:
1458       case OP_TYPEPOSPLUS:
1459       tcode++;
1460       break;
1461 
1462       case OP_TYPEEXACT:
1463       tcode += 1 + IMM2_SIZE;
1464       break;
1465 
1466       /* Zero or more repeats of character types set the bits and then
1467       try again. */
1468 
1469       case OP_TYPEUPTO:
1470       case OP_TYPEMINUPTO:
1471       case OP_TYPEPOSUPTO:
1472       tcode += IMM2_SIZE;  /* Fall through */
1473 
1474       case OP_TYPESTAR:
1475       case OP_TYPEMINSTAR:
1476       case OP_TYPEPOSSTAR:
1477       case OP_TYPEQUERY:
1478       case OP_TYPEMINQUERY:
1479       case OP_TYPEPOSQUERY:
1480       switch(tcode[1])
1481         {
1482         default:
1483         case OP_ANY:
1484         case OP_ALLANY:
1485         return SSB_FAIL;
1486 
1487         case OP_HSPACE:
1488         SET_BIT(CHAR_HT);
1489         SET_BIT(CHAR_SPACE);
1490 
1491         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1492         the bits for 0xA0 and for code units >= 255, independently of UTF. */
1493 
1494 #if PCRE2_CODE_UNIT_WIDTH != 8
1495         SET_BIT(0xA0);
1496         SET_BIT(0xFF);
1497 #else
1498         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1499         units of horizontal space characters. */
1500 
1501 #ifdef SUPPORT_UNICODE
1502         if (utf)
1503           {
1504           SET_BIT(0xC2);  /* For U+00A0 */
1505           SET_BIT(0xE1);  /* For U+1680, U+180E */
1506           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1507           SET_BIT(0xE3);  /* For U+3000 */
1508           }
1509         else
1510 #endif
1511         /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1512         the code is EBCDIC. */
1513           {
1514 #ifndef EBCDIC
1515           SET_BIT(0xA0);
1516 #endif  /* Not EBCDIC */
1517           }
1518 #endif  /* 8-bit support */
1519         break;
1520 
1521         case OP_ANYNL:
1522         case OP_VSPACE:
1523         SET_BIT(CHAR_LF);
1524         SET_BIT(CHAR_VT);
1525         SET_BIT(CHAR_FF);
1526         SET_BIT(CHAR_CR);
1527 
1528         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1529         the bits for NEL and for code units >= 255, independently of UTF. */
1530 
1531 #if PCRE2_CODE_UNIT_WIDTH != 8
1532         SET_BIT(CHAR_NEL);
1533         SET_BIT(0xFF);
1534 #else
1535         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1536         units of vertical space characters. */
1537 
1538 #ifdef SUPPORT_UNICODE
1539         if (utf)
1540           {
1541           SET_BIT(0xC2);  /* For U+0085 (NEL) */
1542           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1543           }
1544         else
1545 #endif
1546         /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1547           {
1548           SET_BIT(CHAR_NEL);
1549           }
1550 #endif  /* 8-bit support */
1551         break;
1552 
1553         case OP_NOT_DIGIT:
1554         set_nottype_bits(re, cbit_digit, table_limit);
1555         break;
1556 
1557         case OP_DIGIT:
1558         set_type_bits(re, cbit_digit, table_limit);
1559         break;
1560 
1561         case OP_NOT_WHITESPACE:
1562         set_nottype_bits(re, cbit_space, table_limit);
1563         break;
1564 
1565         case OP_WHITESPACE:
1566         set_type_bits(re, cbit_space, table_limit);
1567         break;
1568 
1569         case OP_NOT_WORDCHAR:
1570         set_nottype_bits(re, cbit_word, table_limit);
1571         break;
1572 
1573         case OP_WORDCHAR:
1574         set_type_bits(re, cbit_word, table_limit);
1575         break;
1576         }
1577 
1578       tcode += 2;
1579       break;
1580 
1581       /* Extended class: if there are any property checks, or if this is a
1582       negative XCLASS without a map, give up. If there are no property checks,
1583       there must be wide characters on the XCLASS list, because otherwise an
1584       XCLASS would not have been created. This means that code points >= 255
1585       are potential starters. In the UTF-8 case we can scan them and set bits
1586       for the relevant leading bytes. */
1587 
1588 #ifdef SUPPORT_WIDE_CHARS
1589       case OP_XCLASS:
1590       xclassflags = tcode[1 + LINK_SIZE];
1591       if ((xclassflags & XCL_HASPROP) != 0 ||
1592           (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1593         return SSB_FAIL;
1594 
1595       /* We have a positive XCLASS or a negative one without a map. Set up the
1596       map pointer if there is one, and fall through. */
1597 
1598       classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
1599         (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1600 
1601       /* In UTF-8 mode, scan the character list and set bits for leading bytes,
1602       then jump to handle the map. */
1603 
1604 #if PCRE2_CODE_UNIT_WIDTH == 8
1605       if (utf && (xclassflags & XCL_NOT) == 0)
1606         {
1607         PCRE2_UCHAR b, e;
1608         PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
1609         tcode += GET(tcode, 1);
1610 
1611         for (;;) switch (*p++)
1612           {
1613           case XCL_SINGLE:
1614           b = *p++;
1615           while ((*p & 0xc0) == 0x80) p++;
1616           re->start_bitmap[b/8] |= (1u << (b&7));
1617           break;
1618 
1619           case XCL_RANGE:
1620           b = *p++;
1621           while ((*p & 0xc0) == 0x80) p++;
1622           e = *p++;
1623           while ((*p & 0xc0) == 0x80) p++;
1624           for (; b <= e; b++)
1625             re->start_bitmap[b/8] |= (1u << (b&7));
1626           break;
1627 
1628           case XCL_END:
1629           goto HANDLE_CLASSMAP;
1630 
1631           default:
1632           return SSB_UNKNOWN;   /* Internal error, should not occur */
1633           }
1634         }
1635 #endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
1636 #endif  /* SUPPORT_WIDE_CHARS */
1637 
1638       /* It seems that the fall through comment must be outside the #ifdef if
1639       it is to avoid the gcc compiler warning. */
1640 
1641       /* Fall through */
1642 
1643       /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1644       in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1645       because it starts a character with a value > 255. In 8-bit non-UTF mode,
1646       there is no difference between CLASS and NCLASS. In all other wide
1647       character modes, set the 0xFF bit to indicate code units >= 255. */
1648 
1649       case OP_NCLASS:
1650 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1651       if (utf)
1652         {
1653         re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1654         memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1655         }
1656 #elif PCRE2_CODE_UNIT_WIDTH != 8
1657       SET_BIT(0xFF);                             /* For characters >= 255 */
1658 #endif
1659       /* Fall through */
1660 
1661       /* Enter here for a positive non-XCLASS. If we have fallen through from
1662       an XCLASS, classmap will already be set; just advance the code pointer.
1663       Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1664 
1665       case OP_CLASS:
1666       if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1667         {
1668         classmap = (uint8_t *)(++tcode);
1669         tcode += 32 / sizeof(PCRE2_UCHAR);
1670         }
1671 
1672       /* When wide characters are supported, classmap may be NULL. In UTF-8
1673       (sic) mode, the bits in a class bit map correspond to character values,
1674       not to byte values. However, the bit map we are constructing is for byte
1675       values. So we have to do a conversion for characters whose code point is
1676       greater than 127. In fact, there are only two possible starting bytes for
1677       characters in the range 128 - 255. */
1678 
1679 #if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
1680       HANDLE_CLASSMAP:
1681 #endif
1682       if (classmap != NULL)
1683         {
1684 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1685         if (utf)
1686           {
1687           for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1688           for (c = 128; c < 256; c++)
1689             {
1690             if ((classmap[c/8] & (1u << (c&7))) != 0)
1691               {
1692               int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1693               re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1694               c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1695               }
1696             }
1697           }
1698         else
1699 #endif
1700         /* In all modes except UTF-8, the two bit maps are compatible. */
1701 
1702           {
1703           for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1704           }
1705         }
1706 
1707       /* Act on what follows the class. For a zero minimum repeat, continue;
1708       otherwise stop processing. */
1709 
1710       switch (*tcode)
1711         {
1712         case OP_CRSTAR:
1713         case OP_CRMINSTAR:
1714         case OP_CRQUERY:
1715         case OP_CRMINQUERY:
1716         case OP_CRPOSSTAR:
1717         case OP_CRPOSQUERY:
1718         tcode++;
1719         break;
1720 
1721         case OP_CRRANGE:
1722         case OP_CRMINRANGE:
1723         case OP_CRPOSRANGE:
1724         if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1725           else try_next = FALSE;
1726         break;
1727 
1728         default:
1729         try_next = FALSE;
1730         break;
1731         }
1732       break; /* End of class handling case */
1733       }      /* End of switch for opcodes */
1734     }        /* End of try_next loop */
1735 
1736   code += GET(code, 1);   /* Advance to next branch */
1737   }
1738 while (*code == OP_ALT);
1739 
1740 return yield;
1741 }
1742 
1743 
1744 
1745 /*************************************************
1746 *          Study a compiled expression           *
1747 *************************************************/
1748 
1749 /* This function is handed a compiled expression that it must study to produce
1750 information that will speed up the matching.
1751 
1752 Argument:
1753   re       points to the compiled expression
1754 
1755 Returns:   0 normally; non-zero should never normally occur
1756            1 unknown opcode in set_start_bits
1757            2 missing capturing bracket
1758            3 unknown opcode in find_minlength
1759 */
1760 
1761 int
PRIV(study)1762 PRIV(study)(pcre2_real_code *re)
1763 {
1764 int count = 0;
1765 PCRE2_UCHAR *code;
1766 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1767 BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
1768 
1769 /* Find start of compiled code */
1770 
1771 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1772   re->name_entry_size * re->name_count;
1773 
1774 /* For a pattern that has a first code unit, or a multiline pattern that
1775 matches only at "line start", there is no point in seeking a list of starting
1776 code units. */
1777 
1778 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1779   {
1780   int depth = 0;
1781   int rc = set_start_bits(re, code, utf, ucp, &depth);
1782   if (rc == SSB_UNKNOWN) return 1;
1783 
1784   /* If a list of starting code units was set up, scan the list to see if only
1785   one or two were listed. Having only one listed is rare because usually a
1786   single starting code unit will have been recognized and PCRE2_FIRSTSET set.
1787   If two are listed, see if they are caseless versions of the same character;
1788   if so we can replace the list with a caseless first code unit. This gives
1789   better performance and is plausibly worth doing for patterns such as [Ww]ord
1790   or (word|WORD). */
1791 
1792   if (rc == SSB_DONE)
1793     {
1794     int i;
1795     int a = -1;
1796     int b = -1;
1797     uint8_t *p = re->start_bitmap;
1798     uint32_t flags = PCRE2_FIRSTMAPSET;
1799 
1800     for (i = 0; i < 256; p++, i += 8)
1801       {
1802       uint8_t x = *p;
1803       if (x != 0)
1804         {
1805         int c;
1806         uint8_t y = x & (~x + 1);   /* Least significant bit */
1807         if (y != x) goto DONE;      /* More than one bit set */
1808 
1809         /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
1810         all wide characters", so we cannot use it here. */
1811 
1812 #if PCRE2_CODE_UNIT_WIDTH != 8
1813         if (i == 248 && x == 0x80) goto DONE;
1814 #endif
1815 
1816         /* Compute the character value */
1817 
1818         c = i;
1819         switch (x)
1820           {
1821           case 1:   break;
1822           case 2:   c += 1; break;  case 4:  c += 2; break;
1823           case 8:   c += 3; break;  case 16: c += 4; break;
1824           case 32:  c += 5; break;  case 64: c += 6; break;
1825           case 128: c += 7; break;
1826           }
1827 
1828         /* c contains the code unit value, in the range 0-255. In 8-bit UTF
1829         mode, only values < 128 can be used. In all the other cases, c is a
1830         character value. */
1831 
1832 #if PCRE2_CODE_UNIT_WIDTH == 8
1833         if (utf && c > 127) goto DONE;
1834 #endif
1835         if (a < 0) a = c;   /* First one found, save in a */
1836         else if (b < 0)     /* Second one found */
1837           {
1838           int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
1839 
1840 #ifdef SUPPORT_UNICODE
1841           if (utf || ucp)
1842             {
1843             if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
1844             if (c > 127) d = UCD_OTHERCASE(c);
1845             }
1846 #endif  /* SUPPORT_UNICODE */
1847 
1848           if (d != a) goto DONE;   /* Not the other case of a */
1849           b = c;                   /* Save second in b */
1850           }
1851         else goto DONE;   /* More than two characters found */
1852         }
1853       }
1854 
1855     /* Replace the start code unit bits with a first code unit, but only if it
1856     is not the same as a required later code unit. This is because a search for
1857     a required code unit starts after an explicit first code unit, but at a
1858     code unit found from the bitmap. Patterns such as /a*a/ don't work
1859     if both the start unit and required unit are the same. */
1860 
1861     if (a >= 0 &&
1862         (
1863         (re->flags & PCRE2_LASTSET) == 0 ||
1864           (
1865           re->last_codeunit != (uint32_t)a &&
1866           (b < 0 || re->last_codeunit != (uint32_t)b)
1867           )
1868         ))
1869       {
1870       re->first_codeunit = a;
1871       flags = PCRE2_FIRSTSET;
1872       if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
1873       }
1874 
1875     DONE:
1876     re->flags |= flags;
1877     }
1878   }
1879 
1880 /* Find the minimum length of subject string. If the pattern can match an empty
1881 string, the minimum length is already known. If the pattern contains (*ACCEPT)
1882 all bets are off, and we don't even try to find a minimum length. If there are
1883 more back references than the size of the vector we are going to cache them in,
1884 do nothing. A pattern that complicated will probably take a long time to
1885 analyze and may in any case turn out to be too complicated. Note that back
1886 reference minima are held as 16-bit numbers. */
1887 
1888 if ((re->flags & (PCRE2_MATCH_EMPTY|PCRE2_HASACCEPT)) == 0 &&
1889      re->top_backref <= MAX_CACHE_BACKREF)
1890   {
1891   int min;
1892   int backref_cache[MAX_CACHE_BACKREF+1];
1893   backref_cache[0] = 0;    /* Highest one that is set */
1894   min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1895   switch(min)
1896     {
1897     case -1:  /* \C in UTF mode or over-complex regex */
1898     break;    /* Leave minlength unchanged (will be zero) */
1899 
1900     case -2:
1901     return 2; /* missing capturing bracket */
1902 
1903     case -3:
1904     return 3; /* unrecognized opcode */
1905 
1906     default:
1907     re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
1908     break;
1909     }
1910   }
1911 
1912 return 0;
1913 }
1914 
1915 /* End of pcre2_study.c */
1916