xref: /PHP-7.2/ext/pcre/pcrelib/pcre_study.c (revision 8a287c0e)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2012 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #include "pcre_internal.h"
50 
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52 
53 /* Returns from set_start_bits() */
54 
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56 
57 
58 
59 /*************************************************
60 *   Find the minimum subject length for a group  *
61 *************************************************/
62 
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67 
68 Arguments:
69   re              compiled pattern block
70   code            pointer to start of group (the bracket)
71   startcode       pointer to start of the whole pattern's code
72   options         the compiling options
73   recurses        chain of recurse_check to catch mutual recursion
74   countptr        pointer to call count (to catch over complexity)
75 
76 Returns:   the minimum length
77            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
78            -2 internal error (missing capturing bracket)
79            -3 internal error (opcode not listed)
80 */
81 
82 static int
find_minlength(const REAL_PCRE * re,const pcre_uchar * code,const pcre_uchar * startcode,int options,recurse_check * recurses,int * countptr)83 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
84   const pcre_uchar *startcode, int options, recurse_check *recurses,
85   int *countptr)
86 {
87 int length = -1;
88 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
89 BOOL utf = (options & PCRE_UTF8) != 0;
90 BOOL had_recurse = FALSE;
91 recurse_check this_recurse;
92 register int branchlength = 0;
93 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
94 
95 if ((*countptr)++ > 1000) return -1;   /* too complex */
96 
97 if (*code == OP_CBRA || *code == OP_SCBRA ||
98     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
99 
100 /* Scan along the opcodes for this branch. If we get to the end of the
101 branch, check the length against that of the other branches. */
102 
103 for (;;)
104   {
105   int d, min;
106   pcre_uchar *cs, *ce;
107   register pcre_uchar op = *cc;
108 
109   switch (op)
110     {
111     case OP_COND:
112     case OP_SCOND:
113 
114     /* If there is only one branch in a condition, the implied branch has zero
115     length, so we don't add anything. This covers the DEFINE "condition"
116     automatically. */
117 
118     cs = cc + GET(cc, 1);
119     if (*cs != OP_ALT)
120       {
121       cc = cs + 1 + LINK_SIZE;
122       break;
123       }
124 
125     /* Otherwise we can fall through and treat it the same as any other
126     subpattern. */
127 
128     case OP_CBRA:
129     case OP_SCBRA:
130     case OP_BRA:
131     case OP_SBRA:
132     case OP_CBRAPOS:
133     case OP_SCBRAPOS:
134     case OP_BRAPOS:
135     case OP_SBRAPOS:
136     case OP_ONCE:
137     case OP_ONCE_NC:
138     d = find_minlength(re, cc, startcode, options, recurses, countptr);
139     if (d < 0) return d;
140     branchlength += d;
141     do cc += GET(cc, 1); while (*cc == OP_ALT);
142     cc += 1 + LINK_SIZE;
143     break;
144 
145     /* ACCEPT makes things far too complicated; we have to give up. */
146 
147     case OP_ACCEPT:
148     case OP_ASSERT_ACCEPT:
149     return -1;
150 
151     /* Reached end of a branch; if it's a ket it is the end of a nested
152     call. If it's ALT it is an alternation in a nested call. If it is END it's
153     the end of the outer call. All can be handled by the same code. If an
154     ACCEPT was previously encountered, use the length that was in force at that
155     time, and pass back the shortest ACCEPT length. */
156 
157     case OP_ALT:
158     case OP_KET:
159     case OP_KETRMAX:
160     case OP_KETRMIN:
161     case OP_KETRPOS:
162     case OP_END:
163     if (length < 0 || (!had_recurse && branchlength < length))
164       length = branchlength;
165     if (op != OP_ALT) return length;
166     cc += 1 + LINK_SIZE;
167     branchlength = 0;
168     had_recurse = FALSE;
169     break;
170 
171     /* Skip over assertive subpatterns */
172 
173     case OP_ASSERT:
174     case OP_ASSERT_NOT:
175     case OP_ASSERTBACK:
176     case OP_ASSERTBACK_NOT:
177     do cc += GET(cc, 1); while (*cc == OP_ALT);
178     /* Fall through */
179 
180     /* Skip over things that don't match chars */
181 
182     case OP_REVERSE:
183     case OP_CREF:
184     case OP_DNCREF:
185     case OP_RREF:
186     case OP_DNRREF:
187     case OP_DEF:
188     case OP_CALLOUT:
189     case OP_SOD:
190     case OP_SOM:
191     case OP_EOD:
192     case OP_EODN:
193     case OP_CIRC:
194     case OP_CIRCM:
195     case OP_DOLL:
196     case OP_DOLLM:
197     case OP_NOT_WORD_BOUNDARY:
198     case OP_WORD_BOUNDARY:
199     cc += PRIV(OP_lengths)[*cc];
200     break;
201 
202     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
203 
204     case OP_BRAZERO:
205     case OP_BRAMINZERO:
206     case OP_BRAPOSZERO:
207     case OP_SKIPZERO:
208     cc += PRIV(OP_lengths)[*cc];
209     do cc += GET(cc, 1); while (*cc == OP_ALT);
210     cc += 1 + LINK_SIZE;
211     break;
212 
213     /* Handle literal characters and + repetitions */
214 
215     case OP_CHAR:
216     case OP_CHARI:
217     case OP_NOT:
218     case OP_NOTI:
219     case OP_PLUS:
220     case OP_PLUSI:
221     case OP_MINPLUS:
222     case OP_MINPLUSI:
223     case OP_POSPLUS:
224     case OP_POSPLUSI:
225     case OP_NOTPLUS:
226     case OP_NOTPLUSI:
227     case OP_NOTMINPLUS:
228     case OP_NOTMINPLUSI:
229     case OP_NOTPOSPLUS:
230     case OP_NOTPOSPLUSI:
231     branchlength++;
232     cc += 2;
233 #ifdef SUPPORT_UTF
234     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
235 #endif
236     break;
237 
238     case OP_TYPEPLUS:
239     case OP_TYPEMINPLUS:
240     case OP_TYPEPOSPLUS:
241     branchlength++;
242     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
243     break;
244 
245     /* Handle exact repetitions. The count is already in characters, but we
246     need to skip over a multibyte character in UTF8 mode.  */
247 
248     case OP_EXACT:
249     case OP_EXACTI:
250     case OP_NOTEXACT:
251     case OP_NOTEXACTI:
252     branchlength += GET2(cc,1);
253     cc += 2 + IMM2_SIZE;
254 #ifdef SUPPORT_UTF
255     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
256 #endif
257     break;
258 
259     case OP_TYPEEXACT:
260     branchlength += GET2(cc,1);
261     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
262       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
263     break;
264 
265     /* Handle single-char non-literal matchers */
266 
267     case OP_PROP:
268     case OP_NOTPROP:
269     cc += 2;
270     /* Fall through */
271 
272     case OP_NOT_DIGIT:
273     case OP_DIGIT:
274     case OP_NOT_WHITESPACE:
275     case OP_WHITESPACE:
276     case OP_NOT_WORDCHAR:
277     case OP_WORDCHAR:
278     case OP_ANY:
279     case OP_ALLANY:
280     case OP_EXTUNI:
281     case OP_HSPACE:
282     case OP_NOT_HSPACE:
283     case OP_VSPACE:
284     case OP_NOT_VSPACE:
285     branchlength++;
286     cc++;
287     break;
288 
289     /* "Any newline" might match two characters, but it also might match just
290     one. */
291 
292     case OP_ANYNL:
293     branchlength += 1;
294     cc++;
295     break;
296 
297     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
298     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
299     appear, but leave the code, just in case.) */
300 
301     case OP_ANYBYTE:
302 #ifdef SUPPORT_UTF
303     if (utf) return -1;
304 #endif
305     branchlength++;
306     cc++;
307     break;
308 
309     /* For repeated character types, we have to test for \p and \P, which have
310     an extra two bytes of parameters. */
311 
312     case OP_TYPESTAR:
313     case OP_TYPEMINSTAR:
314     case OP_TYPEQUERY:
315     case OP_TYPEMINQUERY:
316     case OP_TYPEPOSSTAR:
317     case OP_TYPEPOSQUERY:
318     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
319     cc += PRIV(OP_lengths)[op];
320     break;
321 
322     case OP_TYPEUPTO:
323     case OP_TYPEMINUPTO:
324     case OP_TYPEPOSUPTO:
325     if (cc[1 + IMM2_SIZE] == OP_PROP
326       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
327     cc += PRIV(OP_lengths)[op];
328     break;
329 
330     /* Check a class for variable quantification */
331 
332     case OP_CLASS:
333     case OP_NCLASS:
334 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
335     case OP_XCLASS:
336     /* The original code caused an unsigned overflow in 64 bit systems,
337     so now we use a conditional statement. */
338     if (op == OP_XCLASS)
339       cc += GET(cc, 1);
340     else
341       cc += PRIV(OP_lengths)[OP_CLASS];
342 #else
343     cc += PRIV(OP_lengths)[OP_CLASS];
344 #endif
345 
346     switch (*cc)
347       {
348       case OP_CRPLUS:
349       case OP_CRMINPLUS:
350       case OP_CRPOSPLUS:
351       branchlength++;
352       /* Fall through */
353 
354       case OP_CRSTAR:
355       case OP_CRMINSTAR:
356       case OP_CRQUERY:
357       case OP_CRMINQUERY:
358       case OP_CRPOSSTAR:
359       case OP_CRPOSQUERY:
360       cc++;
361       break;
362 
363       case OP_CRRANGE:
364       case OP_CRMINRANGE:
365       case OP_CRPOSRANGE:
366       branchlength += GET2(cc,1);
367       cc += 1 + 2 * IMM2_SIZE;
368       break;
369 
370       default:
371       branchlength++;
372       break;
373       }
374     break;
375 
376     /* Backreferences and subroutine calls are treated in the same way: we find
377     the minimum length for the subpattern. A recursion, however, causes an
378     a flag to be set that causes the length of this branch to be ignored. The
379     logic is that a recursion can only make sense if there is another
380     alternation that stops the recursing. That will provide the minimum length
381     (when no recursion happens). A backreference within the group that it is
382     referencing behaves in the same way.
383 
384     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
385     matches an empty string (by default it causes a matching failure), so in
386     that case we must set the minimum length to zero. */
387 
388     case OP_DNREF:     /* Duplicate named pattern back reference */
389     case OP_DNREFI:
390     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
391       {
392       int count = GET2(cc, 1+IMM2_SIZE);
393       pcre_uchar *slot = (pcre_uchar *)re +
394         re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
395       d = INT_MAX;
396       while (count-- > 0)
397         {
398         ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
399         if (cs == NULL) return -2;
400         do ce += GET(ce, 1); while (*ce == OP_ALT);
401         if (cc > cs && cc < ce)     /* Simple recursion */
402           {
403           d = 0;
404           had_recurse = TRUE;
405           break;
406           }
407         else
408           {
409           recurse_check *r = recurses;
410           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
411           if (r != NULL)           /* Mutual recursion */
412             {
413             d = 0;
414             had_recurse = TRUE;
415             break;
416             }
417           else
418             {
419             int dd;
420             this_recurse.prev = recurses;
421             this_recurse.group = cs;
422             dd = find_minlength(re, cs, startcode, options, &this_recurse,
423               countptr);
424             if (dd < d) d = dd;
425             }
426           }
427         slot += re->name_entry_size;
428         }
429       }
430     else d = 0;
431     cc += 1 + 2*IMM2_SIZE;
432     goto REPEAT_BACK_REFERENCE;
433 
434     case OP_REF:      /* Single back reference */
435     case OP_REFI:
436     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
437       {
438       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
439       if (cs == NULL) return -2;
440       do ce += GET(ce, 1); while (*ce == OP_ALT);
441       if (cc > cs && cc < ce)    /* Simple recursion */
442         {
443         d = 0;
444         had_recurse = TRUE;
445         }
446       else
447         {
448         recurse_check *r = recurses;
449         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
450         if (r != NULL)           /* Mutual recursion */
451           {
452           d = 0;
453           had_recurse = TRUE;
454           }
455         else
456           {
457           this_recurse.prev = recurses;
458           this_recurse.group = cs;
459           d = find_minlength(re, cs, startcode, options, &this_recurse,
460             countptr);
461           }
462         }
463       }
464     else d = 0;
465     cc += 1 + IMM2_SIZE;
466 
467     /* Handle repeated back references */
468 
469     REPEAT_BACK_REFERENCE:
470     switch (*cc)
471       {
472       case OP_CRSTAR:
473       case OP_CRMINSTAR:
474       case OP_CRQUERY:
475       case OP_CRMINQUERY:
476       case OP_CRPOSSTAR:
477       case OP_CRPOSQUERY:
478       min = 0;
479       cc++;
480       break;
481 
482       case OP_CRPLUS:
483       case OP_CRMINPLUS:
484       case OP_CRPOSPLUS:
485       min = 1;
486       cc++;
487       break;
488 
489       case OP_CRRANGE:
490       case OP_CRMINRANGE:
491       case OP_CRPOSRANGE:
492       min = GET2(cc, 1);
493       cc += 1 + 2 * IMM2_SIZE;
494       break;
495 
496       default:
497       min = 1;
498       break;
499       }
500 
501     branchlength += min * d;
502     break;
503 
504     /* We can easily detect direct recursion, but not mutual recursion. This is
505     caught by a recursion depth count. */
506 
507     case OP_RECURSE:
508     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
509     do ce += GET(ce, 1); while (*ce == OP_ALT);
510     if (cc > cs && cc < ce)    /* Simple recursion */
511       had_recurse = TRUE;
512     else
513       {
514       recurse_check *r = recurses;
515       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
516       if (r != NULL)           /* Mutual recursion */
517         had_recurse = TRUE;
518       else
519         {
520         this_recurse.prev = recurses;
521         this_recurse.group = cs;
522         branchlength += find_minlength(re, cs, startcode, options,
523           &this_recurse, countptr);
524         }
525       }
526     cc += 1 + LINK_SIZE;
527     break;
528 
529     /* Anything else does not or need not match a character. We can get the
530     item's length from the table, but for those that can match zero occurrences
531     of a character, we must take special action for UTF-8 characters. As it
532     happens, the "NOT" versions of these opcodes are used at present only for
533     ASCII characters, so they could be omitted from this list. However, in
534     future that may change, so we include them here so as not to leave a
535     gotcha for a future maintainer. */
536 
537     case OP_UPTO:
538     case OP_UPTOI:
539     case OP_NOTUPTO:
540     case OP_NOTUPTOI:
541     case OP_MINUPTO:
542     case OP_MINUPTOI:
543     case OP_NOTMINUPTO:
544     case OP_NOTMINUPTOI:
545     case OP_POSUPTO:
546     case OP_POSUPTOI:
547     case OP_NOTPOSUPTO:
548     case OP_NOTPOSUPTOI:
549 
550     case OP_STAR:
551     case OP_STARI:
552     case OP_NOTSTAR:
553     case OP_NOTSTARI:
554     case OP_MINSTAR:
555     case OP_MINSTARI:
556     case OP_NOTMINSTAR:
557     case OP_NOTMINSTARI:
558     case OP_POSSTAR:
559     case OP_POSSTARI:
560     case OP_NOTPOSSTAR:
561     case OP_NOTPOSSTARI:
562 
563     case OP_QUERY:
564     case OP_QUERYI:
565     case OP_NOTQUERY:
566     case OP_NOTQUERYI:
567     case OP_MINQUERY:
568     case OP_MINQUERYI:
569     case OP_NOTMINQUERY:
570     case OP_NOTMINQUERYI:
571     case OP_POSQUERY:
572     case OP_POSQUERYI:
573     case OP_NOTPOSQUERY:
574     case OP_NOTPOSQUERYI:
575 
576     cc += PRIV(OP_lengths)[op];
577 #ifdef SUPPORT_UTF
578     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
579 #endif
580     break;
581 
582     /* Skip these, but we need to add in the name length. */
583 
584     case OP_MARK:
585     case OP_PRUNE_ARG:
586     case OP_SKIP_ARG:
587     case OP_THEN_ARG:
588     cc += PRIV(OP_lengths)[op] + cc[1];
589     break;
590 
591     /* The remaining opcodes are just skipped over. */
592 
593     case OP_CLOSE:
594     case OP_COMMIT:
595     case OP_FAIL:
596     case OP_PRUNE:
597     case OP_SET_SOM:
598     case OP_SKIP:
599     case OP_THEN:
600     cc += PRIV(OP_lengths)[op];
601     break;
602 
603     /* This should not occur: we list all opcodes explicitly so that when
604     new ones get added they are properly considered. */
605 
606     default:
607     return -3;
608     }
609   }
610 /* Control never gets here */
611 }
612 
613 
614 
615 /*************************************************
616 *      Set a bit and maybe its alternate case    *
617 *************************************************/
618 
619 /* Given a character, set its first byte's bit in the table, and also the
620 corresponding bit for the other version of a letter if we are caseless. In
621 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
622 when Unicode property support is available.
623 
624 Arguments:
625   start_bits    points to the bit map
626   p             points to the character
627   caseless      the caseless flag
628   cd            the block with char table pointers
629   utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
630 
631 Returns:        pointer after the character
632 */
633 
634 static const pcre_uchar *
set_table_bit(pcre_uint8 * start_bits,const pcre_uchar * p,BOOL caseless,compile_data * cd,BOOL utf)635 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
636   compile_data *cd, BOOL utf)
637 {
638 pcre_uint32 c = *p;
639 
640 #ifdef COMPILE_PCRE8
641 SET_BIT(c);
642 
643 #ifdef SUPPORT_UTF
644 if (utf && c > 127)
645   {
646   GETCHARINC(c, p);
647 #ifdef SUPPORT_UCP
648   if (caseless)
649     {
650     pcre_uchar buff[6];
651     c = UCD_OTHERCASE(c);
652     (void)PRIV(ord2utf)(c, buff);
653     SET_BIT(buff[0]);
654     }
655 #endif  /* Not SUPPORT_UCP */
656   return p;
657   }
658 #else   /* Not SUPPORT_UTF */
659 (void)(utf);   /* Stops warning for unused parameter */
660 #endif  /* SUPPORT_UTF */
661 
662 /* Not UTF-8 mode, or character is less than 127. */
663 
664 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
665 return p + 1;
666 #endif  /* COMPILE_PCRE8 */
667 
668 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
669 if (c > 0xff)
670   {
671   c = 0xff;
672   caseless = FALSE;
673   }
674 SET_BIT(c);
675 
676 #ifdef SUPPORT_UTF
677 if (utf && c > 127)
678   {
679   GETCHARINC(c, p);
680 #ifdef SUPPORT_UCP
681   if (caseless)
682     {
683     c = UCD_OTHERCASE(c);
684     if (c > 0xff)
685       c = 0xff;
686     SET_BIT(c);
687     }
688 #endif  /* SUPPORT_UCP */
689   return p;
690   }
691 #else   /* Not SUPPORT_UTF */
692 (void)(utf);   /* Stops warning for unused parameter */
693 #endif  /* SUPPORT_UTF */
694 
695 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
696 return p + 1;
697 #endif
698 }
699 
700 
701 
702 /*************************************************
703 *     Set bits for a positive character type     *
704 *************************************************/
705 
706 /* This function sets starting bits for a character type. In UTF-8 mode, we can
707 only do a direct setting for bytes less than 128, as otherwise there can be
708 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
709 environment, the tables will only recognize ASCII characters anyway, but in at
710 least one Windows environment, some higher bytes bits were set in the tables.
711 So we deal with that case by considering the UTF-8 encoding.
712 
713 Arguments:
714   start_bits     the starting bitmap
715   cbit type      the type of character wanted
716   table_limit    32 for non-UTF-8; 16 for UTF-8
717   cd             the block with char table pointers
718 
719 Returns:         nothing
720 */
721 
722 static void
set_type_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)723 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
724   compile_data *cd)
725 {
726 register pcre_uint32 c;
727 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
728 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729 if (table_limit == 32) return;
730 for (c = 128; c < 256; c++)
731   {
732   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
733     {
734     pcre_uchar buff[6];
735     (void)PRIV(ord2utf)(c, buff);
736     SET_BIT(buff[0]);
737     }
738   }
739 #endif
740 }
741 
742 
743 /*************************************************
744 *     Set bits for a negative character type     *
745 *************************************************/
746 
747 /* This function sets starting bits for a negative character type such as \D.
748 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
749 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
750 Unlike in the positive case, where we can set appropriate starting bits for
751 specific high-valued UTF-8 characters, in this case we have to set the bits for
752 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
753 0xc0 (192) for simplicity.
754 
755 Arguments:
756   start_bits     the starting bitmap
757   cbit type      the type of character wanted
758   table_limit    32 for non-UTF-8; 16 for UTF-8
759   cd             the block with char table pointers
760 
761 Returns:         nothing
762 */
763 
764 static void
set_nottype_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)765 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
766   compile_data *cd)
767 {
768 register pcre_uint32 c;
769 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
770 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
771 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
772 #endif
773 }
774 
775 
776 
777 /*************************************************
778 *          Create bitmap of starting bytes       *
779 *************************************************/
780 
781 /* This function scans a compiled unanchored expression recursively and
782 attempts to build a bitmap of the set of possible starting bytes. As time goes
783 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
784 useful for parenthesized groups in patterns such as (a*)b where the group
785 provides some optional starting bytes but scanning must continue at the outer
786 level to find at least one mandatory byte. At the outermost level, this
787 function fails unless the result is SSB_DONE.
788 
789 Arguments:
790   code         points to an expression
791   start_bits   points to a 32-byte table, initialized to 0
792   utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
793   cd           the block with char table pointers
794 
795 Returns:       SSB_FAIL     => Failed to find any starting bytes
796                SSB_DONE     => Found mandatory starting bytes
797                SSB_CONTINUE => Found optional starting bytes
798                SSB_UNKNOWN  => Hit an unrecognized opcode
799 */
800 
801 static int
set_start_bits(const pcre_uchar * code,pcre_uint8 * start_bits,BOOL utf,compile_data * cd)802 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
803   compile_data *cd)
804 {
805 register pcre_uint32 c;
806 int yield = SSB_DONE;
807 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
808 int table_limit = utf? 16:32;
809 #else
810 int table_limit = 32;
811 #endif
812 
813 #if 0
814 /* ========================================================================= */
815 /* The following comment and code was inserted in January 1999. In May 2006,
816 when it was observed to cause compiler warnings about unused values, I took it
817 out again. If anybody is still using OS/2, they will have to put it back
818 manually. */
819 
820 /* This next statement and the later reference to dummy are here in order to
821 trick the optimizer of the IBM C compiler for OS/2 into generating correct
822 code. Apparently IBM isn't going to fix the problem, and we would rather not
823 disable optimization (in this module it actually makes a big difference, and
824 the pcre module can use all the optimization it can get). */
825 
826 volatile int dummy;
827 /* ========================================================================= */
828 #endif
829 
830 do
831   {
832   BOOL try_next = TRUE;
833   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
834 
835   if (*code == OP_CBRA || *code == OP_SCBRA ||
836       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
837 
838   while (try_next)    /* Loop for items in this branch */
839     {
840     int rc;
841 
842     switch(*tcode)
843       {
844       /* If we reach something we don't understand, it means a new opcode has
845       been created that hasn't been added to this code. Hopefully this problem
846       will be discovered during testing. */
847 
848       default:
849       return SSB_UNKNOWN;
850 
851       /* Fail for a valid opcode that implies no starting bits. */
852 
853       case OP_ACCEPT:
854       case OP_ASSERT_ACCEPT:
855       case OP_ALLANY:
856       case OP_ANY:
857       case OP_ANYBYTE:
858       case OP_CIRC:
859       case OP_CIRCM:
860       case OP_CLOSE:
861       case OP_COMMIT:
862       case OP_COND:
863       case OP_CREF:
864       case OP_DEF:
865       case OP_DNCREF:
866       case OP_DNREF:
867       case OP_DNREFI:
868       case OP_DNRREF:
869       case OP_DOLL:
870       case OP_DOLLM:
871       case OP_END:
872       case OP_EOD:
873       case OP_EODN:
874       case OP_EXTUNI:
875       case OP_FAIL:
876       case OP_MARK:
877       case OP_NOT:
878       case OP_NOTEXACT:
879       case OP_NOTEXACTI:
880       case OP_NOTI:
881       case OP_NOTMINPLUS:
882       case OP_NOTMINPLUSI:
883       case OP_NOTMINQUERY:
884       case OP_NOTMINQUERYI:
885       case OP_NOTMINSTAR:
886       case OP_NOTMINSTARI:
887       case OP_NOTMINUPTO:
888       case OP_NOTMINUPTOI:
889       case OP_NOTPLUS:
890       case OP_NOTPLUSI:
891       case OP_NOTPOSPLUS:
892       case OP_NOTPOSPLUSI:
893       case OP_NOTPOSQUERY:
894       case OP_NOTPOSQUERYI:
895       case OP_NOTPOSSTAR:
896       case OP_NOTPOSSTARI:
897       case OP_NOTPOSUPTO:
898       case OP_NOTPOSUPTOI:
899       case OP_NOTPROP:
900       case OP_NOTQUERY:
901       case OP_NOTQUERYI:
902       case OP_NOTSTAR:
903       case OP_NOTSTARI:
904       case OP_NOTUPTO:
905       case OP_NOTUPTOI:
906       case OP_NOT_HSPACE:
907       case OP_NOT_VSPACE:
908       case OP_PRUNE:
909       case OP_PRUNE_ARG:
910       case OP_RECURSE:
911       case OP_REF:
912       case OP_REFI:
913       case OP_REVERSE:
914       case OP_RREF:
915       case OP_SCOND:
916       case OP_SET_SOM:
917       case OP_SKIP:
918       case OP_SKIP_ARG:
919       case OP_SOD:
920       case OP_SOM:
921       case OP_THEN:
922       case OP_THEN_ARG:
923       return SSB_FAIL;
924 
925       /* A "real" property test implies no starting bits, but the fake property
926       PT_CLIST identifies a list of characters. These lists are short, as they
927       are used for characters with more than one "other case", so there is no
928       point in recognizing them for OP_NOTPROP. */
929 
930       case OP_PROP:
931       if (tcode[1] != PT_CLIST) return SSB_FAIL;
932         {
933         const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
934         while ((c = *p++) < NOTACHAR)
935           {
936 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
937           if (utf)
938             {
939             pcre_uchar buff[6];
940             (void)PRIV(ord2utf)(c, buff);
941             c = buff[0];
942             }
943 #endif
944           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
945           }
946         }
947       try_next = FALSE;
948       break;
949 
950       /* We can ignore word boundary tests. */
951 
952       case OP_WORD_BOUNDARY:
953       case OP_NOT_WORD_BOUNDARY:
954       tcode++;
955       break;
956 
957       /* If we hit a bracket or a positive lookahead assertion, recurse to set
958       bits from within the subpattern. If it can't find anything, we have to
959       give up. If it finds some mandatory character(s), we are done for this
960       branch. Otherwise, carry on scanning after the subpattern. */
961 
962       case OP_BRA:
963       case OP_SBRA:
964       case OP_CBRA:
965       case OP_SCBRA:
966       case OP_BRAPOS:
967       case OP_SBRAPOS:
968       case OP_CBRAPOS:
969       case OP_SCBRAPOS:
970       case OP_ONCE:
971       case OP_ONCE_NC:
972       case OP_ASSERT:
973       rc = set_start_bits(tcode, start_bits, utf, cd);
974       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
975       if (rc == SSB_DONE) try_next = FALSE; else
976         {
977         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
978         tcode += 1 + LINK_SIZE;
979         }
980       break;
981 
982       /* If we hit ALT or KET, it means we haven't found anything mandatory in
983       this branch, though we might have found something optional. For ALT, we
984       continue with the next alternative, but we have to arrange that the final
985       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
986       return SSB_CONTINUE: if this is the top level, that indicates failure,
987       but after a nested subpattern, it causes scanning to continue. */
988 
989       case OP_ALT:
990       yield = SSB_CONTINUE;
991       try_next = FALSE;
992       break;
993 
994       case OP_KET:
995       case OP_KETRMAX:
996       case OP_KETRMIN:
997       case OP_KETRPOS:
998       return SSB_CONTINUE;
999 
1000       /* Skip over callout */
1001 
1002       case OP_CALLOUT:
1003       tcode += 2 + 2*LINK_SIZE;
1004       break;
1005 
1006       /* Skip over lookbehind and negative lookahead assertions */
1007 
1008       case OP_ASSERT_NOT:
1009       case OP_ASSERTBACK:
1010       case OP_ASSERTBACK_NOT:
1011       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1012       tcode += 1 + LINK_SIZE;
1013       break;
1014 
1015       /* BRAZERO does the bracket, but carries on. */
1016 
1017       case OP_BRAZERO:
1018       case OP_BRAMINZERO:
1019       case OP_BRAPOSZERO:
1020       rc = set_start_bits(++tcode, start_bits, utf, cd);
1021       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1022 /* =========================================================================
1023       See the comment at the head of this function concerning the next line,
1024       which was an old fudge for the benefit of OS/2.
1025       dummy = 1;
1026   ========================================================================= */
1027       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1028       tcode += 1 + LINK_SIZE;
1029       break;
1030 
1031       /* SKIPZERO skips the bracket. */
1032 
1033       case OP_SKIPZERO:
1034       tcode++;
1035       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1036       tcode += 1 + LINK_SIZE;
1037       break;
1038 
1039       /* Single-char * or ? sets the bit and tries the next item */
1040 
1041       case OP_STAR:
1042       case OP_MINSTAR:
1043       case OP_POSSTAR:
1044       case OP_QUERY:
1045       case OP_MINQUERY:
1046       case OP_POSQUERY:
1047       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1048       break;
1049 
1050       case OP_STARI:
1051       case OP_MINSTARI:
1052       case OP_POSSTARI:
1053       case OP_QUERYI:
1054       case OP_MINQUERYI:
1055       case OP_POSQUERYI:
1056       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1057       break;
1058 
1059       /* Single-char upto sets the bit and tries the next */
1060 
1061       case OP_UPTO:
1062       case OP_MINUPTO:
1063       case OP_POSUPTO:
1064       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1065       break;
1066 
1067       case OP_UPTOI:
1068       case OP_MINUPTOI:
1069       case OP_POSUPTOI:
1070       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1071       break;
1072 
1073       /* At least one single char sets the bit and stops */
1074 
1075       case OP_EXACT:
1076       tcode += IMM2_SIZE;
1077       /* Fall through */
1078       case OP_CHAR:
1079       case OP_PLUS:
1080       case OP_MINPLUS:
1081       case OP_POSPLUS:
1082       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1083       try_next = FALSE;
1084       break;
1085 
1086       case OP_EXACTI:
1087       tcode += IMM2_SIZE;
1088       /* Fall through */
1089       case OP_CHARI:
1090       case OP_PLUSI:
1091       case OP_MINPLUSI:
1092       case OP_POSPLUSI:
1093       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1094       try_next = FALSE;
1095       break;
1096 
1097       /* Special spacing and line-terminating items. These recognize specific
1098       lists of characters. The difference between VSPACE and ANYNL is that the
1099       latter can match the two-character CRLF sequence, but that is not
1100       relevant for finding the first character, so their code here is
1101       identical. */
1102 
1103       case OP_HSPACE:
1104       SET_BIT(CHAR_HT);
1105       SET_BIT(CHAR_SPACE);
1106 #ifdef SUPPORT_UTF
1107       if (utf)
1108         {
1109 #ifdef COMPILE_PCRE8
1110         SET_BIT(0xC2);  /* For U+00A0 */
1111         SET_BIT(0xE1);  /* For U+1680, U+180E */
1112         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1113         SET_BIT(0xE3);  /* For U+3000 */
1114 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1115         SET_BIT(0xA0);
1116         SET_BIT(0xFF);  /* For characters > 255 */
1117 #endif  /* COMPILE_PCRE[8|16|32] */
1118         }
1119       else
1120 #endif /* SUPPORT_UTF */
1121         {
1122 #ifndef EBCDIC
1123         SET_BIT(0xA0);
1124 #endif  /* Not EBCDIC */
1125 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1126         SET_BIT(0xFF);  /* For characters > 255 */
1127 #endif  /* COMPILE_PCRE[16|32] */
1128         }
1129       try_next = FALSE;
1130       break;
1131 
1132       case OP_ANYNL:
1133       case OP_VSPACE:
1134       SET_BIT(CHAR_LF);
1135       SET_BIT(CHAR_VT);
1136       SET_BIT(CHAR_FF);
1137       SET_BIT(CHAR_CR);
1138 #ifdef SUPPORT_UTF
1139       if (utf)
1140         {
1141 #ifdef COMPILE_PCRE8
1142         SET_BIT(0xC2);  /* For U+0085 */
1143         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1144 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1145         SET_BIT(CHAR_NEL);
1146         SET_BIT(0xFF);  /* For characters > 255 */
1147 #endif  /* COMPILE_PCRE[8|16|32] */
1148         }
1149       else
1150 #endif /* SUPPORT_UTF */
1151         {
1152         SET_BIT(CHAR_NEL);
1153 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1154         SET_BIT(0xFF);  /* For characters > 255 */
1155 #endif
1156         }
1157       try_next = FALSE;
1158       break;
1159 
1160       /* Single character types set the bits and stop. Note that if PCRE_UCP
1161       is set, we do not see these op codes because \d etc are converted to
1162       properties. Therefore, these apply in the case when only characters less
1163       than 256 are recognized to match the types. */
1164 
1165       case OP_NOT_DIGIT:
1166       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1167       try_next = FALSE;
1168       break;
1169 
1170       case OP_DIGIT:
1171       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1172       try_next = FALSE;
1173       break;
1174 
1175       /* The cbit_space table has vertical tab as whitespace; we no longer
1176       have to play fancy tricks because Perl added VT to its whitespace at
1177       release 5.18. PCRE added it at release 8.34. */
1178 
1179       case OP_NOT_WHITESPACE:
1180       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1181       try_next = FALSE;
1182       break;
1183 
1184       case OP_WHITESPACE:
1185       set_type_bits(start_bits, cbit_space, table_limit, cd);
1186       try_next = FALSE;
1187       break;
1188 
1189       case OP_NOT_WORDCHAR:
1190       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1191       try_next = FALSE;
1192       break;
1193 
1194       case OP_WORDCHAR:
1195       set_type_bits(start_bits, cbit_word, table_limit, cd);
1196       try_next = FALSE;
1197       break;
1198 
1199       /* One or more character type fudges the pointer and restarts, knowing
1200       it will hit a single character type and stop there. */
1201 
1202       case OP_TYPEPLUS:
1203       case OP_TYPEMINPLUS:
1204       case OP_TYPEPOSPLUS:
1205       tcode++;
1206       break;
1207 
1208       case OP_TYPEEXACT:
1209       tcode += 1 + IMM2_SIZE;
1210       break;
1211 
1212       /* Zero or more repeats of character types set the bits and then
1213       try again. */
1214 
1215       case OP_TYPEUPTO:
1216       case OP_TYPEMINUPTO:
1217       case OP_TYPEPOSUPTO:
1218       tcode += IMM2_SIZE;  /* Fall through */
1219 
1220       case OP_TYPESTAR:
1221       case OP_TYPEMINSTAR:
1222       case OP_TYPEPOSSTAR:
1223       case OP_TYPEQUERY:
1224       case OP_TYPEMINQUERY:
1225       case OP_TYPEPOSQUERY:
1226       switch(tcode[1])
1227         {
1228         default:
1229         case OP_ANY:
1230         case OP_ALLANY:
1231         return SSB_FAIL;
1232 
1233         case OP_HSPACE:
1234         SET_BIT(CHAR_HT);
1235         SET_BIT(CHAR_SPACE);
1236 #ifdef SUPPORT_UTF
1237         if (utf)
1238           {
1239 #ifdef COMPILE_PCRE8
1240           SET_BIT(0xC2);  /* For U+00A0 */
1241           SET_BIT(0xE1);  /* For U+1680, U+180E */
1242           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1243           SET_BIT(0xE3);  /* For U+3000 */
1244 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1245           SET_BIT(0xA0);
1246           SET_BIT(0xFF);  /* For characters > 255 */
1247 #endif  /* COMPILE_PCRE[8|16|32] */
1248           }
1249         else
1250 #endif /* SUPPORT_UTF */
1251 #ifndef EBCDIC
1252           SET_BIT(0xA0);
1253 #endif  /* Not EBCDIC */
1254         break;
1255 
1256         case OP_ANYNL:
1257         case OP_VSPACE:
1258         SET_BIT(CHAR_LF);
1259         SET_BIT(CHAR_VT);
1260         SET_BIT(CHAR_FF);
1261         SET_BIT(CHAR_CR);
1262 #ifdef SUPPORT_UTF
1263         if (utf)
1264           {
1265 #ifdef COMPILE_PCRE8
1266           SET_BIT(0xC2);  /* For U+0085 */
1267           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1268 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1269           SET_BIT(CHAR_NEL);
1270           SET_BIT(0xFF);  /* For characters > 255 */
1271 #endif  /* COMPILE_PCRE16 */
1272           }
1273         else
1274 #endif /* SUPPORT_UTF */
1275           SET_BIT(CHAR_NEL);
1276         break;
1277 
1278         case OP_NOT_DIGIT:
1279         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1280         break;
1281 
1282         case OP_DIGIT:
1283         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1284         break;
1285 
1286         /* The cbit_space table has vertical tab as whitespace; we no longer
1287         have to play fancy tricks because Perl added VT to its whitespace at
1288         release 5.18. PCRE added it at release 8.34. */
1289 
1290         case OP_NOT_WHITESPACE:
1291         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1292         break;
1293 
1294         case OP_WHITESPACE:
1295         set_type_bits(start_bits, cbit_space, table_limit, cd);
1296         break;
1297 
1298         case OP_NOT_WORDCHAR:
1299         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1300         break;
1301 
1302         case OP_WORDCHAR:
1303         set_type_bits(start_bits, cbit_word, table_limit, cd);
1304         break;
1305         }
1306 
1307       tcode += 2;
1308       break;
1309 
1310       /* Character class where all the information is in a bit map: set the
1311       bits and either carry on or not, according to the repeat count. If it was
1312       a negative class, and we are operating with UTF-8 characters, any byte
1313       with a value >= 0xc4 is a potentially valid starter because it starts a
1314       character with a value > 255. */
1315 
1316 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1317       case OP_XCLASS:
1318       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1319         return SSB_FAIL;
1320       /* All bits are set. */
1321       if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1322         return SSB_FAIL;
1323 #endif
1324       /* Fall through */
1325 
1326       case OP_NCLASS:
1327 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1328       if (utf)
1329         {
1330         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1331         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1332         }
1333 #endif
1334 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1335       SET_BIT(0xFF);                         /* For characters > 255 */
1336 #endif
1337       /* Fall through */
1338 
1339       case OP_CLASS:
1340         {
1341         pcre_uint8 *map;
1342 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1343         map = NULL;
1344         if (*tcode == OP_XCLASS)
1345           {
1346           if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1347             map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1348           tcode += GET(tcode, 1);
1349           }
1350         else
1351 #endif
1352           {
1353           tcode++;
1354           map = (pcre_uint8 *)tcode;
1355           tcode += 32 / sizeof(pcre_uchar);
1356           }
1357 
1358         /* In UTF-8 mode, the bits in a bit map correspond to character
1359         values, not to byte values. However, the bit map we are constructing is
1360         for byte values. So we have to do a conversion for characters whose
1361         value is > 127. In fact, there are only two possible starting bytes for
1362         characters in the range 128 - 255. */
1363 
1364 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1365         if (map != NULL)
1366 #endif
1367           {
1368 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1369           if (utf)
1370             {
1371             for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1372             for (c = 128; c < 256; c++)
1373               {
1374               if ((map[c/8] & (1 << (c&7))) != 0)
1375                 {
1376                 int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1377                 start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1378                 c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1379                 }
1380               }
1381             }
1382           else
1383 #endif
1384             {
1385             /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1386             for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1387             }
1388           }
1389 
1390         /* Advance past the bit map, and act on what follows. For a zero
1391         minimum repeat, continue; otherwise stop processing. */
1392 
1393         switch (*tcode)
1394           {
1395           case OP_CRSTAR:
1396           case OP_CRMINSTAR:
1397           case OP_CRQUERY:
1398           case OP_CRMINQUERY:
1399           case OP_CRPOSSTAR:
1400           case OP_CRPOSQUERY:
1401           tcode++;
1402           break;
1403 
1404           case OP_CRRANGE:
1405           case OP_CRMINRANGE:
1406           case OP_CRPOSRANGE:
1407           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1408             else try_next = FALSE;
1409           break;
1410 
1411           default:
1412           try_next = FALSE;
1413           break;
1414           }
1415         }
1416       break; /* End of bitmap class handling */
1417 
1418       }      /* End of switch */
1419     }        /* End of try_next loop */
1420 
1421   code += GET(code, 1);   /* Advance to next branch */
1422   }
1423 while (*code == OP_ALT);
1424 return yield;
1425 }
1426 
1427 
1428 
1429 
1430 
1431 /*************************************************
1432 *          Study a compiled expression           *
1433 *************************************************/
1434 
1435 /* This function is handed a compiled expression that it must study to produce
1436 information that will speed up the matching. It returns a pcre[16]_extra block
1437 which then gets handed back to pcre_exec().
1438 
1439 Arguments:
1440   re        points to the compiled expression
1441   options   contains option bits
1442   errorptr  points to where to place error messages;
1443             set NULL unless error
1444 
1445 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1446               the appropriate flags set;
1447             NULL on error or if no optimization possible
1448 */
1449 
1450 #if defined COMPILE_PCRE8
1451 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre * external_re,int options,const char ** errorptr)1452 pcre_study(const pcre *external_re, int options, const char **errorptr)
1453 #elif defined COMPILE_PCRE16
1454 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1455 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1456 #elif defined COMPILE_PCRE32
1457 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1458 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1459 #endif
1460 {
1461 int min;
1462 int count = 0;
1463 BOOL bits_set = FALSE;
1464 pcre_uint8 start_bits[32];
1465 PUBL(extra) *extra = NULL;
1466 pcre_study_data *study;
1467 const pcre_uint8 *tables;
1468 pcre_uchar *code;
1469 compile_data compile_block;
1470 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1471 
1472 
1473 *errorptr = NULL;
1474 
1475 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1476   {
1477   *errorptr = "argument is not a compiled regular expression";
1478   return NULL;
1479   }
1480 
1481 if ((re->flags & PCRE_MODE) == 0)
1482   {
1483 #if defined COMPILE_PCRE8
1484   *errorptr = "argument not compiled in 8 bit mode";
1485 #elif defined COMPILE_PCRE16
1486   *errorptr = "argument not compiled in 16 bit mode";
1487 #elif defined COMPILE_PCRE32
1488   *errorptr = "argument not compiled in 32 bit mode";
1489 #endif
1490   return NULL;
1491   }
1492 
1493 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1494   {
1495   *errorptr = "unknown or incorrect option bit(s) set";
1496   return NULL;
1497   }
1498 
1499 code = (pcre_uchar *)re + re->name_table_offset +
1500   (re->name_count * re->name_entry_size);
1501 
1502 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1503 a multiline pattern that matches only at "line starts", there is no point in
1504 seeking a list of starting bytes. */
1505 
1506 if ((re->options & PCRE_ANCHORED) == 0 &&
1507     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1508   {
1509   int rc;
1510 
1511   /* Set the character tables in the block that is passed around */
1512 
1513   tables = re->tables;
1514 
1515 #if defined COMPILE_PCRE8
1516   if (tables == NULL)
1517     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1518     (void *)(&tables));
1519 #elif defined COMPILE_PCRE16
1520   if (tables == NULL)
1521     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1522     (void *)(&tables));
1523 #elif defined COMPILE_PCRE32
1524   if (tables == NULL)
1525     (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1526     (void *)(&tables));
1527 #endif
1528 
1529   compile_block.lcc = tables + lcc_offset;
1530   compile_block.fcc = tables + fcc_offset;
1531   compile_block.cbits = tables + cbits_offset;
1532   compile_block.ctypes = tables + ctypes_offset;
1533 
1534   /* See if we can find a fixed set of initial characters for the pattern. */
1535 
1536   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1537   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1538     &compile_block);
1539   bits_set = rc == SSB_DONE;
1540   if (rc == SSB_UNKNOWN)
1541     {
1542     *errorptr = "internal error: opcode not recognized";
1543     return NULL;
1544     }
1545   }
1546 
1547 /* Find the minimum length of subject string. */
1548 
1549 switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1550   {
1551   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1552   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1553   default: break;
1554   }
1555 
1556 /* If a set of starting bytes has been identified, or if the minimum length is
1557 greater than zero, or if JIT optimization has been requested, or if
1558 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1559 pcre_study_data block. The study data is put in the latter, which is pointed to
1560 by the former, which may also get additional data set later by the calling
1561 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1562 save it in a field for returning via the pcre_fullinfo() function so that if it
1563 becomes variable in the future, we don't have to change that code. */
1564 
1565 if (bits_set || min > 0 || (options & (
1566 #ifdef SUPPORT_JIT
1567     PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1568     PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1569 #endif
1570     PCRE_STUDY_EXTRA_NEEDED)) != 0)
1571   {
1572   extra = (PUBL(extra) *)(PUBL(malloc))
1573     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1574   if (extra == NULL)
1575     {
1576     *errorptr = "failed to get memory";
1577     return NULL;
1578     }
1579 
1580   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1581   extra->flags = PCRE_EXTRA_STUDY_DATA;
1582   extra->study_data = study;
1583 
1584   study->size = sizeof(pcre_study_data);
1585   study->flags = 0;
1586 
1587   /* Set the start bits always, to avoid unset memory errors if the
1588   study data is written to a file, but set the flag only if any of the bits
1589   are set, to save time looking when none are. */
1590 
1591   if (bits_set)
1592     {
1593     study->flags |= PCRE_STUDY_MAPPED;
1594     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1595     }
1596   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1597 
1598 #ifdef PCRE_DEBUG
1599   if (bits_set)
1600     {
1601     pcre_uint8 *ptr = start_bits;
1602     int i;
1603 
1604     printf("Start bits:\n");
1605     for (i = 0; i < 32; i++)
1606       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1607     }
1608 #endif
1609 
1610   /* Always set the minlength value in the block, because the JIT compiler
1611   makes use of it. However, don't set the bit unless the length is greater than
1612   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1613   checking the zero case. */
1614 
1615   if (min > 0)
1616     {
1617     study->flags |= PCRE_STUDY_MINLEN;
1618     study->minlength = min;
1619     }
1620   else study->minlength = 0;
1621 
1622   /* If JIT support was compiled and requested, attempt the JIT compilation.
1623   If no starting bytes were found, and the minimum length is zero, and JIT
1624   compilation fails, abandon the extra block and return NULL, unless
1625   PCRE_STUDY_EXTRA_NEEDED is set. */
1626 
1627 #ifdef SUPPORT_JIT
1628   extra->executable_jit = NULL;
1629   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1630     PRIV(jit_compile)(re, extra, JIT_COMPILE);
1631   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1632     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1633   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1634     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1635 
1636   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1637       (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1638     {
1639 #if defined COMPILE_PCRE8
1640     pcre_free_study(extra);
1641 #elif defined COMPILE_PCRE16
1642     pcre16_free_study(extra);
1643 #elif defined COMPILE_PCRE32
1644     pcre32_free_study(extra);
1645 #endif
1646     extra = NULL;
1647     }
1648 #endif
1649   }
1650 
1651 return extra;
1652 }
1653 
1654 
1655 /*************************************************
1656 *          Free the study data                   *
1657 *************************************************/
1658 
1659 /* This function frees the memory that was obtained by pcre_study().
1660 
1661 Argument:   a pointer to the pcre[16]_extra block
1662 Returns:    nothing
1663 */
1664 
1665 #if defined COMPILE_PCRE8
1666 PCRE_EXP_DEFN void
pcre_free_study(pcre_extra * extra)1667 pcre_free_study(pcre_extra *extra)
1668 #elif defined COMPILE_PCRE16
1669 PCRE_EXP_DEFN void
1670 pcre16_free_study(pcre16_extra *extra)
1671 #elif defined COMPILE_PCRE32
1672 PCRE_EXP_DEFN void
1673 pcre32_free_study(pcre32_extra *extra)
1674 #endif
1675 {
1676 if (extra == NULL)
1677   return;
1678 #ifdef SUPPORT_JIT
1679 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1680      extra->executable_jit != NULL)
1681   PRIV(jit_free)(extra->executable_jit);
1682 #endif
1683 PUBL(free)(extra);
1684 }
1685 
1686 /* End of pcre_study.c */
1687