xref: /PHP-5.4/ext/pcre/pcrelib/pcre_study.c (revision 95fa7279)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2012 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #include "pcre_internal.h"
50 
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52 
53 /* Returns from set_start_bits() */
54 
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56 
57 
58 
59 /*************************************************
60 *   Find the minimum subject length for a group  *
61 *************************************************/
62 
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67 
68 Arguments:
69   re              compiled pattern block
70   code            pointer to start of group (the bracket)
71   startcode       pointer to start of the whole pattern's code
72   options         the compiling options
73   recurses        chain of recurse_check to catch mutual recursion
74 
75 Returns:   the minimum length
76            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
77            -2 internal error (missing capturing bracket)
78            -3 internal error (opcode not listed)
79 */
80 
81 static int
find_minlength(const REAL_PCRE * re,const pcre_uchar * code,const pcre_uchar * startcode,int options,recurse_check * recurses)82 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
83   const pcre_uchar *startcode, int options, recurse_check *recurses)
84 {
85 int length = -1;
86 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87 BOOL utf = (options & PCRE_UTF8) != 0;
88 BOOL had_recurse = FALSE;
89 recurse_check this_recurse;
90 register int branchlength = 0;
91 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
92 
93 if (*code == OP_CBRA || *code == OP_SCBRA ||
94     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
95 
96 /* Scan along the opcodes for this branch. If we get to the end of the
97 branch, check the length against that of the other branches. */
98 
99 for (;;)
100   {
101   int d, min;
102   pcre_uchar *cs, *ce;
103   register pcre_uchar op = *cc;
104 
105   switch (op)
106     {
107     case OP_COND:
108     case OP_SCOND:
109 
110     /* If there is only one branch in a condition, the implied branch has zero
111     length, so we don't add anything. This covers the DEFINE "condition"
112     automatically. */
113 
114     cs = cc + GET(cc, 1);
115     if (*cs != OP_ALT)
116       {
117       cc = cs + 1 + LINK_SIZE;
118       break;
119       }
120 
121     /* Otherwise we can fall through and treat it the same as any other
122     subpattern. */
123 
124     case OP_CBRA:
125     case OP_SCBRA:
126     case OP_BRA:
127     case OP_SBRA:
128     case OP_CBRAPOS:
129     case OP_SCBRAPOS:
130     case OP_BRAPOS:
131     case OP_SBRAPOS:
132     case OP_ONCE:
133     case OP_ONCE_NC:
134     d = find_minlength(re, cc, startcode, options, recurses);
135     if (d < 0) return d;
136     branchlength += d;
137     do cc += GET(cc, 1); while (*cc == OP_ALT);
138     cc += 1 + LINK_SIZE;
139     break;
140 
141     /* ACCEPT makes things far too complicated; we have to give up. */
142 
143     case OP_ACCEPT:
144     case OP_ASSERT_ACCEPT:
145     return -1;
146 
147     /* Reached end of a branch; if it's a ket it is the end of a nested
148     call. If it's ALT it is an alternation in a nested call. If it is END it's
149     the end of the outer call. All can be handled by the same code. If an
150     ACCEPT was previously encountered, use the length that was in force at that
151     time, and pass back the shortest ACCEPT length. */
152 
153     case OP_ALT:
154     case OP_KET:
155     case OP_KETRMAX:
156     case OP_KETRMIN:
157     case OP_KETRPOS:
158     case OP_END:
159     if (length < 0 || (!had_recurse && branchlength < length))
160       length = branchlength;
161     if (op != OP_ALT) return length;
162     cc += 1 + LINK_SIZE;
163     branchlength = 0;
164     had_recurse = FALSE;
165     break;
166 
167     /* Skip over assertive subpatterns */
168 
169     case OP_ASSERT:
170     case OP_ASSERT_NOT:
171     case OP_ASSERTBACK:
172     case OP_ASSERTBACK_NOT:
173     do cc += GET(cc, 1); while (*cc == OP_ALT);
174     /* Fall through */
175 
176     /* Skip over things that don't match chars */
177 
178     case OP_REVERSE:
179     case OP_CREF:
180     case OP_DNCREF:
181     case OP_RREF:
182     case OP_DNRREF:
183     case OP_DEF:
184     case OP_CALLOUT:
185     case OP_SOD:
186     case OP_SOM:
187     case OP_EOD:
188     case OP_EODN:
189     case OP_CIRC:
190     case OP_CIRCM:
191     case OP_DOLL:
192     case OP_DOLLM:
193     case OP_NOT_WORD_BOUNDARY:
194     case OP_WORD_BOUNDARY:
195     cc += PRIV(OP_lengths)[*cc];
196     break;
197 
198     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
199 
200     case OP_BRAZERO:
201     case OP_BRAMINZERO:
202     case OP_BRAPOSZERO:
203     case OP_SKIPZERO:
204     cc += PRIV(OP_lengths)[*cc];
205     do cc += GET(cc, 1); while (*cc == OP_ALT);
206     cc += 1 + LINK_SIZE;
207     break;
208 
209     /* Handle literal characters and + repetitions */
210 
211     case OP_CHAR:
212     case OP_CHARI:
213     case OP_NOT:
214     case OP_NOTI:
215     case OP_PLUS:
216     case OP_PLUSI:
217     case OP_MINPLUS:
218     case OP_MINPLUSI:
219     case OP_POSPLUS:
220     case OP_POSPLUSI:
221     case OP_NOTPLUS:
222     case OP_NOTPLUSI:
223     case OP_NOTMINPLUS:
224     case OP_NOTMINPLUSI:
225     case OP_NOTPOSPLUS:
226     case OP_NOTPOSPLUSI:
227     branchlength++;
228     cc += 2;
229 #ifdef SUPPORT_UTF
230     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
231 #endif
232     break;
233 
234     case OP_TYPEPLUS:
235     case OP_TYPEMINPLUS:
236     case OP_TYPEPOSPLUS:
237     branchlength++;
238     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
239     break;
240 
241     /* Handle exact repetitions. The count is already in characters, but we
242     need to skip over a multibyte character in UTF8 mode.  */
243 
244     case OP_EXACT:
245     case OP_EXACTI:
246     case OP_NOTEXACT:
247     case OP_NOTEXACTI:
248     branchlength += GET2(cc,1);
249     cc += 2 + IMM2_SIZE;
250 #ifdef SUPPORT_UTF
251     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
252 #endif
253     break;
254 
255     case OP_TYPEEXACT:
256     branchlength += GET2(cc,1);
257     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
258       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
259     break;
260 
261     /* Handle single-char non-literal matchers */
262 
263     case OP_PROP:
264     case OP_NOTPROP:
265     cc += 2;
266     /* Fall through */
267 
268     case OP_NOT_DIGIT:
269     case OP_DIGIT:
270     case OP_NOT_WHITESPACE:
271     case OP_WHITESPACE:
272     case OP_NOT_WORDCHAR:
273     case OP_WORDCHAR:
274     case OP_ANY:
275     case OP_ALLANY:
276     case OP_EXTUNI:
277     case OP_HSPACE:
278     case OP_NOT_HSPACE:
279     case OP_VSPACE:
280     case OP_NOT_VSPACE:
281     branchlength++;
282     cc++;
283     break;
284 
285     /* "Any newline" might match two characters, but it also might match just
286     one. */
287 
288     case OP_ANYNL:
289     branchlength += 1;
290     cc++;
291     break;
292 
293     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
294     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
295     appear, but leave the code, just in case.) */
296 
297     case OP_ANYBYTE:
298 #ifdef SUPPORT_UTF
299     if (utf) return -1;
300 #endif
301     branchlength++;
302     cc++;
303     break;
304 
305     /* For repeated character types, we have to test for \p and \P, which have
306     an extra two bytes of parameters. */
307 
308     case OP_TYPESTAR:
309     case OP_TYPEMINSTAR:
310     case OP_TYPEQUERY:
311     case OP_TYPEMINQUERY:
312     case OP_TYPEPOSSTAR:
313     case OP_TYPEPOSQUERY:
314     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
315     cc += PRIV(OP_lengths)[op];
316     break;
317 
318     case OP_TYPEUPTO:
319     case OP_TYPEMINUPTO:
320     case OP_TYPEPOSUPTO:
321     if (cc[1 + IMM2_SIZE] == OP_PROP
322       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
323     cc += PRIV(OP_lengths)[op];
324     break;
325 
326     /* Check a class for variable quantification */
327 
328     case OP_CLASS:
329     case OP_NCLASS:
330 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
331     case OP_XCLASS:
332     /* The original code caused an unsigned overflow in 64 bit systems,
333     so now we use a conditional statement. */
334     if (op == OP_XCLASS)
335       cc += GET(cc, 1);
336     else
337       cc += PRIV(OP_lengths)[OP_CLASS];
338 #else
339     cc += PRIV(OP_lengths)[OP_CLASS];
340 #endif
341 
342     switch (*cc)
343       {
344       case OP_CRPLUS:
345       case OP_CRMINPLUS:
346       case OP_CRPOSPLUS:
347       branchlength++;
348       /* Fall through */
349 
350       case OP_CRSTAR:
351       case OP_CRMINSTAR:
352       case OP_CRQUERY:
353       case OP_CRMINQUERY:
354       case OP_CRPOSSTAR:
355       case OP_CRPOSQUERY:
356       cc++;
357       break;
358 
359       case OP_CRRANGE:
360       case OP_CRMINRANGE:
361       case OP_CRPOSRANGE:
362       branchlength += GET2(cc,1);
363       cc += 1 + 2 * IMM2_SIZE;
364       break;
365 
366       default:
367       branchlength++;
368       break;
369       }
370     break;
371 
372     /* Backreferences and subroutine calls are treated in the same way: we find
373     the minimum length for the subpattern. A recursion, however, causes an
374     a flag to be set that causes the length of this branch to be ignored. The
375     logic is that a recursion can only make sense if there is another
376     alternation that stops the recursing. That will provide the minimum length
377     (when no recursion happens). A backreference within the group that it is
378     referencing behaves in the same way.
379 
380     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
381     matches an empty string (by default it causes a matching failure), so in
382     that case we must set the minimum length to zero. */
383 
384     case OP_DNREF:     /* Duplicate named pattern back reference */
385     case OP_DNREFI:
386     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
387       {
388       int count = GET2(cc, 1+IMM2_SIZE);
389       pcre_uchar *slot = (pcre_uchar *)re +
390         re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
391       d = INT_MAX;
392       while (count-- > 0)
393         {
394         ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
395         if (cs == NULL) return -2;
396         do ce += GET(ce, 1); while (*ce == OP_ALT);
397         if (cc > cs && cc < ce)     /* Simple recursion */
398           {
399           d = 0;
400           had_recurse = TRUE;
401           break;
402           }
403         else
404           {
405           recurse_check *r = recurses;
406           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
407           if (r != NULL)           /* Mutual recursion */
408             {
409             d = 0;
410             had_recurse = TRUE;
411             break;
412             }
413           else
414             {
415             int dd;
416             this_recurse.prev = recurses;
417             this_recurse.group = cs;
418             dd = find_minlength(re, cs, startcode, options, &this_recurse);
419             if (dd < d) d = dd;
420             }
421           }
422         slot += re->name_entry_size;
423         }
424       }
425     else d = 0;
426     cc += 1 + 2*IMM2_SIZE;
427     goto REPEAT_BACK_REFERENCE;
428 
429     case OP_REF:      /* Single back reference */
430     case OP_REFI:
431     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
432       {
433       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
434       if (cs == NULL) return -2;
435       do ce += GET(ce, 1); while (*ce == OP_ALT);
436       if (cc > cs && cc < ce)    /* Simple recursion */
437         {
438         d = 0;
439         had_recurse = TRUE;
440         }
441       else
442         {
443         recurse_check *r = recurses;
444         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
445         if (r != NULL)           /* Mutual recursion */
446           {
447           d = 0;
448           had_recurse = TRUE;
449           }
450         else
451           {
452           this_recurse.prev = recurses;
453           this_recurse.group = cs;
454           d = find_minlength(re, cs, startcode, options, &this_recurse);
455           }
456         }
457       }
458     else d = 0;
459     cc += 1 + IMM2_SIZE;
460 
461     /* Handle repeated back references */
462 
463     REPEAT_BACK_REFERENCE:
464     switch (*cc)
465       {
466       case OP_CRSTAR:
467       case OP_CRMINSTAR:
468       case OP_CRQUERY:
469       case OP_CRMINQUERY:
470       case OP_CRPOSSTAR:
471       case OP_CRPOSQUERY:
472       min = 0;
473       cc++;
474       break;
475 
476       case OP_CRPLUS:
477       case OP_CRMINPLUS:
478       case OP_CRPOSPLUS:
479       min = 1;
480       cc++;
481       break;
482 
483       case OP_CRRANGE:
484       case OP_CRMINRANGE:
485       case OP_CRPOSRANGE:
486       min = GET2(cc, 1);
487       cc += 1 + 2 * IMM2_SIZE;
488       break;
489 
490       default:
491       min = 1;
492       break;
493       }
494 
495     branchlength += min * d;
496     break;
497 
498     /* We can easily detect direct recursion, but not mutual recursion. This is
499     caught by a recursion depth count. */
500 
501     case OP_RECURSE:
502     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
503     do ce += GET(ce, 1); while (*ce == OP_ALT);
504     if (cc > cs && cc < ce)    /* Simple recursion */
505       had_recurse = TRUE;
506     else
507       {
508       recurse_check *r = recurses;
509       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
510       if (r != NULL)           /* Mutual recursion */
511         had_recurse = TRUE;
512       else
513         {
514         this_recurse.prev = recurses;
515         this_recurse.group = cs;
516         branchlength += find_minlength(re, cs, startcode, options,
517           &this_recurse);
518         }
519       }
520     cc += 1 + LINK_SIZE;
521     break;
522 
523     /* Anything else does not or need not match a character. We can get the
524     item's length from the table, but for those that can match zero occurrences
525     of a character, we must take special action for UTF-8 characters. As it
526     happens, the "NOT" versions of these opcodes are used at present only for
527     ASCII characters, so they could be omitted from this list. However, in
528     future that may change, so we include them here so as not to leave a
529     gotcha for a future maintainer. */
530 
531     case OP_UPTO:
532     case OP_UPTOI:
533     case OP_NOTUPTO:
534     case OP_NOTUPTOI:
535     case OP_MINUPTO:
536     case OP_MINUPTOI:
537     case OP_NOTMINUPTO:
538     case OP_NOTMINUPTOI:
539     case OP_POSUPTO:
540     case OP_POSUPTOI:
541     case OP_NOTPOSUPTO:
542     case OP_NOTPOSUPTOI:
543 
544     case OP_STAR:
545     case OP_STARI:
546     case OP_NOTSTAR:
547     case OP_NOTSTARI:
548     case OP_MINSTAR:
549     case OP_MINSTARI:
550     case OP_NOTMINSTAR:
551     case OP_NOTMINSTARI:
552     case OP_POSSTAR:
553     case OP_POSSTARI:
554     case OP_NOTPOSSTAR:
555     case OP_NOTPOSSTARI:
556 
557     case OP_QUERY:
558     case OP_QUERYI:
559     case OP_NOTQUERY:
560     case OP_NOTQUERYI:
561     case OP_MINQUERY:
562     case OP_MINQUERYI:
563     case OP_NOTMINQUERY:
564     case OP_NOTMINQUERYI:
565     case OP_POSQUERY:
566     case OP_POSQUERYI:
567     case OP_NOTPOSQUERY:
568     case OP_NOTPOSQUERYI:
569 
570     cc += PRIV(OP_lengths)[op];
571 #ifdef SUPPORT_UTF
572     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
573 #endif
574     break;
575 
576     /* Skip these, but we need to add in the name length. */
577 
578     case OP_MARK:
579     case OP_PRUNE_ARG:
580     case OP_SKIP_ARG:
581     case OP_THEN_ARG:
582     cc += PRIV(OP_lengths)[op] + cc[1];
583     break;
584 
585     /* The remaining opcodes are just skipped over. */
586 
587     case OP_CLOSE:
588     case OP_COMMIT:
589     case OP_FAIL:
590     case OP_PRUNE:
591     case OP_SET_SOM:
592     case OP_SKIP:
593     case OP_THEN:
594     cc += PRIV(OP_lengths)[op];
595     break;
596 
597     /* This should not occur: we list all opcodes explicitly so that when
598     new ones get added they are properly considered. */
599 
600     default:
601     return -3;
602     }
603   }
604 /* Control never gets here */
605 }
606 
607 
608 
609 /*************************************************
610 *      Set a bit and maybe its alternate case    *
611 *************************************************/
612 
613 /* Given a character, set its first byte's bit in the table, and also the
614 corresponding bit for the other version of a letter if we are caseless. In
615 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
616 when Unicode property support is available.
617 
618 Arguments:
619   start_bits    points to the bit map
620   p             points to the character
621   caseless      the caseless flag
622   cd            the block with char table pointers
623   utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
624 
625 Returns:        pointer after the character
626 */
627 
628 static const pcre_uchar *
set_table_bit(pcre_uint8 * start_bits,const pcre_uchar * p,BOOL caseless,compile_data * cd,BOOL utf)629 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
630   compile_data *cd, BOOL utf)
631 {
632 pcre_uint32 c = *p;
633 
634 #ifdef COMPILE_PCRE8
635 SET_BIT(c);
636 
637 #ifdef SUPPORT_UTF
638 if (utf && c > 127)
639   {
640   GETCHARINC(c, p);
641 #ifdef SUPPORT_UCP
642   if (caseless)
643     {
644     pcre_uchar buff[6];
645     c = UCD_OTHERCASE(c);
646     (void)PRIV(ord2utf)(c, buff);
647     SET_BIT(buff[0]);
648     }
649 #endif  /* Not SUPPORT_UCP */
650   return p;
651   }
652 #else   /* Not SUPPORT_UTF */
653 (void)(utf);   /* Stops warning for unused parameter */
654 #endif  /* SUPPORT_UTF */
655 
656 /* Not UTF-8 mode, or character is less than 127. */
657 
658 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
659 return p + 1;
660 #endif  /* COMPILE_PCRE8 */
661 
662 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
663 if (c > 0xff)
664   {
665   c = 0xff;
666   caseless = FALSE;
667   }
668 SET_BIT(c);
669 
670 #ifdef SUPPORT_UTF
671 if (utf && c > 127)
672   {
673   GETCHARINC(c, p);
674 #ifdef SUPPORT_UCP
675   if (caseless)
676     {
677     c = UCD_OTHERCASE(c);
678     if (c > 0xff)
679       c = 0xff;
680     SET_BIT(c);
681     }
682 #endif  /* SUPPORT_UCP */
683   return p;
684   }
685 #else   /* Not SUPPORT_UTF */
686 (void)(utf);   /* Stops warning for unused parameter */
687 #endif  /* SUPPORT_UTF */
688 
689 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
690 return p + 1;
691 #endif
692 }
693 
694 
695 
696 /*************************************************
697 *     Set bits for a positive character type     *
698 *************************************************/
699 
700 /* This function sets starting bits for a character type. In UTF-8 mode, we can
701 only do a direct setting for bytes less than 128, as otherwise there can be
702 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
703 environment, the tables will only recognize ASCII characters anyway, but in at
704 least one Windows environment, some higher bytes bits were set in the tables.
705 So we deal with that case by considering the UTF-8 encoding.
706 
707 Arguments:
708   start_bits     the starting bitmap
709   cbit type      the type of character wanted
710   table_limit    32 for non-UTF-8; 16 for UTF-8
711   cd             the block with char table pointers
712 
713 Returns:         nothing
714 */
715 
716 static void
set_type_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)717 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
718   compile_data *cd)
719 {
720 register pcre_uint32 c;
721 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
722 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
723 if (table_limit == 32) return;
724 for (c = 128; c < 256; c++)
725   {
726   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
727     {
728     pcre_uchar buff[6];
729     (void)PRIV(ord2utf)(c, buff);
730     SET_BIT(buff[0]);
731     }
732   }
733 #endif
734 }
735 
736 
737 /*************************************************
738 *     Set bits for a negative character type     *
739 *************************************************/
740 
741 /* This function sets starting bits for a negative character type such as \D.
742 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
743 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
744 Unlike in the positive case, where we can set appropriate starting bits for
745 specific high-valued UTF-8 characters, in this case we have to set the bits for
746 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
747 0xc0 (192) for simplicity.
748 
749 Arguments:
750   start_bits     the starting bitmap
751   cbit type      the type of character wanted
752   table_limit    32 for non-UTF-8; 16 for UTF-8
753   cd             the block with char table pointers
754 
755 Returns:         nothing
756 */
757 
758 static void
set_nottype_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)759 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
760   compile_data *cd)
761 {
762 register pcre_uint32 c;
763 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
764 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
765 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
766 #endif
767 }
768 
769 
770 
771 /*************************************************
772 *          Create bitmap of starting bytes       *
773 *************************************************/
774 
775 /* This function scans a compiled unanchored expression recursively and
776 attempts to build a bitmap of the set of possible starting bytes. As time goes
777 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
778 useful for parenthesized groups in patterns such as (a*)b where the group
779 provides some optional starting bytes but scanning must continue at the outer
780 level to find at least one mandatory byte. At the outermost level, this
781 function fails unless the result is SSB_DONE.
782 
783 Arguments:
784   code         points to an expression
785   start_bits   points to a 32-byte table, initialized to 0
786   utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
787   cd           the block with char table pointers
788 
789 Returns:       SSB_FAIL     => Failed to find any starting bytes
790                SSB_DONE     => Found mandatory starting bytes
791                SSB_CONTINUE => Found optional starting bytes
792                SSB_UNKNOWN  => Hit an unrecognized opcode
793 */
794 
795 static int
set_start_bits(const pcre_uchar * code,pcre_uint8 * start_bits,BOOL utf,compile_data * cd)796 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
797   compile_data *cd)
798 {
799 register pcre_uint32 c;
800 int yield = SSB_DONE;
801 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
802 int table_limit = utf? 16:32;
803 #else
804 int table_limit = 32;
805 #endif
806 
807 #if 0
808 /* ========================================================================= */
809 /* The following comment and code was inserted in January 1999. In May 2006,
810 when it was observed to cause compiler warnings about unused values, I took it
811 out again. If anybody is still using OS/2, they will have to put it back
812 manually. */
813 
814 /* This next statement and the later reference to dummy are here in order to
815 trick the optimizer of the IBM C compiler for OS/2 into generating correct
816 code. Apparently IBM isn't going to fix the problem, and we would rather not
817 disable optimization (in this module it actually makes a big difference, and
818 the pcre module can use all the optimization it can get). */
819 
820 volatile int dummy;
821 /* ========================================================================= */
822 #endif
823 
824 do
825   {
826   BOOL try_next = TRUE;
827   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
828 
829   if (*code == OP_CBRA || *code == OP_SCBRA ||
830       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
831 
832   while (try_next)    /* Loop for items in this branch */
833     {
834     int rc;
835 
836     switch(*tcode)
837       {
838       /* If we reach something we don't understand, it means a new opcode has
839       been created that hasn't been added to this code. Hopefully this problem
840       will be discovered during testing. */
841 
842       default:
843       return SSB_UNKNOWN;
844 
845       /* Fail for a valid opcode that implies no starting bits. */
846 
847       case OP_ACCEPT:
848       case OP_ASSERT_ACCEPT:
849       case OP_ALLANY:
850       case OP_ANY:
851       case OP_ANYBYTE:
852       case OP_CIRC:
853       case OP_CIRCM:
854       case OP_CLOSE:
855       case OP_COMMIT:
856       case OP_COND:
857       case OP_CREF:
858       case OP_DEF:
859       case OP_DNCREF:
860       case OP_DNREF:
861       case OP_DNREFI:
862       case OP_DNRREF:
863       case OP_DOLL:
864       case OP_DOLLM:
865       case OP_END:
866       case OP_EOD:
867       case OP_EODN:
868       case OP_EXTUNI:
869       case OP_FAIL:
870       case OP_MARK:
871       case OP_NOT:
872       case OP_NOTEXACT:
873       case OP_NOTEXACTI:
874       case OP_NOTI:
875       case OP_NOTMINPLUS:
876       case OP_NOTMINPLUSI:
877       case OP_NOTMINQUERY:
878       case OP_NOTMINQUERYI:
879       case OP_NOTMINSTAR:
880       case OP_NOTMINSTARI:
881       case OP_NOTMINUPTO:
882       case OP_NOTMINUPTOI:
883       case OP_NOTPLUS:
884       case OP_NOTPLUSI:
885       case OP_NOTPOSPLUS:
886       case OP_NOTPOSPLUSI:
887       case OP_NOTPOSQUERY:
888       case OP_NOTPOSQUERYI:
889       case OP_NOTPOSSTAR:
890       case OP_NOTPOSSTARI:
891       case OP_NOTPOSUPTO:
892       case OP_NOTPOSUPTOI:
893       case OP_NOTPROP:
894       case OP_NOTQUERY:
895       case OP_NOTQUERYI:
896       case OP_NOTSTAR:
897       case OP_NOTSTARI:
898       case OP_NOTUPTO:
899       case OP_NOTUPTOI:
900       case OP_NOT_HSPACE:
901       case OP_NOT_VSPACE:
902       case OP_PRUNE:
903       case OP_PRUNE_ARG:
904       case OP_RECURSE:
905       case OP_REF:
906       case OP_REFI:
907       case OP_REVERSE:
908       case OP_RREF:
909       case OP_SCOND:
910       case OP_SET_SOM:
911       case OP_SKIP:
912       case OP_SKIP_ARG:
913       case OP_SOD:
914       case OP_SOM:
915       case OP_THEN:
916       case OP_THEN_ARG:
917       return SSB_FAIL;
918 
919       /* A "real" property test implies no starting bits, but the fake property
920       PT_CLIST identifies a list of characters. These lists are short, as they
921       are used for characters with more than one "other case", so there is no
922       point in recognizing them for OP_NOTPROP. */
923 
924       case OP_PROP:
925       if (tcode[1] != PT_CLIST) return SSB_FAIL;
926         {
927         const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
928         while ((c = *p++) < NOTACHAR)
929           {
930 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
931           if (utf)
932             {
933             pcre_uchar buff[6];
934             (void)PRIV(ord2utf)(c, buff);
935             c = buff[0];
936             }
937 #endif
938           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
939           }
940         }
941       try_next = FALSE;
942       break;
943 
944       /* We can ignore word boundary tests. */
945 
946       case OP_WORD_BOUNDARY:
947       case OP_NOT_WORD_BOUNDARY:
948       tcode++;
949       break;
950 
951       /* If we hit a bracket or a positive lookahead assertion, recurse to set
952       bits from within the subpattern. If it can't find anything, we have to
953       give up. If it finds some mandatory character(s), we are done for this
954       branch. Otherwise, carry on scanning after the subpattern. */
955 
956       case OP_BRA:
957       case OP_SBRA:
958       case OP_CBRA:
959       case OP_SCBRA:
960       case OP_BRAPOS:
961       case OP_SBRAPOS:
962       case OP_CBRAPOS:
963       case OP_SCBRAPOS:
964       case OP_ONCE:
965       case OP_ONCE_NC:
966       case OP_ASSERT:
967       rc = set_start_bits(tcode, start_bits, utf, cd);
968       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
969       if (rc == SSB_DONE) try_next = FALSE; else
970         {
971         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
972         tcode += 1 + LINK_SIZE;
973         }
974       break;
975 
976       /* If we hit ALT or KET, it means we haven't found anything mandatory in
977       this branch, though we might have found something optional. For ALT, we
978       continue with the next alternative, but we have to arrange that the final
979       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
980       return SSB_CONTINUE: if this is the top level, that indicates failure,
981       but after a nested subpattern, it causes scanning to continue. */
982 
983       case OP_ALT:
984       yield = SSB_CONTINUE;
985       try_next = FALSE;
986       break;
987 
988       case OP_KET:
989       case OP_KETRMAX:
990       case OP_KETRMIN:
991       case OP_KETRPOS:
992       return SSB_CONTINUE;
993 
994       /* Skip over callout */
995 
996       case OP_CALLOUT:
997       tcode += 2 + 2*LINK_SIZE;
998       break;
999 
1000       /* Skip over lookbehind and negative lookahead assertions */
1001 
1002       case OP_ASSERT_NOT:
1003       case OP_ASSERTBACK:
1004       case OP_ASSERTBACK_NOT:
1005       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1006       tcode += 1 + LINK_SIZE;
1007       break;
1008 
1009       /* BRAZERO does the bracket, but carries on. */
1010 
1011       case OP_BRAZERO:
1012       case OP_BRAMINZERO:
1013       case OP_BRAPOSZERO:
1014       rc = set_start_bits(++tcode, start_bits, utf, cd);
1015       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1016 /* =========================================================================
1017       See the comment at the head of this function concerning the next line,
1018       which was an old fudge for the benefit of OS/2.
1019       dummy = 1;
1020   ========================================================================= */
1021       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1022       tcode += 1 + LINK_SIZE;
1023       break;
1024 
1025       /* SKIPZERO skips the bracket. */
1026 
1027       case OP_SKIPZERO:
1028       tcode++;
1029       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1030       tcode += 1 + LINK_SIZE;
1031       break;
1032 
1033       /* Single-char * or ? sets the bit and tries the next item */
1034 
1035       case OP_STAR:
1036       case OP_MINSTAR:
1037       case OP_POSSTAR:
1038       case OP_QUERY:
1039       case OP_MINQUERY:
1040       case OP_POSQUERY:
1041       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1042       break;
1043 
1044       case OP_STARI:
1045       case OP_MINSTARI:
1046       case OP_POSSTARI:
1047       case OP_QUERYI:
1048       case OP_MINQUERYI:
1049       case OP_POSQUERYI:
1050       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1051       break;
1052 
1053       /* Single-char upto sets the bit and tries the next */
1054 
1055       case OP_UPTO:
1056       case OP_MINUPTO:
1057       case OP_POSUPTO:
1058       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1059       break;
1060 
1061       case OP_UPTOI:
1062       case OP_MINUPTOI:
1063       case OP_POSUPTOI:
1064       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1065       break;
1066 
1067       /* At least one single char sets the bit and stops */
1068 
1069       case OP_EXACT:
1070       tcode += IMM2_SIZE;
1071       /* Fall through */
1072       case OP_CHAR:
1073       case OP_PLUS:
1074       case OP_MINPLUS:
1075       case OP_POSPLUS:
1076       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1077       try_next = FALSE;
1078       break;
1079 
1080       case OP_EXACTI:
1081       tcode += IMM2_SIZE;
1082       /* Fall through */
1083       case OP_CHARI:
1084       case OP_PLUSI:
1085       case OP_MINPLUSI:
1086       case OP_POSPLUSI:
1087       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1088       try_next = FALSE;
1089       break;
1090 
1091       /* Special spacing and line-terminating items. These recognize specific
1092       lists of characters. The difference between VSPACE and ANYNL is that the
1093       latter can match the two-character CRLF sequence, but that is not
1094       relevant for finding the first character, so their code here is
1095       identical. */
1096 
1097       case OP_HSPACE:
1098       SET_BIT(CHAR_HT);
1099       SET_BIT(CHAR_SPACE);
1100 #ifdef SUPPORT_UTF
1101       if (utf)
1102         {
1103 #ifdef COMPILE_PCRE8
1104         SET_BIT(0xC2);  /* For U+00A0 */
1105         SET_BIT(0xE1);  /* For U+1680, U+180E */
1106         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1107         SET_BIT(0xE3);  /* For U+3000 */
1108 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1109         SET_BIT(0xA0);
1110         SET_BIT(0xFF);  /* For characters > 255 */
1111 #endif  /* COMPILE_PCRE[8|16|32] */
1112         }
1113       else
1114 #endif /* SUPPORT_UTF */
1115         {
1116 #ifndef EBCDIC
1117         SET_BIT(0xA0);
1118 #endif  /* Not EBCDIC */
1119 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1120         SET_BIT(0xFF);  /* For characters > 255 */
1121 #endif  /* COMPILE_PCRE[16|32] */
1122         }
1123       try_next = FALSE;
1124       break;
1125 
1126       case OP_ANYNL:
1127       case OP_VSPACE:
1128       SET_BIT(CHAR_LF);
1129       SET_BIT(CHAR_VT);
1130       SET_BIT(CHAR_FF);
1131       SET_BIT(CHAR_CR);
1132 #ifdef SUPPORT_UTF
1133       if (utf)
1134         {
1135 #ifdef COMPILE_PCRE8
1136         SET_BIT(0xC2);  /* For U+0085 */
1137         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1138 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1139         SET_BIT(CHAR_NEL);
1140         SET_BIT(0xFF);  /* For characters > 255 */
1141 #endif  /* COMPILE_PCRE[8|16|32] */
1142         }
1143       else
1144 #endif /* SUPPORT_UTF */
1145         {
1146         SET_BIT(CHAR_NEL);
1147 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1148         SET_BIT(0xFF);  /* For characters > 255 */
1149 #endif
1150         }
1151       try_next = FALSE;
1152       break;
1153 
1154       /* Single character types set the bits and stop. Note that if PCRE_UCP
1155       is set, we do not see these op codes because \d etc are converted to
1156       properties. Therefore, these apply in the case when only characters less
1157       than 256 are recognized to match the types. */
1158 
1159       case OP_NOT_DIGIT:
1160       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1161       try_next = FALSE;
1162       break;
1163 
1164       case OP_DIGIT:
1165       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1166       try_next = FALSE;
1167       break;
1168 
1169       /* The cbit_space table has vertical tab as whitespace; we no longer
1170       have to play fancy tricks because Perl added VT to its whitespace at
1171       release 5.18. PCRE added it at release 8.34. */
1172 
1173       case OP_NOT_WHITESPACE:
1174       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1175       try_next = FALSE;
1176       break;
1177 
1178       case OP_WHITESPACE:
1179       set_type_bits(start_bits, cbit_space, table_limit, cd);
1180       try_next = FALSE;
1181       break;
1182 
1183       case OP_NOT_WORDCHAR:
1184       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1185       try_next = FALSE;
1186       break;
1187 
1188       case OP_WORDCHAR:
1189       set_type_bits(start_bits, cbit_word, table_limit, cd);
1190       try_next = FALSE;
1191       break;
1192 
1193       /* One or more character type fudges the pointer and restarts, knowing
1194       it will hit a single character type and stop there. */
1195 
1196       case OP_TYPEPLUS:
1197       case OP_TYPEMINPLUS:
1198       case OP_TYPEPOSPLUS:
1199       tcode++;
1200       break;
1201 
1202       case OP_TYPEEXACT:
1203       tcode += 1 + IMM2_SIZE;
1204       break;
1205 
1206       /* Zero or more repeats of character types set the bits and then
1207       try again. */
1208 
1209       case OP_TYPEUPTO:
1210       case OP_TYPEMINUPTO:
1211       case OP_TYPEPOSUPTO:
1212       tcode += IMM2_SIZE;  /* Fall through */
1213 
1214       case OP_TYPESTAR:
1215       case OP_TYPEMINSTAR:
1216       case OP_TYPEPOSSTAR:
1217       case OP_TYPEQUERY:
1218       case OP_TYPEMINQUERY:
1219       case OP_TYPEPOSQUERY:
1220       switch(tcode[1])
1221         {
1222         default:
1223         case OP_ANY:
1224         case OP_ALLANY:
1225         return SSB_FAIL;
1226 
1227         case OP_HSPACE:
1228         SET_BIT(CHAR_HT);
1229         SET_BIT(CHAR_SPACE);
1230 #ifdef SUPPORT_UTF
1231         if (utf)
1232           {
1233 #ifdef COMPILE_PCRE8
1234           SET_BIT(0xC2);  /* For U+00A0 */
1235           SET_BIT(0xE1);  /* For U+1680, U+180E */
1236           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1237           SET_BIT(0xE3);  /* For U+3000 */
1238 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1239           SET_BIT(0xA0);
1240           SET_BIT(0xFF);  /* For characters > 255 */
1241 #endif  /* COMPILE_PCRE[8|16|32] */
1242           }
1243         else
1244 #endif /* SUPPORT_UTF */
1245 #ifndef EBCDIC
1246           SET_BIT(0xA0);
1247 #endif  /* Not EBCDIC */
1248         break;
1249 
1250         case OP_ANYNL:
1251         case OP_VSPACE:
1252         SET_BIT(CHAR_LF);
1253         SET_BIT(CHAR_VT);
1254         SET_BIT(CHAR_FF);
1255         SET_BIT(CHAR_CR);
1256 #ifdef SUPPORT_UTF
1257         if (utf)
1258           {
1259 #ifdef COMPILE_PCRE8
1260           SET_BIT(0xC2);  /* For U+0085 */
1261           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1262 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1263           SET_BIT(CHAR_NEL);
1264           SET_BIT(0xFF);  /* For characters > 255 */
1265 #endif  /* COMPILE_PCRE16 */
1266           }
1267         else
1268 #endif /* SUPPORT_UTF */
1269           SET_BIT(CHAR_NEL);
1270         break;
1271 
1272         case OP_NOT_DIGIT:
1273         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1274         break;
1275 
1276         case OP_DIGIT:
1277         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1278         break;
1279 
1280         /* The cbit_space table has vertical tab as whitespace; we no longer
1281         have to play fancy tricks because Perl added VT to its whitespace at
1282         release 5.18. PCRE added it at release 8.34. */
1283 
1284         case OP_NOT_WHITESPACE:
1285         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1286         break;
1287 
1288         case OP_WHITESPACE:
1289         set_type_bits(start_bits, cbit_space, table_limit, cd);
1290         break;
1291 
1292         case OP_NOT_WORDCHAR:
1293         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1294         break;
1295 
1296         case OP_WORDCHAR:
1297         set_type_bits(start_bits, cbit_word, table_limit, cd);
1298         break;
1299         }
1300 
1301       tcode += 2;
1302       break;
1303 
1304       /* Character class where all the information is in a bit map: set the
1305       bits and either carry on or not, according to the repeat count. If it was
1306       a negative class, and we are operating with UTF-8 characters, any byte
1307       with a value >= 0xc4 is a potentially valid starter because it starts a
1308       character with a value > 255. */
1309 
1310 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1311       case OP_XCLASS:
1312       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1313         return SSB_FAIL;
1314       /* All bits are set. */
1315       if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1316         return SSB_FAIL;
1317 #endif
1318       /* Fall through */
1319 
1320       case OP_NCLASS:
1321 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1322       if (utf)
1323         {
1324         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1325         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1326         }
1327 #endif
1328 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1329       SET_BIT(0xFF);                         /* For characters > 255 */
1330 #endif
1331       /* Fall through */
1332 
1333       case OP_CLASS:
1334         {
1335         pcre_uint8 *map;
1336 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1337         map = NULL;
1338         if (*tcode == OP_XCLASS)
1339           {
1340           if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1341             map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1342           tcode += GET(tcode, 1);
1343           }
1344         else
1345 #endif
1346           {
1347           tcode++;
1348           map = (pcre_uint8 *)tcode;
1349           tcode += 32 / sizeof(pcre_uchar);
1350           }
1351 
1352         /* In UTF-8 mode, the bits in a bit map correspond to character
1353         values, not to byte values. However, the bit map we are constructing is
1354         for byte values. So we have to do a conversion for characters whose
1355         value is > 127. In fact, there are only two possible starting bytes for
1356         characters in the range 128 - 255. */
1357 
1358 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1359         if (map != NULL)
1360 #endif
1361           {
1362 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1363           if (utf)
1364             {
1365             for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1366             for (c = 128; c < 256; c++)
1367               {
1368               if ((map[c/8] && (1 << (c&7))) != 0)
1369                 {
1370                 int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1371                 start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1372                 c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1373                 }
1374               }
1375             }
1376           else
1377 #endif
1378             {
1379             /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1380             for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1381             }
1382           }
1383 
1384         /* Advance past the bit map, and act on what follows. For a zero
1385         minimum repeat, continue; otherwise stop processing. */
1386 
1387         switch (*tcode)
1388           {
1389           case OP_CRSTAR:
1390           case OP_CRMINSTAR:
1391           case OP_CRQUERY:
1392           case OP_CRMINQUERY:
1393           case OP_CRPOSSTAR:
1394           case OP_CRPOSQUERY:
1395           tcode++;
1396           break;
1397 
1398           case OP_CRRANGE:
1399           case OP_CRMINRANGE:
1400           case OP_CRPOSRANGE:
1401           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1402             else try_next = FALSE;
1403           break;
1404 
1405           default:
1406           try_next = FALSE;
1407           break;
1408           }
1409         }
1410       break; /* End of bitmap class handling */
1411 
1412       }      /* End of switch */
1413     }        /* End of try_next loop */
1414 
1415   code += GET(code, 1);   /* Advance to next branch */
1416   }
1417 while (*code == OP_ALT);
1418 return yield;
1419 }
1420 
1421 
1422 
1423 
1424 
1425 /*************************************************
1426 *          Study a compiled expression           *
1427 *************************************************/
1428 
1429 /* This function is handed a compiled expression that it must study to produce
1430 information that will speed up the matching. It returns a pcre[16]_extra block
1431 which then gets handed back to pcre_exec().
1432 
1433 Arguments:
1434   re        points to the compiled expression
1435   options   contains option bits
1436   errorptr  points to where to place error messages;
1437             set NULL unless error
1438 
1439 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1440               the appropriate flags set;
1441             NULL on error or if no optimization possible
1442 */
1443 
1444 #if defined COMPILE_PCRE8
1445 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre * external_re,int options,const char ** errorptr)1446 pcre_study(const pcre *external_re, int options, const char **errorptr)
1447 #elif defined COMPILE_PCRE16
1448 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1449 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1450 #elif defined COMPILE_PCRE32
1451 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1452 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1453 #endif
1454 {
1455 int min;
1456 BOOL bits_set = FALSE;
1457 pcre_uint8 start_bits[32];
1458 PUBL(extra) *extra = NULL;
1459 pcre_study_data *study;
1460 const pcre_uint8 *tables;
1461 pcre_uchar *code;
1462 compile_data compile_block;
1463 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1464 
1465 
1466 *errorptr = NULL;
1467 
1468 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1469   {
1470   *errorptr = "argument is not a compiled regular expression";
1471   return NULL;
1472   }
1473 
1474 if ((re->flags & PCRE_MODE) == 0)
1475   {
1476 #if defined COMPILE_PCRE8
1477   *errorptr = "argument not compiled in 8 bit mode";
1478 #elif defined COMPILE_PCRE16
1479   *errorptr = "argument not compiled in 16 bit mode";
1480 #elif defined COMPILE_PCRE32
1481   *errorptr = "argument not compiled in 32 bit mode";
1482 #endif
1483   return NULL;
1484   }
1485 
1486 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1487   {
1488   *errorptr = "unknown or incorrect option bit(s) set";
1489   return NULL;
1490   }
1491 
1492 code = (pcre_uchar *)re + re->name_table_offset +
1493   (re->name_count * re->name_entry_size);
1494 
1495 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1496 a multiline pattern that matches only at "line starts", there is no point in
1497 seeking a list of starting bytes. */
1498 
1499 if ((re->options & PCRE_ANCHORED) == 0 &&
1500     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1501   {
1502   int rc;
1503 
1504   /* Set the character tables in the block that is passed around */
1505 
1506   tables = re->tables;
1507 
1508 #if defined COMPILE_PCRE8
1509   if (tables == NULL)
1510     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1511     (void *)(&tables));
1512 #elif defined COMPILE_PCRE16
1513   if (tables == NULL)
1514     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1515     (void *)(&tables));
1516 #elif defined COMPILE_PCRE32
1517   if (tables == NULL)
1518     (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1519     (void *)(&tables));
1520 #endif
1521 
1522   compile_block.lcc = tables + lcc_offset;
1523   compile_block.fcc = tables + fcc_offset;
1524   compile_block.cbits = tables + cbits_offset;
1525   compile_block.ctypes = tables + ctypes_offset;
1526 
1527   /* See if we can find a fixed set of initial characters for the pattern. */
1528 
1529   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1530   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1531     &compile_block);
1532   bits_set = rc == SSB_DONE;
1533   if (rc == SSB_UNKNOWN)
1534     {
1535     *errorptr = "internal error: opcode not recognized";
1536     return NULL;
1537     }
1538   }
1539 
1540 /* Find the minimum length of subject string. */
1541 
1542 switch(min = find_minlength(re, code, code, re->options, NULL))
1543   {
1544   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1545   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1546   default: break;
1547   }
1548 
1549 /* If a set of starting bytes has been identified, or if the minimum length is
1550 greater than zero, or if JIT optimization has been requested, or if
1551 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1552 pcre_study_data block. The study data is put in the latter, which is pointed to
1553 by the former, which may also get additional data set later by the calling
1554 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1555 save it in a field for returning via the pcre_fullinfo() function so that if it
1556 becomes variable in the future, we don't have to change that code. */
1557 
1558 if (bits_set || min > 0 || (options & (
1559 #ifdef SUPPORT_JIT
1560     PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1561     PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1562 #endif
1563     PCRE_STUDY_EXTRA_NEEDED)) != 0)
1564   {
1565   extra = (PUBL(extra) *)(PUBL(malloc))
1566     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1567   if (extra == NULL)
1568     {
1569     *errorptr = "failed to get memory";
1570     return NULL;
1571     }
1572 
1573   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1574   extra->flags = PCRE_EXTRA_STUDY_DATA;
1575   extra->study_data = study;
1576 
1577   study->size = sizeof(pcre_study_data);
1578   study->flags = 0;
1579 
1580   /* Set the start bits always, to avoid unset memory errors if the
1581   study data is written to a file, but set the flag only if any of the bits
1582   are set, to save time looking when none are. */
1583 
1584   if (bits_set)
1585     {
1586     study->flags |= PCRE_STUDY_MAPPED;
1587     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1588     }
1589   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1590 
1591 #ifdef PCRE_DEBUG
1592   if (bits_set)
1593     {
1594     pcre_uint8 *ptr = start_bits;
1595     int i;
1596 
1597     printf("Start bits:\n");
1598     for (i = 0; i < 32; i++)
1599       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1600     }
1601 #endif
1602 
1603   /* Always set the minlength value in the block, because the JIT compiler
1604   makes use of it. However, don't set the bit unless the length is greater than
1605   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1606   checking the zero case. */
1607 
1608   if (min > 0)
1609     {
1610     study->flags |= PCRE_STUDY_MINLEN;
1611     study->minlength = min;
1612     }
1613   else study->minlength = 0;
1614 
1615   /* If JIT support was compiled and requested, attempt the JIT compilation.
1616   If no starting bytes were found, and the minimum length is zero, and JIT
1617   compilation fails, abandon the extra block and return NULL, unless
1618   PCRE_STUDY_EXTRA_NEEDED is set. */
1619 
1620 #ifdef SUPPORT_JIT
1621   extra->executable_jit = NULL;
1622   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1623     PRIV(jit_compile)(re, extra, JIT_COMPILE);
1624   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1625     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1626   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1627     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1628 
1629   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1630       (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1631     {
1632 #if defined COMPILE_PCRE8
1633     pcre_free_study(extra);
1634 #elif defined COMPILE_PCRE16
1635     pcre16_free_study(extra);
1636 #elif defined COMPILE_PCRE32
1637     pcre32_free_study(extra);
1638 #endif
1639     extra = NULL;
1640     }
1641 #endif
1642   }
1643 
1644 return extra;
1645 }
1646 
1647 
1648 /*************************************************
1649 *          Free the study data                   *
1650 *************************************************/
1651 
1652 /* This function frees the memory that was obtained by pcre_study().
1653 
1654 Argument:   a pointer to the pcre[16]_extra block
1655 Returns:    nothing
1656 */
1657 
1658 #if defined COMPILE_PCRE8
1659 PCRE_EXP_DEFN void
pcre_free_study(pcre_extra * extra)1660 pcre_free_study(pcre_extra *extra)
1661 #elif defined COMPILE_PCRE16
1662 PCRE_EXP_DEFN void
1663 pcre16_free_study(pcre16_extra *extra)
1664 #elif defined COMPILE_PCRE32
1665 PCRE_EXP_DEFN void
1666 pcre32_free_study(pcre32_extra *extra)
1667 #endif
1668 {
1669 if (extra == NULL)
1670   return;
1671 #ifdef SUPPORT_JIT
1672 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1673      extra->executable_jit != NULL)
1674   PRIV(jit_free)(extra->executable_jit);
1675 #endif
1676 PUBL(free)(extra);
1677 }
1678 
1679 /* End of pcre_study.c */
1680