xref: /PHP-7.1/ext/pcre/pcrelib/pcre_study.c (revision 5f9df47e)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2012 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43 
44 
45 #include "config.h"
46 
47 #include "pcre_internal.h"
48 
49 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
50 
51 /* Returns from set_start_bits() */
52 
53 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
54 
55 
56 
57 /*************************************************
58 *   Find the minimum subject length for a group  *
59 *************************************************/
60 
61 /* Scan a parenthesized group and compute the minimum length of subject that
62 is needed to match it. This is a lower bound; it does not mean there is a
63 string of that length that matches. In UTF8 mode, the result is in characters
64 rather than bytes.
65 
66 Arguments:
67   re              compiled pattern block
68   code            pointer to start of group (the bracket)
69   startcode       pointer to start of the whole pattern's code
70   options         the compiling options
71   recurses        chain of recurse_check to catch mutual recursion
72   countptr        pointer to call count (to catch over complexity)
73 
74 Returns:   the minimum length
75            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
76            -2 internal error (missing capturing bracket)
77            -3 internal error (opcode not listed)
78 */
79 
80 static int
find_minlength(const REAL_PCRE * re,const pcre_uchar * code,const pcre_uchar * startcode,int options,recurse_check * recurses,int * countptr)81 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
82   const pcre_uchar *startcode, int options, recurse_check *recurses,
83   int *countptr)
84 {
85 int length = -1;
86 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87 BOOL utf = (options & PCRE_UTF8) != 0;
88 BOOL had_recurse = FALSE;
89 recurse_check this_recurse;
90 register int branchlength = 0;
91 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
92 
93 if ((*countptr)++ > 1000) return -1;   /* too complex */
94 
95 if (*code == OP_CBRA || *code == OP_SCBRA ||
96     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
97 
98 /* Scan along the opcodes for this branch. If we get to the end of the
99 branch, check the length against that of the other branches. */
100 
101 for (;;)
102   {
103   int d, min;
104   pcre_uchar *cs, *ce;
105   register pcre_uchar op = *cc;
106 
107   switch (op)
108     {
109     case OP_COND:
110     case OP_SCOND:
111 
112     /* If there is only one branch in a condition, the implied branch has zero
113     length, so we don't add anything. This covers the DEFINE "condition"
114     automatically. */
115 
116     cs = cc + GET(cc, 1);
117     if (*cs != OP_ALT)
118       {
119       cc = cs + 1 + LINK_SIZE;
120       break;
121       }
122 
123     /* Otherwise we can fall through and treat it the same as any other
124     subpattern. */
125 
126     case OP_CBRA:
127     case OP_SCBRA:
128     case OP_BRA:
129     case OP_SBRA:
130     case OP_CBRAPOS:
131     case OP_SCBRAPOS:
132     case OP_BRAPOS:
133     case OP_SBRAPOS:
134     case OP_ONCE:
135     case OP_ONCE_NC:
136     d = find_minlength(re, cc, startcode, options, recurses, countptr);
137     if (d < 0) return d;
138     branchlength += d;
139     do cc += GET(cc, 1); while (*cc == OP_ALT);
140     cc += 1 + LINK_SIZE;
141     break;
142 
143     /* ACCEPT makes things far too complicated; we have to give up. */
144 
145     case OP_ACCEPT:
146     case OP_ASSERT_ACCEPT:
147     return -1;
148 
149     /* Reached end of a branch; if it's a ket it is the end of a nested
150     call. If it's ALT it is an alternation in a nested call. If it is END it's
151     the end of the outer call. All can be handled by the same code. If an
152     ACCEPT was previously encountered, use the length that was in force at that
153     time, and pass back the shortest ACCEPT length. */
154 
155     case OP_ALT:
156     case OP_KET:
157     case OP_KETRMAX:
158     case OP_KETRMIN:
159     case OP_KETRPOS:
160     case OP_END:
161     if (length < 0 || (!had_recurse && branchlength < length))
162       length = branchlength;
163     if (op != OP_ALT) return length;
164     cc += 1 + LINK_SIZE;
165     branchlength = 0;
166     had_recurse = FALSE;
167     break;
168 
169     /* Skip over assertive subpatterns */
170 
171     case OP_ASSERT:
172     case OP_ASSERT_NOT:
173     case OP_ASSERTBACK:
174     case OP_ASSERTBACK_NOT:
175     do cc += GET(cc, 1); while (*cc == OP_ALT);
176     /* Fall through */
177 
178     /* Skip over things that don't match chars */
179 
180     case OP_REVERSE:
181     case OP_CREF:
182     case OP_DNCREF:
183     case OP_RREF:
184     case OP_DNRREF:
185     case OP_DEF:
186     case OP_CALLOUT:
187     case OP_SOD:
188     case OP_SOM:
189     case OP_EOD:
190     case OP_EODN:
191     case OP_CIRC:
192     case OP_CIRCM:
193     case OP_DOLL:
194     case OP_DOLLM:
195     case OP_NOT_WORD_BOUNDARY:
196     case OP_WORD_BOUNDARY:
197     cc += PRIV(OP_lengths)[*cc];
198     break;
199 
200     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
201 
202     case OP_BRAZERO:
203     case OP_BRAMINZERO:
204     case OP_BRAPOSZERO:
205     case OP_SKIPZERO:
206     cc += PRIV(OP_lengths)[*cc];
207     do cc += GET(cc, 1); while (*cc == OP_ALT);
208     cc += 1 + LINK_SIZE;
209     break;
210 
211     /* Handle literal characters and + repetitions */
212 
213     case OP_CHAR:
214     case OP_CHARI:
215     case OP_NOT:
216     case OP_NOTI:
217     case OP_PLUS:
218     case OP_PLUSI:
219     case OP_MINPLUS:
220     case OP_MINPLUSI:
221     case OP_POSPLUS:
222     case OP_POSPLUSI:
223     case OP_NOTPLUS:
224     case OP_NOTPLUSI:
225     case OP_NOTMINPLUS:
226     case OP_NOTMINPLUSI:
227     case OP_NOTPOSPLUS:
228     case OP_NOTPOSPLUSI:
229     branchlength++;
230     cc += 2;
231 #ifdef SUPPORT_UTF
232     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
233 #endif
234     break;
235 
236     case OP_TYPEPLUS:
237     case OP_TYPEMINPLUS:
238     case OP_TYPEPOSPLUS:
239     branchlength++;
240     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
241     break;
242 
243     /* Handle exact repetitions. The count is already in characters, but we
244     need to skip over a multibyte character in UTF8 mode.  */
245 
246     case OP_EXACT:
247     case OP_EXACTI:
248     case OP_NOTEXACT:
249     case OP_NOTEXACTI:
250     branchlength += GET2(cc,1);
251     cc += 2 + IMM2_SIZE;
252 #ifdef SUPPORT_UTF
253     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
254 #endif
255     break;
256 
257     case OP_TYPEEXACT:
258     branchlength += GET2(cc,1);
259     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
260       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
261     break;
262 
263     /* Handle single-char non-literal matchers */
264 
265     case OP_PROP:
266     case OP_NOTPROP:
267     cc += 2;
268     /* Fall through */
269 
270     case OP_NOT_DIGIT:
271     case OP_DIGIT:
272     case OP_NOT_WHITESPACE:
273     case OP_WHITESPACE:
274     case OP_NOT_WORDCHAR:
275     case OP_WORDCHAR:
276     case OP_ANY:
277     case OP_ALLANY:
278     case OP_EXTUNI:
279     case OP_HSPACE:
280     case OP_NOT_HSPACE:
281     case OP_VSPACE:
282     case OP_NOT_VSPACE:
283     branchlength++;
284     cc++;
285     break;
286 
287     /* "Any newline" might match two characters, but it also might match just
288     one. */
289 
290     case OP_ANYNL:
291     branchlength += 1;
292     cc++;
293     break;
294 
295     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
296     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
297     appear, but leave the code, just in case.) */
298 
299     case OP_ANYBYTE:
300 #ifdef SUPPORT_UTF
301     if (utf) return -1;
302 #endif
303     branchlength++;
304     cc++;
305     break;
306 
307     /* For repeated character types, we have to test for \p and \P, which have
308     an extra two bytes of parameters. */
309 
310     case OP_TYPESTAR:
311     case OP_TYPEMINSTAR:
312     case OP_TYPEQUERY:
313     case OP_TYPEMINQUERY:
314     case OP_TYPEPOSSTAR:
315     case OP_TYPEPOSQUERY:
316     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
317     cc += PRIV(OP_lengths)[op];
318     break;
319 
320     case OP_TYPEUPTO:
321     case OP_TYPEMINUPTO:
322     case OP_TYPEPOSUPTO:
323     if (cc[1 + IMM2_SIZE] == OP_PROP
324       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
325     cc += PRIV(OP_lengths)[op];
326     break;
327 
328     /* Check a class for variable quantification */
329 
330     case OP_CLASS:
331     case OP_NCLASS:
332 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
333     case OP_XCLASS:
334     /* The original code caused an unsigned overflow in 64 bit systems,
335     so now we use a conditional statement. */
336     if (op == OP_XCLASS)
337       cc += GET(cc, 1);
338     else
339       cc += PRIV(OP_lengths)[OP_CLASS];
340 #else
341     cc += PRIV(OP_lengths)[OP_CLASS];
342 #endif
343 
344     switch (*cc)
345       {
346       case OP_CRPLUS:
347       case OP_CRMINPLUS:
348       case OP_CRPOSPLUS:
349       branchlength++;
350       /* Fall through */
351 
352       case OP_CRSTAR:
353       case OP_CRMINSTAR:
354       case OP_CRQUERY:
355       case OP_CRMINQUERY:
356       case OP_CRPOSSTAR:
357       case OP_CRPOSQUERY:
358       cc++;
359       break;
360 
361       case OP_CRRANGE:
362       case OP_CRMINRANGE:
363       case OP_CRPOSRANGE:
364       branchlength += GET2(cc,1);
365       cc += 1 + 2 * IMM2_SIZE;
366       break;
367 
368       default:
369       branchlength++;
370       break;
371       }
372     break;
373 
374     /* Backreferences and subroutine calls are treated in the same way: we find
375     the minimum length for the subpattern. A recursion, however, causes an
376     a flag to be set that causes the length of this branch to be ignored. The
377     logic is that a recursion can only make sense if there is another
378     alternation that stops the recursing. That will provide the minimum length
379     (when no recursion happens). A backreference within the group that it is
380     referencing behaves in the same way.
381 
382     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
383     matches an empty string (by default it causes a matching failure), so in
384     that case we must set the minimum length to zero. */
385 
386     case OP_DNREF:     /* Duplicate named pattern back reference */
387     case OP_DNREFI:
388     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
389       {
390       int count = GET2(cc, 1+IMM2_SIZE);
391       pcre_uchar *slot = (pcre_uchar *)re +
392         re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
393       d = INT_MAX;
394       while (count-- > 0)
395         {
396         ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
397         if (cs == NULL) return -2;
398         do ce += GET(ce, 1); while (*ce == OP_ALT);
399         if (cc > cs && cc < ce)     /* Simple recursion */
400           {
401           d = 0;
402           had_recurse = TRUE;
403           break;
404           }
405         else
406           {
407           recurse_check *r = recurses;
408           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
409           if (r != NULL)           /* Mutual recursion */
410             {
411             d = 0;
412             had_recurse = TRUE;
413             break;
414             }
415           else
416             {
417             int dd;
418             this_recurse.prev = recurses;
419             this_recurse.group = cs;
420             dd = find_minlength(re, cs, startcode, options, &this_recurse,
421               countptr);
422             if (dd < d) d = dd;
423             }
424           }
425         slot += re->name_entry_size;
426         }
427       }
428     else d = 0;
429     cc += 1 + 2*IMM2_SIZE;
430     goto REPEAT_BACK_REFERENCE;
431 
432     case OP_REF:      /* Single back reference */
433     case OP_REFI:
434     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
435       {
436       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
437       if (cs == NULL) return -2;
438       do ce += GET(ce, 1); while (*ce == OP_ALT);
439       if (cc > cs && cc < ce)    /* Simple recursion */
440         {
441         d = 0;
442         had_recurse = TRUE;
443         }
444       else
445         {
446         recurse_check *r = recurses;
447         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
448         if (r != NULL)           /* Mutual recursion */
449           {
450           d = 0;
451           had_recurse = TRUE;
452           }
453         else
454           {
455           this_recurse.prev = recurses;
456           this_recurse.group = cs;
457           d = find_minlength(re, cs, startcode, options, &this_recurse,
458             countptr);
459           }
460         }
461       }
462     else d = 0;
463     cc += 1 + IMM2_SIZE;
464 
465     /* Handle repeated back references */
466 
467     REPEAT_BACK_REFERENCE:
468     switch (*cc)
469       {
470       case OP_CRSTAR:
471       case OP_CRMINSTAR:
472       case OP_CRQUERY:
473       case OP_CRMINQUERY:
474       case OP_CRPOSSTAR:
475       case OP_CRPOSQUERY:
476       min = 0;
477       cc++;
478       break;
479 
480       case OP_CRPLUS:
481       case OP_CRMINPLUS:
482       case OP_CRPOSPLUS:
483       min = 1;
484       cc++;
485       break;
486 
487       case OP_CRRANGE:
488       case OP_CRMINRANGE:
489       case OP_CRPOSRANGE:
490       min = GET2(cc, 1);
491       cc += 1 + 2 * IMM2_SIZE;
492       break;
493 
494       default:
495       min = 1;
496       break;
497       }
498 
499     branchlength += min * d;
500     break;
501 
502     /* We can easily detect direct recursion, but not mutual recursion. This is
503     caught by a recursion depth count. */
504 
505     case OP_RECURSE:
506     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
507     do ce += GET(ce, 1); while (*ce == OP_ALT);
508     if (cc > cs && cc < ce)    /* Simple recursion */
509       had_recurse = TRUE;
510     else
511       {
512       recurse_check *r = recurses;
513       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
514       if (r != NULL)           /* Mutual recursion */
515         had_recurse = TRUE;
516       else
517         {
518         this_recurse.prev = recurses;
519         this_recurse.group = cs;
520         branchlength += find_minlength(re, cs, startcode, options,
521           &this_recurse, countptr);
522         }
523       }
524     cc += 1 + LINK_SIZE;
525     break;
526 
527     /* Anything else does not or need not match a character. We can get the
528     item's length from the table, but for those that can match zero occurrences
529     of a character, we must take special action for UTF-8 characters. As it
530     happens, the "NOT" versions of these opcodes are used at present only for
531     ASCII characters, so they could be omitted from this list. However, in
532     future that may change, so we include them here so as not to leave a
533     gotcha for a future maintainer. */
534 
535     case OP_UPTO:
536     case OP_UPTOI:
537     case OP_NOTUPTO:
538     case OP_NOTUPTOI:
539     case OP_MINUPTO:
540     case OP_MINUPTOI:
541     case OP_NOTMINUPTO:
542     case OP_NOTMINUPTOI:
543     case OP_POSUPTO:
544     case OP_POSUPTOI:
545     case OP_NOTPOSUPTO:
546     case OP_NOTPOSUPTOI:
547 
548     case OP_STAR:
549     case OP_STARI:
550     case OP_NOTSTAR:
551     case OP_NOTSTARI:
552     case OP_MINSTAR:
553     case OP_MINSTARI:
554     case OP_NOTMINSTAR:
555     case OP_NOTMINSTARI:
556     case OP_POSSTAR:
557     case OP_POSSTARI:
558     case OP_NOTPOSSTAR:
559     case OP_NOTPOSSTARI:
560 
561     case OP_QUERY:
562     case OP_QUERYI:
563     case OP_NOTQUERY:
564     case OP_NOTQUERYI:
565     case OP_MINQUERY:
566     case OP_MINQUERYI:
567     case OP_NOTMINQUERY:
568     case OP_NOTMINQUERYI:
569     case OP_POSQUERY:
570     case OP_POSQUERYI:
571     case OP_NOTPOSQUERY:
572     case OP_NOTPOSQUERYI:
573 
574     cc += PRIV(OP_lengths)[op];
575 #ifdef SUPPORT_UTF
576     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
577 #endif
578     break;
579 
580     /* Skip these, but we need to add in the name length. */
581 
582     case OP_MARK:
583     case OP_PRUNE_ARG:
584     case OP_SKIP_ARG:
585     case OP_THEN_ARG:
586     cc += PRIV(OP_lengths)[op] + cc[1];
587     break;
588 
589     /* The remaining opcodes are just skipped over. */
590 
591     case OP_CLOSE:
592     case OP_COMMIT:
593     case OP_FAIL:
594     case OP_PRUNE:
595     case OP_SET_SOM:
596     case OP_SKIP:
597     case OP_THEN:
598     cc += PRIV(OP_lengths)[op];
599     break;
600 
601     /* This should not occur: we list all opcodes explicitly so that when
602     new ones get added they are properly considered. */
603 
604     default:
605     return -3;
606     }
607   }
608 /* Control never gets here */
609 }
610 
611 
612 
613 /*************************************************
614 *      Set a bit and maybe its alternate case    *
615 *************************************************/
616 
617 /* Given a character, set its first byte's bit in the table, and also the
618 corresponding bit for the other version of a letter if we are caseless. In
619 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
620 when Unicode property support is available.
621 
622 Arguments:
623   start_bits    points to the bit map
624   p             points to the character
625   caseless      the caseless flag
626   cd            the block with char table pointers
627   utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
628 
629 Returns:        pointer after the character
630 */
631 
632 static const pcre_uchar *
set_table_bit(pcre_uint8 * start_bits,const pcre_uchar * p,BOOL caseless,compile_data * cd,BOOL utf)633 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
634   compile_data *cd, BOOL utf)
635 {
636 pcre_uint32 c = *p;
637 
638 #ifdef COMPILE_PCRE8
639 SET_BIT(c);
640 
641 #ifdef SUPPORT_UTF
642 if (utf && c > 127)
643   {
644   GETCHARINC(c, p);
645 #ifdef SUPPORT_UCP
646   if (caseless)
647     {
648     pcre_uchar buff[6];
649     c = UCD_OTHERCASE(c);
650     (void)PRIV(ord2utf)(c, buff);
651     SET_BIT(buff[0]);
652     }
653 #endif  /* Not SUPPORT_UCP */
654   return p;
655   }
656 #else   /* Not SUPPORT_UTF */
657 (void)(utf);   /* Stops warning for unused parameter */
658 #endif  /* SUPPORT_UTF */
659 
660 /* Not UTF-8 mode, or character is less than 127. */
661 
662 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
663 return p + 1;
664 #endif  /* COMPILE_PCRE8 */
665 
666 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
667 if (c > 0xff)
668   {
669   c = 0xff;
670   caseless = FALSE;
671   }
672 SET_BIT(c);
673 
674 #ifdef SUPPORT_UTF
675 if (utf && c > 127)
676   {
677   GETCHARINC(c, p);
678 #ifdef SUPPORT_UCP
679   if (caseless)
680     {
681     c = UCD_OTHERCASE(c);
682     if (c > 0xff)
683       c = 0xff;
684     SET_BIT(c);
685     }
686 #endif  /* SUPPORT_UCP */
687   return p;
688   }
689 #else   /* Not SUPPORT_UTF */
690 (void)(utf);   /* Stops warning for unused parameter */
691 #endif  /* SUPPORT_UTF */
692 
693 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
694 return p + 1;
695 #endif
696 }
697 
698 
699 
700 /*************************************************
701 *     Set bits for a positive character type     *
702 *************************************************/
703 
704 /* This function sets starting bits for a character type. In UTF-8 mode, we can
705 only do a direct setting for bytes less than 128, as otherwise there can be
706 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
707 environment, the tables will only recognize ASCII characters anyway, but in at
708 least one Windows environment, some higher bytes bits were set in the tables.
709 So we deal with that case by considering the UTF-8 encoding.
710 
711 Arguments:
712   start_bits     the starting bitmap
713   cbit type      the type of character wanted
714   table_limit    32 for non-UTF-8; 16 for UTF-8
715   cd             the block with char table pointers
716 
717 Returns:         nothing
718 */
719 
720 static void
set_type_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)721 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
722   compile_data *cd)
723 {
724 register pcre_uint32 c;
725 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
726 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
727 if (table_limit == 32) return;
728 for (c = 128; c < 256; c++)
729   {
730   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
731     {
732     pcre_uchar buff[6];
733     (void)PRIV(ord2utf)(c, buff);
734     SET_BIT(buff[0]);
735     }
736   }
737 #endif
738 }
739 
740 
741 /*************************************************
742 *     Set bits for a negative character type     *
743 *************************************************/
744 
745 /* This function sets starting bits for a negative character type such as \D.
746 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
747 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
748 Unlike in the positive case, where we can set appropriate starting bits for
749 specific high-valued UTF-8 characters, in this case we have to set the bits for
750 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
751 0xc0 (192) for simplicity.
752 
753 Arguments:
754   start_bits     the starting bitmap
755   cbit type      the type of character wanted
756   table_limit    32 for non-UTF-8; 16 for UTF-8
757   cd             the block with char table pointers
758 
759 Returns:         nothing
760 */
761 
762 static void
set_nottype_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)763 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
764   compile_data *cd)
765 {
766 register pcre_uint32 c;
767 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
768 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
769 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
770 #endif
771 }
772 
773 
774 
775 /*************************************************
776 *          Create bitmap of starting bytes       *
777 *************************************************/
778 
779 /* This function scans a compiled unanchored expression recursively and
780 attempts to build a bitmap of the set of possible starting bytes. As time goes
781 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
782 useful for parenthesized groups in patterns such as (a*)b where the group
783 provides some optional starting bytes but scanning must continue at the outer
784 level to find at least one mandatory byte. At the outermost level, this
785 function fails unless the result is SSB_DONE.
786 
787 Arguments:
788   code         points to an expression
789   start_bits   points to a 32-byte table, initialized to 0
790   utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
791   cd           the block with char table pointers
792 
793 Returns:       SSB_FAIL     => Failed to find any starting bytes
794                SSB_DONE     => Found mandatory starting bytes
795                SSB_CONTINUE => Found optional starting bytes
796                SSB_UNKNOWN  => Hit an unrecognized opcode
797 */
798 
799 static int
set_start_bits(const pcre_uchar * code,pcre_uint8 * start_bits,BOOL utf,compile_data * cd)800 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
801   compile_data *cd)
802 {
803 register pcre_uint32 c;
804 int yield = SSB_DONE;
805 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
806 int table_limit = utf? 16:32;
807 #else
808 int table_limit = 32;
809 #endif
810 
811 #if 0
812 /* ========================================================================= */
813 /* The following comment and code was inserted in January 1999. In May 2006,
814 when it was observed to cause compiler warnings about unused values, I took it
815 out again. If anybody is still using OS/2, they will have to put it back
816 manually. */
817 
818 /* This next statement and the later reference to dummy are here in order to
819 trick the optimizer of the IBM C compiler for OS/2 into generating correct
820 code. Apparently IBM isn't going to fix the problem, and we would rather not
821 disable optimization (in this module it actually makes a big difference, and
822 the pcre module can use all the optimization it can get). */
823 
824 volatile int dummy;
825 /* ========================================================================= */
826 #endif
827 
828 do
829   {
830   BOOL try_next = TRUE;
831   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
832 
833   if (*code == OP_CBRA || *code == OP_SCBRA ||
834       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
835 
836   while (try_next)    /* Loop for items in this branch */
837     {
838     int rc;
839 
840     switch(*tcode)
841       {
842       /* If we reach something we don't understand, it means a new opcode has
843       been created that hasn't been added to this code. Hopefully this problem
844       will be discovered during testing. */
845 
846       default:
847       return SSB_UNKNOWN;
848 
849       /* Fail for a valid opcode that implies no starting bits. */
850 
851       case OP_ACCEPT:
852       case OP_ASSERT_ACCEPT:
853       case OP_ALLANY:
854       case OP_ANY:
855       case OP_ANYBYTE:
856       case OP_CIRC:
857       case OP_CIRCM:
858       case OP_CLOSE:
859       case OP_COMMIT:
860       case OP_COND:
861       case OP_CREF:
862       case OP_DEF:
863       case OP_DNCREF:
864       case OP_DNREF:
865       case OP_DNREFI:
866       case OP_DNRREF:
867       case OP_DOLL:
868       case OP_DOLLM:
869       case OP_END:
870       case OP_EOD:
871       case OP_EODN:
872       case OP_EXTUNI:
873       case OP_FAIL:
874       case OP_MARK:
875       case OP_NOT:
876       case OP_NOTEXACT:
877       case OP_NOTEXACTI:
878       case OP_NOTI:
879       case OP_NOTMINPLUS:
880       case OP_NOTMINPLUSI:
881       case OP_NOTMINQUERY:
882       case OP_NOTMINQUERYI:
883       case OP_NOTMINSTAR:
884       case OP_NOTMINSTARI:
885       case OP_NOTMINUPTO:
886       case OP_NOTMINUPTOI:
887       case OP_NOTPLUS:
888       case OP_NOTPLUSI:
889       case OP_NOTPOSPLUS:
890       case OP_NOTPOSPLUSI:
891       case OP_NOTPOSQUERY:
892       case OP_NOTPOSQUERYI:
893       case OP_NOTPOSSTAR:
894       case OP_NOTPOSSTARI:
895       case OP_NOTPOSUPTO:
896       case OP_NOTPOSUPTOI:
897       case OP_NOTPROP:
898       case OP_NOTQUERY:
899       case OP_NOTQUERYI:
900       case OP_NOTSTAR:
901       case OP_NOTSTARI:
902       case OP_NOTUPTO:
903       case OP_NOTUPTOI:
904       case OP_NOT_HSPACE:
905       case OP_NOT_VSPACE:
906       case OP_PRUNE:
907       case OP_PRUNE_ARG:
908       case OP_RECURSE:
909       case OP_REF:
910       case OP_REFI:
911       case OP_REVERSE:
912       case OP_RREF:
913       case OP_SCOND:
914       case OP_SET_SOM:
915       case OP_SKIP:
916       case OP_SKIP_ARG:
917       case OP_SOD:
918       case OP_SOM:
919       case OP_THEN:
920       case OP_THEN_ARG:
921       return SSB_FAIL;
922 
923       /* A "real" property test implies no starting bits, but the fake property
924       PT_CLIST identifies a list of characters. These lists are short, as they
925       are used for characters with more than one "other case", so there is no
926       point in recognizing them for OP_NOTPROP. */
927 
928       case OP_PROP:
929       if (tcode[1] != PT_CLIST) return SSB_FAIL;
930         {
931         const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
932         while ((c = *p++) < NOTACHAR)
933           {
934 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
935           if (utf)
936             {
937             pcre_uchar buff[6];
938             (void)PRIV(ord2utf)(c, buff);
939             c = buff[0];
940             }
941 #endif
942           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
943           }
944         }
945       try_next = FALSE;
946       break;
947 
948       /* We can ignore word boundary tests. */
949 
950       case OP_WORD_BOUNDARY:
951       case OP_NOT_WORD_BOUNDARY:
952       tcode++;
953       break;
954 
955       /* If we hit a bracket or a positive lookahead assertion, recurse to set
956       bits from within the subpattern. If it can't find anything, we have to
957       give up. If it finds some mandatory character(s), we are done for this
958       branch. Otherwise, carry on scanning after the subpattern. */
959 
960       case OP_BRA:
961       case OP_SBRA:
962       case OP_CBRA:
963       case OP_SCBRA:
964       case OP_BRAPOS:
965       case OP_SBRAPOS:
966       case OP_CBRAPOS:
967       case OP_SCBRAPOS:
968       case OP_ONCE:
969       case OP_ONCE_NC:
970       case OP_ASSERT:
971       rc = set_start_bits(tcode, start_bits, utf, cd);
972       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
973       if (rc == SSB_DONE) try_next = FALSE; else
974         {
975         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
976         tcode += 1 + LINK_SIZE;
977         }
978       break;
979 
980       /* If we hit ALT or KET, it means we haven't found anything mandatory in
981       this branch, though we might have found something optional. For ALT, we
982       continue with the next alternative, but we have to arrange that the final
983       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
984       return SSB_CONTINUE: if this is the top level, that indicates failure,
985       but after a nested subpattern, it causes scanning to continue. */
986 
987       case OP_ALT:
988       yield = SSB_CONTINUE;
989       try_next = FALSE;
990       break;
991 
992       case OP_KET:
993       case OP_KETRMAX:
994       case OP_KETRMIN:
995       case OP_KETRPOS:
996       return SSB_CONTINUE;
997 
998       /* Skip over callout */
999 
1000       case OP_CALLOUT:
1001       tcode += 2 + 2*LINK_SIZE;
1002       break;
1003 
1004       /* Skip over lookbehind and negative lookahead assertions */
1005 
1006       case OP_ASSERT_NOT:
1007       case OP_ASSERTBACK:
1008       case OP_ASSERTBACK_NOT:
1009       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1010       tcode += 1 + LINK_SIZE;
1011       break;
1012 
1013       /* BRAZERO does the bracket, but carries on. */
1014 
1015       case OP_BRAZERO:
1016       case OP_BRAMINZERO:
1017       case OP_BRAPOSZERO:
1018       rc = set_start_bits(++tcode, start_bits, utf, cd);
1019       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1020 /* =========================================================================
1021       See the comment at the head of this function concerning the next line,
1022       which was an old fudge for the benefit of OS/2.
1023       dummy = 1;
1024   ========================================================================= */
1025       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1026       tcode += 1 + LINK_SIZE;
1027       break;
1028 
1029       /* SKIPZERO skips the bracket. */
1030 
1031       case OP_SKIPZERO:
1032       tcode++;
1033       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1034       tcode += 1 + LINK_SIZE;
1035       break;
1036 
1037       /* Single-char * or ? sets the bit and tries the next item */
1038 
1039       case OP_STAR:
1040       case OP_MINSTAR:
1041       case OP_POSSTAR:
1042       case OP_QUERY:
1043       case OP_MINQUERY:
1044       case OP_POSQUERY:
1045       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1046       break;
1047 
1048       case OP_STARI:
1049       case OP_MINSTARI:
1050       case OP_POSSTARI:
1051       case OP_QUERYI:
1052       case OP_MINQUERYI:
1053       case OP_POSQUERYI:
1054       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1055       break;
1056 
1057       /* Single-char upto sets the bit and tries the next */
1058 
1059       case OP_UPTO:
1060       case OP_MINUPTO:
1061       case OP_POSUPTO:
1062       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1063       break;
1064 
1065       case OP_UPTOI:
1066       case OP_MINUPTOI:
1067       case OP_POSUPTOI:
1068       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1069       break;
1070 
1071       /* At least one single char sets the bit and stops */
1072 
1073       case OP_EXACT:
1074       tcode += IMM2_SIZE;
1075       /* Fall through */
1076       case OP_CHAR:
1077       case OP_PLUS:
1078       case OP_MINPLUS:
1079       case OP_POSPLUS:
1080       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1081       try_next = FALSE;
1082       break;
1083 
1084       case OP_EXACTI:
1085       tcode += IMM2_SIZE;
1086       /* Fall through */
1087       case OP_CHARI:
1088       case OP_PLUSI:
1089       case OP_MINPLUSI:
1090       case OP_POSPLUSI:
1091       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1092       try_next = FALSE;
1093       break;
1094 
1095       /* Special spacing and line-terminating items. These recognize specific
1096       lists of characters. The difference between VSPACE and ANYNL is that the
1097       latter can match the two-character CRLF sequence, but that is not
1098       relevant for finding the first character, so their code here is
1099       identical. */
1100 
1101       case OP_HSPACE:
1102       SET_BIT(CHAR_HT);
1103       SET_BIT(CHAR_SPACE);
1104 #ifdef SUPPORT_UTF
1105       if (utf)
1106         {
1107 #ifdef COMPILE_PCRE8
1108         SET_BIT(0xC2);  /* For U+00A0 */
1109         SET_BIT(0xE1);  /* For U+1680, U+180E */
1110         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1111         SET_BIT(0xE3);  /* For U+3000 */
1112 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1113         SET_BIT(0xA0);
1114         SET_BIT(0xFF);  /* For characters > 255 */
1115 #endif  /* COMPILE_PCRE[8|16|32] */
1116         }
1117       else
1118 #endif /* SUPPORT_UTF */
1119         {
1120 #ifndef EBCDIC
1121         SET_BIT(0xA0);
1122 #endif  /* Not EBCDIC */
1123 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1124         SET_BIT(0xFF);  /* For characters > 255 */
1125 #endif  /* COMPILE_PCRE[16|32] */
1126         }
1127       try_next = FALSE;
1128       break;
1129 
1130       case OP_ANYNL:
1131       case OP_VSPACE:
1132       SET_BIT(CHAR_LF);
1133       SET_BIT(CHAR_VT);
1134       SET_BIT(CHAR_FF);
1135       SET_BIT(CHAR_CR);
1136 #ifdef SUPPORT_UTF
1137       if (utf)
1138         {
1139 #ifdef COMPILE_PCRE8
1140         SET_BIT(0xC2);  /* For U+0085 */
1141         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1142 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1143         SET_BIT(CHAR_NEL);
1144         SET_BIT(0xFF);  /* For characters > 255 */
1145 #endif  /* COMPILE_PCRE[8|16|32] */
1146         }
1147       else
1148 #endif /* SUPPORT_UTF */
1149         {
1150         SET_BIT(CHAR_NEL);
1151 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1152         SET_BIT(0xFF);  /* For characters > 255 */
1153 #endif
1154         }
1155       try_next = FALSE;
1156       break;
1157 
1158       /* Single character types set the bits and stop. Note that if PCRE_UCP
1159       is set, we do not see these op codes because \d etc are converted to
1160       properties. Therefore, these apply in the case when only characters less
1161       than 256 are recognized to match the types. */
1162 
1163       case OP_NOT_DIGIT:
1164       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1165       try_next = FALSE;
1166       break;
1167 
1168       case OP_DIGIT:
1169       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1170       try_next = FALSE;
1171       break;
1172 
1173       /* The cbit_space table has vertical tab as whitespace; we no longer
1174       have to play fancy tricks because Perl added VT to its whitespace at
1175       release 5.18. PCRE added it at release 8.34. */
1176 
1177       case OP_NOT_WHITESPACE:
1178       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1179       try_next = FALSE;
1180       break;
1181 
1182       case OP_WHITESPACE:
1183       set_type_bits(start_bits, cbit_space, table_limit, cd);
1184       try_next = FALSE;
1185       break;
1186 
1187       case OP_NOT_WORDCHAR:
1188       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1189       try_next = FALSE;
1190       break;
1191 
1192       case OP_WORDCHAR:
1193       set_type_bits(start_bits, cbit_word, table_limit, cd);
1194       try_next = FALSE;
1195       break;
1196 
1197       /* One or more character type fudges the pointer and restarts, knowing
1198       it will hit a single character type and stop there. */
1199 
1200       case OP_TYPEPLUS:
1201       case OP_TYPEMINPLUS:
1202       case OP_TYPEPOSPLUS:
1203       tcode++;
1204       break;
1205 
1206       case OP_TYPEEXACT:
1207       tcode += 1 + IMM2_SIZE;
1208       break;
1209 
1210       /* Zero or more repeats of character types set the bits and then
1211       try again. */
1212 
1213       case OP_TYPEUPTO:
1214       case OP_TYPEMINUPTO:
1215       case OP_TYPEPOSUPTO:
1216       tcode += IMM2_SIZE;  /* Fall through */
1217 
1218       case OP_TYPESTAR:
1219       case OP_TYPEMINSTAR:
1220       case OP_TYPEPOSSTAR:
1221       case OP_TYPEQUERY:
1222       case OP_TYPEMINQUERY:
1223       case OP_TYPEPOSQUERY:
1224       switch(tcode[1])
1225         {
1226         default:
1227         case OP_ANY:
1228         case OP_ALLANY:
1229         return SSB_FAIL;
1230 
1231         case OP_HSPACE:
1232         SET_BIT(CHAR_HT);
1233         SET_BIT(CHAR_SPACE);
1234 #ifdef SUPPORT_UTF
1235         if (utf)
1236           {
1237 #ifdef COMPILE_PCRE8
1238           SET_BIT(0xC2);  /* For U+00A0 */
1239           SET_BIT(0xE1);  /* For U+1680, U+180E */
1240           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1241           SET_BIT(0xE3);  /* For U+3000 */
1242 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1243           SET_BIT(0xA0);
1244           SET_BIT(0xFF);  /* For characters > 255 */
1245 #endif  /* COMPILE_PCRE[8|16|32] */
1246           }
1247         else
1248 #endif /* SUPPORT_UTF */
1249 #ifndef EBCDIC
1250           SET_BIT(0xA0);
1251 #endif  /* Not EBCDIC */
1252         break;
1253 
1254         case OP_ANYNL:
1255         case OP_VSPACE:
1256         SET_BIT(CHAR_LF);
1257         SET_BIT(CHAR_VT);
1258         SET_BIT(CHAR_FF);
1259         SET_BIT(CHAR_CR);
1260 #ifdef SUPPORT_UTF
1261         if (utf)
1262           {
1263 #ifdef COMPILE_PCRE8
1264           SET_BIT(0xC2);  /* For U+0085 */
1265           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1266 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1267           SET_BIT(CHAR_NEL);
1268           SET_BIT(0xFF);  /* For characters > 255 */
1269 #endif  /* COMPILE_PCRE16 */
1270           }
1271         else
1272 #endif /* SUPPORT_UTF */
1273           SET_BIT(CHAR_NEL);
1274         break;
1275 
1276         case OP_NOT_DIGIT:
1277         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1278         break;
1279 
1280         case OP_DIGIT:
1281         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1282         break;
1283 
1284         /* The cbit_space table has vertical tab as whitespace; we no longer
1285         have to play fancy tricks because Perl added VT to its whitespace at
1286         release 5.18. PCRE added it at release 8.34. */
1287 
1288         case OP_NOT_WHITESPACE:
1289         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1290         break;
1291 
1292         case OP_WHITESPACE:
1293         set_type_bits(start_bits, cbit_space, table_limit, cd);
1294         break;
1295 
1296         case OP_NOT_WORDCHAR:
1297         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1298         break;
1299 
1300         case OP_WORDCHAR:
1301         set_type_bits(start_bits, cbit_word, table_limit, cd);
1302         break;
1303         }
1304 
1305       tcode += 2;
1306       break;
1307 
1308       /* Character class where all the information is in a bit map: set the
1309       bits and either carry on or not, according to the repeat count. If it was
1310       a negative class, and we are operating with UTF-8 characters, any byte
1311       with a value >= 0xc4 is a potentially valid starter because it starts a
1312       character with a value > 255. */
1313 
1314 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1315       case OP_XCLASS:
1316       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1317         return SSB_FAIL;
1318       /* All bits are set. */
1319       if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1320         return SSB_FAIL;
1321 #endif
1322       /* Fall through */
1323 
1324       case OP_NCLASS:
1325 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1326       if (utf)
1327         {
1328         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1329         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1330         }
1331 #endif
1332 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1333       SET_BIT(0xFF);                         /* For characters > 255 */
1334 #endif
1335       /* Fall through */
1336 
1337       case OP_CLASS:
1338         {
1339         pcre_uint8 *map;
1340 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1341         map = NULL;
1342         if (*tcode == OP_XCLASS)
1343           {
1344           if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1345             map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1346           tcode += GET(tcode, 1);
1347           }
1348         else
1349 #endif
1350           {
1351           tcode++;
1352           map = (pcre_uint8 *)tcode;
1353           tcode += 32 / sizeof(pcre_uchar);
1354           }
1355 
1356         /* In UTF-8 mode, the bits in a bit map correspond to character
1357         values, not to byte values. However, the bit map we are constructing is
1358         for byte values. So we have to do a conversion for characters whose
1359         value is > 127. In fact, there are only two possible starting bytes for
1360         characters in the range 128 - 255. */
1361 
1362 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1363         if (map != NULL)
1364 #endif
1365           {
1366 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1367           if (utf)
1368             {
1369             for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1370             for (c = 128; c < 256; c++)
1371               {
1372               if ((map[c/8] && (1 << (c&7))) != 0)
1373                 {
1374                 int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1375                 start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1376                 c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1377                 }
1378               }
1379             }
1380           else
1381 #endif
1382             {
1383             /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1384             for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1385             }
1386           }
1387 
1388         /* Advance past the bit map, and act on what follows. For a zero
1389         minimum repeat, continue; otherwise stop processing. */
1390 
1391         switch (*tcode)
1392           {
1393           case OP_CRSTAR:
1394           case OP_CRMINSTAR:
1395           case OP_CRQUERY:
1396           case OP_CRMINQUERY:
1397           case OP_CRPOSSTAR:
1398           case OP_CRPOSQUERY:
1399           tcode++;
1400           break;
1401 
1402           case OP_CRRANGE:
1403           case OP_CRMINRANGE:
1404           case OP_CRPOSRANGE:
1405           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1406             else try_next = FALSE;
1407           break;
1408 
1409           default:
1410           try_next = FALSE;
1411           break;
1412           }
1413         }
1414       break; /* End of bitmap class handling */
1415 
1416       }      /* End of switch */
1417     }        /* End of try_next loop */
1418 
1419   code += GET(code, 1);   /* Advance to next branch */
1420   }
1421 while (*code == OP_ALT);
1422 return yield;
1423 }
1424 
1425 
1426 
1427 
1428 
1429 /*************************************************
1430 *          Study a compiled expression           *
1431 *************************************************/
1432 
1433 /* This function is handed a compiled expression that it must study to produce
1434 information that will speed up the matching. It returns a pcre[16]_extra block
1435 which then gets handed back to pcre_exec().
1436 
1437 Arguments:
1438   re        points to the compiled expression
1439   options   contains option bits
1440   errorptr  points to where to place error messages;
1441             set NULL unless error
1442 
1443 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1444               the appropriate flags set;
1445             NULL on error or if no optimization possible
1446 */
1447 
1448 #if defined COMPILE_PCRE8
1449 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre * external_re,int options,const char ** errorptr)1450 pcre_study(const pcre *external_re, int options, const char **errorptr)
1451 #elif defined COMPILE_PCRE16
1452 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1453 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1454 #elif defined COMPILE_PCRE32
1455 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1456 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1457 #endif
1458 {
1459 int min;
1460 int count = 0;
1461 BOOL bits_set = FALSE;
1462 pcre_uint8 start_bits[32];
1463 PUBL(extra) *extra = NULL;
1464 pcre_study_data *study;
1465 const pcre_uint8 *tables;
1466 pcre_uchar *code;
1467 compile_data compile_block;
1468 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1469 
1470 
1471 *errorptr = NULL;
1472 
1473 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1474   {
1475   *errorptr = "argument is not a compiled regular expression";
1476   return NULL;
1477   }
1478 
1479 if ((re->flags & PCRE_MODE) == 0)
1480   {
1481 #if defined COMPILE_PCRE8
1482   *errorptr = "argument not compiled in 8 bit mode";
1483 #elif defined COMPILE_PCRE16
1484   *errorptr = "argument not compiled in 16 bit mode";
1485 #elif defined COMPILE_PCRE32
1486   *errorptr = "argument not compiled in 32 bit mode";
1487 #endif
1488   return NULL;
1489   }
1490 
1491 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1492   {
1493   *errorptr = "unknown or incorrect option bit(s) set";
1494   return NULL;
1495   }
1496 
1497 code = (pcre_uchar *)re + re->name_table_offset +
1498   (re->name_count * re->name_entry_size);
1499 
1500 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1501 a multiline pattern that matches only at "line starts", there is no point in
1502 seeking a list of starting bytes. */
1503 
1504 if ((re->options & PCRE_ANCHORED) == 0 &&
1505     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1506   {
1507   int rc;
1508 
1509   /* Set the character tables in the block that is passed around */
1510 
1511   tables = re->tables;
1512 
1513 #if defined COMPILE_PCRE8
1514   if (tables == NULL)
1515     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1516     (void *)(&tables));
1517 #elif defined COMPILE_PCRE16
1518   if (tables == NULL)
1519     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1520     (void *)(&tables));
1521 #elif defined COMPILE_PCRE32
1522   if (tables == NULL)
1523     (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1524     (void *)(&tables));
1525 #endif
1526 
1527   compile_block.lcc = tables + lcc_offset;
1528   compile_block.fcc = tables + fcc_offset;
1529   compile_block.cbits = tables + cbits_offset;
1530   compile_block.ctypes = tables + ctypes_offset;
1531 
1532   /* See if we can find a fixed set of initial characters for the pattern. */
1533 
1534   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1535   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1536     &compile_block);
1537   bits_set = rc == SSB_DONE;
1538   if (rc == SSB_UNKNOWN)
1539     {
1540     *errorptr = "internal error: opcode not recognized";
1541     return NULL;
1542     }
1543   }
1544 
1545 /* Find the minimum length of subject string. */
1546 
1547 switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1548   {
1549   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1550   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1551   default: break;
1552   }
1553 
1554 /* If a set of starting bytes has been identified, or if the minimum length is
1555 greater than zero, or if JIT optimization has been requested, or if
1556 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1557 pcre_study_data block. The study data is put in the latter, which is pointed to
1558 by the former, which may also get additional data set later by the calling
1559 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1560 save it in a field for returning via the pcre_fullinfo() function so that if it
1561 becomes variable in the future, we don't have to change that code. */
1562 
1563 if (bits_set || min > 0 || (options & (
1564 #ifdef SUPPORT_JIT
1565     PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1566     PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1567 #endif
1568     PCRE_STUDY_EXTRA_NEEDED)) != 0)
1569   {
1570   extra = (PUBL(extra) *)(PUBL(malloc))
1571     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1572   if (extra == NULL)
1573     {
1574     *errorptr = "failed to get memory";
1575     return NULL;
1576     }
1577 
1578   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1579   extra->flags = PCRE_EXTRA_STUDY_DATA;
1580   extra->study_data = study;
1581 
1582   study->size = sizeof(pcre_study_data);
1583   study->flags = 0;
1584 
1585   /* Set the start bits always, to avoid unset memory errors if the
1586   study data is written to a file, but set the flag only if any of the bits
1587   are set, to save time looking when none are. */
1588 
1589   if (bits_set)
1590     {
1591     study->flags |= PCRE_STUDY_MAPPED;
1592     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1593     }
1594   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1595 
1596 #ifdef PCRE_DEBUG
1597   if (bits_set)
1598     {
1599     pcre_uint8 *ptr = start_bits;
1600     int i;
1601 
1602     printf("Start bits:\n");
1603     for (i = 0; i < 32; i++)
1604       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1605     }
1606 #endif
1607 
1608   /* Always set the minlength value in the block, because the JIT compiler
1609   makes use of it. However, don't set the bit unless the length is greater than
1610   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1611   checking the zero case. */
1612 
1613   if (min > 0)
1614     {
1615     study->flags |= PCRE_STUDY_MINLEN;
1616     study->minlength = min;
1617     }
1618   else study->minlength = 0;
1619 
1620   /* If JIT support was compiled and requested, attempt the JIT compilation.
1621   If no starting bytes were found, and the minimum length is zero, and JIT
1622   compilation fails, abandon the extra block and return NULL, unless
1623   PCRE_STUDY_EXTRA_NEEDED is set. */
1624 
1625 #ifdef SUPPORT_JIT
1626   extra->executable_jit = NULL;
1627   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1628     PRIV(jit_compile)(re, extra, JIT_COMPILE);
1629   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1630     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1631   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1632     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1633 
1634   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1635       (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1636     {
1637 #if defined COMPILE_PCRE8
1638     pcre_free_study(extra);
1639 #elif defined COMPILE_PCRE16
1640     pcre16_free_study(extra);
1641 #elif defined COMPILE_PCRE32
1642     pcre32_free_study(extra);
1643 #endif
1644     extra = NULL;
1645     }
1646 #endif
1647   }
1648 
1649 return extra;
1650 }
1651 
1652 
1653 /*************************************************
1654 *          Free the study data                   *
1655 *************************************************/
1656 
1657 /* This function frees the memory that was obtained by pcre_study().
1658 
1659 Argument:   a pointer to the pcre[16]_extra block
1660 Returns:    nothing
1661 */
1662 
1663 #if defined COMPILE_PCRE8
1664 PCRE_EXP_DEFN void
pcre_free_study(pcre_extra * extra)1665 pcre_free_study(pcre_extra *extra)
1666 #elif defined COMPILE_PCRE16
1667 PCRE_EXP_DEFN void
1668 pcre16_free_study(pcre16_extra *extra)
1669 #elif defined COMPILE_PCRE32
1670 PCRE_EXP_DEFN void
1671 pcre32_free_study(pcre32_extra *extra)
1672 #endif
1673 {
1674 if (extra == NULL)
1675   return;
1676 #ifdef SUPPORT_JIT
1677 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1678      extra->executable_jit != NULL)
1679   PRIV(jit_free)(extra->executable_jit);
1680 #endif
1681 PUBL(free)(extra);
1682 }
1683 
1684 /* End of pcre_study.c */
1685