xref: /PHP-5.3/ext/pcre/pcrelib/pcre_study.c (revision 357ab3cb)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2012 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #include "pcre_internal.h"
50 
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52 
53 /* Returns from set_start_bits() */
54 
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56 
57 
58 
59 /*************************************************
60 *   Find the minimum subject length for a group  *
61 *************************************************/
62 
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67 
68 Arguments:
69   code            pointer to start of group (the bracket)
70   startcode       pointer to start of the whole pattern
71   options         the compiling options
72   int             RECURSE depth
73 
74 Returns:   the minimum length
75            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
76            -2 internal error (missing capturing bracket)
77            -3 internal error (opcode not listed)
78 */
79 
80 static int
find_minlength(const pcre_uchar * code,const pcre_uchar * startcode,int options,int recurse_depth)81 find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
82   int recurse_depth)
83 {
84 int length = -1;
85 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
86 BOOL utf = (options & PCRE_UTF8) != 0;
87 BOOL had_recurse = FALSE;
88 register int branchlength = 0;
89 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
90 
91 if (*code == OP_CBRA || *code == OP_SCBRA ||
92     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
93 
94 /* Scan along the opcodes for this branch. If we get to the end of the
95 branch, check the length against that of the other branches. */
96 
97 for (;;)
98   {
99   int d, min;
100   pcre_uchar *cs, *ce;
101   register pcre_uchar op = *cc;
102 
103   switch (op)
104     {
105     case OP_COND:
106     case OP_SCOND:
107 
108     /* If there is only one branch in a condition, the implied branch has zero
109     length, so we don't add anything. This covers the DEFINE "condition"
110     automatically. */
111 
112     cs = cc + GET(cc, 1);
113     if (*cs != OP_ALT)
114       {
115       cc = cs + 1 + LINK_SIZE;
116       break;
117       }
118 
119     /* Otherwise we can fall through and treat it the same as any other
120     subpattern. */
121 
122     case OP_CBRA:
123     case OP_SCBRA:
124     case OP_BRA:
125     case OP_SBRA:
126     case OP_CBRAPOS:
127     case OP_SCBRAPOS:
128     case OP_BRAPOS:
129     case OP_SBRAPOS:
130     case OP_ONCE:
131     case OP_ONCE_NC:
132     d = find_minlength(cc, startcode, options, recurse_depth);
133     if (d < 0) return d;
134     branchlength += d;
135     do cc += GET(cc, 1); while (*cc == OP_ALT);
136     cc += 1 + LINK_SIZE;
137     break;
138 
139     /* ACCEPT makes things far too complicated; we have to give up. */
140 
141     case OP_ACCEPT:
142     case OP_ASSERT_ACCEPT:
143     return -1;
144 
145     /* Reached end of a branch; if it's a ket it is the end of a nested
146     call. If it's ALT it is an alternation in a nested call. If it is END it's
147     the end of the outer call. All can be handled by the same code. If an
148     ACCEPT was previously encountered, use the length that was in force at that
149     time, and pass back the shortest ACCEPT length. */
150 
151     case OP_ALT:
152     case OP_KET:
153     case OP_KETRMAX:
154     case OP_KETRMIN:
155     case OP_KETRPOS:
156     case OP_END:
157     if (length < 0 || (!had_recurse && branchlength < length))
158       length = branchlength;
159     if (op != OP_ALT) return length;
160     cc += 1 + LINK_SIZE;
161     branchlength = 0;
162     had_recurse = FALSE;
163     break;
164 
165     /* Skip over assertive subpatterns */
166 
167     case OP_ASSERT:
168     case OP_ASSERT_NOT:
169     case OP_ASSERTBACK:
170     case OP_ASSERTBACK_NOT:
171     do cc += GET(cc, 1); while (*cc == OP_ALT);
172     /* Fall through */
173 
174     /* Skip over things that don't match chars */
175 
176     case OP_REVERSE:
177     case OP_CREF:
178     case OP_NCREF:
179     case OP_RREF:
180     case OP_NRREF:
181     case OP_DEF:
182     case OP_CALLOUT:
183     case OP_SOD:
184     case OP_SOM:
185     case OP_EOD:
186     case OP_EODN:
187     case OP_CIRC:
188     case OP_CIRCM:
189     case OP_DOLL:
190     case OP_DOLLM:
191     case OP_NOT_WORD_BOUNDARY:
192     case OP_WORD_BOUNDARY:
193     cc += PRIV(OP_lengths)[*cc];
194     break;
195 
196     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
197 
198     case OP_BRAZERO:
199     case OP_BRAMINZERO:
200     case OP_BRAPOSZERO:
201     case OP_SKIPZERO:
202     cc += PRIV(OP_lengths)[*cc];
203     do cc += GET(cc, 1); while (*cc == OP_ALT);
204     cc += 1 + LINK_SIZE;
205     break;
206 
207     /* Handle literal characters and + repetitions */
208 
209     case OP_CHAR:
210     case OP_CHARI:
211     case OP_NOT:
212     case OP_NOTI:
213     case OP_PLUS:
214     case OP_PLUSI:
215     case OP_MINPLUS:
216     case OP_MINPLUSI:
217     case OP_POSPLUS:
218     case OP_POSPLUSI:
219     case OP_NOTPLUS:
220     case OP_NOTPLUSI:
221     case OP_NOTMINPLUS:
222     case OP_NOTMINPLUSI:
223     case OP_NOTPOSPLUS:
224     case OP_NOTPOSPLUSI:
225     branchlength++;
226     cc += 2;
227 #ifdef SUPPORT_UTF
228     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
229 #endif
230     break;
231 
232     case OP_TYPEPLUS:
233     case OP_TYPEMINPLUS:
234     case OP_TYPEPOSPLUS:
235     branchlength++;
236     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
237     break;
238 
239     /* Handle exact repetitions. The count is already in characters, but we
240     need to skip over a multibyte character in UTF8 mode.  */
241 
242     case OP_EXACT:
243     case OP_EXACTI:
244     case OP_NOTEXACT:
245     case OP_NOTEXACTI:
246     branchlength += GET2(cc,1);
247     cc += 2 + IMM2_SIZE;
248 #ifdef SUPPORT_UTF
249     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
250 #endif
251     break;
252 
253     case OP_TYPEEXACT:
254     branchlength += GET2(cc,1);
255     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
256       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
257     break;
258 
259     /* Handle single-char non-literal matchers */
260 
261     case OP_PROP:
262     case OP_NOTPROP:
263     cc += 2;
264     /* Fall through */
265 
266     case OP_NOT_DIGIT:
267     case OP_DIGIT:
268     case OP_NOT_WHITESPACE:
269     case OP_WHITESPACE:
270     case OP_NOT_WORDCHAR:
271     case OP_WORDCHAR:
272     case OP_ANY:
273     case OP_ALLANY:
274     case OP_EXTUNI:
275     case OP_HSPACE:
276     case OP_NOT_HSPACE:
277     case OP_VSPACE:
278     case OP_NOT_VSPACE:
279     branchlength++;
280     cc++;
281     break;
282 
283     /* "Any newline" might match two characters, but it also might match just
284     one. */
285 
286     case OP_ANYNL:
287     branchlength += 1;
288     cc++;
289     break;
290 
291     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
292     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
293     appear, but leave the code, just in case.) */
294 
295     case OP_ANYBYTE:
296 #ifdef SUPPORT_UTF
297     if (utf) return -1;
298 #endif
299     branchlength++;
300     cc++;
301     break;
302 
303     /* For repeated character types, we have to test for \p and \P, which have
304     an extra two bytes of parameters. */
305 
306     case OP_TYPESTAR:
307     case OP_TYPEMINSTAR:
308     case OP_TYPEQUERY:
309     case OP_TYPEMINQUERY:
310     case OP_TYPEPOSSTAR:
311     case OP_TYPEPOSQUERY:
312     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
313     cc += PRIV(OP_lengths)[op];
314     break;
315 
316     case OP_TYPEUPTO:
317     case OP_TYPEMINUPTO:
318     case OP_TYPEPOSUPTO:
319     if (cc[1 + IMM2_SIZE] == OP_PROP
320       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
321     cc += PRIV(OP_lengths)[op];
322     break;
323 
324     /* Check a class for variable quantification */
325 
326     case OP_CLASS:
327     case OP_NCLASS:
328 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
329     case OP_XCLASS:
330     /* The original code caused an unsigned overflow in 64 bit systems,
331     so now we use a conditional statement. */
332     if (op == OP_XCLASS)
333       cc += GET(cc, 1);
334     else
335       cc += PRIV(OP_lengths)[OP_CLASS];
336 #else
337     cc += PRIV(OP_lengths)[OP_CLASS];
338 #endif
339 
340     switch (*cc)
341       {
342       case OP_CRPLUS:
343       case OP_CRMINPLUS:
344       branchlength++;
345       /* Fall through */
346 
347       case OP_CRSTAR:
348       case OP_CRMINSTAR:
349       case OP_CRQUERY:
350       case OP_CRMINQUERY:
351       cc++;
352       break;
353 
354       case OP_CRRANGE:
355       case OP_CRMINRANGE:
356       branchlength += GET2(cc,1);
357       cc += 1 + 2 * IMM2_SIZE;
358       break;
359 
360       default:
361       branchlength++;
362       break;
363       }
364     break;
365 
366     /* Backreferences and subroutine calls are treated in the same way: we find
367     the minimum length for the subpattern. A recursion, however, causes an
368     a flag to be set that causes the length of this branch to be ignored. The
369     logic is that a recursion can only make sense if there is another
370     alternation that stops the recursing. That will provide the minimum length
371     (when no recursion happens). A backreference within the group that it is
372     referencing behaves in the same way.
373 
374     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
375     matches an empty string (by default it causes a matching failure), so in
376     that case we must set the minimum length to zero. */
377 
378     case OP_REF:
379     case OP_REFI:
380     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
381       {
382       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
383       if (cs == NULL) return -2;
384       do ce += GET(ce, 1); while (*ce == OP_ALT);
385       if (cc > cs && cc < ce)
386         {
387         d = 0;
388         had_recurse = TRUE;
389         }
390       else
391         {
392         d = find_minlength(cs, startcode, options, recurse_depth);
393         }
394       }
395     else d = 0;
396     cc += 1 + IMM2_SIZE;
397 
398     /* Handle repeated back references */
399 
400     switch (*cc)
401       {
402       case OP_CRSTAR:
403       case OP_CRMINSTAR:
404       case OP_CRQUERY:
405       case OP_CRMINQUERY:
406       min = 0;
407       cc++;
408       break;
409 
410       case OP_CRPLUS:
411       case OP_CRMINPLUS:
412       min = 1;
413       cc++;
414       break;
415 
416       case OP_CRRANGE:
417       case OP_CRMINRANGE:
418       min = GET2(cc, 1);
419       cc += 1 + 2 * IMM2_SIZE;
420       break;
421 
422       default:
423       min = 1;
424       break;
425       }
426 
427     branchlength += min * d;
428     break;
429 
430     /* We can easily detect direct recursion, but not mutual recursion. This is
431     caught by a recursion depth count. */
432 
433     case OP_RECURSE:
434     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
435     do ce += GET(ce, 1); while (*ce == OP_ALT);
436     if ((cc > cs && cc < ce) || recurse_depth > 10)
437       had_recurse = TRUE;
438     else
439       {
440       branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
441       }
442     cc += 1 + LINK_SIZE;
443     break;
444 
445     /* Anything else does not or need not match a character. We can get the
446     item's length from the table, but for those that can match zero occurrences
447     of a character, we must take special action for UTF-8 characters. As it
448     happens, the "NOT" versions of these opcodes are used at present only for
449     ASCII characters, so they could be omitted from this list. However, in
450     future that may change, so we include them here so as not to leave a
451     gotcha for a future maintainer. */
452 
453     case OP_UPTO:
454     case OP_UPTOI:
455     case OP_NOTUPTO:
456     case OP_NOTUPTOI:
457     case OP_MINUPTO:
458     case OP_MINUPTOI:
459     case OP_NOTMINUPTO:
460     case OP_NOTMINUPTOI:
461     case OP_POSUPTO:
462     case OP_POSUPTOI:
463     case OP_NOTPOSUPTO:
464     case OP_NOTPOSUPTOI:
465 
466     case OP_STAR:
467     case OP_STARI:
468     case OP_NOTSTAR:
469     case OP_NOTSTARI:
470     case OP_MINSTAR:
471     case OP_MINSTARI:
472     case OP_NOTMINSTAR:
473     case OP_NOTMINSTARI:
474     case OP_POSSTAR:
475     case OP_POSSTARI:
476     case OP_NOTPOSSTAR:
477     case OP_NOTPOSSTARI:
478 
479     case OP_QUERY:
480     case OP_QUERYI:
481     case OP_NOTQUERY:
482     case OP_NOTQUERYI:
483     case OP_MINQUERY:
484     case OP_MINQUERYI:
485     case OP_NOTMINQUERY:
486     case OP_NOTMINQUERYI:
487     case OP_POSQUERY:
488     case OP_POSQUERYI:
489     case OP_NOTPOSQUERY:
490     case OP_NOTPOSQUERYI:
491 
492     cc += PRIV(OP_lengths)[op];
493 #ifdef SUPPORT_UTF
494     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
495 #endif
496     break;
497 
498     /* Skip these, but we need to add in the name length. */
499 
500     case OP_MARK:
501     case OP_PRUNE_ARG:
502     case OP_SKIP_ARG:
503     case OP_THEN_ARG:
504     cc += PRIV(OP_lengths)[op] + cc[1];
505     break;
506 
507     /* The remaining opcodes are just skipped over. */
508 
509     case OP_CLOSE:
510     case OP_COMMIT:
511     case OP_FAIL:
512     case OP_PRUNE:
513     case OP_SET_SOM:
514     case OP_SKIP:
515     case OP_THEN:
516     cc += PRIV(OP_lengths)[op];
517     break;
518 
519     /* This should not occur: we list all opcodes explicitly so that when
520     new ones get added they are properly considered. */
521 
522     default:
523     return -3;
524     }
525   }
526 /* Control never gets here */
527 }
528 
529 
530 
531 /*************************************************
532 *      Set a bit and maybe its alternate case    *
533 *************************************************/
534 
535 /* Given a character, set its first byte's bit in the table, and also the
536 corresponding bit for the other version of a letter if we are caseless. In
537 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
538 when Unicode property support is available.
539 
540 Arguments:
541   start_bits    points to the bit map
542   p             points to the character
543   caseless      the caseless flag
544   cd            the block with char table pointers
545   utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
546 
547 Returns:        pointer after the character
548 */
549 
550 static const pcre_uchar *
set_table_bit(pcre_uint8 * start_bits,const pcre_uchar * p,BOOL caseless,compile_data * cd,BOOL utf)551 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
552   compile_data *cd, BOOL utf)
553 {
554 pcre_uint32 c = *p;
555 
556 #ifdef COMPILE_PCRE8
557 SET_BIT(c);
558 
559 #ifdef SUPPORT_UTF
560 if (utf && c > 127)
561   {
562   GETCHARINC(c, p);
563 #ifdef SUPPORT_UCP
564   if (caseless)
565     {
566     pcre_uchar buff[6];
567     c = UCD_OTHERCASE(c);
568     (void)PRIV(ord2utf)(c, buff);
569     SET_BIT(buff[0]);
570     }
571 #endif  /* Not SUPPORT_UCP */
572   return p;
573   }
574 #else   /* Not SUPPORT_UTF */
575 (void)(utf);   /* Stops warning for unused parameter */
576 #endif  /* SUPPORT_UTF */
577 
578 /* Not UTF-8 mode, or character is less than 127. */
579 
580 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
581 return p + 1;
582 #endif  /* COMPILE_PCRE8 */
583 
584 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
585 if (c > 0xff)
586   {
587   c = 0xff;
588   caseless = FALSE;
589   }
590 SET_BIT(c);
591 
592 #ifdef SUPPORT_UTF
593 if (utf && c > 127)
594   {
595   GETCHARINC(c, p);
596 #ifdef SUPPORT_UCP
597   if (caseless)
598     {
599     c = UCD_OTHERCASE(c);
600     if (c > 0xff)
601       c = 0xff;
602     SET_BIT(c);
603     }
604 #endif  /* SUPPORT_UCP */
605   return p;
606   }
607 #else   /* Not SUPPORT_UTF */
608 (void)(utf);   /* Stops warning for unused parameter */
609 #endif  /* SUPPORT_UTF */
610 
611 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
612 return p + 1;
613 #endif
614 }
615 
616 
617 
618 /*************************************************
619 *     Set bits for a positive character type     *
620 *************************************************/
621 
622 /* This function sets starting bits for a character type. In UTF-8 mode, we can
623 only do a direct setting for bytes less than 128, as otherwise there can be
624 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
625 environment, the tables will only recognize ASCII characters anyway, but in at
626 least one Windows environment, some higher bytes bits were set in the tables.
627 So we deal with that case by considering the UTF-8 encoding.
628 
629 Arguments:
630   start_bits     the starting bitmap
631   cbit type      the type of character wanted
632   table_limit    32 for non-UTF-8; 16 for UTF-8
633   cd             the block with char table pointers
634 
635 Returns:         nothing
636 */
637 
638 static void
set_type_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)639 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
640   compile_data *cd)
641 {
642 register pcre_uint32 c;
643 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
644 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
645 if (table_limit == 32) return;
646 for (c = 128; c < 256; c++)
647   {
648   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
649     {
650     pcre_uchar buff[6];
651     (void)PRIV(ord2utf)(c, buff);
652     SET_BIT(buff[0]);
653     }
654   }
655 #endif
656 }
657 
658 
659 /*************************************************
660 *     Set bits for a negative character type     *
661 *************************************************/
662 
663 /* This function sets starting bits for a negative character type such as \D.
664 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
665 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
666 Unlike in the positive case, where we can set appropriate starting bits for
667 specific high-valued UTF-8 characters, in this case we have to set the bits for
668 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
669 0xc0 (192) for simplicity.
670 
671 Arguments:
672   start_bits     the starting bitmap
673   cbit type      the type of character wanted
674   table_limit    32 for non-UTF-8; 16 for UTF-8
675   cd             the block with char table pointers
676 
677 Returns:         nothing
678 */
679 
680 static void
set_nottype_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)681 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
682   compile_data *cd)
683 {
684 register pcre_uint32 c;
685 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
686 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
687 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
688 #endif
689 }
690 
691 
692 
693 /*************************************************
694 *          Create bitmap of starting bytes       *
695 *************************************************/
696 
697 /* This function scans a compiled unanchored expression recursively and
698 attempts to build a bitmap of the set of possible starting bytes. As time goes
699 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
700 useful for parenthesized groups in patterns such as (a*)b where the group
701 provides some optional starting bytes but scanning must continue at the outer
702 level to find at least one mandatory byte. At the outermost level, this
703 function fails unless the result is SSB_DONE.
704 
705 Arguments:
706   code         points to an expression
707   start_bits   points to a 32-byte table, initialized to 0
708   utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
709   cd           the block with char table pointers
710 
711 Returns:       SSB_FAIL     => Failed to find any starting bytes
712                SSB_DONE     => Found mandatory starting bytes
713                SSB_CONTINUE => Found optional starting bytes
714                SSB_UNKNOWN  => Hit an unrecognized opcode
715 */
716 
717 static int
set_start_bits(const pcre_uchar * code,pcre_uint8 * start_bits,BOOL utf,compile_data * cd)718 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
719   compile_data *cd)
720 {
721 register pcre_uint32 c;
722 int yield = SSB_DONE;
723 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
724 int table_limit = utf? 16:32;
725 #else
726 int table_limit = 32;
727 #endif
728 
729 #if 0
730 /* ========================================================================= */
731 /* The following comment and code was inserted in January 1999. In May 2006,
732 when it was observed to cause compiler warnings about unused values, I took it
733 out again. If anybody is still using OS/2, they will have to put it back
734 manually. */
735 
736 /* This next statement and the later reference to dummy are here in order to
737 trick the optimizer of the IBM C compiler for OS/2 into generating correct
738 code. Apparently IBM isn't going to fix the problem, and we would rather not
739 disable optimization (in this module it actually makes a big difference, and
740 the pcre module can use all the optimization it can get). */
741 
742 volatile int dummy;
743 /* ========================================================================= */
744 #endif
745 
746 do
747   {
748   BOOL try_next = TRUE;
749   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
750 
751   if (*code == OP_CBRA || *code == OP_SCBRA ||
752       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
753 
754   while (try_next)    /* Loop for items in this branch */
755     {
756     int rc;
757 
758     switch(*tcode)
759       {
760       /* If we reach something we don't understand, it means a new opcode has
761       been created that hasn't been added to this code. Hopefully this problem
762       will be discovered during testing. */
763 
764       default:
765       return SSB_UNKNOWN;
766 
767       /* Fail for a valid opcode that implies no starting bits. */
768 
769       case OP_ACCEPT:
770       case OP_ASSERT_ACCEPT:
771       case OP_ALLANY:
772       case OP_ANY:
773       case OP_ANYBYTE:
774       case OP_CIRC:
775       case OP_CIRCM:
776       case OP_CLOSE:
777       case OP_COMMIT:
778       case OP_COND:
779       case OP_CREF:
780       case OP_DEF:
781       case OP_DOLL:
782       case OP_DOLLM:
783       case OP_END:
784       case OP_EOD:
785       case OP_EODN:
786       case OP_EXTUNI:
787       case OP_FAIL:
788       case OP_MARK:
789       case OP_NCREF:
790       case OP_NOT:
791       case OP_NOTEXACT:
792       case OP_NOTEXACTI:
793       case OP_NOTI:
794       case OP_NOTMINPLUS:
795       case OP_NOTMINPLUSI:
796       case OP_NOTMINQUERY:
797       case OP_NOTMINQUERYI:
798       case OP_NOTMINSTAR:
799       case OP_NOTMINSTARI:
800       case OP_NOTMINUPTO:
801       case OP_NOTMINUPTOI:
802       case OP_NOTPLUS:
803       case OP_NOTPLUSI:
804       case OP_NOTPOSPLUS:
805       case OP_NOTPOSPLUSI:
806       case OP_NOTPOSQUERY:
807       case OP_NOTPOSQUERYI:
808       case OP_NOTPOSSTAR:
809       case OP_NOTPOSSTARI:
810       case OP_NOTPOSUPTO:
811       case OP_NOTPOSUPTOI:
812       case OP_NOTPROP:
813       case OP_NOTQUERY:
814       case OP_NOTQUERYI:
815       case OP_NOTSTAR:
816       case OP_NOTSTARI:
817       case OP_NOTUPTO:
818       case OP_NOTUPTOI:
819       case OP_NOT_HSPACE:
820       case OP_NOT_VSPACE:
821       case OP_NRREF:
822       case OP_PROP:
823       case OP_PRUNE:
824       case OP_PRUNE_ARG:
825       case OP_RECURSE:
826       case OP_REF:
827       case OP_REFI:
828       case OP_REVERSE:
829       case OP_RREF:
830       case OP_SCOND:
831       case OP_SET_SOM:
832       case OP_SKIP:
833       case OP_SKIP_ARG:
834       case OP_SOD:
835       case OP_SOM:
836       case OP_THEN:
837       case OP_THEN_ARG:
838 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
839       case OP_XCLASS:
840 #endif
841       return SSB_FAIL;
842 
843       /* We can ignore word boundary tests. */
844 
845       case OP_WORD_BOUNDARY:
846       case OP_NOT_WORD_BOUNDARY:
847       tcode++;
848       break;
849 
850       /* If we hit a bracket or a positive lookahead assertion, recurse to set
851       bits from within the subpattern. If it can't find anything, we have to
852       give up. If it finds some mandatory character(s), we are done for this
853       branch. Otherwise, carry on scanning after the subpattern. */
854 
855       case OP_BRA:
856       case OP_SBRA:
857       case OP_CBRA:
858       case OP_SCBRA:
859       case OP_BRAPOS:
860       case OP_SBRAPOS:
861       case OP_CBRAPOS:
862       case OP_SCBRAPOS:
863       case OP_ONCE:
864       case OP_ONCE_NC:
865       case OP_ASSERT:
866       rc = set_start_bits(tcode, start_bits, utf, cd);
867       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
868       if (rc == SSB_DONE) try_next = FALSE; else
869         {
870         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
871         tcode += 1 + LINK_SIZE;
872         }
873       break;
874 
875       /* If we hit ALT or KET, it means we haven't found anything mandatory in
876       this branch, though we might have found something optional. For ALT, we
877       continue with the next alternative, but we have to arrange that the final
878       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
879       return SSB_CONTINUE: if this is the top level, that indicates failure,
880       but after a nested subpattern, it causes scanning to continue. */
881 
882       case OP_ALT:
883       yield = SSB_CONTINUE;
884       try_next = FALSE;
885       break;
886 
887       case OP_KET:
888       case OP_KETRMAX:
889       case OP_KETRMIN:
890       case OP_KETRPOS:
891       return SSB_CONTINUE;
892 
893       /* Skip over callout */
894 
895       case OP_CALLOUT:
896       tcode += 2 + 2*LINK_SIZE;
897       break;
898 
899       /* Skip over lookbehind and negative lookahead assertions */
900 
901       case OP_ASSERT_NOT:
902       case OP_ASSERTBACK:
903       case OP_ASSERTBACK_NOT:
904       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
905       tcode += 1 + LINK_SIZE;
906       break;
907 
908       /* BRAZERO does the bracket, but carries on. */
909 
910       case OP_BRAZERO:
911       case OP_BRAMINZERO:
912       case OP_BRAPOSZERO:
913       rc = set_start_bits(++tcode, start_bits, utf, cd);
914       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
915 /* =========================================================================
916       See the comment at the head of this function concerning the next line,
917       which was an old fudge for the benefit of OS/2.
918       dummy = 1;
919   ========================================================================= */
920       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
921       tcode += 1 + LINK_SIZE;
922       break;
923 
924       /* SKIPZERO skips the bracket. */
925 
926       case OP_SKIPZERO:
927       tcode++;
928       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
929       tcode += 1 + LINK_SIZE;
930       break;
931 
932       /* Single-char * or ? sets the bit and tries the next item */
933 
934       case OP_STAR:
935       case OP_MINSTAR:
936       case OP_POSSTAR:
937       case OP_QUERY:
938       case OP_MINQUERY:
939       case OP_POSQUERY:
940       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
941       break;
942 
943       case OP_STARI:
944       case OP_MINSTARI:
945       case OP_POSSTARI:
946       case OP_QUERYI:
947       case OP_MINQUERYI:
948       case OP_POSQUERYI:
949       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
950       break;
951 
952       /* Single-char upto sets the bit and tries the next */
953 
954       case OP_UPTO:
955       case OP_MINUPTO:
956       case OP_POSUPTO:
957       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
958       break;
959 
960       case OP_UPTOI:
961       case OP_MINUPTOI:
962       case OP_POSUPTOI:
963       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
964       break;
965 
966       /* At least one single char sets the bit and stops */
967 
968       case OP_EXACT:
969       tcode += IMM2_SIZE;
970       /* Fall through */
971       case OP_CHAR:
972       case OP_PLUS:
973       case OP_MINPLUS:
974       case OP_POSPLUS:
975       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
976       try_next = FALSE;
977       break;
978 
979       case OP_EXACTI:
980       tcode += IMM2_SIZE;
981       /* Fall through */
982       case OP_CHARI:
983       case OP_PLUSI:
984       case OP_MINPLUSI:
985       case OP_POSPLUSI:
986       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
987       try_next = FALSE;
988       break;
989 
990       /* Special spacing and line-terminating items. These recognize specific
991       lists of characters. The difference between VSPACE and ANYNL is that the
992       latter can match the two-character CRLF sequence, but that is not
993       relevant for finding the first character, so their code here is
994       identical. */
995 
996       case OP_HSPACE:
997       SET_BIT(CHAR_HT);
998       SET_BIT(CHAR_SPACE);
999 #ifdef SUPPORT_UTF
1000       if (utf)
1001         {
1002 #ifdef COMPILE_PCRE8
1003         SET_BIT(0xC2);  /* For U+00A0 */
1004         SET_BIT(0xE1);  /* For U+1680, U+180E */
1005         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1006         SET_BIT(0xE3);  /* For U+3000 */
1007 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1008         SET_BIT(0xA0);
1009         SET_BIT(0xFF);  /* For characters > 255 */
1010 #endif  /* COMPILE_PCRE[8|16|32] */
1011         }
1012       else
1013 #endif /* SUPPORT_UTF */
1014         {
1015 #ifndef EBCDIC
1016         SET_BIT(0xA0);
1017 #endif  /* Not EBCDIC */
1018 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1019         SET_BIT(0xFF);  /* For characters > 255 */
1020 #endif  /* COMPILE_PCRE[16|32] */
1021         }
1022       try_next = FALSE;
1023       break;
1024 
1025       case OP_ANYNL:
1026       case OP_VSPACE:
1027       SET_BIT(CHAR_LF);
1028       SET_BIT(CHAR_VT);
1029       SET_BIT(CHAR_FF);
1030       SET_BIT(CHAR_CR);
1031 #ifdef SUPPORT_UTF
1032       if (utf)
1033         {
1034 #ifdef COMPILE_PCRE8
1035         SET_BIT(0xC2);  /* For U+0085 */
1036         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1037 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1038         SET_BIT(CHAR_NEL);
1039         SET_BIT(0xFF);  /* For characters > 255 */
1040 #endif  /* COMPILE_PCRE[8|16|32] */
1041         }
1042       else
1043 #endif /* SUPPORT_UTF */
1044         {
1045         SET_BIT(CHAR_NEL);
1046 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1047         SET_BIT(0xFF);  /* For characters > 255 */
1048 #endif
1049         }
1050       try_next = FALSE;
1051       break;
1052 
1053       /* Single character types set the bits and stop. Note that if PCRE_UCP
1054       is set, we do not see these op codes because \d etc are converted to
1055       properties. Therefore, these apply in the case when only characters less
1056       than 256 are recognized to match the types. */
1057 
1058       case OP_NOT_DIGIT:
1059       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1060       try_next = FALSE;
1061       break;
1062 
1063       case OP_DIGIT:
1064       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1065       try_next = FALSE;
1066       break;
1067 
1068       /* The cbit_space table has vertical tab as whitespace; we have to
1069       ensure it is set as not whitespace. Luckily, the code value is the same
1070       (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1071 
1072       case OP_NOT_WHITESPACE:
1073       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1074       start_bits[1] |= 0x08;
1075       try_next = FALSE;
1076       break;
1077 
1078       /* The cbit_space table has vertical tab as whitespace; we have to not
1079       set it from the table. Luckily, the code value is the same (0x0b) in
1080       ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1081 
1082       case OP_WHITESPACE:
1083       c = start_bits[1];    /* Save in case it was already set */
1084       set_type_bits(start_bits, cbit_space, table_limit, cd);
1085       start_bits[1] = (start_bits[1] & ~0x08) | c;
1086       try_next = FALSE;
1087       break;
1088 
1089       case OP_NOT_WORDCHAR:
1090       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1091       try_next = FALSE;
1092       break;
1093 
1094       case OP_WORDCHAR:
1095       set_type_bits(start_bits, cbit_word, table_limit, cd);
1096       try_next = FALSE;
1097       break;
1098 
1099       /* One or more character type fudges the pointer and restarts, knowing
1100       it will hit a single character type and stop there. */
1101 
1102       case OP_TYPEPLUS:
1103       case OP_TYPEMINPLUS:
1104       case OP_TYPEPOSPLUS:
1105       tcode++;
1106       break;
1107 
1108       case OP_TYPEEXACT:
1109       tcode += 1 + IMM2_SIZE;
1110       break;
1111 
1112       /* Zero or more repeats of character types set the bits and then
1113       try again. */
1114 
1115       case OP_TYPEUPTO:
1116       case OP_TYPEMINUPTO:
1117       case OP_TYPEPOSUPTO:
1118       tcode += IMM2_SIZE;  /* Fall through */
1119 
1120       case OP_TYPESTAR:
1121       case OP_TYPEMINSTAR:
1122       case OP_TYPEPOSSTAR:
1123       case OP_TYPEQUERY:
1124       case OP_TYPEMINQUERY:
1125       case OP_TYPEPOSQUERY:
1126       switch(tcode[1])
1127         {
1128         default:
1129         case OP_ANY:
1130         case OP_ALLANY:
1131         return SSB_FAIL;
1132 
1133         case OP_HSPACE:
1134         SET_BIT(CHAR_HT);
1135         SET_BIT(CHAR_SPACE);
1136 #ifdef SUPPORT_UTF
1137         if (utf)
1138           {
1139 #ifdef COMPILE_PCRE8
1140           SET_BIT(0xC2);  /* For U+00A0 */
1141           SET_BIT(0xE1);  /* For U+1680, U+180E */
1142           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1143           SET_BIT(0xE3);  /* For U+3000 */
1144 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1145           SET_BIT(0xA0);
1146           SET_BIT(0xFF);  /* For characters > 255 */
1147 #endif  /* COMPILE_PCRE[8|16|32] */
1148           }
1149         else
1150 #endif /* SUPPORT_UTF */
1151 #ifndef EBCDIC
1152           SET_BIT(0xA0);
1153 #endif  /* Not EBCDIC */
1154         break;
1155 
1156         case OP_ANYNL:
1157         case OP_VSPACE:
1158         SET_BIT(CHAR_LF);
1159         SET_BIT(CHAR_VT);
1160         SET_BIT(CHAR_FF);
1161         SET_BIT(CHAR_CR);
1162 #ifdef SUPPORT_UTF
1163         if (utf)
1164           {
1165 #ifdef COMPILE_PCRE8
1166           SET_BIT(0xC2);  /* For U+0085 */
1167           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1168 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1169           SET_BIT(CHAR_NEL);
1170           SET_BIT(0xFF);  /* For characters > 255 */
1171 #endif  /* COMPILE_PCRE16 */
1172           }
1173         else
1174 #endif /* SUPPORT_UTF */
1175           SET_BIT(CHAR_NEL);
1176         break;
1177 
1178         case OP_NOT_DIGIT:
1179         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1180         break;
1181 
1182         case OP_DIGIT:
1183         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1184         break;
1185 
1186         /* The cbit_space table has vertical tab as whitespace; we have to
1187         ensure it gets set as not whitespace. Luckily, the code value is the
1188         same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate
1189         bit. */
1190 
1191         case OP_NOT_WHITESPACE:
1192         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1193         start_bits[1] |= 0x08;
1194         break;
1195 
1196         /* The cbit_space table has vertical tab as whitespace; we have to
1197         avoid setting it. Luckily, the code value is the same (0x0b) in ASCII
1198         and EBCDIC, so we can just adjust the appropriate bit. */
1199 
1200         case OP_WHITESPACE:
1201         c = start_bits[1];    /* Save in case it was already set */
1202         set_type_bits(start_bits, cbit_space, table_limit, cd);
1203         start_bits[1] = (start_bits[1] & ~0x08) | c;
1204         break;
1205 
1206         case OP_NOT_WORDCHAR:
1207         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1208         break;
1209 
1210         case OP_WORDCHAR:
1211         set_type_bits(start_bits, cbit_word, table_limit, cd);
1212         break;
1213         }
1214 
1215       tcode += 2;
1216       break;
1217 
1218       /* Character class where all the information is in a bit map: set the
1219       bits and either carry on or not, according to the repeat count. If it was
1220       a negative class, and we are operating with UTF-8 characters, any byte
1221       with a value >= 0xc4 is a potentially valid starter because it starts a
1222       character with a value > 255. */
1223 
1224       case OP_NCLASS:
1225 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1226       if (utf)
1227         {
1228         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1229         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1230         }
1231 #endif
1232 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1233       SET_BIT(0xFF);                         /* For characters > 255 */
1234 #endif
1235       /* Fall through */
1236 
1237       case OP_CLASS:
1238         {
1239         pcre_uint8 *map;
1240         tcode++;
1241         map = (pcre_uint8 *)tcode;
1242 
1243         /* In UTF-8 mode, the bits in a bit map correspond to character
1244         values, not to byte values. However, the bit map we are constructing is
1245         for byte values. So we have to do a conversion for characters whose
1246         value is > 127. In fact, there are only two possible starting bytes for
1247         characters in the range 128 - 255. */
1248 
1249 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1250         if (utf)
1251           {
1252           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1253           for (c = 128; c < 256; c++)
1254             {
1255             if ((map[c/8] && (1 << (c&7))) != 0)
1256               {
1257               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1258               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1259               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1260               }
1261             }
1262           }
1263         else
1264 #endif
1265           {
1266           /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1267           for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1268           }
1269 
1270         /* Advance past the bit map, and act on what follows. For a zero
1271         minimum repeat, continue; otherwise stop processing. */
1272 
1273         tcode += 32 / sizeof(pcre_uchar);
1274         switch (*tcode)
1275           {
1276           case OP_CRSTAR:
1277           case OP_CRMINSTAR:
1278           case OP_CRQUERY:
1279           case OP_CRMINQUERY:
1280           tcode++;
1281           break;
1282 
1283           case OP_CRRANGE:
1284           case OP_CRMINRANGE:
1285           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1286             else try_next = FALSE;
1287           break;
1288 
1289           default:
1290           try_next = FALSE;
1291           break;
1292           }
1293         }
1294       break; /* End of bitmap class handling */
1295 
1296       }      /* End of switch */
1297     }        /* End of try_next loop */
1298 
1299   code += GET(code, 1);   /* Advance to next branch */
1300   }
1301 while (*code == OP_ALT);
1302 return yield;
1303 }
1304 
1305 
1306 
1307 
1308 
1309 /*************************************************
1310 *          Study a compiled expression           *
1311 *************************************************/
1312 
1313 /* This function is handed a compiled expression that it must study to produce
1314 information that will speed up the matching. It returns a pcre[16]_extra block
1315 which then gets handed back to pcre_exec().
1316 
1317 Arguments:
1318   re        points to the compiled expression
1319   options   contains option bits
1320   errorptr  points to where to place error messages;
1321             set NULL unless error
1322 
1323 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1324               the appropriate flags set;
1325             NULL on error or if no optimization possible
1326 */
1327 
1328 #if defined COMPILE_PCRE8
1329 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre * external_re,int options,const char ** errorptr)1330 pcre_study(const pcre *external_re, int options, const char **errorptr)
1331 #elif defined COMPILE_PCRE16
1332 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1333 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1334 #elif defined COMPILE_PCRE32
1335 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1336 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1337 #endif
1338 {
1339 int min;
1340 BOOL bits_set = FALSE;
1341 pcre_uint8 start_bits[32];
1342 PUBL(extra) *extra = NULL;
1343 pcre_study_data *study;
1344 const pcre_uint8 *tables;
1345 pcre_uchar *code;
1346 compile_data compile_block;
1347 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1348 
1349 *errorptr = NULL;
1350 
1351 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1352   {
1353   *errorptr = "argument is not a compiled regular expression";
1354   return NULL;
1355   }
1356 
1357 if ((re->flags & PCRE_MODE) == 0)
1358   {
1359 #if defined COMPILE_PCRE8
1360   *errorptr = "argument not compiled in 8 bit mode";
1361 #elif defined COMPILE_PCRE16
1362   *errorptr = "argument not compiled in 16 bit mode";
1363 #elif defined COMPILE_PCRE32
1364   *errorptr = "argument not compiled in 32 bit mode";
1365 #endif
1366   return NULL;
1367   }
1368 
1369 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1370   {
1371   *errorptr = "unknown or incorrect option bit(s) set";
1372   return NULL;
1373   }
1374 
1375 code = (pcre_uchar *)re + re->name_table_offset +
1376   (re->name_count * re->name_entry_size);
1377 
1378 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1379 a multiline pattern that matches only at "line starts", there is no point in
1380 seeking a list of starting bytes. */
1381 
1382 if ((re->options & PCRE_ANCHORED) == 0 &&
1383     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1384   {
1385   int rc;
1386 
1387   /* Set the character tables in the block that is passed around */
1388 
1389   tables = re->tables;
1390 
1391 #if defined COMPILE_PCRE8
1392   if (tables == NULL)
1393     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1394     (void *)(&tables));
1395 #elif defined COMPILE_PCRE16
1396   if (tables == NULL)
1397     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1398     (void *)(&tables));
1399 #elif defined COMPILE_PCRE32
1400   if (tables == NULL)
1401     (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1402     (void *)(&tables));
1403 #endif
1404 
1405   compile_block.lcc = tables + lcc_offset;
1406   compile_block.fcc = tables + fcc_offset;
1407   compile_block.cbits = tables + cbits_offset;
1408   compile_block.ctypes = tables + ctypes_offset;
1409 
1410   /* See if we can find a fixed set of initial characters for the pattern. */
1411 
1412   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1413   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1414     &compile_block);
1415   bits_set = rc == SSB_DONE;
1416   if (rc == SSB_UNKNOWN)
1417     {
1418     *errorptr = "internal error: opcode not recognized";
1419     return NULL;
1420     }
1421   }
1422 
1423 /* Find the minimum length of subject string. */
1424 
1425 switch(min = find_minlength(code, code, re->options, 0))
1426   {
1427   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1428   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1429   default: break;
1430   }
1431 
1432 /* If a set of starting bytes has been identified, or if the minimum length is
1433 greater than zero, or if JIT optimization has been requested, or if
1434 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1435 pcre_study_data block. The study data is put in the latter, which is pointed to
1436 by the former, which may also get additional data set later by the calling
1437 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1438 save it in a field for returning via the pcre_fullinfo() function so that if it
1439 becomes variable in the future, we don't have to change that code. */
1440 
1441 if (bits_set || min > 0 || (options & (
1442 #ifdef SUPPORT_JIT
1443     PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1444     PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1445 #endif
1446     PCRE_STUDY_EXTRA_NEEDED)) != 0)
1447   {
1448   extra = (PUBL(extra) *)(PUBL(malloc))
1449     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1450   if (extra == NULL)
1451     {
1452     *errorptr = "failed to get memory";
1453     return NULL;
1454     }
1455 
1456   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1457   extra->flags = PCRE_EXTRA_STUDY_DATA;
1458   extra->study_data = study;
1459 
1460   study->size = sizeof(pcre_study_data);
1461   study->flags = 0;
1462 
1463   /* Set the start bits always, to avoid unset memory errors if the
1464   study data is written to a file, but set the flag only if any of the bits
1465   are set, to save time looking when none are. */
1466 
1467   if (bits_set)
1468     {
1469     study->flags |= PCRE_STUDY_MAPPED;
1470     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1471     }
1472   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1473 
1474 #ifdef PCRE_DEBUG
1475   if (bits_set)
1476     {
1477     pcre_uint8 *ptr = start_bits;
1478     int i;
1479 
1480     printf("Start bits:\n");
1481     for (i = 0; i < 32; i++)
1482       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1483     }
1484 #endif
1485 
1486   /* Always set the minlength value in the block, because the JIT compiler
1487   makes use of it. However, don't set the bit unless the length is greater than
1488   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1489   checking the zero case. */
1490 
1491   if (min > 0)
1492     {
1493     study->flags |= PCRE_STUDY_MINLEN;
1494     study->minlength = min;
1495     }
1496   else study->minlength = 0;
1497 
1498   /* If JIT support was compiled and requested, attempt the JIT compilation.
1499   If no starting bytes were found, and the minimum length is zero, and JIT
1500   compilation fails, abandon the extra block and return NULL, unless
1501   PCRE_STUDY_EXTRA_NEEDED is set. */
1502 
1503 #ifdef SUPPORT_JIT
1504   extra->executable_jit = NULL;
1505   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1506     PRIV(jit_compile)(re, extra, JIT_COMPILE);
1507   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1508     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1509   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1510     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1511 
1512   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1513       (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1514     {
1515 #if defined COMPILE_PCRE8
1516     pcre_free_study(extra);
1517 #elif defined COMPILE_PCRE16
1518     pcre16_free_study(extra);
1519 #elif defined COMPILE_PCRE32
1520     pcre32_free_study(extra);
1521 #endif
1522     extra = NULL;
1523     }
1524 #endif
1525   }
1526 
1527 return extra;
1528 }
1529 
1530 
1531 /*************************************************
1532 *          Free the study data                   *
1533 *************************************************/
1534 
1535 /* This function frees the memory that was obtained by pcre_study().
1536 
1537 Argument:   a pointer to the pcre[16]_extra block
1538 Returns:    nothing
1539 */
1540 
1541 #if defined COMPILE_PCRE8
1542 PCRE_EXP_DEFN void
pcre_free_study(pcre_extra * extra)1543 pcre_free_study(pcre_extra *extra)
1544 #elif defined COMPILE_PCRE16
1545 PCRE_EXP_DEFN void
1546 pcre16_free_study(pcre16_extra *extra)
1547 #elif defined COMPILE_PCRE32
1548 PCRE_EXP_DEFN void
1549 pcre32_free_study(pcre32_extra *extra)
1550 #endif
1551 {
1552 if (extra == NULL)
1553   return;
1554 #ifdef SUPPORT_JIT
1555 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1556      extra->executable_jit != NULL)
1557   PRIV(jit_free)(extra->executable_jit);
1558 #endif
1559 PUBL(free)(extra);
1560 }
1561 
1562 /* End of pcre_study.c */
1563