1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre_internal.h"
50
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52
53 /* Returns from set_start_bits() */
54
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56
57
58
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
62
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67
68 Arguments:
69 re compiled pattern block
70 code pointer to start of group (the bracket)
71 startcode pointer to start of the whole pattern's code
72 options the compiling options
73 recurses chain of recurse_check to catch mutual recursion
74
75 Returns: the minimum length
76 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
77 -2 internal error (missing capturing bracket)
78 -3 internal error (opcode not listed)
79 */
80
81 static int
find_minlength(const REAL_PCRE * re,const pcre_uchar * code,const pcre_uchar * startcode,int options,recurse_check * recurses)82 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
83 const pcre_uchar *startcode, int options, recurse_check *recurses)
84 {
85 int length = -1;
86 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87 BOOL utf = (options & PCRE_UTF8) != 0;
88 BOOL had_recurse = FALSE;
89 recurse_check this_recurse;
90 register int branchlength = 0;
91 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
92
93 if (*code == OP_CBRA || *code == OP_SCBRA ||
94 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
95
96 /* Scan along the opcodes for this branch. If we get to the end of the
97 branch, check the length against that of the other branches. */
98
99 for (;;)
100 {
101 int d, min;
102 pcre_uchar *cs, *ce;
103 register pcre_uchar op = *cc;
104
105 switch (op)
106 {
107 case OP_COND:
108 case OP_SCOND:
109
110 /* If there is only one branch in a condition, the implied branch has zero
111 length, so we don't add anything. This covers the DEFINE "condition"
112 automatically. */
113
114 cs = cc + GET(cc, 1);
115 if (*cs != OP_ALT)
116 {
117 cc = cs + 1 + LINK_SIZE;
118 break;
119 }
120
121 /* Otherwise we can fall through and treat it the same as any other
122 subpattern. */
123
124 case OP_CBRA:
125 case OP_SCBRA:
126 case OP_BRA:
127 case OP_SBRA:
128 case OP_CBRAPOS:
129 case OP_SCBRAPOS:
130 case OP_BRAPOS:
131 case OP_SBRAPOS:
132 case OP_ONCE:
133 case OP_ONCE_NC:
134 d = find_minlength(re, cc, startcode, options, recurses);
135 if (d < 0) return d;
136 branchlength += d;
137 do cc += GET(cc, 1); while (*cc == OP_ALT);
138 cc += 1 + LINK_SIZE;
139 break;
140
141 /* ACCEPT makes things far too complicated; we have to give up. */
142
143 case OP_ACCEPT:
144 case OP_ASSERT_ACCEPT:
145 return -1;
146
147 /* Reached end of a branch; if it's a ket it is the end of a nested
148 call. If it's ALT it is an alternation in a nested call. If it is END it's
149 the end of the outer call. All can be handled by the same code. If an
150 ACCEPT was previously encountered, use the length that was in force at that
151 time, and pass back the shortest ACCEPT length. */
152
153 case OP_ALT:
154 case OP_KET:
155 case OP_KETRMAX:
156 case OP_KETRMIN:
157 case OP_KETRPOS:
158 case OP_END:
159 if (length < 0 || (!had_recurse && branchlength < length))
160 length = branchlength;
161 if (op != OP_ALT) return length;
162 cc += 1 + LINK_SIZE;
163 branchlength = 0;
164 had_recurse = FALSE;
165 break;
166
167 /* Skip over assertive subpatterns */
168
169 case OP_ASSERT:
170 case OP_ASSERT_NOT:
171 case OP_ASSERTBACK:
172 case OP_ASSERTBACK_NOT:
173 do cc += GET(cc, 1); while (*cc == OP_ALT);
174 /* Fall through */
175
176 /* Skip over things that don't match chars */
177
178 case OP_REVERSE:
179 case OP_CREF:
180 case OP_DNCREF:
181 case OP_RREF:
182 case OP_DNRREF:
183 case OP_DEF:
184 case OP_CALLOUT:
185 case OP_SOD:
186 case OP_SOM:
187 case OP_EOD:
188 case OP_EODN:
189 case OP_CIRC:
190 case OP_CIRCM:
191 case OP_DOLL:
192 case OP_DOLLM:
193 case OP_NOT_WORD_BOUNDARY:
194 case OP_WORD_BOUNDARY:
195 cc += PRIV(OP_lengths)[*cc];
196 break;
197
198 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
199
200 case OP_BRAZERO:
201 case OP_BRAMINZERO:
202 case OP_BRAPOSZERO:
203 case OP_SKIPZERO:
204 cc += PRIV(OP_lengths)[*cc];
205 do cc += GET(cc, 1); while (*cc == OP_ALT);
206 cc += 1 + LINK_SIZE;
207 break;
208
209 /* Handle literal characters and + repetitions */
210
211 case OP_CHAR:
212 case OP_CHARI:
213 case OP_NOT:
214 case OP_NOTI:
215 case OP_PLUS:
216 case OP_PLUSI:
217 case OP_MINPLUS:
218 case OP_MINPLUSI:
219 case OP_POSPLUS:
220 case OP_POSPLUSI:
221 case OP_NOTPLUS:
222 case OP_NOTPLUSI:
223 case OP_NOTMINPLUS:
224 case OP_NOTMINPLUSI:
225 case OP_NOTPOSPLUS:
226 case OP_NOTPOSPLUSI:
227 branchlength++;
228 cc += 2;
229 #ifdef SUPPORT_UTF
230 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
231 #endif
232 break;
233
234 case OP_TYPEPLUS:
235 case OP_TYPEMINPLUS:
236 case OP_TYPEPOSPLUS:
237 branchlength++;
238 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
239 break;
240
241 /* Handle exact repetitions. The count is already in characters, but we
242 need to skip over a multibyte character in UTF8 mode. */
243
244 case OP_EXACT:
245 case OP_EXACTI:
246 case OP_NOTEXACT:
247 case OP_NOTEXACTI:
248 branchlength += GET2(cc,1);
249 cc += 2 + IMM2_SIZE;
250 #ifdef SUPPORT_UTF
251 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
252 #endif
253 break;
254
255 case OP_TYPEEXACT:
256 branchlength += GET2(cc,1);
257 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
258 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
259 break;
260
261 /* Handle single-char non-literal matchers */
262
263 case OP_PROP:
264 case OP_NOTPROP:
265 cc += 2;
266 /* Fall through */
267
268 case OP_NOT_DIGIT:
269 case OP_DIGIT:
270 case OP_NOT_WHITESPACE:
271 case OP_WHITESPACE:
272 case OP_NOT_WORDCHAR:
273 case OP_WORDCHAR:
274 case OP_ANY:
275 case OP_ALLANY:
276 case OP_EXTUNI:
277 case OP_HSPACE:
278 case OP_NOT_HSPACE:
279 case OP_VSPACE:
280 case OP_NOT_VSPACE:
281 branchlength++;
282 cc++;
283 break;
284
285 /* "Any newline" might match two characters, but it also might match just
286 one. */
287
288 case OP_ANYNL:
289 branchlength += 1;
290 cc++;
291 break;
292
293 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
294 non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
295 appear, but leave the code, just in case.) */
296
297 case OP_ANYBYTE:
298 #ifdef SUPPORT_UTF
299 if (utf) return -1;
300 #endif
301 branchlength++;
302 cc++;
303 break;
304
305 /* For repeated character types, we have to test for \p and \P, which have
306 an extra two bytes of parameters. */
307
308 case OP_TYPESTAR:
309 case OP_TYPEMINSTAR:
310 case OP_TYPEQUERY:
311 case OP_TYPEMINQUERY:
312 case OP_TYPEPOSSTAR:
313 case OP_TYPEPOSQUERY:
314 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
315 cc += PRIV(OP_lengths)[op];
316 break;
317
318 case OP_TYPEUPTO:
319 case OP_TYPEMINUPTO:
320 case OP_TYPEPOSUPTO:
321 if (cc[1 + IMM2_SIZE] == OP_PROP
322 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
323 cc += PRIV(OP_lengths)[op];
324 break;
325
326 /* Check a class for variable quantification */
327
328 case OP_CLASS:
329 case OP_NCLASS:
330 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
331 case OP_XCLASS:
332 /* The original code caused an unsigned overflow in 64 bit systems,
333 so now we use a conditional statement. */
334 if (op == OP_XCLASS)
335 cc += GET(cc, 1);
336 else
337 cc += PRIV(OP_lengths)[OP_CLASS];
338 #else
339 cc += PRIV(OP_lengths)[OP_CLASS];
340 #endif
341
342 switch (*cc)
343 {
344 case OP_CRPLUS:
345 case OP_CRMINPLUS:
346 case OP_CRPOSPLUS:
347 branchlength++;
348 /* Fall through */
349
350 case OP_CRSTAR:
351 case OP_CRMINSTAR:
352 case OP_CRQUERY:
353 case OP_CRMINQUERY:
354 case OP_CRPOSSTAR:
355 case OP_CRPOSQUERY:
356 cc++;
357 break;
358
359 case OP_CRRANGE:
360 case OP_CRMINRANGE:
361 case OP_CRPOSRANGE:
362 branchlength += GET2(cc,1);
363 cc += 1 + 2 * IMM2_SIZE;
364 break;
365
366 default:
367 branchlength++;
368 break;
369 }
370 break;
371
372 /* Backreferences and subroutine calls are treated in the same way: we find
373 the minimum length for the subpattern. A recursion, however, causes an
374 a flag to be set that causes the length of this branch to be ignored. The
375 logic is that a recursion can only make sense if there is another
376 alternation that stops the recursing. That will provide the minimum length
377 (when no recursion happens). A backreference within the group that it is
378 referencing behaves in the same way.
379
380 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
381 matches an empty string (by default it causes a matching failure), so in
382 that case we must set the minimum length to zero. */
383
384 case OP_DNREF: /* Duplicate named pattern back reference */
385 case OP_DNREFI:
386 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
387 {
388 int count = GET2(cc, 1+IMM2_SIZE);
389 pcre_uchar *slot = (pcre_uchar *)re +
390 re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
391 d = INT_MAX;
392 while (count-- > 0)
393 {
394 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
395 if (cs == NULL) return -2;
396 do ce += GET(ce, 1); while (*ce == OP_ALT);
397 if (cc > cs && cc < ce) /* Simple recursion */
398 {
399 d = 0;
400 had_recurse = TRUE;
401 break;
402 }
403 else
404 {
405 recurse_check *r = recurses;
406 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
407 if (r != NULL) /* Mutual recursion */
408 {
409 d = 0;
410 had_recurse = TRUE;
411 break;
412 }
413 else
414 {
415 int dd;
416 this_recurse.prev = recurses;
417 this_recurse.group = cs;
418 dd = find_minlength(re, cs, startcode, options, &this_recurse);
419 if (dd < d) d = dd;
420 }
421 }
422 slot += re->name_entry_size;
423 }
424 }
425 else d = 0;
426 cc += 1 + 2*IMM2_SIZE;
427 goto REPEAT_BACK_REFERENCE;
428
429 case OP_REF: /* Single back reference */
430 case OP_REFI:
431 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
432 {
433 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
434 if (cs == NULL) return -2;
435 do ce += GET(ce, 1); while (*ce == OP_ALT);
436 if (cc > cs && cc < ce) /* Simple recursion */
437 {
438 d = 0;
439 had_recurse = TRUE;
440 }
441 else
442 {
443 recurse_check *r = recurses;
444 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
445 if (r != NULL) /* Mutual recursion */
446 {
447 d = 0;
448 had_recurse = TRUE;
449 }
450 else
451 {
452 this_recurse.prev = recurses;
453 this_recurse.group = cs;
454 d = find_minlength(re, cs, startcode, options, &this_recurse);
455 }
456 }
457 }
458 else d = 0;
459 cc += 1 + IMM2_SIZE;
460
461 /* Handle repeated back references */
462
463 REPEAT_BACK_REFERENCE:
464 switch (*cc)
465 {
466 case OP_CRSTAR:
467 case OP_CRMINSTAR:
468 case OP_CRQUERY:
469 case OP_CRMINQUERY:
470 case OP_CRPOSSTAR:
471 case OP_CRPOSQUERY:
472 min = 0;
473 cc++;
474 break;
475
476 case OP_CRPLUS:
477 case OP_CRMINPLUS:
478 case OP_CRPOSPLUS:
479 min = 1;
480 cc++;
481 break;
482
483 case OP_CRRANGE:
484 case OP_CRMINRANGE:
485 case OP_CRPOSRANGE:
486 min = GET2(cc, 1);
487 cc += 1 + 2 * IMM2_SIZE;
488 break;
489
490 default:
491 min = 1;
492 break;
493 }
494
495 branchlength += min * d;
496 break;
497
498 /* We can easily detect direct recursion, but not mutual recursion. This is
499 caught by a recursion depth count. */
500
501 case OP_RECURSE:
502 cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
503 do ce += GET(ce, 1); while (*ce == OP_ALT);
504 if (cc > cs && cc < ce) /* Simple recursion */
505 had_recurse = TRUE;
506 else
507 {
508 recurse_check *r = recurses;
509 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
510 if (r != NULL) /* Mutual recursion */
511 had_recurse = TRUE;
512 else
513 {
514 this_recurse.prev = recurses;
515 this_recurse.group = cs;
516 branchlength += find_minlength(re, cs, startcode, options,
517 &this_recurse);
518 }
519 }
520 cc += 1 + LINK_SIZE;
521 break;
522
523 /* Anything else does not or need not match a character. We can get the
524 item's length from the table, but for those that can match zero occurrences
525 of a character, we must take special action for UTF-8 characters. As it
526 happens, the "NOT" versions of these opcodes are used at present only for
527 ASCII characters, so they could be omitted from this list. However, in
528 future that may change, so we include them here so as not to leave a
529 gotcha for a future maintainer. */
530
531 case OP_UPTO:
532 case OP_UPTOI:
533 case OP_NOTUPTO:
534 case OP_NOTUPTOI:
535 case OP_MINUPTO:
536 case OP_MINUPTOI:
537 case OP_NOTMINUPTO:
538 case OP_NOTMINUPTOI:
539 case OP_POSUPTO:
540 case OP_POSUPTOI:
541 case OP_NOTPOSUPTO:
542 case OP_NOTPOSUPTOI:
543
544 case OP_STAR:
545 case OP_STARI:
546 case OP_NOTSTAR:
547 case OP_NOTSTARI:
548 case OP_MINSTAR:
549 case OP_MINSTARI:
550 case OP_NOTMINSTAR:
551 case OP_NOTMINSTARI:
552 case OP_POSSTAR:
553 case OP_POSSTARI:
554 case OP_NOTPOSSTAR:
555 case OP_NOTPOSSTARI:
556
557 case OP_QUERY:
558 case OP_QUERYI:
559 case OP_NOTQUERY:
560 case OP_NOTQUERYI:
561 case OP_MINQUERY:
562 case OP_MINQUERYI:
563 case OP_NOTMINQUERY:
564 case OP_NOTMINQUERYI:
565 case OP_POSQUERY:
566 case OP_POSQUERYI:
567 case OP_NOTPOSQUERY:
568 case OP_NOTPOSQUERYI:
569
570 cc += PRIV(OP_lengths)[op];
571 #ifdef SUPPORT_UTF
572 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
573 #endif
574 break;
575
576 /* Skip these, but we need to add in the name length. */
577
578 case OP_MARK:
579 case OP_PRUNE_ARG:
580 case OP_SKIP_ARG:
581 case OP_THEN_ARG:
582 cc += PRIV(OP_lengths)[op] + cc[1];
583 break;
584
585 /* The remaining opcodes are just skipped over. */
586
587 case OP_CLOSE:
588 case OP_COMMIT:
589 case OP_FAIL:
590 case OP_PRUNE:
591 case OP_SET_SOM:
592 case OP_SKIP:
593 case OP_THEN:
594 cc += PRIV(OP_lengths)[op];
595 break;
596
597 /* This should not occur: we list all opcodes explicitly so that when
598 new ones get added they are properly considered. */
599
600 default:
601 return -3;
602 }
603 }
604 /* Control never gets here */
605 }
606
607
608
609 /*************************************************
610 * Set a bit and maybe its alternate case *
611 *************************************************/
612
613 /* Given a character, set its first byte's bit in the table, and also the
614 corresponding bit for the other version of a letter if we are caseless. In
615 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
616 when Unicode property support is available.
617
618 Arguments:
619 start_bits points to the bit map
620 p points to the character
621 caseless the caseless flag
622 cd the block with char table pointers
623 utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
624
625 Returns: pointer after the character
626 */
627
628 static const pcre_uchar *
set_table_bit(pcre_uint8 * start_bits,const pcre_uchar * p,BOOL caseless,compile_data * cd,BOOL utf)629 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
630 compile_data *cd, BOOL utf)
631 {
632 pcre_uint32 c = *p;
633
634 #ifdef COMPILE_PCRE8
635 SET_BIT(c);
636
637 #ifdef SUPPORT_UTF
638 if (utf && c > 127)
639 {
640 GETCHARINC(c, p);
641 #ifdef SUPPORT_UCP
642 if (caseless)
643 {
644 pcre_uchar buff[6];
645 c = UCD_OTHERCASE(c);
646 (void)PRIV(ord2utf)(c, buff);
647 SET_BIT(buff[0]);
648 }
649 #endif /* Not SUPPORT_UCP */
650 return p;
651 }
652 #else /* Not SUPPORT_UTF */
653 (void)(utf); /* Stops warning for unused parameter */
654 #endif /* SUPPORT_UTF */
655
656 /* Not UTF-8 mode, or character is less than 127. */
657
658 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
659 return p + 1;
660 #endif /* COMPILE_PCRE8 */
661
662 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
663 if (c > 0xff)
664 {
665 c = 0xff;
666 caseless = FALSE;
667 }
668 SET_BIT(c);
669
670 #ifdef SUPPORT_UTF
671 if (utf && c > 127)
672 {
673 GETCHARINC(c, p);
674 #ifdef SUPPORT_UCP
675 if (caseless)
676 {
677 c = UCD_OTHERCASE(c);
678 if (c > 0xff)
679 c = 0xff;
680 SET_BIT(c);
681 }
682 #endif /* SUPPORT_UCP */
683 return p;
684 }
685 #else /* Not SUPPORT_UTF */
686 (void)(utf); /* Stops warning for unused parameter */
687 #endif /* SUPPORT_UTF */
688
689 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
690 return p + 1;
691 #endif
692 }
693
694
695
696 /*************************************************
697 * Set bits for a positive character type *
698 *************************************************/
699
700 /* This function sets starting bits for a character type. In UTF-8 mode, we can
701 only do a direct setting for bytes less than 128, as otherwise there can be
702 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
703 environment, the tables will only recognize ASCII characters anyway, but in at
704 least one Windows environment, some higher bytes bits were set in the tables.
705 So we deal with that case by considering the UTF-8 encoding.
706
707 Arguments:
708 start_bits the starting bitmap
709 cbit type the type of character wanted
710 table_limit 32 for non-UTF-8; 16 for UTF-8
711 cd the block with char table pointers
712
713 Returns: nothing
714 */
715
716 static void
set_type_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)717 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
718 compile_data *cd)
719 {
720 register pcre_uint32 c;
721 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
722 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
723 if (table_limit == 32) return;
724 for (c = 128; c < 256; c++)
725 {
726 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
727 {
728 pcre_uchar buff[6];
729 (void)PRIV(ord2utf)(c, buff);
730 SET_BIT(buff[0]);
731 }
732 }
733 #endif
734 }
735
736
737 /*************************************************
738 * Set bits for a negative character type *
739 *************************************************/
740
741 /* This function sets starting bits for a negative character type such as \D.
742 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
743 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
744 Unlike in the positive case, where we can set appropriate starting bits for
745 specific high-valued UTF-8 characters, in this case we have to set the bits for
746 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
747 0xc0 (192) for simplicity.
748
749 Arguments:
750 start_bits the starting bitmap
751 cbit type the type of character wanted
752 table_limit 32 for non-UTF-8; 16 for UTF-8
753 cd the block with char table pointers
754
755 Returns: nothing
756 */
757
758 static void
set_nottype_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)759 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
760 compile_data *cd)
761 {
762 register pcre_uint32 c;
763 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
764 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
765 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
766 #endif
767 }
768
769
770
771 /*************************************************
772 * Create bitmap of starting bytes *
773 *************************************************/
774
775 /* This function scans a compiled unanchored expression recursively and
776 attempts to build a bitmap of the set of possible starting bytes. As time goes
777 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
778 useful for parenthesized groups in patterns such as (a*)b where the group
779 provides some optional starting bytes but scanning must continue at the outer
780 level to find at least one mandatory byte. At the outermost level, this
781 function fails unless the result is SSB_DONE.
782
783 Arguments:
784 code points to an expression
785 start_bits points to a 32-byte table, initialized to 0
786 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
787 cd the block with char table pointers
788
789 Returns: SSB_FAIL => Failed to find any starting bytes
790 SSB_DONE => Found mandatory starting bytes
791 SSB_CONTINUE => Found optional starting bytes
792 SSB_UNKNOWN => Hit an unrecognized opcode
793 */
794
795 static int
set_start_bits(const pcre_uchar * code,pcre_uint8 * start_bits,BOOL utf,compile_data * cd)796 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
797 compile_data *cd)
798 {
799 register pcre_uint32 c;
800 int yield = SSB_DONE;
801 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
802 int table_limit = utf? 16:32;
803 #else
804 int table_limit = 32;
805 #endif
806
807 #if 0
808 /* ========================================================================= */
809 /* The following comment and code was inserted in January 1999. In May 2006,
810 when it was observed to cause compiler warnings about unused values, I took it
811 out again. If anybody is still using OS/2, they will have to put it back
812 manually. */
813
814 /* This next statement and the later reference to dummy are here in order to
815 trick the optimizer of the IBM C compiler for OS/2 into generating correct
816 code. Apparently IBM isn't going to fix the problem, and we would rather not
817 disable optimization (in this module it actually makes a big difference, and
818 the pcre module can use all the optimization it can get). */
819
820 volatile int dummy;
821 /* ========================================================================= */
822 #endif
823
824 do
825 {
826 BOOL try_next = TRUE;
827 const pcre_uchar *tcode = code + 1 + LINK_SIZE;
828
829 if (*code == OP_CBRA || *code == OP_SCBRA ||
830 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
831
832 while (try_next) /* Loop for items in this branch */
833 {
834 int rc;
835
836 switch(*tcode)
837 {
838 /* If we reach something we don't understand, it means a new opcode has
839 been created that hasn't been added to this code. Hopefully this problem
840 will be discovered during testing. */
841
842 default:
843 return SSB_UNKNOWN;
844
845 /* Fail for a valid opcode that implies no starting bits. */
846
847 case OP_ACCEPT:
848 case OP_ASSERT_ACCEPT:
849 case OP_ALLANY:
850 case OP_ANY:
851 case OP_ANYBYTE:
852 case OP_CIRC:
853 case OP_CIRCM:
854 case OP_CLOSE:
855 case OP_COMMIT:
856 case OP_COND:
857 case OP_CREF:
858 case OP_DEF:
859 case OP_DNCREF:
860 case OP_DNREF:
861 case OP_DNREFI:
862 case OP_DNRREF:
863 case OP_DOLL:
864 case OP_DOLLM:
865 case OP_END:
866 case OP_EOD:
867 case OP_EODN:
868 case OP_EXTUNI:
869 case OP_FAIL:
870 case OP_MARK:
871 case OP_NOT:
872 case OP_NOTEXACT:
873 case OP_NOTEXACTI:
874 case OP_NOTI:
875 case OP_NOTMINPLUS:
876 case OP_NOTMINPLUSI:
877 case OP_NOTMINQUERY:
878 case OP_NOTMINQUERYI:
879 case OP_NOTMINSTAR:
880 case OP_NOTMINSTARI:
881 case OP_NOTMINUPTO:
882 case OP_NOTMINUPTOI:
883 case OP_NOTPLUS:
884 case OP_NOTPLUSI:
885 case OP_NOTPOSPLUS:
886 case OP_NOTPOSPLUSI:
887 case OP_NOTPOSQUERY:
888 case OP_NOTPOSQUERYI:
889 case OP_NOTPOSSTAR:
890 case OP_NOTPOSSTARI:
891 case OP_NOTPOSUPTO:
892 case OP_NOTPOSUPTOI:
893 case OP_NOTPROP:
894 case OP_NOTQUERY:
895 case OP_NOTQUERYI:
896 case OP_NOTSTAR:
897 case OP_NOTSTARI:
898 case OP_NOTUPTO:
899 case OP_NOTUPTOI:
900 case OP_NOT_HSPACE:
901 case OP_NOT_VSPACE:
902 case OP_PRUNE:
903 case OP_PRUNE_ARG:
904 case OP_RECURSE:
905 case OP_REF:
906 case OP_REFI:
907 case OP_REVERSE:
908 case OP_RREF:
909 case OP_SCOND:
910 case OP_SET_SOM:
911 case OP_SKIP:
912 case OP_SKIP_ARG:
913 case OP_SOD:
914 case OP_SOM:
915 case OP_THEN:
916 case OP_THEN_ARG:
917 return SSB_FAIL;
918
919 /* A "real" property test implies no starting bits, but the fake property
920 PT_CLIST identifies a list of characters. These lists are short, as they
921 are used for characters with more than one "other case", so there is no
922 point in recognizing them for OP_NOTPROP. */
923
924 case OP_PROP:
925 if (tcode[1] != PT_CLIST) return SSB_FAIL;
926 {
927 const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
928 while ((c = *p++) < NOTACHAR)
929 {
930 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
931 if (utf)
932 {
933 pcre_uchar buff[6];
934 (void)PRIV(ord2utf)(c, buff);
935 c = buff[0];
936 }
937 #endif
938 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
939 }
940 }
941 try_next = FALSE;
942 break;
943
944 /* We can ignore word boundary tests. */
945
946 case OP_WORD_BOUNDARY:
947 case OP_NOT_WORD_BOUNDARY:
948 tcode++;
949 break;
950
951 /* If we hit a bracket or a positive lookahead assertion, recurse to set
952 bits from within the subpattern. If it can't find anything, we have to
953 give up. If it finds some mandatory character(s), we are done for this
954 branch. Otherwise, carry on scanning after the subpattern. */
955
956 case OP_BRA:
957 case OP_SBRA:
958 case OP_CBRA:
959 case OP_SCBRA:
960 case OP_BRAPOS:
961 case OP_SBRAPOS:
962 case OP_CBRAPOS:
963 case OP_SCBRAPOS:
964 case OP_ONCE:
965 case OP_ONCE_NC:
966 case OP_ASSERT:
967 rc = set_start_bits(tcode, start_bits, utf, cd);
968 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
969 if (rc == SSB_DONE) try_next = FALSE; else
970 {
971 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
972 tcode += 1 + LINK_SIZE;
973 }
974 break;
975
976 /* If we hit ALT or KET, it means we haven't found anything mandatory in
977 this branch, though we might have found something optional. For ALT, we
978 continue with the next alternative, but we have to arrange that the final
979 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
980 return SSB_CONTINUE: if this is the top level, that indicates failure,
981 but after a nested subpattern, it causes scanning to continue. */
982
983 case OP_ALT:
984 yield = SSB_CONTINUE;
985 try_next = FALSE;
986 break;
987
988 case OP_KET:
989 case OP_KETRMAX:
990 case OP_KETRMIN:
991 case OP_KETRPOS:
992 return SSB_CONTINUE;
993
994 /* Skip over callout */
995
996 case OP_CALLOUT:
997 tcode += 2 + 2*LINK_SIZE;
998 break;
999
1000 /* Skip over lookbehind and negative lookahead assertions */
1001
1002 case OP_ASSERT_NOT:
1003 case OP_ASSERTBACK:
1004 case OP_ASSERTBACK_NOT:
1005 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1006 tcode += 1 + LINK_SIZE;
1007 break;
1008
1009 /* BRAZERO does the bracket, but carries on. */
1010
1011 case OP_BRAZERO:
1012 case OP_BRAMINZERO:
1013 case OP_BRAPOSZERO:
1014 rc = set_start_bits(++tcode, start_bits, utf, cd);
1015 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1016 /* =========================================================================
1017 See the comment at the head of this function concerning the next line,
1018 which was an old fudge for the benefit of OS/2.
1019 dummy = 1;
1020 ========================================================================= */
1021 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1022 tcode += 1 + LINK_SIZE;
1023 break;
1024
1025 /* SKIPZERO skips the bracket. */
1026
1027 case OP_SKIPZERO:
1028 tcode++;
1029 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1030 tcode += 1 + LINK_SIZE;
1031 break;
1032
1033 /* Single-char * or ? sets the bit and tries the next item */
1034
1035 case OP_STAR:
1036 case OP_MINSTAR:
1037 case OP_POSSTAR:
1038 case OP_QUERY:
1039 case OP_MINQUERY:
1040 case OP_POSQUERY:
1041 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1042 break;
1043
1044 case OP_STARI:
1045 case OP_MINSTARI:
1046 case OP_POSSTARI:
1047 case OP_QUERYI:
1048 case OP_MINQUERYI:
1049 case OP_POSQUERYI:
1050 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1051 break;
1052
1053 /* Single-char upto sets the bit and tries the next */
1054
1055 case OP_UPTO:
1056 case OP_MINUPTO:
1057 case OP_POSUPTO:
1058 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1059 break;
1060
1061 case OP_UPTOI:
1062 case OP_MINUPTOI:
1063 case OP_POSUPTOI:
1064 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1065 break;
1066
1067 /* At least one single char sets the bit and stops */
1068
1069 case OP_EXACT:
1070 tcode += IMM2_SIZE;
1071 /* Fall through */
1072 case OP_CHAR:
1073 case OP_PLUS:
1074 case OP_MINPLUS:
1075 case OP_POSPLUS:
1076 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1077 try_next = FALSE;
1078 break;
1079
1080 case OP_EXACTI:
1081 tcode += IMM2_SIZE;
1082 /* Fall through */
1083 case OP_CHARI:
1084 case OP_PLUSI:
1085 case OP_MINPLUSI:
1086 case OP_POSPLUSI:
1087 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1088 try_next = FALSE;
1089 break;
1090
1091 /* Special spacing and line-terminating items. These recognize specific
1092 lists of characters. The difference between VSPACE and ANYNL is that the
1093 latter can match the two-character CRLF sequence, but that is not
1094 relevant for finding the first character, so their code here is
1095 identical. */
1096
1097 case OP_HSPACE:
1098 SET_BIT(CHAR_HT);
1099 SET_BIT(CHAR_SPACE);
1100 #ifdef SUPPORT_UTF
1101 if (utf)
1102 {
1103 #ifdef COMPILE_PCRE8
1104 SET_BIT(0xC2); /* For U+00A0 */
1105 SET_BIT(0xE1); /* For U+1680, U+180E */
1106 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1107 SET_BIT(0xE3); /* For U+3000 */
1108 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1109 SET_BIT(0xA0);
1110 SET_BIT(0xFF); /* For characters > 255 */
1111 #endif /* COMPILE_PCRE[8|16|32] */
1112 }
1113 else
1114 #endif /* SUPPORT_UTF */
1115 {
1116 #ifndef EBCDIC
1117 SET_BIT(0xA0);
1118 #endif /* Not EBCDIC */
1119 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1120 SET_BIT(0xFF); /* For characters > 255 */
1121 #endif /* COMPILE_PCRE[16|32] */
1122 }
1123 try_next = FALSE;
1124 break;
1125
1126 case OP_ANYNL:
1127 case OP_VSPACE:
1128 SET_BIT(CHAR_LF);
1129 SET_BIT(CHAR_VT);
1130 SET_BIT(CHAR_FF);
1131 SET_BIT(CHAR_CR);
1132 #ifdef SUPPORT_UTF
1133 if (utf)
1134 {
1135 #ifdef COMPILE_PCRE8
1136 SET_BIT(0xC2); /* For U+0085 */
1137 SET_BIT(0xE2); /* For U+2028, U+2029 */
1138 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1139 SET_BIT(CHAR_NEL);
1140 SET_BIT(0xFF); /* For characters > 255 */
1141 #endif /* COMPILE_PCRE[8|16|32] */
1142 }
1143 else
1144 #endif /* SUPPORT_UTF */
1145 {
1146 SET_BIT(CHAR_NEL);
1147 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1148 SET_BIT(0xFF); /* For characters > 255 */
1149 #endif
1150 }
1151 try_next = FALSE;
1152 break;
1153
1154 /* Single character types set the bits and stop. Note that if PCRE_UCP
1155 is set, we do not see these op codes because \d etc are converted to
1156 properties. Therefore, these apply in the case when only characters less
1157 than 256 are recognized to match the types. */
1158
1159 case OP_NOT_DIGIT:
1160 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1161 try_next = FALSE;
1162 break;
1163
1164 case OP_DIGIT:
1165 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1166 try_next = FALSE;
1167 break;
1168
1169 /* The cbit_space table has vertical tab as whitespace; we no longer
1170 have to play fancy tricks because Perl added VT to its whitespace at
1171 release 5.18. PCRE added it at release 8.34. */
1172
1173 case OP_NOT_WHITESPACE:
1174 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1175 try_next = FALSE;
1176 break;
1177
1178 case OP_WHITESPACE:
1179 set_type_bits(start_bits, cbit_space, table_limit, cd);
1180 try_next = FALSE;
1181 break;
1182
1183 case OP_NOT_WORDCHAR:
1184 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1185 try_next = FALSE;
1186 break;
1187
1188 case OP_WORDCHAR:
1189 set_type_bits(start_bits, cbit_word, table_limit, cd);
1190 try_next = FALSE;
1191 break;
1192
1193 /* One or more character type fudges the pointer and restarts, knowing
1194 it will hit a single character type and stop there. */
1195
1196 case OP_TYPEPLUS:
1197 case OP_TYPEMINPLUS:
1198 case OP_TYPEPOSPLUS:
1199 tcode++;
1200 break;
1201
1202 case OP_TYPEEXACT:
1203 tcode += 1 + IMM2_SIZE;
1204 break;
1205
1206 /* Zero or more repeats of character types set the bits and then
1207 try again. */
1208
1209 case OP_TYPEUPTO:
1210 case OP_TYPEMINUPTO:
1211 case OP_TYPEPOSUPTO:
1212 tcode += IMM2_SIZE; /* Fall through */
1213
1214 case OP_TYPESTAR:
1215 case OP_TYPEMINSTAR:
1216 case OP_TYPEPOSSTAR:
1217 case OP_TYPEQUERY:
1218 case OP_TYPEMINQUERY:
1219 case OP_TYPEPOSQUERY:
1220 switch(tcode[1])
1221 {
1222 default:
1223 case OP_ANY:
1224 case OP_ALLANY:
1225 return SSB_FAIL;
1226
1227 case OP_HSPACE:
1228 SET_BIT(CHAR_HT);
1229 SET_BIT(CHAR_SPACE);
1230 #ifdef SUPPORT_UTF
1231 if (utf)
1232 {
1233 #ifdef COMPILE_PCRE8
1234 SET_BIT(0xC2); /* For U+00A0 */
1235 SET_BIT(0xE1); /* For U+1680, U+180E */
1236 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1237 SET_BIT(0xE3); /* For U+3000 */
1238 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1239 SET_BIT(0xA0);
1240 SET_BIT(0xFF); /* For characters > 255 */
1241 #endif /* COMPILE_PCRE[8|16|32] */
1242 }
1243 else
1244 #endif /* SUPPORT_UTF */
1245 #ifndef EBCDIC
1246 SET_BIT(0xA0);
1247 #endif /* Not EBCDIC */
1248 break;
1249
1250 case OP_ANYNL:
1251 case OP_VSPACE:
1252 SET_BIT(CHAR_LF);
1253 SET_BIT(CHAR_VT);
1254 SET_BIT(CHAR_FF);
1255 SET_BIT(CHAR_CR);
1256 #ifdef SUPPORT_UTF
1257 if (utf)
1258 {
1259 #ifdef COMPILE_PCRE8
1260 SET_BIT(0xC2); /* For U+0085 */
1261 SET_BIT(0xE2); /* For U+2028, U+2029 */
1262 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1263 SET_BIT(CHAR_NEL);
1264 SET_BIT(0xFF); /* For characters > 255 */
1265 #endif /* COMPILE_PCRE16 */
1266 }
1267 else
1268 #endif /* SUPPORT_UTF */
1269 SET_BIT(CHAR_NEL);
1270 break;
1271
1272 case OP_NOT_DIGIT:
1273 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1274 break;
1275
1276 case OP_DIGIT:
1277 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1278 break;
1279
1280 /* The cbit_space table has vertical tab as whitespace; we no longer
1281 have to play fancy tricks because Perl added VT to its whitespace at
1282 release 5.18. PCRE added it at release 8.34. */
1283
1284 case OP_NOT_WHITESPACE:
1285 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1286 break;
1287
1288 case OP_WHITESPACE:
1289 set_type_bits(start_bits, cbit_space, table_limit, cd);
1290 break;
1291
1292 case OP_NOT_WORDCHAR:
1293 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1294 break;
1295
1296 case OP_WORDCHAR:
1297 set_type_bits(start_bits, cbit_word, table_limit, cd);
1298 break;
1299 }
1300
1301 tcode += 2;
1302 break;
1303
1304 /* Character class where all the information is in a bit map: set the
1305 bits and either carry on or not, according to the repeat count. If it was
1306 a negative class, and we are operating with UTF-8 characters, any byte
1307 with a value >= 0xc4 is a potentially valid starter because it starts a
1308 character with a value > 255. */
1309
1310 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1311 case OP_XCLASS:
1312 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1313 return SSB_FAIL;
1314 /* All bits are set. */
1315 if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1316 return SSB_FAIL;
1317 #endif
1318 /* Fall through */
1319
1320 case OP_NCLASS:
1321 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1322 if (utf)
1323 {
1324 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1325 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1326 }
1327 #endif
1328 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1329 SET_BIT(0xFF); /* For characters > 255 */
1330 #endif
1331 /* Fall through */
1332
1333 case OP_CLASS:
1334 {
1335 pcre_uint8 *map;
1336 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1337 map = NULL;
1338 if (*tcode == OP_XCLASS)
1339 {
1340 if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1341 map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1342 tcode += GET(tcode, 1);
1343 }
1344 else
1345 #endif
1346 {
1347 tcode++;
1348 map = (pcre_uint8 *)tcode;
1349 tcode += 32 / sizeof(pcre_uchar);
1350 }
1351
1352 /* In UTF-8 mode, the bits in a bit map correspond to character
1353 values, not to byte values. However, the bit map we are constructing is
1354 for byte values. So we have to do a conversion for characters whose
1355 value is > 127. In fact, there are only two possible starting bytes for
1356 characters in the range 128 - 255. */
1357
1358 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1359 if (map != NULL)
1360 #endif
1361 {
1362 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1363 if (utf)
1364 {
1365 for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1366 for (c = 128; c < 256; c++)
1367 {
1368 if ((map[c/8] && (1 << (c&7))) != 0)
1369 {
1370 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1371 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1372 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1373 }
1374 }
1375 }
1376 else
1377 #endif
1378 {
1379 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1380 for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1381 }
1382 }
1383
1384 /* Advance past the bit map, and act on what follows. For a zero
1385 minimum repeat, continue; otherwise stop processing. */
1386
1387 switch (*tcode)
1388 {
1389 case OP_CRSTAR:
1390 case OP_CRMINSTAR:
1391 case OP_CRQUERY:
1392 case OP_CRMINQUERY:
1393 case OP_CRPOSSTAR:
1394 case OP_CRPOSQUERY:
1395 tcode++;
1396 break;
1397
1398 case OP_CRRANGE:
1399 case OP_CRMINRANGE:
1400 case OP_CRPOSRANGE:
1401 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1402 else try_next = FALSE;
1403 break;
1404
1405 default:
1406 try_next = FALSE;
1407 break;
1408 }
1409 }
1410 break; /* End of bitmap class handling */
1411
1412 } /* End of switch */
1413 } /* End of try_next loop */
1414
1415 code += GET(code, 1); /* Advance to next branch */
1416 }
1417 while (*code == OP_ALT);
1418 return yield;
1419 }
1420
1421
1422
1423
1424
1425 /*************************************************
1426 * Study a compiled expression *
1427 *************************************************/
1428
1429 /* This function is handed a compiled expression that it must study to produce
1430 information that will speed up the matching. It returns a pcre[16]_extra block
1431 which then gets handed back to pcre_exec().
1432
1433 Arguments:
1434 re points to the compiled expression
1435 options contains option bits
1436 errorptr points to where to place error messages;
1437 set NULL unless error
1438
1439 Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1440 the appropriate flags set;
1441 NULL on error or if no optimization possible
1442 */
1443
1444 #if defined COMPILE_PCRE8
1445 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre * external_re,int options,const char ** errorptr)1446 pcre_study(const pcre *external_re, int options, const char **errorptr)
1447 #elif defined COMPILE_PCRE16
1448 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1449 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1450 #elif defined COMPILE_PCRE32
1451 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1452 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1453 #endif
1454 {
1455 int min;
1456 BOOL bits_set = FALSE;
1457 pcre_uint8 start_bits[32];
1458 PUBL(extra) *extra = NULL;
1459 pcre_study_data *study;
1460 const pcre_uint8 *tables;
1461 pcre_uchar *code;
1462 compile_data compile_block;
1463 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1464
1465
1466 *errorptr = NULL;
1467
1468 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1469 {
1470 *errorptr = "argument is not a compiled regular expression";
1471 return NULL;
1472 }
1473
1474 if ((re->flags & PCRE_MODE) == 0)
1475 {
1476 #if defined COMPILE_PCRE8
1477 *errorptr = "argument not compiled in 8 bit mode";
1478 #elif defined COMPILE_PCRE16
1479 *errorptr = "argument not compiled in 16 bit mode";
1480 #elif defined COMPILE_PCRE32
1481 *errorptr = "argument not compiled in 32 bit mode";
1482 #endif
1483 return NULL;
1484 }
1485
1486 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1487 {
1488 *errorptr = "unknown or incorrect option bit(s) set";
1489 return NULL;
1490 }
1491
1492 code = (pcre_uchar *)re + re->name_table_offset +
1493 (re->name_count * re->name_entry_size);
1494
1495 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1496 a multiline pattern that matches only at "line starts", there is no point in
1497 seeking a list of starting bytes. */
1498
1499 if ((re->options & PCRE_ANCHORED) == 0 &&
1500 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1501 {
1502 int rc;
1503
1504 /* Set the character tables in the block that is passed around */
1505
1506 tables = re->tables;
1507
1508 #if defined COMPILE_PCRE8
1509 if (tables == NULL)
1510 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1511 (void *)(&tables));
1512 #elif defined COMPILE_PCRE16
1513 if (tables == NULL)
1514 (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1515 (void *)(&tables));
1516 #elif defined COMPILE_PCRE32
1517 if (tables == NULL)
1518 (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1519 (void *)(&tables));
1520 #endif
1521
1522 compile_block.lcc = tables + lcc_offset;
1523 compile_block.fcc = tables + fcc_offset;
1524 compile_block.cbits = tables + cbits_offset;
1525 compile_block.ctypes = tables + ctypes_offset;
1526
1527 /* See if we can find a fixed set of initial characters for the pattern. */
1528
1529 memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1530 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1531 &compile_block);
1532 bits_set = rc == SSB_DONE;
1533 if (rc == SSB_UNKNOWN)
1534 {
1535 *errorptr = "internal error: opcode not recognized";
1536 return NULL;
1537 }
1538 }
1539
1540 /* Find the minimum length of subject string. */
1541
1542 switch(min = find_minlength(re, code, code, re->options, NULL))
1543 {
1544 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1545 case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1546 default: break;
1547 }
1548
1549 /* If a set of starting bytes has been identified, or if the minimum length is
1550 greater than zero, or if JIT optimization has been requested, or if
1551 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1552 pcre_study_data block. The study data is put in the latter, which is pointed to
1553 by the former, which may also get additional data set later by the calling
1554 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1555 save it in a field for returning via the pcre_fullinfo() function so that if it
1556 becomes variable in the future, we don't have to change that code. */
1557
1558 if (bits_set || min > 0 || (options & (
1559 #ifdef SUPPORT_JIT
1560 PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1561 PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1562 #endif
1563 PCRE_STUDY_EXTRA_NEEDED)) != 0)
1564 {
1565 extra = (PUBL(extra) *)(PUBL(malloc))
1566 (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1567 if (extra == NULL)
1568 {
1569 *errorptr = "failed to get memory";
1570 return NULL;
1571 }
1572
1573 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1574 extra->flags = PCRE_EXTRA_STUDY_DATA;
1575 extra->study_data = study;
1576
1577 study->size = sizeof(pcre_study_data);
1578 study->flags = 0;
1579
1580 /* Set the start bits always, to avoid unset memory errors if the
1581 study data is written to a file, but set the flag only if any of the bits
1582 are set, to save time looking when none are. */
1583
1584 if (bits_set)
1585 {
1586 study->flags |= PCRE_STUDY_MAPPED;
1587 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1588 }
1589 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1590
1591 #ifdef PCRE_DEBUG
1592 if (bits_set)
1593 {
1594 pcre_uint8 *ptr = start_bits;
1595 int i;
1596
1597 printf("Start bits:\n");
1598 for (i = 0; i < 32; i++)
1599 printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1600 }
1601 #endif
1602
1603 /* Always set the minlength value in the block, because the JIT compiler
1604 makes use of it. However, don't set the bit unless the length is greater than
1605 zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1606 checking the zero case. */
1607
1608 if (min > 0)
1609 {
1610 study->flags |= PCRE_STUDY_MINLEN;
1611 study->minlength = min;
1612 }
1613 else study->minlength = 0;
1614
1615 /* If JIT support was compiled and requested, attempt the JIT compilation.
1616 If no starting bytes were found, and the minimum length is zero, and JIT
1617 compilation fails, abandon the extra block and return NULL, unless
1618 PCRE_STUDY_EXTRA_NEEDED is set. */
1619
1620 #ifdef SUPPORT_JIT
1621 extra->executable_jit = NULL;
1622 if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1623 PRIV(jit_compile)(re, extra, JIT_COMPILE);
1624 if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1625 PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1626 if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1627 PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1628
1629 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1630 (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1631 {
1632 #if defined COMPILE_PCRE8
1633 pcre_free_study(extra);
1634 #elif defined COMPILE_PCRE16
1635 pcre16_free_study(extra);
1636 #elif defined COMPILE_PCRE32
1637 pcre32_free_study(extra);
1638 #endif
1639 extra = NULL;
1640 }
1641 #endif
1642 }
1643
1644 return extra;
1645 }
1646
1647
1648 /*************************************************
1649 * Free the study data *
1650 *************************************************/
1651
1652 /* This function frees the memory that was obtained by pcre_study().
1653
1654 Argument: a pointer to the pcre[16]_extra block
1655 Returns: nothing
1656 */
1657
1658 #if defined COMPILE_PCRE8
1659 PCRE_EXP_DEFN void
pcre_free_study(pcre_extra * extra)1660 pcre_free_study(pcre_extra *extra)
1661 #elif defined COMPILE_PCRE16
1662 PCRE_EXP_DEFN void
1663 pcre16_free_study(pcre16_extra *extra)
1664 #elif defined COMPILE_PCRE32
1665 PCRE_EXP_DEFN void
1666 pcre32_free_study(pcre32_extra *extra)
1667 #endif
1668 {
1669 if (extra == NULL)
1670 return;
1671 #ifdef SUPPORT_JIT
1672 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1673 extra->executable_jit != NULL)
1674 PRIV(jit_free)(extra->executable_jit);
1675 #endif
1676 PUBL(free)(extra);
1677 }
1678
1679 /* End of pcre_study.c */
1680