1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #include "config.h"
46
47 #include "pcre_internal.h"
48
49 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
50
51 /* Returns from set_start_bits() */
52
53 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
54
55
56
57 /*************************************************
58 * Find the minimum subject length for a group *
59 *************************************************/
60
61 /* Scan a parenthesized group and compute the minimum length of subject that
62 is needed to match it. This is a lower bound; it does not mean there is a
63 string of that length that matches. In UTF8 mode, the result is in characters
64 rather than bytes.
65
66 Arguments:
67 re compiled pattern block
68 code pointer to start of group (the bracket)
69 startcode pointer to start of the whole pattern's code
70 options the compiling options
71 recurses chain of recurse_check to catch mutual recursion
72 countptr pointer to call count (to catch over complexity)
73
74 Returns: the minimum length
75 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
76 -2 internal error (missing capturing bracket)
77 -3 internal error (opcode not listed)
78 */
79
80 static int
find_minlength(const REAL_PCRE * re,const pcre_uchar * code,const pcre_uchar * startcode,int options,recurse_check * recurses,int * countptr)81 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
82 const pcre_uchar *startcode, int options, recurse_check *recurses,
83 int *countptr)
84 {
85 int length = -1;
86 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87 BOOL utf = (options & PCRE_UTF8) != 0;
88 BOOL had_recurse = FALSE;
89 recurse_check this_recurse;
90 register int branchlength = 0;
91 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
92
93 if ((*countptr)++ > 1000) return -1; /* too complex */
94
95 if (*code == OP_CBRA || *code == OP_SCBRA ||
96 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
97
98 /* Scan along the opcodes for this branch. If we get to the end of the
99 branch, check the length against that of the other branches. */
100
101 for (;;)
102 {
103 int d, min;
104 pcre_uchar *cs, *ce;
105 register pcre_uchar op = *cc;
106
107 switch (op)
108 {
109 case OP_COND:
110 case OP_SCOND:
111
112 /* If there is only one branch in a condition, the implied branch has zero
113 length, so we don't add anything. This covers the DEFINE "condition"
114 automatically. */
115
116 cs = cc + GET(cc, 1);
117 if (*cs != OP_ALT)
118 {
119 cc = cs + 1 + LINK_SIZE;
120 break;
121 }
122
123 /* Otherwise we can fall through and treat it the same as any other
124 subpattern. */
125
126 case OP_CBRA:
127 case OP_SCBRA:
128 case OP_BRA:
129 case OP_SBRA:
130 case OP_CBRAPOS:
131 case OP_SCBRAPOS:
132 case OP_BRAPOS:
133 case OP_SBRAPOS:
134 case OP_ONCE:
135 case OP_ONCE_NC:
136 d = find_minlength(re, cc, startcode, options, recurses, countptr);
137 if (d < 0) return d;
138 branchlength += d;
139 do cc += GET(cc, 1); while (*cc == OP_ALT);
140 cc += 1 + LINK_SIZE;
141 break;
142
143 /* ACCEPT makes things far too complicated; we have to give up. */
144
145 case OP_ACCEPT:
146 case OP_ASSERT_ACCEPT:
147 return -1;
148
149 /* Reached end of a branch; if it's a ket it is the end of a nested
150 call. If it's ALT it is an alternation in a nested call. If it is END it's
151 the end of the outer call. All can be handled by the same code. If an
152 ACCEPT was previously encountered, use the length that was in force at that
153 time, and pass back the shortest ACCEPT length. */
154
155 case OP_ALT:
156 case OP_KET:
157 case OP_KETRMAX:
158 case OP_KETRMIN:
159 case OP_KETRPOS:
160 case OP_END:
161 if (length < 0 || (!had_recurse && branchlength < length))
162 length = branchlength;
163 if (op != OP_ALT) return length;
164 cc += 1 + LINK_SIZE;
165 branchlength = 0;
166 had_recurse = FALSE;
167 break;
168
169 /* Skip over assertive subpatterns */
170
171 case OP_ASSERT:
172 case OP_ASSERT_NOT:
173 case OP_ASSERTBACK:
174 case OP_ASSERTBACK_NOT:
175 do cc += GET(cc, 1); while (*cc == OP_ALT);
176 /* Fall through */
177
178 /* Skip over things that don't match chars */
179
180 case OP_REVERSE:
181 case OP_CREF:
182 case OP_DNCREF:
183 case OP_RREF:
184 case OP_DNRREF:
185 case OP_DEF:
186 case OP_CALLOUT:
187 case OP_SOD:
188 case OP_SOM:
189 case OP_EOD:
190 case OP_EODN:
191 case OP_CIRC:
192 case OP_CIRCM:
193 case OP_DOLL:
194 case OP_DOLLM:
195 case OP_NOT_WORD_BOUNDARY:
196 case OP_WORD_BOUNDARY:
197 cc += PRIV(OP_lengths)[*cc];
198 break;
199
200 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
201
202 case OP_BRAZERO:
203 case OP_BRAMINZERO:
204 case OP_BRAPOSZERO:
205 case OP_SKIPZERO:
206 cc += PRIV(OP_lengths)[*cc];
207 do cc += GET(cc, 1); while (*cc == OP_ALT);
208 cc += 1 + LINK_SIZE;
209 break;
210
211 /* Handle literal characters and + repetitions */
212
213 case OP_CHAR:
214 case OP_CHARI:
215 case OP_NOT:
216 case OP_NOTI:
217 case OP_PLUS:
218 case OP_PLUSI:
219 case OP_MINPLUS:
220 case OP_MINPLUSI:
221 case OP_POSPLUS:
222 case OP_POSPLUSI:
223 case OP_NOTPLUS:
224 case OP_NOTPLUSI:
225 case OP_NOTMINPLUS:
226 case OP_NOTMINPLUSI:
227 case OP_NOTPOSPLUS:
228 case OP_NOTPOSPLUSI:
229 branchlength++;
230 cc += 2;
231 #ifdef SUPPORT_UTF
232 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
233 #endif
234 break;
235
236 case OP_TYPEPLUS:
237 case OP_TYPEMINPLUS:
238 case OP_TYPEPOSPLUS:
239 branchlength++;
240 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
241 break;
242
243 /* Handle exact repetitions. The count is already in characters, but we
244 need to skip over a multibyte character in UTF8 mode. */
245
246 case OP_EXACT:
247 case OP_EXACTI:
248 case OP_NOTEXACT:
249 case OP_NOTEXACTI:
250 branchlength += GET2(cc,1);
251 cc += 2 + IMM2_SIZE;
252 #ifdef SUPPORT_UTF
253 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
254 #endif
255 break;
256
257 case OP_TYPEEXACT:
258 branchlength += GET2(cc,1);
259 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
260 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
261 break;
262
263 /* Handle single-char non-literal matchers */
264
265 case OP_PROP:
266 case OP_NOTPROP:
267 cc += 2;
268 /* Fall through */
269
270 case OP_NOT_DIGIT:
271 case OP_DIGIT:
272 case OP_NOT_WHITESPACE:
273 case OP_WHITESPACE:
274 case OP_NOT_WORDCHAR:
275 case OP_WORDCHAR:
276 case OP_ANY:
277 case OP_ALLANY:
278 case OP_EXTUNI:
279 case OP_HSPACE:
280 case OP_NOT_HSPACE:
281 case OP_VSPACE:
282 case OP_NOT_VSPACE:
283 branchlength++;
284 cc++;
285 break;
286
287 /* "Any newline" might match two characters, but it also might match just
288 one. */
289
290 case OP_ANYNL:
291 branchlength += 1;
292 cc++;
293 break;
294
295 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
296 non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
297 appear, but leave the code, just in case.) */
298
299 case OP_ANYBYTE:
300 #ifdef SUPPORT_UTF
301 if (utf) return -1;
302 #endif
303 branchlength++;
304 cc++;
305 break;
306
307 /* For repeated character types, we have to test for \p and \P, which have
308 an extra two bytes of parameters. */
309
310 case OP_TYPESTAR:
311 case OP_TYPEMINSTAR:
312 case OP_TYPEQUERY:
313 case OP_TYPEMINQUERY:
314 case OP_TYPEPOSSTAR:
315 case OP_TYPEPOSQUERY:
316 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
317 cc += PRIV(OP_lengths)[op];
318 break;
319
320 case OP_TYPEUPTO:
321 case OP_TYPEMINUPTO:
322 case OP_TYPEPOSUPTO:
323 if (cc[1 + IMM2_SIZE] == OP_PROP
324 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
325 cc += PRIV(OP_lengths)[op];
326 break;
327
328 /* Check a class for variable quantification */
329
330 case OP_CLASS:
331 case OP_NCLASS:
332 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
333 case OP_XCLASS:
334 /* The original code caused an unsigned overflow in 64 bit systems,
335 so now we use a conditional statement. */
336 if (op == OP_XCLASS)
337 cc += GET(cc, 1);
338 else
339 cc += PRIV(OP_lengths)[OP_CLASS];
340 #else
341 cc += PRIV(OP_lengths)[OP_CLASS];
342 #endif
343
344 switch (*cc)
345 {
346 case OP_CRPLUS:
347 case OP_CRMINPLUS:
348 case OP_CRPOSPLUS:
349 branchlength++;
350 /* Fall through */
351
352 case OP_CRSTAR:
353 case OP_CRMINSTAR:
354 case OP_CRQUERY:
355 case OP_CRMINQUERY:
356 case OP_CRPOSSTAR:
357 case OP_CRPOSQUERY:
358 cc++;
359 break;
360
361 case OP_CRRANGE:
362 case OP_CRMINRANGE:
363 case OP_CRPOSRANGE:
364 branchlength += GET2(cc,1);
365 cc += 1 + 2 * IMM2_SIZE;
366 break;
367
368 default:
369 branchlength++;
370 break;
371 }
372 break;
373
374 /* Backreferences and subroutine calls are treated in the same way: we find
375 the minimum length for the subpattern. A recursion, however, causes an
376 a flag to be set that causes the length of this branch to be ignored. The
377 logic is that a recursion can only make sense if there is another
378 alternation that stops the recursing. That will provide the minimum length
379 (when no recursion happens). A backreference within the group that it is
380 referencing behaves in the same way.
381
382 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
383 matches an empty string (by default it causes a matching failure), so in
384 that case we must set the minimum length to zero. */
385
386 case OP_DNREF: /* Duplicate named pattern back reference */
387 case OP_DNREFI:
388 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
389 {
390 int count = GET2(cc, 1+IMM2_SIZE);
391 pcre_uchar *slot = (pcre_uchar *)re +
392 re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
393 d = INT_MAX;
394 while (count-- > 0)
395 {
396 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
397 if (cs == NULL) return -2;
398 do ce += GET(ce, 1); while (*ce == OP_ALT);
399 if (cc > cs && cc < ce) /* Simple recursion */
400 {
401 d = 0;
402 had_recurse = TRUE;
403 break;
404 }
405 else
406 {
407 recurse_check *r = recurses;
408 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
409 if (r != NULL) /* Mutual recursion */
410 {
411 d = 0;
412 had_recurse = TRUE;
413 break;
414 }
415 else
416 {
417 int dd;
418 this_recurse.prev = recurses;
419 this_recurse.group = cs;
420 dd = find_minlength(re, cs, startcode, options, &this_recurse,
421 countptr);
422 if (dd < d) d = dd;
423 }
424 }
425 slot += re->name_entry_size;
426 }
427 }
428 else d = 0;
429 cc += 1 + 2*IMM2_SIZE;
430 goto REPEAT_BACK_REFERENCE;
431
432 case OP_REF: /* Single back reference */
433 case OP_REFI:
434 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
435 {
436 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
437 if (cs == NULL) return -2;
438 do ce += GET(ce, 1); while (*ce == OP_ALT);
439 if (cc > cs && cc < ce) /* Simple recursion */
440 {
441 d = 0;
442 had_recurse = TRUE;
443 }
444 else
445 {
446 recurse_check *r = recurses;
447 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
448 if (r != NULL) /* Mutual recursion */
449 {
450 d = 0;
451 had_recurse = TRUE;
452 }
453 else
454 {
455 this_recurse.prev = recurses;
456 this_recurse.group = cs;
457 d = find_minlength(re, cs, startcode, options, &this_recurse,
458 countptr);
459 }
460 }
461 }
462 else d = 0;
463 cc += 1 + IMM2_SIZE;
464
465 /* Handle repeated back references */
466
467 REPEAT_BACK_REFERENCE:
468 switch (*cc)
469 {
470 case OP_CRSTAR:
471 case OP_CRMINSTAR:
472 case OP_CRQUERY:
473 case OP_CRMINQUERY:
474 case OP_CRPOSSTAR:
475 case OP_CRPOSQUERY:
476 min = 0;
477 cc++;
478 break;
479
480 case OP_CRPLUS:
481 case OP_CRMINPLUS:
482 case OP_CRPOSPLUS:
483 min = 1;
484 cc++;
485 break;
486
487 case OP_CRRANGE:
488 case OP_CRMINRANGE:
489 case OP_CRPOSRANGE:
490 min = GET2(cc, 1);
491 cc += 1 + 2 * IMM2_SIZE;
492 break;
493
494 default:
495 min = 1;
496 break;
497 }
498
499 branchlength += min * d;
500 break;
501
502 /* We can easily detect direct recursion, but not mutual recursion. This is
503 caught by a recursion depth count. */
504
505 case OP_RECURSE:
506 cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
507 do ce += GET(ce, 1); while (*ce == OP_ALT);
508 if (cc > cs && cc < ce) /* Simple recursion */
509 had_recurse = TRUE;
510 else
511 {
512 recurse_check *r = recurses;
513 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
514 if (r != NULL) /* Mutual recursion */
515 had_recurse = TRUE;
516 else
517 {
518 this_recurse.prev = recurses;
519 this_recurse.group = cs;
520 branchlength += find_minlength(re, cs, startcode, options,
521 &this_recurse, countptr);
522 }
523 }
524 cc += 1 + LINK_SIZE;
525 break;
526
527 /* Anything else does not or need not match a character. We can get the
528 item's length from the table, but for those that can match zero occurrences
529 of a character, we must take special action for UTF-8 characters. As it
530 happens, the "NOT" versions of these opcodes are used at present only for
531 ASCII characters, so they could be omitted from this list. However, in
532 future that may change, so we include them here so as not to leave a
533 gotcha for a future maintainer. */
534
535 case OP_UPTO:
536 case OP_UPTOI:
537 case OP_NOTUPTO:
538 case OP_NOTUPTOI:
539 case OP_MINUPTO:
540 case OP_MINUPTOI:
541 case OP_NOTMINUPTO:
542 case OP_NOTMINUPTOI:
543 case OP_POSUPTO:
544 case OP_POSUPTOI:
545 case OP_NOTPOSUPTO:
546 case OP_NOTPOSUPTOI:
547
548 case OP_STAR:
549 case OP_STARI:
550 case OP_NOTSTAR:
551 case OP_NOTSTARI:
552 case OP_MINSTAR:
553 case OP_MINSTARI:
554 case OP_NOTMINSTAR:
555 case OP_NOTMINSTARI:
556 case OP_POSSTAR:
557 case OP_POSSTARI:
558 case OP_NOTPOSSTAR:
559 case OP_NOTPOSSTARI:
560
561 case OP_QUERY:
562 case OP_QUERYI:
563 case OP_NOTQUERY:
564 case OP_NOTQUERYI:
565 case OP_MINQUERY:
566 case OP_MINQUERYI:
567 case OP_NOTMINQUERY:
568 case OP_NOTMINQUERYI:
569 case OP_POSQUERY:
570 case OP_POSQUERYI:
571 case OP_NOTPOSQUERY:
572 case OP_NOTPOSQUERYI:
573
574 cc += PRIV(OP_lengths)[op];
575 #ifdef SUPPORT_UTF
576 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
577 #endif
578 break;
579
580 /* Skip these, but we need to add in the name length. */
581
582 case OP_MARK:
583 case OP_PRUNE_ARG:
584 case OP_SKIP_ARG:
585 case OP_THEN_ARG:
586 cc += PRIV(OP_lengths)[op] + cc[1];
587 break;
588
589 /* The remaining opcodes are just skipped over. */
590
591 case OP_CLOSE:
592 case OP_COMMIT:
593 case OP_FAIL:
594 case OP_PRUNE:
595 case OP_SET_SOM:
596 case OP_SKIP:
597 case OP_THEN:
598 cc += PRIV(OP_lengths)[op];
599 break;
600
601 /* This should not occur: we list all opcodes explicitly so that when
602 new ones get added they are properly considered. */
603
604 default:
605 return -3;
606 }
607 }
608 /* Control never gets here */
609 }
610
611
612
613 /*************************************************
614 * Set a bit and maybe its alternate case *
615 *************************************************/
616
617 /* Given a character, set its first byte's bit in the table, and also the
618 corresponding bit for the other version of a letter if we are caseless. In
619 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
620 when Unicode property support is available.
621
622 Arguments:
623 start_bits points to the bit map
624 p points to the character
625 caseless the caseless flag
626 cd the block with char table pointers
627 utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
628
629 Returns: pointer after the character
630 */
631
632 static const pcre_uchar *
set_table_bit(pcre_uint8 * start_bits,const pcre_uchar * p,BOOL caseless,compile_data * cd,BOOL utf)633 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
634 compile_data *cd, BOOL utf)
635 {
636 pcre_uint32 c = *p;
637
638 #ifdef COMPILE_PCRE8
639 SET_BIT(c);
640
641 #ifdef SUPPORT_UTF
642 if (utf && c > 127)
643 {
644 GETCHARINC(c, p);
645 #ifdef SUPPORT_UCP
646 if (caseless)
647 {
648 pcre_uchar buff[6];
649 c = UCD_OTHERCASE(c);
650 (void)PRIV(ord2utf)(c, buff);
651 SET_BIT(buff[0]);
652 }
653 #endif /* Not SUPPORT_UCP */
654 return p;
655 }
656 #else /* Not SUPPORT_UTF */
657 (void)(utf); /* Stops warning for unused parameter */
658 #endif /* SUPPORT_UTF */
659
660 /* Not UTF-8 mode, or character is less than 127. */
661
662 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
663 return p + 1;
664 #endif /* COMPILE_PCRE8 */
665
666 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
667 if (c > 0xff)
668 {
669 c = 0xff;
670 caseless = FALSE;
671 }
672 SET_BIT(c);
673
674 #ifdef SUPPORT_UTF
675 if (utf && c > 127)
676 {
677 GETCHARINC(c, p);
678 #ifdef SUPPORT_UCP
679 if (caseless)
680 {
681 c = UCD_OTHERCASE(c);
682 if (c > 0xff)
683 c = 0xff;
684 SET_BIT(c);
685 }
686 #endif /* SUPPORT_UCP */
687 return p;
688 }
689 #else /* Not SUPPORT_UTF */
690 (void)(utf); /* Stops warning for unused parameter */
691 #endif /* SUPPORT_UTF */
692
693 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
694 return p + 1;
695 #endif
696 }
697
698
699
700 /*************************************************
701 * Set bits for a positive character type *
702 *************************************************/
703
704 /* This function sets starting bits for a character type. In UTF-8 mode, we can
705 only do a direct setting for bytes less than 128, as otherwise there can be
706 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
707 environment, the tables will only recognize ASCII characters anyway, but in at
708 least one Windows environment, some higher bytes bits were set in the tables.
709 So we deal with that case by considering the UTF-8 encoding.
710
711 Arguments:
712 start_bits the starting bitmap
713 cbit type the type of character wanted
714 table_limit 32 for non-UTF-8; 16 for UTF-8
715 cd the block with char table pointers
716
717 Returns: nothing
718 */
719
720 static void
set_type_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)721 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
722 compile_data *cd)
723 {
724 register pcre_uint32 c;
725 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
726 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
727 if (table_limit == 32) return;
728 for (c = 128; c < 256; c++)
729 {
730 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
731 {
732 pcre_uchar buff[6];
733 (void)PRIV(ord2utf)(c, buff);
734 SET_BIT(buff[0]);
735 }
736 }
737 #endif
738 }
739
740
741 /*************************************************
742 * Set bits for a negative character type *
743 *************************************************/
744
745 /* This function sets starting bits for a negative character type such as \D.
746 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
747 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
748 Unlike in the positive case, where we can set appropriate starting bits for
749 specific high-valued UTF-8 characters, in this case we have to set the bits for
750 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
751 0xc0 (192) for simplicity.
752
753 Arguments:
754 start_bits the starting bitmap
755 cbit type the type of character wanted
756 table_limit 32 for non-UTF-8; 16 for UTF-8
757 cd the block with char table pointers
758
759 Returns: nothing
760 */
761
762 static void
set_nottype_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)763 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
764 compile_data *cd)
765 {
766 register pcre_uint32 c;
767 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
768 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
769 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
770 #endif
771 }
772
773
774
775 /*************************************************
776 * Create bitmap of starting bytes *
777 *************************************************/
778
779 /* This function scans a compiled unanchored expression recursively and
780 attempts to build a bitmap of the set of possible starting bytes. As time goes
781 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
782 useful for parenthesized groups in patterns such as (a*)b where the group
783 provides some optional starting bytes but scanning must continue at the outer
784 level to find at least one mandatory byte. At the outermost level, this
785 function fails unless the result is SSB_DONE.
786
787 Arguments:
788 code points to an expression
789 start_bits points to a 32-byte table, initialized to 0
790 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
791 cd the block with char table pointers
792
793 Returns: SSB_FAIL => Failed to find any starting bytes
794 SSB_DONE => Found mandatory starting bytes
795 SSB_CONTINUE => Found optional starting bytes
796 SSB_UNKNOWN => Hit an unrecognized opcode
797 */
798
799 static int
set_start_bits(const pcre_uchar * code,pcre_uint8 * start_bits,BOOL utf,compile_data * cd)800 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
801 compile_data *cd)
802 {
803 register pcre_uint32 c;
804 int yield = SSB_DONE;
805 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
806 int table_limit = utf? 16:32;
807 #else
808 int table_limit = 32;
809 #endif
810
811 #if 0
812 /* ========================================================================= */
813 /* The following comment and code was inserted in January 1999. In May 2006,
814 when it was observed to cause compiler warnings about unused values, I took it
815 out again. If anybody is still using OS/2, they will have to put it back
816 manually. */
817
818 /* This next statement and the later reference to dummy are here in order to
819 trick the optimizer of the IBM C compiler for OS/2 into generating correct
820 code. Apparently IBM isn't going to fix the problem, and we would rather not
821 disable optimization (in this module it actually makes a big difference, and
822 the pcre module can use all the optimization it can get). */
823
824 volatile int dummy;
825 /* ========================================================================= */
826 #endif
827
828 do
829 {
830 BOOL try_next = TRUE;
831 const pcre_uchar *tcode = code + 1 + LINK_SIZE;
832
833 if (*code == OP_CBRA || *code == OP_SCBRA ||
834 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
835
836 while (try_next) /* Loop for items in this branch */
837 {
838 int rc;
839
840 switch(*tcode)
841 {
842 /* If we reach something we don't understand, it means a new opcode has
843 been created that hasn't been added to this code. Hopefully this problem
844 will be discovered during testing. */
845
846 default:
847 return SSB_UNKNOWN;
848
849 /* Fail for a valid opcode that implies no starting bits. */
850
851 case OP_ACCEPT:
852 case OP_ASSERT_ACCEPT:
853 case OP_ALLANY:
854 case OP_ANY:
855 case OP_ANYBYTE:
856 case OP_CIRC:
857 case OP_CIRCM:
858 case OP_CLOSE:
859 case OP_COMMIT:
860 case OP_COND:
861 case OP_CREF:
862 case OP_DEF:
863 case OP_DNCREF:
864 case OP_DNREF:
865 case OP_DNREFI:
866 case OP_DNRREF:
867 case OP_DOLL:
868 case OP_DOLLM:
869 case OP_END:
870 case OP_EOD:
871 case OP_EODN:
872 case OP_EXTUNI:
873 case OP_FAIL:
874 case OP_MARK:
875 case OP_NOT:
876 case OP_NOTEXACT:
877 case OP_NOTEXACTI:
878 case OP_NOTI:
879 case OP_NOTMINPLUS:
880 case OP_NOTMINPLUSI:
881 case OP_NOTMINQUERY:
882 case OP_NOTMINQUERYI:
883 case OP_NOTMINSTAR:
884 case OP_NOTMINSTARI:
885 case OP_NOTMINUPTO:
886 case OP_NOTMINUPTOI:
887 case OP_NOTPLUS:
888 case OP_NOTPLUSI:
889 case OP_NOTPOSPLUS:
890 case OP_NOTPOSPLUSI:
891 case OP_NOTPOSQUERY:
892 case OP_NOTPOSQUERYI:
893 case OP_NOTPOSSTAR:
894 case OP_NOTPOSSTARI:
895 case OP_NOTPOSUPTO:
896 case OP_NOTPOSUPTOI:
897 case OP_NOTPROP:
898 case OP_NOTQUERY:
899 case OP_NOTQUERYI:
900 case OP_NOTSTAR:
901 case OP_NOTSTARI:
902 case OP_NOTUPTO:
903 case OP_NOTUPTOI:
904 case OP_NOT_HSPACE:
905 case OP_NOT_VSPACE:
906 case OP_PRUNE:
907 case OP_PRUNE_ARG:
908 case OP_RECURSE:
909 case OP_REF:
910 case OP_REFI:
911 case OP_REVERSE:
912 case OP_RREF:
913 case OP_SCOND:
914 case OP_SET_SOM:
915 case OP_SKIP:
916 case OP_SKIP_ARG:
917 case OP_SOD:
918 case OP_SOM:
919 case OP_THEN:
920 case OP_THEN_ARG:
921 return SSB_FAIL;
922
923 /* A "real" property test implies no starting bits, but the fake property
924 PT_CLIST identifies a list of characters. These lists are short, as they
925 are used for characters with more than one "other case", so there is no
926 point in recognizing them for OP_NOTPROP. */
927
928 case OP_PROP:
929 if (tcode[1] != PT_CLIST) return SSB_FAIL;
930 {
931 const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
932 while ((c = *p++) < NOTACHAR)
933 {
934 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
935 if (utf)
936 {
937 pcre_uchar buff[6];
938 (void)PRIV(ord2utf)(c, buff);
939 c = buff[0];
940 }
941 #endif
942 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
943 }
944 }
945 try_next = FALSE;
946 break;
947
948 /* We can ignore word boundary tests. */
949
950 case OP_WORD_BOUNDARY:
951 case OP_NOT_WORD_BOUNDARY:
952 tcode++;
953 break;
954
955 /* If we hit a bracket or a positive lookahead assertion, recurse to set
956 bits from within the subpattern. If it can't find anything, we have to
957 give up. If it finds some mandatory character(s), we are done for this
958 branch. Otherwise, carry on scanning after the subpattern. */
959
960 case OP_BRA:
961 case OP_SBRA:
962 case OP_CBRA:
963 case OP_SCBRA:
964 case OP_BRAPOS:
965 case OP_SBRAPOS:
966 case OP_CBRAPOS:
967 case OP_SCBRAPOS:
968 case OP_ONCE:
969 case OP_ONCE_NC:
970 case OP_ASSERT:
971 rc = set_start_bits(tcode, start_bits, utf, cd);
972 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
973 if (rc == SSB_DONE) try_next = FALSE; else
974 {
975 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
976 tcode += 1 + LINK_SIZE;
977 }
978 break;
979
980 /* If we hit ALT or KET, it means we haven't found anything mandatory in
981 this branch, though we might have found something optional. For ALT, we
982 continue with the next alternative, but we have to arrange that the final
983 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
984 return SSB_CONTINUE: if this is the top level, that indicates failure,
985 but after a nested subpattern, it causes scanning to continue. */
986
987 case OP_ALT:
988 yield = SSB_CONTINUE;
989 try_next = FALSE;
990 break;
991
992 case OP_KET:
993 case OP_KETRMAX:
994 case OP_KETRMIN:
995 case OP_KETRPOS:
996 return SSB_CONTINUE;
997
998 /* Skip over callout */
999
1000 case OP_CALLOUT:
1001 tcode += 2 + 2*LINK_SIZE;
1002 break;
1003
1004 /* Skip over lookbehind and negative lookahead assertions */
1005
1006 case OP_ASSERT_NOT:
1007 case OP_ASSERTBACK:
1008 case OP_ASSERTBACK_NOT:
1009 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1010 tcode += 1 + LINK_SIZE;
1011 break;
1012
1013 /* BRAZERO does the bracket, but carries on. */
1014
1015 case OP_BRAZERO:
1016 case OP_BRAMINZERO:
1017 case OP_BRAPOSZERO:
1018 rc = set_start_bits(++tcode, start_bits, utf, cd);
1019 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1020 /* =========================================================================
1021 See the comment at the head of this function concerning the next line,
1022 which was an old fudge for the benefit of OS/2.
1023 dummy = 1;
1024 ========================================================================= */
1025 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1026 tcode += 1 + LINK_SIZE;
1027 break;
1028
1029 /* SKIPZERO skips the bracket. */
1030
1031 case OP_SKIPZERO:
1032 tcode++;
1033 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1034 tcode += 1 + LINK_SIZE;
1035 break;
1036
1037 /* Single-char * or ? sets the bit and tries the next item */
1038
1039 case OP_STAR:
1040 case OP_MINSTAR:
1041 case OP_POSSTAR:
1042 case OP_QUERY:
1043 case OP_MINQUERY:
1044 case OP_POSQUERY:
1045 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1046 break;
1047
1048 case OP_STARI:
1049 case OP_MINSTARI:
1050 case OP_POSSTARI:
1051 case OP_QUERYI:
1052 case OP_MINQUERYI:
1053 case OP_POSQUERYI:
1054 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1055 break;
1056
1057 /* Single-char upto sets the bit and tries the next */
1058
1059 case OP_UPTO:
1060 case OP_MINUPTO:
1061 case OP_POSUPTO:
1062 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1063 break;
1064
1065 case OP_UPTOI:
1066 case OP_MINUPTOI:
1067 case OP_POSUPTOI:
1068 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1069 break;
1070
1071 /* At least one single char sets the bit and stops */
1072
1073 case OP_EXACT:
1074 tcode += IMM2_SIZE;
1075 /* Fall through */
1076 case OP_CHAR:
1077 case OP_PLUS:
1078 case OP_MINPLUS:
1079 case OP_POSPLUS:
1080 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1081 try_next = FALSE;
1082 break;
1083
1084 case OP_EXACTI:
1085 tcode += IMM2_SIZE;
1086 /* Fall through */
1087 case OP_CHARI:
1088 case OP_PLUSI:
1089 case OP_MINPLUSI:
1090 case OP_POSPLUSI:
1091 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1092 try_next = FALSE;
1093 break;
1094
1095 /* Special spacing and line-terminating items. These recognize specific
1096 lists of characters. The difference between VSPACE and ANYNL is that the
1097 latter can match the two-character CRLF sequence, but that is not
1098 relevant for finding the first character, so their code here is
1099 identical. */
1100
1101 case OP_HSPACE:
1102 SET_BIT(CHAR_HT);
1103 SET_BIT(CHAR_SPACE);
1104 #ifdef SUPPORT_UTF
1105 if (utf)
1106 {
1107 #ifdef COMPILE_PCRE8
1108 SET_BIT(0xC2); /* For U+00A0 */
1109 SET_BIT(0xE1); /* For U+1680, U+180E */
1110 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1111 SET_BIT(0xE3); /* For U+3000 */
1112 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1113 SET_BIT(0xA0);
1114 SET_BIT(0xFF); /* For characters > 255 */
1115 #endif /* COMPILE_PCRE[8|16|32] */
1116 }
1117 else
1118 #endif /* SUPPORT_UTF */
1119 {
1120 #ifndef EBCDIC
1121 SET_BIT(0xA0);
1122 #endif /* Not EBCDIC */
1123 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1124 SET_BIT(0xFF); /* For characters > 255 */
1125 #endif /* COMPILE_PCRE[16|32] */
1126 }
1127 try_next = FALSE;
1128 break;
1129
1130 case OP_ANYNL:
1131 case OP_VSPACE:
1132 SET_BIT(CHAR_LF);
1133 SET_BIT(CHAR_VT);
1134 SET_BIT(CHAR_FF);
1135 SET_BIT(CHAR_CR);
1136 #ifdef SUPPORT_UTF
1137 if (utf)
1138 {
1139 #ifdef COMPILE_PCRE8
1140 SET_BIT(0xC2); /* For U+0085 */
1141 SET_BIT(0xE2); /* For U+2028, U+2029 */
1142 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1143 SET_BIT(CHAR_NEL);
1144 SET_BIT(0xFF); /* For characters > 255 */
1145 #endif /* COMPILE_PCRE[8|16|32] */
1146 }
1147 else
1148 #endif /* SUPPORT_UTF */
1149 {
1150 SET_BIT(CHAR_NEL);
1151 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1152 SET_BIT(0xFF); /* For characters > 255 */
1153 #endif
1154 }
1155 try_next = FALSE;
1156 break;
1157
1158 /* Single character types set the bits and stop. Note that if PCRE_UCP
1159 is set, we do not see these op codes because \d etc are converted to
1160 properties. Therefore, these apply in the case when only characters less
1161 than 256 are recognized to match the types. */
1162
1163 case OP_NOT_DIGIT:
1164 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1165 try_next = FALSE;
1166 break;
1167
1168 case OP_DIGIT:
1169 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1170 try_next = FALSE;
1171 break;
1172
1173 /* The cbit_space table has vertical tab as whitespace; we no longer
1174 have to play fancy tricks because Perl added VT to its whitespace at
1175 release 5.18. PCRE added it at release 8.34. */
1176
1177 case OP_NOT_WHITESPACE:
1178 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1179 try_next = FALSE;
1180 break;
1181
1182 case OP_WHITESPACE:
1183 set_type_bits(start_bits, cbit_space, table_limit, cd);
1184 try_next = FALSE;
1185 break;
1186
1187 case OP_NOT_WORDCHAR:
1188 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1189 try_next = FALSE;
1190 break;
1191
1192 case OP_WORDCHAR:
1193 set_type_bits(start_bits, cbit_word, table_limit, cd);
1194 try_next = FALSE;
1195 break;
1196
1197 /* One or more character type fudges the pointer and restarts, knowing
1198 it will hit a single character type and stop there. */
1199
1200 case OP_TYPEPLUS:
1201 case OP_TYPEMINPLUS:
1202 case OP_TYPEPOSPLUS:
1203 tcode++;
1204 break;
1205
1206 case OP_TYPEEXACT:
1207 tcode += 1 + IMM2_SIZE;
1208 break;
1209
1210 /* Zero or more repeats of character types set the bits and then
1211 try again. */
1212
1213 case OP_TYPEUPTO:
1214 case OP_TYPEMINUPTO:
1215 case OP_TYPEPOSUPTO:
1216 tcode += IMM2_SIZE; /* Fall through */
1217
1218 case OP_TYPESTAR:
1219 case OP_TYPEMINSTAR:
1220 case OP_TYPEPOSSTAR:
1221 case OP_TYPEQUERY:
1222 case OP_TYPEMINQUERY:
1223 case OP_TYPEPOSQUERY:
1224 switch(tcode[1])
1225 {
1226 default:
1227 case OP_ANY:
1228 case OP_ALLANY:
1229 return SSB_FAIL;
1230
1231 case OP_HSPACE:
1232 SET_BIT(CHAR_HT);
1233 SET_BIT(CHAR_SPACE);
1234 #ifdef SUPPORT_UTF
1235 if (utf)
1236 {
1237 #ifdef COMPILE_PCRE8
1238 SET_BIT(0xC2); /* For U+00A0 */
1239 SET_BIT(0xE1); /* For U+1680, U+180E */
1240 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1241 SET_BIT(0xE3); /* For U+3000 */
1242 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1243 SET_BIT(0xA0);
1244 SET_BIT(0xFF); /* For characters > 255 */
1245 #endif /* COMPILE_PCRE[8|16|32] */
1246 }
1247 else
1248 #endif /* SUPPORT_UTF */
1249 #ifndef EBCDIC
1250 SET_BIT(0xA0);
1251 #endif /* Not EBCDIC */
1252 break;
1253
1254 case OP_ANYNL:
1255 case OP_VSPACE:
1256 SET_BIT(CHAR_LF);
1257 SET_BIT(CHAR_VT);
1258 SET_BIT(CHAR_FF);
1259 SET_BIT(CHAR_CR);
1260 #ifdef SUPPORT_UTF
1261 if (utf)
1262 {
1263 #ifdef COMPILE_PCRE8
1264 SET_BIT(0xC2); /* For U+0085 */
1265 SET_BIT(0xE2); /* For U+2028, U+2029 */
1266 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1267 SET_BIT(CHAR_NEL);
1268 SET_BIT(0xFF); /* For characters > 255 */
1269 #endif /* COMPILE_PCRE16 */
1270 }
1271 else
1272 #endif /* SUPPORT_UTF */
1273 SET_BIT(CHAR_NEL);
1274 break;
1275
1276 case OP_NOT_DIGIT:
1277 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1278 break;
1279
1280 case OP_DIGIT:
1281 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1282 break;
1283
1284 /* The cbit_space table has vertical tab as whitespace; we no longer
1285 have to play fancy tricks because Perl added VT to its whitespace at
1286 release 5.18. PCRE added it at release 8.34. */
1287
1288 case OP_NOT_WHITESPACE:
1289 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1290 break;
1291
1292 case OP_WHITESPACE:
1293 set_type_bits(start_bits, cbit_space, table_limit, cd);
1294 break;
1295
1296 case OP_NOT_WORDCHAR:
1297 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1298 break;
1299
1300 case OP_WORDCHAR:
1301 set_type_bits(start_bits, cbit_word, table_limit, cd);
1302 break;
1303 }
1304
1305 tcode += 2;
1306 break;
1307
1308 /* Character class where all the information is in a bit map: set the
1309 bits and either carry on or not, according to the repeat count. If it was
1310 a negative class, and we are operating with UTF-8 characters, any byte
1311 with a value >= 0xc4 is a potentially valid starter because it starts a
1312 character with a value > 255. */
1313
1314 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1315 case OP_XCLASS:
1316 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1317 return SSB_FAIL;
1318 /* All bits are set. */
1319 if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1320 return SSB_FAIL;
1321 #endif
1322 /* Fall through */
1323
1324 case OP_NCLASS:
1325 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1326 if (utf)
1327 {
1328 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1329 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1330 }
1331 #endif
1332 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1333 SET_BIT(0xFF); /* For characters > 255 */
1334 #endif
1335 /* Fall through */
1336
1337 case OP_CLASS:
1338 {
1339 pcre_uint8 *map;
1340 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1341 map = NULL;
1342 if (*tcode == OP_XCLASS)
1343 {
1344 if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1345 map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1346 tcode += GET(tcode, 1);
1347 }
1348 else
1349 #endif
1350 {
1351 tcode++;
1352 map = (pcre_uint8 *)tcode;
1353 tcode += 32 / sizeof(pcre_uchar);
1354 }
1355
1356 /* In UTF-8 mode, the bits in a bit map correspond to character
1357 values, not to byte values. However, the bit map we are constructing is
1358 for byte values. So we have to do a conversion for characters whose
1359 value is > 127. In fact, there are only two possible starting bytes for
1360 characters in the range 128 - 255. */
1361
1362 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1363 if (map != NULL)
1364 #endif
1365 {
1366 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1367 if (utf)
1368 {
1369 for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1370 for (c = 128; c < 256; c++)
1371 {
1372 if ((map[c/8] && (1 << (c&7))) != 0)
1373 {
1374 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1375 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1376 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1377 }
1378 }
1379 }
1380 else
1381 #endif
1382 {
1383 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1384 for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1385 }
1386 }
1387
1388 /* Advance past the bit map, and act on what follows. For a zero
1389 minimum repeat, continue; otherwise stop processing. */
1390
1391 switch (*tcode)
1392 {
1393 case OP_CRSTAR:
1394 case OP_CRMINSTAR:
1395 case OP_CRQUERY:
1396 case OP_CRMINQUERY:
1397 case OP_CRPOSSTAR:
1398 case OP_CRPOSQUERY:
1399 tcode++;
1400 break;
1401
1402 case OP_CRRANGE:
1403 case OP_CRMINRANGE:
1404 case OP_CRPOSRANGE:
1405 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1406 else try_next = FALSE;
1407 break;
1408
1409 default:
1410 try_next = FALSE;
1411 break;
1412 }
1413 }
1414 break; /* End of bitmap class handling */
1415
1416 } /* End of switch */
1417 } /* End of try_next loop */
1418
1419 code += GET(code, 1); /* Advance to next branch */
1420 }
1421 while (*code == OP_ALT);
1422 return yield;
1423 }
1424
1425
1426
1427
1428
1429 /*************************************************
1430 * Study a compiled expression *
1431 *************************************************/
1432
1433 /* This function is handed a compiled expression that it must study to produce
1434 information that will speed up the matching. It returns a pcre[16]_extra block
1435 which then gets handed back to pcre_exec().
1436
1437 Arguments:
1438 re points to the compiled expression
1439 options contains option bits
1440 errorptr points to where to place error messages;
1441 set NULL unless error
1442
1443 Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1444 the appropriate flags set;
1445 NULL on error or if no optimization possible
1446 */
1447
1448 #if defined COMPILE_PCRE8
1449 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre * external_re,int options,const char ** errorptr)1450 pcre_study(const pcre *external_re, int options, const char **errorptr)
1451 #elif defined COMPILE_PCRE16
1452 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1453 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1454 #elif defined COMPILE_PCRE32
1455 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1456 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1457 #endif
1458 {
1459 int min;
1460 int count = 0;
1461 BOOL bits_set = FALSE;
1462 pcre_uint8 start_bits[32];
1463 PUBL(extra) *extra = NULL;
1464 pcre_study_data *study;
1465 const pcre_uint8 *tables;
1466 pcre_uchar *code;
1467 compile_data compile_block;
1468 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1469
1470
1471 *errorptr = NULL;
1472
1473 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1474 {
1475 *errorptr = "argument is not a compiled regular expression";
1476 return NULL;
1477 }
1478
1479 if ((re->flags & PCRE_MODE) == 0)
1480 {
1481 #if defined COMPILE_PCRE8
1482 *errorptr = "argument not compiled in 8 bit mode";
1483 #elif defined COMPILE_PCRE16
1484 *errorptr = "argument not compiled in 16 bit mode";
1485 #elif defined COMPILE_PCRE32
1486 *errorptr = "argument not compiled in 32 bit mode";
1487 #endif
1488 return NULL;
1489 }
1490
1491 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1492 {
1493 *errorptr = "unknown or incorrect option bit(s) set";
1494 return NULL;
1495 }
1496
1497 code = (pcre_uchar *)re + re->name_table_offset +
1498 (re->name_count * re->name_entry_size);
1499
1500 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1501 a multiline pattern that matches only at "line starts", there is no point in
1502 seeking a list of starting bytes. */
1503
1504 if ((re->options & PCRE_ANCHORED) == 0 &&
1505 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1506 {
1507 int rc;
1508
1509 /* Set the character tables in the block that is passed around */
1510
1511 tables = re->tables;
1512
1513 #if defined COMPILE_PCRE8
1514 if (tables == NULL)
1515 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1516 (void *)(&tables));
1517 #elif defined COMPILE_PCRE16
1518 if (tables == NULL)
1519 (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1520 (void *)(&tables));
1521 #elif defined COMPILE_PCRE32
1522 if (tables == NULL)
1523 (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1524 (void *)(&tables));
1525 #endif
1526
1527 compile_block.lcc = tables + lcc_offset;
1528 compile_block.fcc = tables + fcc_offset;
1529 compile_block.cbits = tables + cbits_offset;
1530 compile_block.ctypes = tables + ctypes_offset;
1531
1532 /* See if we can find a fixed set of initial characters for the pattern. */
1533
1534 memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1535 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1536 &compile_block);
1537 bits_set = rc == SSB_DONE;
1538 if (rc == SSB_UNKNOWN)
1539 {
1540 *errorptr = "internal error: opcode not recognized";
1541 return NULL;
1542 }
1543 }
1544
1545 /* Find the minimum length of subject string. */
1546
1547 switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1548 {
1549 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1550 case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1551 default: break;
1552 }
1553
1554 /* If a set of starting bytes has been identified, or if the minimum length is
1555 greater than zero, or if JIT optimization has been requested, or if
1556 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1557 pcre_study_data block. The study data is put in the latter, which is pointed to
1558 by the former, which may also get additional data set later by the calling
1559 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1560 save it in a field for returning via the pcre_fullinfo() function so that if it
1561 becomes variable in the future, we don't have to change that code. */
1562
1563 if (bits_set || min > 0 || (options & (
1564 #ifdef SUPPORT_JIT
1565 PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1566 PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1567 #endif
1568 PCRE_STUDY_EXTRA_NEEDED)) != 0)
1569 {
1570 extra = (PUBL(extra) *)(PUBL(malloc))
1571 (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1572 if (extra == NULL)
1573 {
1574 *errorptr = "failed to get memory";
1575 return NULL;
1576 }
1577
1578 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1579 extra->flags = PCRE_EXTRA_STUDY_DATA;
1580 extra->study_data = study;
1581
1582 study->size = sizeof(pcre_study_data);
1583 study->flags = 0;
1584
1585 /* Set the start bits always, to avoid unset memory errors if the
1586 study data is written to a file, but set the flag only if any of the bits
1587 are set, to save time looking when none are. */
1588
1589 if (bits_set)
1590 {
1591 study->flags |= PCRE_STUDY_MAPPED;
1592 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1593 }
1594 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1595
1596 #ifdef PCRE_DEBUG
1597 if (bits_set)
1598 {
1599 pcre_uint8 *ptr = start_bits;
1600 int i;
1601
1602 printf("Start bits:\n");
1603 for (i = 0; i < 32; i++)
1604 printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1605 }
1606 #endif
1607
1608 /* Always set the minlength value in the block, because the JIT compiler
1609 makes use of it. However, don't set the bit unless the length is greater than
1610 zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1611 checking the zero case. */
1612
1613 if (min > 0)
1614 {
1615 study->flags |= PCRE_STUDY_MINLEN;
1616 study->minlength = min;
1617 }
1618 else study->minlength = 0;
1619
1620 /* If JIT support was compiled and requested, attempt the JIT compilation.
1621 If no starting bytes were found, and the minimum length is zero, and JIT
1622 compilation fails, abandon the extra block and return NULL, unless
1623 PCRE_STUDY_EXTRA_NEEDED is set. */
1624
1625 #ifdef SUPPORT_JIT
1626 extra->executable_jit = NULL;
1627 if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1628 PRIV(jit_compile)(re, extra, JIT_COMPILE);
1629 if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1630 PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1631 if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1632 PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1633
1634 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1635 (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1636 {
1637 #if defined COMPILE_PCRE8
1638 pcre_free_study(extra);
1639 #elif defined COMPILE_PCRE16
1640 pcre16_free_study(extra);
1641 #elif defined COMPILE_PCRE32
1642 pcre32_free_study(extra);
1643 #endif
1644 extra = NULL;
1645 }
1646 #endif
1647 }
1648
1649 return extra;
1650 }
1651
1652
1653 /*************************************************
1654 * Free the study data *
1655 *************************************************/
1656
1657 /* This function frees the memory that was obtained by pcre_study().
1658
1659 Argument: a pointer to the pcre[16]_extra block
1660 Returns: nothing
1661 */
1662
1663 #if defined COMPILE_PCRE8
1664 PCRE_EXP_DEFN void
pcre_free_study(pcre_extra * extra)1665 pcre_free_study(pcre_extra *extra)
1666 #elif defined COMPILE_PCRE16
1667 PCRE_EXP_DEFN void
1668 pcre16_free_study(pcre16_extra *extra)
1669 #elif defined COMPILE_PCRE32
1670 PCRE_EXP_DEFN void
1671 pcre32_free_study(pcre32_extra *extra)
1672 #endif
1673 {
1674 if (extra == NULL)
1675 return;
1676 #ifdef SUPPORT_JIT
1677 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1678 extra->executable_jit != NULL)
1679 PRIV(jit_free)(extra->executable_jit);
1680 #endif
1681 PUBL(free)(extra);
1682 }
1683
1684 /* End of pcre_study.c */
1685