1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #include "config.h"
45
46 #define NLBLOCK md /* Block containing newline information */
47 #define PSSTART start_subject /* Field containing processed string start */
48 #define PSEND end_subject /* Field containing processed string end */
49
50 #include "pcre_internal.h"
51
52 /* Undefine some potentially clashing cpp symbols */
53
54 #undef min
55 #undef max
56
57 /* The md->capture_last field uses the lower 16 bits for the last captured
58 substring (which can never be greater than 65535) and a bit in the top half
59 to mean "capture vector overflowed". This odd way of doing things was
60 implemented when it was realized that preserving and restoring the overflow bit
61 whenever the last capture number was saved/restored made for a neater
62 interface, and doing it this way saved on (a) another variable, which would
63 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
64 separate set of save/restore instructions. The following defines are used in
65 implementing this. */
66
67 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
68 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
69 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
70
71 /* Values for setting in md->match_function_type to indicate two special types
72 of call to match(). We do it this way to save on using another stack variable,
73 as stack usage is to be discouraged. */
74
75 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
76 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
77
78 /* Non-error returns from the match() function. Error returns are externally
79 defined PCRE_ERROR_xxx codes, which are all negative. */
80
81 #define MATCH_MATCH 1
82 #define MATCH_NOMATCH 0
83
84 /* Special internal returns from the match() function. Make them sufficiently
85 negative to avoid the external error codes. */
86
87 #define MATCH_ACCEPT (-999)
88 #define MATCH_KETRPOS (-998)
89 #define MATCH_ONCE (-997)
90 /* The next 5 must be kept together and in sequence so that a test that checks
91 for any one of them can use a range. */
92 #define MATCH_COMMIT (-996)
93 #define MATCH_PRUNE (-995)
94 #define MATCH_SKIP (-994)
95 #define MATCH_SKIP_ARG (-993)
96 #define MATCH_THEN (-992)
97 #define MATCH_BACKTRACK_MAX MATCH_THEN
98 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
99
100 /* Maximum number of ints of offset to save on the stack for recursive calls.
101 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
102 because the offset vector is always a multiple of 3 long. */
103
104 #define REC_STACK_SAVE_MAX 30
105
106 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
107
108 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
109 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
110
111 #ifdef PCRE_DEBUG
112 /*************************************************
113 * Debugging function to print chars *
114 *************************************************/
115
116 /* Print a sequence of chars in printable format, stopping at the end of the
117 subject if the requested.
118
119 Arguments:
120 p points to characters
121 length number to print
122 is_subject TRUE if printing from within md->start_subject
123 md pointer to matching data block, if is_subject is TRUE
124
125 Returns: nothing
126 */
127
128 static void
pchars(const pcre_uchar * p,int length,BOOL is_subject,match_data * md)129 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
130 {
131 pcre_uint32 c;
132 BOOL utf = md->utf;
133 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
134 while (length-- > 0)
135 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
136 }
137 #endif
138
139
140
141 /*************************************************
142 * Match a back-reference *
143 *************************************************/
144
145 /* Normally, if a back reference hasn't been set, the length that is passed is
146 negative, so the match always fails. However, in JavaScript compatibility mode,
147 the length passed is zero. Note that in caseless UTF-8 mode, the number of
148 subject bytes matched may be different to the number of reference bytes.
149
150 Arguments:
151 offset index into the offset vector
152 eptr pointer into the subject
153 length length of reference to be matched (number of bytes)
154 md points to match data block
155 caseless TRUE if caseless
156
157 Returns: >= 0 the number of subject bytes matched
158 -1 no match
159 -2 partial match; always given if at end subject
160 */
161
162 static int
match_ref(int offset,register PCRE_PUCHAR eptr,int length,match_data * md,BOOL caseless)163 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
164 BOOL caseless)
165 {
166 PCRE_PUCHAR eptr_start = eptr;
167 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
168 #if defined SUPPORT_UTF && defined SUPPORT_UCP
169 BOOL utf = md->utf;
170 #endif
171
172 #ifdef PCRE_DEBUG
173 if (eptr >= md->end_subject)
174 printf("matching subject <null>");
175 else
176 {
177 printf("matching subject ");
178 pchars(eptr, length, TRUE, md);
179 }
180 printf(" against backref ");
181 pchars(p, length, FALSE, md);
182 printf("\n");
183 #endif
184
185 /* Always fail if reference not set (and not JavaScript compatible - in that
186 case the length is passed as zero). */
187
188 if (length < 0) return -1;
189
190 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
191 properly if Unicode properties are supported. Otherwise, we can check only
192 ASCII characters. */
193
194 if (caseless)
195 {
196 #if defined SUPPORT_UTF && defined SUPPORT_UCP
197 if (utf)
198 {
199 /* Match characters up to the end of the reference. NOTE: the number of
200 data units matched may differ, because in UTF-8 there are some characters
201 whose upper and lower case versions code have different numbers of bytes.
202 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
203 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
204 sequence of two of the latter. It is important, therefore, to check the
205 length along the reference, not along the subject (earlier code did this
206 wrong). */
207
208 PCRE_PUCHAR endptr = p + length;
209 while (p < endptr)
210 {
211 pcre_uint32 c, d;
212 const ucd_record *ur;
213 if (eptr >= md->end_subject) return -2; /* Partial match */
214 GETCHARINC(c, eptr);
215 GETCHARINC(d, p);
216 ur = GET_UCD(d);
217 if (c != d && c != d + ur->other_case)
218 {
219 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
220 for (;;)
221 {
222 if (c < *pp) return -1;
223 if (c == *pp++) break;
224 }
225 }
226 }
227 }
228 else
229 #endif
230
231 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
232 is no UCP support. */
233 {
234 while (length-- > 0)
235 {
236 pcre_uint32 cc, cp;
237 if (eptr >= md->end_subject) return -2; /* Partial match */
238 cc = UCHAR21TEST(eptr);
239 cp = UCHAR21TEST(p);
240 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
241 p++;
242 eptr++;
243 }
244 }
245 }
246
247 /* In the caseful case, we can just compare the bytes, whether or not we
248 are in UTF-8 mode. */
249
250 else
251 {
252 while (length-- > 0)
253 {
254 if (eptr >= md->end_subject) return -2; /* Partial match */
255 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
256 }
257 }
258
259 return (int)(eptr - eptr_start);
260 }
261
262
263
264 /***************************************************************************
265 ****************************************************************************
266 RECURSION IN THE match() FUNCTION
267
268 The match() function is highly recursive, though not every recursive call
269 increases the recursive depth. Nevertheless, some regular expressions can cause
270 it to recurse to a great depth. I was writing for Unix, so I just let it call
271 itself recursively. This uses the stack for saving everything that has to be
272 saved for a recursive call. On Unix, the stack can be large, and this works
273 fine.
274
275 It turns out that on some non-Unix-like systems there are problems with
276 programs that use a lot of stack. (This despite the fact that every last chip
277 has oodles of memory these days, and techniques for extending the stack have
278 been known for decades.) So....
279
280 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
281 calls by keeping local variables that need to be preserved in blocks of memory
282 obtained from malloc() instead instead of on the stack. Macros are used to
283 achieve this so that the actual code doesn't look very different to what it
284 always used to.
285
286 The original heap-recursive code used longjmp(). However, it seems that this
287 can be very slow on some operating systems. Following a suggestion from Stan
288 Switzer, the use of longjmp() has been abolished, at the cost of having to
289 provide a unique number for each call to RMATCH. There is no way of generating
290 a sequence of numbers at compile time in C. I have given them names, to make
291 them stand out more clearly.
292
293 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
294 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
295 tests. Furthermore, not using longjmp() means that local dynamic variables
296 don't have indeterminate values; this has meant that the frame size can be
297 reduced because the result can be "passed back" by straight setting of the
298 variable instead of being passed in the frame.
299 ****************************************************************************
300 ***************************************************************************/
301
302 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
303 below must be updated in sync. */
304
305 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
306 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
307 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
308 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
309 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
310 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
311 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
312
313 /* These versions of the macros use the stack, as normal. There are debugging
314 versions and production versions. Note that the "rw" argument of RMATCH isn't
315 actually used in this definition. */
316
317 #ifndef NO_RECURSE
318 #define REGISTER register
319
320 #ifdef PCRE_DEBUG
321 #define RMATCH(ra,rb,rc,rd,re,rw) \
322 { \
323 printf("match() called in line %d\n", __LINE__); \
324 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
325 printf("to line %d\n", __LINE__); \
326 }
327 #define RRETURN(ra) \
328 { \
329 printf("match() returned %d from line %d\n", ra, __LINE__); \
330 return ra; \
331 }
332 #else
333 #define RMATCH(ra,rb,rc,rd,re,rw) \
334 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
335 #define RRETURN(ra) return ra
336 #endif
337
338 #else
339
340
341 /* These versions of the macros manage a private stack on the heap. Note that
342 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
343 argument of match(), which never changes. */
344
345 #define REGISTER
346
347 #define RMATCH(ra,rb,rc,rd,re,rw)\
348 {\
349 heapframe *newframe = frame->Xnextframe;\
350 if (newframe == NULL)\
351 {\
352 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
353 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
354 newframe->Xnextframe = NULL;\
355 frame->Xnextframe = newframe;\
356 }\
357 frame->Xwhere = rw;\
358 newframe->Xeptr = ra;\
359 newframe->Xecode = rb;\
360 newframe->Xmstart = mstart;\
361 newframe->Xoffset_top = rc;\
362 newframe->Xeptrb = re;\
363 newframe->Xrdepth = frame->Xrdepth + 1;\
364 newframe->Xprevframe = frame;\
365 frame = newframe;\
366 DPRINTF(("restarting from line %d\n", __LINE__));\
367 goto HEAP_RECURSE;\
368 L_##rw:\
369 DPRINTF(("jumped back to line %d\n", __LINE__));\
370 }
371
372 #define RRETURN(ra)\
373 {\
374 heapframe *oldframe = frame;\
375 frame = oldframe->Xprevframe;\
376 if (frame != NULL)\
377 {\
378 rrc = ra;\
379 goto HEAP_RETURN;\
380 }\
381 return ra;\
382 }
383
384
385 /* Structure for remembering the local variables in a private frame */
386
387 typedef struct heapframe {
388 struct heapframe *Xprevframe;
389 struct heapframe *Xnextframe;
390
391 /* Function arguments that may change */
392
393 PCRE_PUCHAR Xeptr;
394 const pcre_uchar *Xecode;
395 PCRE_PUCHAR Xmstart;
396 int Xoffset_top;
397 eptrblock *Xeptrb;
398 unsigned int Xrdepth;
399
400 /* Function local variables */
401
402 PCRE_PUCHAR Xcallpat;
403 #ifdef SUPPORT_UTF
404 PCRE_PUCHAR Xcharptr;
405 #endif
406 PCRE_PUCHAR Xdata;
407 PCRE_PUCHAR Xnext;
408 PCRE_PUCHAR Xpp;
409 PCRE_PUCHAR Xprev;
410 PCRE_PUCHAR Xsaved_eptr;
411
412 recursion_info Xnew_recursive;
413
414 BOOL Xcur_is_word;
415 BOOL Xcondition;
416 BOOL Xprev_is_word;
417
418 #ifdef SUPPORT_UCP
419 int Xprop_type;
420 unsigned int Xprop_value;
421 int Xprop_fail_result;
422 int Xoclength;
423 pcre_uchar Xocchars[6];
424 #endif
425
426 int Xcodelink;
427 int Xctype;
428 unsigned int Xfc;
429 int Xfi;
430 int Xlength;
431 int Xmax;
432 int Xmin;
433 unsigned int Xnumber;
434 int Xoffset;
435 unsigned int Xop;
436 pcre_int32 Xsave_capture_last;
437 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
438 int Xstacksave[REC_STACK_SAVE_MAX];
439
440 eptrblock Xnewptrb;
441
442 /* Where to jump back to */
443
444 int Xwhere;
445
446 } heapframe;
447
448 #endif
449
450
451 /***************************************************************************
452 ***************************************************************************/
453
454
455
456 /*************************************************
457 * Match from current position *
458 *************************************************/
459
460 /* This function is called recursively in many circumstances. Whenever it
461 returns a negative (error) response, the outer incarnation must also return the
462 same response. */
463
464 /* These macros pack up tests that are used for partial matching, and which
465 appear several times in the code. We set the "hit end" flag if the pointer is
466 at the end of the subject and also past the start of the subject (i.e.
467 something has been matched). For hard partial matching, we then return
468 immediately. The second one is used when we already know we are past the end of
469 the subject. */
470
471 #define CHECK_PARTIAL()\
472 if (md->partial != 0 && eptr >= md->end_subject && \
473 eptr > md->start_used_ptr) \
474 { \
475 md->hitend = TRUE; \
476 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
477 }
478
479 #define SCHECK_PARTIAL()\
480 if (md->partial != 0 && eptr > md->start_used_ptr) \
481 { \
482 md->hitend = TRUE; \
483 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
484 }
485
486
487 /* Performance note: It might be tempting to extract commonly used fields from
488 the md structure (e.g. utf, end_subject) into individual variables to improve
489 performance. Tests using gcc on a SPARC disproved this; in the first case, it
490 made performance worse.
491
492 Arguments:
493 eptr pointer to current character in subject
494 ecode pointer to current position in compiled code
495 mstart pointer to the current match start position (can be modified
496 by encountering \K)
497 offset_top current top pointer
498 md pointer to "static" info for the match
499 eptrb pointer to chain of blocks containing eptr at start of
500 brackets - for testing for empty matches
501 rdepth the recursion depth
502
503 Returns: MATCH_MATCH if matched ) these values are >= 0
504 MATCH_NOMATCH if failed to match )
505 a negative MATCH_xxx value for PRUNE, SKIP, etc
506 a negative PCRE_ERROR_xxx value if aborted by an error condition
507 (e.g. stopped by repeated call or recursion limit)
508 */
509
510 static int
match(REGISTER PCRE_PUCHAR eptr,REGISTER const pcre_uchar * ecode,PCRE_PUCHAR mstart,int offset_top,match_data * md,eptrblock * eptrb,unsigned int rdepth)511 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
512 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
513 unsigned int rdepth)
514 {
515 /* These variables do not need to be preserved over recursion in this function,
516 so they can be ordinary variables in all cases. Mark some of them with
517 "register" because they are used a lot in loops. */
518
519 register int rrc; /* Returns from recursive calls */
520 register int i; /* Used for loops not involving calls to RMATCH() */
521 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
522 register BOOL utf; /* Local copy of UTF flag for speed */
523
524 BOOL minimize, possessive; /* Quantifier options */
525 BOOL caseless;
526 int condcode;
527
528 /* When recursion is not being used, all "local" variables that have to be
529 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
530 frame on the stack here; subsequent instantiations are obtained from the heap
531 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
532 the top-level on the stack rather than malloc-ing them all gives a performance
533 boost in many cases where there is not much "recursion". */
534
535 #ifdef NO_RECURSE
536 heapframe *frame = (heapframe *)md->match_frames_base;
537
538 /* Copy in the original argument variables */
539
540 frame->Xeptr = eptr;
541 frame->Xecode = ecode;
542 frame->Xmstart = mstart;
543 frame->Xoffset_top = offset_top;
544 frame->Xeptrb = eptrb;
545 frame->Xrdepth = rdepth;
546
547 /* This is where control jumps back to to effect "recursion" */
548
549 HEAP_RECURSE:
550
551 /* Macros make the argument variables come from the current frame */
552
553 #define eptr frame->Xeptr
554 #define ecode frame->Xecode
555 #define mstart frame->Xmstart
556 #define offset_top frame->Xoffset_top
557 #define eptrb frame->Xeptrb
558 #define rdepth frame->Xrdepth
559
560 /* Ditto for the local variables */
561
562 #ifdef SUPPORT_UTF
563 #define charptr frame->Xcharptr
564 #endif
565 #define callpat frame->Xcallpat
566 #define codelink frame->Xcodelink
567 #define data frame->Xdata
568 #define next frame->Xnext
569 #define pp frame->Xpp
570 #define prev frame->Xprev
571 #define saved_eptr frame->Xsaved_eptr
572
573 #define new_recursive frame->Xnew_recursive
574
575 #define cur_is_word frame->Xcur_is_word
576 #define condition frame->Xcondition
577 #define prev_is_word frame->Xprev_is_word
578
579 #ifdef SUPPORT_UCP
580 #define prop_type frame->Xprop_type
581 #define prop_value frame->Xprop_value
582 #define prop_fail_result frame->Xprop_fail_result
583 #define oclength frame->Xoclength
584 #define occhars frame->Xocchars
585 #endif
586
587 #define ctype frame->Xctype
588 #define fc frame->Xfc
589 #define fi frame->Xfi
590 #define length frame->Xlength
591 #define max frame->Xmax
592 #define min frame->Xmin
593 #define number frame->Xnumber
594 #define offset frame->Xoffset
595 #define op frame->Xop
596 #define save_capture_last frame->Xsave_capture_last
597 #define save_offset1 frame->Xsave_offset1
598 #define save_offset2 frame->Xsave_offset2
599 #define save_offset3 frame->Xsave_offset3
600 #define stacksave frame->Xstacksave
601
602 #define newptrb frame->Xnewptrb
603
604 /* When recursion is being used, local variables are allocated on the stack and
605 get preserved during recursion in the normal way. In this environment, fi and
606 i, and fc and c, can be the same variables. */
607
608 #else /* NO_RECURSE not defined */
609 #define fi i
610 #define fc c
611
612 /* Many of the following variables are used only in small blocks of the code.
613 My normal style of coding would have declared them within each of those blocks.
614 However, in order to accommodate the version of this code that uses an external
615 "stack" implemented on the heap, it is easier to declare them all here, so the
616 declarations can be cut out in a block. The only declarations within blocks
617 below are for variables that do not have to be preserved over a recursive call
618 to RMATCH(). */
619
620 #ifdef SUPPORT_UTF
621 const pcre_uchar *charptr;
622 #endif
623 const pcre_uchar *callpat;
624 const pcre_uchar *data;
625 const pcre_uchar *next;
626 PCRE_PUCHAR pp;
627 const pcre_uchar *prev;
628 PCRE_PUCHAR saved_eptr;
629
630 recursion_info new_recursive;
631
632 BOOL cur_is_word;
633 BOOL condition;
634 BOOL prev_is_word;
635
636 #ifdef SUPPORT_UCP
637 int prop_type;
638 unsigned int prop_value;
639 int prop_fail_result;
640 int oclength;
641 pcre_uchar occhars[6];
642 #endif
643
644 int codelink;
645 int ctype;
646 int length;
647 int max;
648 int min;
649 unsigned int number;
650 int offset;
651 unsigned int op;
652 pcre_int32 save_capture_last;
653 int save_offset1, save_offset2, save_offset3;
654 int stacksave[REC_STACK_SAVE_MAX];
655
656 eptrblock newptrb;
657
658 /* There is a special fudge for calling match() in a way that causes it to
659 measure the size of its basic stack frame when the stack is being used for
660 recursion. The second argument (ecode) being NULL triggers this behaviour. It
661 cannot normally ever be NULL. The return is the negated value of the frame
662 size. */
663
664 if (ecode == NULL)
665 {
666 if (rdepth == 0)
667 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
668 else
669 {
670 int len = (char *)&rdepth - (char *)eptr;
671 return (len > 0)? -len : len;
672 }
673 }
674 #endif /* NO_RECURSE */
675
676 /* To save space on the stack and in the heap frame, I have doubled up on some
677 of the local variables that are used only in localised parts of the code, but
678 still need to be preserved over recursive calls of match(). These macros define
679 the alternative names that are used. */
680
681 #define allow_zero cur_is_word
682 #define cbegroup condition
683 #define code_offset codelink
684 #define condassert condition
685 #define matched_once prev_is_word
686 #define foc number
687 #define save_mark data
688
689 /* These statements are here to stop the compiler complaining about unitialized
690 variables. */
691
692 #ifdef SUPPORT_UCP
693 prop_value = 0;
694 prop_fail_result = 0;
695 #endif
696
697
698 /* This label is used for tail recursion, which is used in a few cases even
699 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
700 used. Thanks to Ian Taylor for noticing this possibility and sending the
701 original patch. */
702
703 TAIL_RECURSE:
704
705 /* OK, now we can get on with the real code of the function. Recursive calls
706 are specified by the macro RMATCH and RRETURN is used to return. When
707 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
708 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
709 defined). However, RMATCH isn't like a function call because it's quite a
710 complicated macro. It has to be used in one particular way. This shouldn't,
711 however, impact performance when true recursion is being used. */
712
713 #ifdef SUPPORT_UTF
714 utf = md->utf; /* Local copy of the flag */
715 #else
716 utf = FALSE;
717 #endif
718
719 /* First check that we haven't called match() too many times, or that we
720 haven't exceeded the recursive call limit. */
721
722 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
723 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
724
725 /* At the start of a group with an unlimited repeat that may match an empty
726 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
727 done this way to save having to use another function argument, which would take
728 up space on the stack. See also MATCH_CONDASSERT below.
729
730 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
731 such remembered pointers, to be checked when we hit the closing ket, in order
732 to break infinite loops that match no characters. When match() is called in
733 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
734 NOT be used with tail recursion, because the memory block that is used is on
735 the stack, so a new one may be required for each match(). */
736
737 if (md->match_function_type == MATCH_CBEGROUP)
738 {
739 newptrb.epb_saved_eptr = eptr;
740 newptrb.epb_prev = eptrb;
741 eptrb = &newptrb;
742 md->match_function_type = 0;
743 }
744
745 /* Now start processing the opcodes. */
746
747 for (;;)
748 {
749 minimize = possessive = FALSE;
750 op = *ecode;
751
752 switch(op)
753 {
754 case OP_MARK:
755 md->nomatch_mark = ecode + 2;
756 md->mark = NULL; /* In case previously set by assertion */
757 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
758 eptrb, RM55);
759 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
760 md->mark == NULL) md->mark = ecode + 2;
761
762 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
763 argument, and we must check whether that argument matches this MARK's
764 argument. It is passed back in md->start_match_ptr (an overloading of that
765 variable). If it does match, we reset that variable to the current subject
766 position and return MATCH_SKIP. Otherwise, pass back the return code
767 unaltered. */
768
769 else if (rrc == MATCH_SKIP_ARG &&
770 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
771 {
772 md->start_match_ptr = eptr;
773 RRETURN(MATCH_SKIP);
774 }
775 RRETURN(rrc);
776
777 case OP_FAIL:
778 RRETURN(MATCH_NOMATCH);
779
780 case OP_COMMIT:
781 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
782 eptrb, RM52);
783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
784 RRETURN(MATCH_COMMIT);
785
786 case OP_PRUNE:
787 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
788 eptrb, RM51);
789 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
790 RRETURN(MATCH_PRUNE);
791
792 case OP_PRUNE_ARG:
793 md->nomatch_mark = ecode + 2;
794 md->mark = NULL; /* In case previously set by assertion */
795 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
796 eptrb, RM56);
797 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
798 md->mark == NULL) md->mark = ecode + 2;
799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
800 RRETURN(MATCH_PRUNE);
801
802 case OP_SKIP:
803 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
804 eptrb, RM53);
805 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
806 md->start_match_ptr = eptr; /* Pass back current position */
807 RRETURN(MATCH_SKIP);
808
809 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
810 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
811 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
812 that failed and any that precede it (either they also failed, or were not
813 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
814 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
815 set to the count of the one that failed. */
816
817 case OP_SKIP_ARG:
818 md->skip_arg_count++;
819 if (md->skip_arg_count <= md->ignore_skip_arg)
820 {
821 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
822 break;
823 }
824 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
825 eptrb, RM57);
826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
827
828 /* Pass back the current skip name by overloading md->start_match_ptr and
829 returning the special MATCH_SKIP_ARG return code. This will either be
830 caught by a matching MARK, or get to the top, where it causes a rematch
831 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
832
833 md->start_match_ptr = ecode + 2;
834 RRETURN(MATCH_SKIP_ARG);
835
836 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
837 the branch in which it occurs can be determined. Overload the start of
838 match pointer to do this. */
839
840 case OP_THEN:
841 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
842 eptrb, RM54);
843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
844 md->start_match_ptr = ecode;
845 RRETURN(MATCH_THEN);
846
847 case OP_THEN_ARG:
848 md->nomatch_mark = ecode + 2;
849 md->mark = NULL; /* In case previously set by assertion */
850 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
851 md, eptrb, RM58);
852 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
853 md->mark == NULL) md->mark = ecode + 2;
854 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
855 md->start_match_ptr = ecode;
856 RRETURN(MATCH_THEN);
857
858 /* Handle an atomic group that does not contain any capturing parentheses.
859 This can be handled like an assertion. Prior to 8.13, all atomic groups
860 were handled this way. In 8.13, the code was changed as below for ONCE, so
861 that backups pass through the group and thereby reset captured values.
862 However, this uses a lot more stack, so in 8.20, atomic groups that do not
863 contain any captures generate OP_ONCE_NC, which can be handled in the old,
864 less stack intensive way.
865
866 Check the alternative branches in turn - the matching won't pass the KET
867 for this kind of subpattern. If any one branch matches, we carry on as at
868 the end of a normal bracket, leaving the subject pointer, but resetting
869 the start-of-match value in case it was changed by \K. */
870
871 case OP_ONCE_NC:
872 prev = ecode;
873 saved_eptr = eptr;
874 save_mark = md->mark;
875 do
876 {
877 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
878 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
879 {
880 mstart = md->start_match_ptr;
881 break;
882 }
883 if (rrc == MATCH_THEN)
884 {
885 next = ecode + GET(ecode,1);
886 if (md->start_match_ptr < next &&
887 (*ecode == OP_ALT || *next == OP_ALT))
888 rrc = MATCH_NOMATCH;
889 }
890
891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
892 ecode += GET(ecode,1);
893 md->mark = save_mark;
894 }
895 while (*ecode == OP_ALT);
896
897 /* If hit the end of the group (which could be repeated), fail */
898
899 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
900
901 /* Continue as from after the group, updating the offsets high water
902 mark, since extracts may have been taken. */
903
904 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
905
906 offset_top = md->end_offset_top;
907 eptr = md->end_match_ptr;
908
909 /* For a non-repeating ket, just continue at this level. This also
910 happens for a repeating ket if no characters were matched in the group.
911 This is the forcible breaking of infinite loops as implemented in Perl
912 5.005. */
913
914 if (*ecode == OP_KET || eptr == saved_eptr)
915 {
916 ecode += 1+LINK_SIZE;
917 break;
918 }
919
920 /* The repeating kets try the rest of the pattern or restart from the
921 preceding bracket, in the appropriate order. The second "call" of match()
922 uses tail recursion, to avoid using another stack frame. */
923
924 if (*ecode == OP_KETRMIN)
925 {
926 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
927 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
928 ecode = prev;
929 goto TAIL_RECURSE;
930 }
931 else /* OP_KETRMAX */
932 {
933 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
935 ecode += 1 + LINK_SIZE;
936 goto TAIL_RECURSE;
937 }
938 /* Control never gets here */
939
940 /* Handle a capturing bracket, other than those that are possessive with an
941 unlimited repeat. If there is space in the offset vector, save the current
942 subject position in the working slot at the top of the vector. We mustn't
943 change the current values of the data slot, because they may be set from a
944 previous iteration of this group, and be referred to by a reference inside
945 the group. A failure to match might occur after the group has succeeded,
946 if something later on doesn't match. For this reason, we need to restore
947 the working value and also the values of the final offsets, in case they
948 were set by a previous iteration of the same bracket.
949
950 If there isn't enough space in the offset vector, treat this as if it were
951 a non-capturing bracket. Don't worry about setting the flag for the error
952 case here; that is handled in the code for KET. */
953
954 case OP_CBRA:
955 case OP_SCBRA:
956 number = GET2(ecode, 1+LINK_SIZE);
957 offset = number << 1;
958
959 #ifdef PCRE_DEBUG
960 printf("start bracket %d\n", number);
961 printf("subject=");
962 pchars(eptr, 16, TRUE, md);
963 printf("\n");
964 #endif
965
966 if (offset < md->offset_max)
967 {
968 save_offset1 = md->offset_vector[offset];
969 save_offset2 = md->offset_vector[offset+1];
970 save_offset3 = md->offset_vector[md->offset_end - number];
971 save_capture_last = md->capture_last;
972 save_mark = md->mark;
973
974 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
975 md->offset_vector[md->offset_end - number] =
976 (int)(eptr - md->start_subject);
977
978 for (;;)
979 {
980 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
981 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
982 eptrb, RM1);
983 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
984
985 /* If we backed up to a THEN, check whether it is within the current
986 branch by comparing the address of the THEN that is passed back with
987 the end of the branch. If it is within the current branch, and the
988 branch is one of two or more alternatives (it either starts or ends
989 with OP_ALT), we have reached the limit of THEN's action, so convert
990 the return code to NOMATCH, which will cause normal backtracking to
991 happen from now on. Otherwise, THEN is passed back to an outer
992 alternative. This implements Perl's treatment of parenthesized groups,
993 where a group not containing | does not affect the current alternative,
994 that is, (X) is NOT the same as (X|(*F)). */
995
996 if (rrc == MATCH_THEN)
997 {
998 next = ecode + GET(ecode,1);
999 if (md->start_match_ptr < next &&
1000 (*ecode == OP_ALT || *next == OP_ALT))
1001 rrc = MATCH_NOMATCH;
1002 }
1003
1004 /* Anything other than NOMATCH is passed back. */
1005
1006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1007 md->capture_last = save_capture_last;
1008 ecode += GET(ecode, 1);
1009 md->mark = save_mark;
1010 if (*ecode != OP_ALT) break;
1011 }
1012
1013 DPRINTF(("bracket %d failed\n", number));
1014 md->offset_vector[offset] = save_offset1;
1015 md->offset_vector[offset+1] = save_offset2;
1016 md->offset_vector[md->offset_end - number] = save_offset3;
1017
1018 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1019
1020 RRETURN(rrc);
1021 }
1022
1023 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1024 as a non-capturing bracket. */
1025
1026 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1027 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1028
1029 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1030
1031 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1032 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1033
1034 /* Non-capturing or atomic group, except for possessive with unlimited
1035 repeat and ONCE group with no captures. Loop for all the alternatives.
1036
1037 When we get to the final alternative within the brackets, we used to return
1038 the result of a recursive call to match() whatever happened so it was
1039 possible to reduce stack usage by turning this into a tail recursion,
1040 except in the case of a possibly empty group. However, now that there is
1041 the possiblity of (*THEN) occurring in the final alternative, this
1042 optimization is no longer always possible.
1043
1044 We can optimize if we know there are no (*THEN)s in the pattern; at present
1045 this is the best that can be done.
1046
1047 MATCH_ONCE is returned when the end of an atomic group is successfully
1048 reached, but subsequent matching fails. It passes back up the tree (causing
1049 captured values to be reset) until the original atomic group level is
1050 reached. This is tested by comparing md->once_target with the start of the
1051 group. At this point, the return is converted into MATCH_NOMATCH so that
1052 previous backup points can be taken. */
1053
1054 case OP_ONCE:
1055 case OP_BRA:
1056 case OP_SBRA:
1057 DPRINTF(("start non-capturing bracket\n"));
1058
1059 for (;;)
1060 {
1061 if (op >= OP_SBRA || op == OP_ONCE)
1062 md->match_function_type = MATCH_CBEGROUP;
1063
1064 /* If this is not a possibly empty group, and there are no (*THEN)s in
1065 the pattern, and this is the final alternative, optimize as described
1066 above. */
1067
1068 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1069 {
1070 ecode += PRIV(OP_lengths)[*ecode];
1071 goto TAIL_RECURSE;
1072 }
1073
1074 /* In all other cases, we have to make another call to match(). */
1075
1076 save_mark = md->mark;
1077 save_capture_last = md->capture_last;
1078 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1079 RM2);
1080
1081 /* See comment in the code for capturing groups above about handling
1082 THEN. */
1083
1084 if (rrc == MATCH_THEN)
1085 {
1086 next = ecode + GET(ecode,1);
1087 if (md->start_match_ptr < next &&
1088 (*ecode == OP_ALT || *next == OP_ALT))
1089 rrc = MATCH_NOMATCH;
1090 }
1091
1092 if (rrc != MATCH_NOMATCH)
1093 {
1094 if (rrc == MATCH_ONCE)
1095 {
1096 const pcre_uchar *scode = ecode;
1097 if (*scode != OP_ONCE) /* If not at start, find it */
1098 {
1099 while (*scode == OP_ALT) scode += GET(scode, 1);
1100 scode -= GET(scode, 1);
1101 }
1102 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1103 }
1104 RRETURN(rrc);
1105 }
1106 ecode += GET(ecode, 1);
1107 md->mark = save_mark;
1108 if (*ecode != OP_ALT) break;
1109 md->capture_last = save_capture_last;
1110 }
1111
1112 RRETURN(MATCH_NOMATCH);
1113
1114 /* Handle possessive capturing brackets with an unlimited repeat. We come
1115 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1116 handled similarly to the normal case above. However, the matching is
1117 different. The end of these brackets will always be OP_KETRPOS, which
1118 returns MATCH_KETRPOS without going further in the pattern. By this means
1119 we can handle the group by iteration rather than recursion, thereby
1120 reducing the amount of stack needed. */
1121
1122 case OP_CBRAPOS:
1123 case OP_SCBRAPOS:
1124 allow_zero = FALSE;
1125
1126 POSSESSIVE_CAPTURE:
1127 number = GET2(ecode, 1+LINK_SIZE);
1128 offset = number << 1;
1129
1130 #ifdef PCRE_DEBUG
1131 printf("start possessive bracket %d\n", number);
1132 printf("subject=");
1133 pchars(eptr, 16, TRUE, md);
1134 printf("\n");
1135 #endif
1136
1137 if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE;
1138
1139 matched_once = FALSE;
1140 code_offset = (int)(ecode - md->start_code);
1141
1142 save_offset1 = md->offset_vector[offset];
1143 save_offset2 = md->offset_vector[offset+1];
1144 save_offset3 = md->offset_vector[md->offset_end - number];
1145 save_capture_last = md->capture_last;
1146
1147 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1148
1149 /* Each time round the loop, save the current subject position for use
1150 when the group matches. For MATCH_MATCH, the group has matched, so we
1151 restart it with a new subject starting position, remembering that we had
1152 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1153 usual. If we haven't matched any alternatives in any iteration, check to
1154 see if a previous iteration matched. If so, the group has matched;
1155 continue from afterwards. Otherwise it has failed; restore the previous
1156 capture values before returning NOMATCH. */
1157
1158 for (;;)
1159 {
1160 md->offset_vector[md->offset_end - number] =
1161 (int)(eptr - md->start_subject);
1162 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1163 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1164 eptrb, RM63);
1165 if (rrc == MATCH_KETRPOS)
1166 {
1167 offset_top = md->end_offset_top;
1168 ecode = md->start_code + code_offset;
1169 save_capture_last = md->capture_last;
1170 matched_once = TRUE;
1171 mstart = md->start_match_ptr; /* In case \K changed it */
1172 if (eptr == md->end_match_ptr) /* Matched an empty string */
1173 {
1174 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1175 break;
1176 }
1177 eptr = md->end_match_ptr;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 md->capture_last = save_capture_last;
1194 ecode += GET(ecode, 1);
1195 if (*ecode != OP_ALT) break;
1196 }
1197
1198 if (!matched_once)
1199 {
1200 md->offset_vector[offset] = save_offset1;
1201 md->offset_vector[offset+1] = save_offset2;
1202 md->offset_vector[md->offset_end - number] = save_offset3;
1203 }
1204
1205 if (allow_zero || matched_once)
1206 {
1207 ecode += 1 + LINK_SIZE;
1208 break;
1209 }
1210
1211 RRETURN(MATCH_NOMATCH);
1212
1213 /* Non-capturing possessive bracket with unlimited repeat. We come here
1214 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1215 without the capturing complication. It is written out separately for speed
1216 and cleanliness. */
1217
1218 case OP_BRAPOS:
1219 case OP_SBRAPOS:
1220 allow_zero = FALSE;
1221
1222 POSSESSIVE_NON_CAPTURE:
1223 matched_once = FALSE;
1224 code_offset = (int)(ecode - md->start_code);
1225 save_capture_last = md->capture_last;
1226
1227 for (;;)
1228 {
1229 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1230 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1231 eptrb, RM48);
1232 if (rrc == MATCH_KETRPOS)
1233 {
1234 offset_top = md->end_offset_top;
1235 ecode = md->start_code + code_offset;
1236 matched_once = TRUE;
1237 mstart = md->start_match_ptr; /* In case \K reset it */
1238 if (eptr == md->end_match_ptr) /* Matched an empty string */
1239 {
1240 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1241 break;
1242 }
1243 eptr = md->end_match_ptr;
1244 continue;
1245 }
1246
1247 /* See comment in the code for capturing groups above about handling
1248 THEN. */
1249
1250 if (rrc == MATCH_THEN)
1251 {
1252 next = ecode + GET(ecode,1);
1253 if (md->start_match_ptr < next &&
1254 (*ecode == OP_ALT || *next == OP_ALT))
1255 rrc = MATCH_NOMATCH;
1256 }
1257
1258 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1259 ecode += GET(ecode, 1);
1260 if (*ecode != OP_ALT) break;
1261 md->capture_last = save_capture_last;
1262 }
1263
1264 if (matched_once || allow_zero)
1265 {
1266 ecode += 1 + LINK_SIZE;
1267 break;
1268 }
1269 RRETURN(MATCH_NOMATCH);
1270
1271 /* Control never reaches here. */
1272
1273 /* Conditional group: compilation checked that there are no more than two
1274 branches. If the condition is false, skipping the first branch takes us
1275 past the end of the item if there is only one branch, but that's exactly
1276 what we want. */
1277
1278 case OP_COND:
1279 case OP_SCOND:
1280
1281 /* The variable codelink will be added to ecode when the condition is
1282 false, to get to the second branch. Setting it to the offset to the ALT
1283 or KET, then incrementing ecode achieves this effect. We now have ecode
1284 pointing to the condition or callout. */
1285
1286 codelink = GET(ecode, 1); /* Offset to the second branch */
1287 ecode += 1 + LINK_SIZE; /* From this opcode */
1288
1289 /* Because of the way auto-callout works during compile, a callout item is
1290 inserted between OP_COND and an assertion condition. */
1291
1292 if (*ecode == OP_CALLOUT)
1293 {
1294 if (PUBL(callout) != NULL)
1295 {
1296 PUBL(callout_block) cb;
1297 cb.version = 2; /* Version 1 of the callout block */
1298 cb.callout_number = ecode[1];
1299 cb.offset_vector = md->offset_vector;
1300 #if defined COMPILE_PCRE8
1301 cb.subject = (PCRE_SPTR)md->start_subject;
1302 #elif defined COMPILE_PCRE16
1303 cb.subject = (PCRE_SPTR16)md->start_subject;
1304 #elif defined COMPILE_PCRE32
1305 cb.subject = (PCRE_SPTR32)md->start_subject;
1306 #endif
1307 cb.subject_length = (int)(md->end_subject - md->start_subject);
1308 cb.start_match = (int)(mstart - md->start_subject);
1309 cb.current_position = (int)(eptr - md->start_subject);
1310 cb.pattern_position = GET(ecode, 2);
1311 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1312 cb.capture_top = offset_top/2;
1313 cb.capture_last = md->capture_last & CAPLMASK;
1314 /* Internal change requires this for API compatibility. */
1315 if (cb.capture_last == 0) cb.capture_last = -1;
1316 cb.callout_data = md->callout_data;
1317 cb.mark = md->nomatch_mark;
1318 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1319 if (rrc < 0) RRETURN(rrc);
1320 }
1321
1322 /* Advance ecode past the callout, so it now points to the condition. We
1323 must adjust codelink so that the value of ecode+codelink is unchanged. */
1324
1325 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1326 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1327 }
1328
1329 /* Test the various possible conditions */
1330
1331 condition = FALSE;
1332 switch(condcode = *ecode)
1333 {
1334 case OP_RREF: /* Numbered group recursion test */
1335 if (md->recursive != NULL) /* Not recursing => FALSE */
1336 {
1337 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1338 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1339 }
1340 break;
1341
1342 case OP_DNRREF: /* Duplicate named group recursion test */
1343 if (md->recursive != NULL)
1344 {
1345 int count = GET2(ecode, 1 + IMM2_SIZE);
1346 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1347 while (count-- > 0)
1348 {
1349 unsigned int recno = GET2(slot, 0);
1350 condition = recno == md->recursive->group_num;
1351 if (condition) break;
1352 slot += md->name_entry_size;
1353 }
1354 }
1355 break;
1356
1357 case OP_CREF: /* Numbered group used test */
1358 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1359 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1360 break;
1361
1362 case OP_DNCREF: /* Duplicate named group used test */
1363 {
1364 int count = GET2(ecode, 1 + IMM2_SIZE);
1365 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1366 while (count-- > 0)
1367 {
1368 offset = GET2(slot, 0) << 1;
1369 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1370 if (condition) break;
1371 slot += md->name_entry_size;
1372 }
1373 }
1374 break;
1375
1376 case OP_DEF: /* DEFINE - always false */
1377 case OP_FAIL: /* From optimized (?!) condition */
1378 break;
1379
1380 /* The condition is an assertion. Call match() to evaluate it - setting
1381 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1382 of an assertion. */
1383
1384 default:
1385 md->match_function_type = MATCH_CONDASSERT;
1386 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1387 if (rrc == MATCH_MATCH)
1388 {
1389 if (md->end_offset_top > offset_top)
1390 offset_top = md->end_offset_top; /* Captures may have happened */
1391 condition = TRUE;
1392
1393 /* Advance ecode past the assertion to the start of the first branch,
1394 but adjust it so that the general choosing code below works. If the
1395 assertion has a quantifier that allows zero repeats we must skip over
1396 the BRAZERO. This is a lunatic thing to do, but somebody did! */
1397
1398 if (*ecode == OP_BRAZERO) ecode++;
1399 ecode += GET(ecode, 1);
1400 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1401 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1402 }
1403
1404 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1405 assertion; it is therefore treated as NOMATCH. Any other return is an
1406 error. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 break;
1413 }
1414
1415 /* Choose branch according to the condition */
1416
1417 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, we
1420 can use tail recursion to avoid using another stack frame, except when
1421 there is unlimited repeat of a possibly empty group. In the latter case, a
1422 recursive call to match() is always required, unless the second alternative
1423 doesn't exist, in which case we can just plough on. Note that, for
1424 compatibility with Perl, the | in a conditional group is NOT treated as
1425 creating two alternatives. If a THEN is encountered in the branch, it
1426 propagates out to the enclosing alternative (unless nested in a deeper set
1427 of alternatives, of course). */
1428
1429 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 goto TAIL_RECURSE;
1434 }
1435
1436 md->match_function_type = MATCH_CBEGROUP;
1437 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1438 RRETURN(rrc);
1439 }
1440
1441 /* Condition false & no alternative; continue after the group. */
1442
1443 else
1444 {
1445 }
1446 break;
1447
1448
1449 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1450 to close any currently open capturing brackets. */
1451
1452 case OP_CLOSE:
1453 number = GET2(ecode, 1); /* Must be less than 65536 */
1454 offset = number << 1;
1455
1456 #ifdef PCRE_DEBUG
1457 printf("end bracket %d at *ACCEPT", number);
1458 printf("\n");
1459 #endif
1460
1461 md->capture_last = (md->capture_last & OVFLMASK) | number;
1462 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1463 {
1464 md->offset_vector[offset] =
1465 md->offset_vector[md->offset_end - number];
1466 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1467
1468 /* If this group is at or above the current highwater mark, ensure that
1469 any groups between the current high water mark and this group are marked
1470 unset and then update the high water mark. */
1471
1472 if (offset >= offset_top)
1473 {
1474 register int *iptr = md->offset_vector + offset_top;
1475 register int *iend = md->offset_vector + offset;
1476 while (iptr < iend) *iptr++ = -1;
1477 offset_top = offset + 2;
1478 }
1479 }
1480 ecode += 1 + IMM2_SIZE;
1481 break;
1482
1483
1484 /* End of the pattern, either real or forced. */
1485
1486 case OP_END:
1487 case OP_ACCEPT:
1488 case OP_ASSERT_ACCEPT:
1489
1490 /* If we have matched an empty string, fail if not in an assertion and not
1491 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1492 is set and we have matched at the start of the subject. In both cases,
1493 backtracking will then try other alternatives, if any. */
1494
1495 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1496 md->recursive == NULL &&
1497 (md->notempty ||
1498 (md->notempty_atstart &&
1499 mstart == md->start_subject + md->start_offset)))
1500 RRETURN(MATCH_NOMATCH);
1501
1502 /* Otherwise, we have a match. */
1503
1504 md->end_match_ptr = eptr; /* Record where we ended */
1505 md->end_offset_top = offset_top; /* and how many extracts were taken */
1506 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1507
1508 /* For some reason, the macros don't work properly if an expression is
1509 given as the argument to RRETURN when the heap is in use. */
1510
1511 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1512 RRETURN(rrc);
1513
1514 /* Assertion brackets. Check the alternative branches in turn - the
1515 matching won't pass the KET for an assertion. If any one branch matches,
1516 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1517 start of each branch to move the current point backwards, so the code at
1518 this level is identical to the lookahead case. When the assertion is part
1519 of a condition, we want to return immediately afterwards. The caller of
1520 this incarnation of the match() function will have set MATCH_CONDASSERT in
1521 md->match_function type, and one of these opcodes will be the first opcode
1522 that is processed. We use a local variable that is preserved over calls to
1523 match() to remember this case. */
1524
1525 case OP_ASSERT:
1526 case OP_ASSERTBACK:
1527 save_mark = md->mark;
1528 if (md->match_function_type == MATCH_CONDASSERT)
1529 {
1530 condassert = TRUE;
1531 md->match_function_type = 0;
1532 }
1533 else condassert = FALSE;
1534
1535 /* Loop for each branch */
1536
1537 do
1538 {
1539 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1540
1541 /* A match means that the assertion is true; break out of the loop
1542 that matches its alternatives. */
1543
1544 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1545 {
1546 mstart = md->start_match_ptr; /* In case \K reset it */
1547 break;
1548 }
1549
1550 /* If not matched, restore the previous mark setting. */
1551
1552 md->mark = save_mark;
1553
1554 /* See comment in the code for capturing groups above about handling
1555 THEN. */
1556
1557 if (rrc == MATCH_THEN)
1558 {
1559 next = ecode + GET(ecode,1);
1560 if (md->start_match_ptr < next &&
1561 (*ecode == OP_ALT || *next == OP_ALT))
1562 rrc = MATCH_NOMATCH;
1563 }
1564
1565 /* Anything other than NOMATCH causes the entire assertion to fail,
1566 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1567 uncaptured THEN, which means they take their normal effect. This
1568 consistent approach does not always have exactly the same effect as in
1569 Perl. */
1570
1571 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1572 ecode += GET(ecode, 1);
1573 }
1574 while (*ecode == OP_ALT); /* Continue for next alternative */
1575
1576 /* If we have tried all the alternative branches, the assertion has
1577 failed. If not, we broke out after a match. */
1578
1579 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1580
1581 /* If checking an assertion for a condition, return MATCH_MATCH. */
1582
1583 if (condassert) RRETURN(MATCH_MATCH);
1584
1585 /* Continue from after a successful assertion, updating the offsets high
1586 water mark, since extracts may have been taken during the assertion. */
1587
1588 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1589 ecode += 1 + LINK_SIZE;
1590 offset_top = md->end_offset_top;
1591 continue;
1592
1593 /* Negative assertion: all branches must fail to match for the assertion to
1594 succeed. */
1595
1596 case OP_ASSERT_NOT:
1597 case OP_ASSERTBACK_NOT:
1598 save_mark = md->mark;
1599 if (md->match_function_type == MATCH_CONDASSERT)
1600 {
1601 condassert = TRUE;
1602 md->match_function_type = 0;
1603 }
1604 else condassert = FALSE;
1605
1606 /* Loop for each alternative branch. */
1607
1608 do
1609 {
1610 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1611 md->mark = save_mark; /* Always restore the mark setting */
1612
1613 switch(rrc)
1614 {
1615 case MATCH_MATCH: /* A successful match means */
1616 case MATCH_ACCEPT: /* the assertion has failed. */
1617 RRETURN(MATCH_NOMATCH);
1618
1619 case MATCH_NOMATCH: /* Carry on with next branch */
1620 break;
1621
1622 /* See comment in the code for capturing groups above about handling
1623 THEN. */
1624
1625 case MATCH_THEN:
1626 next = ecode + GET(ecode,1);
1627 if (md->start_match_ptr < next &&
1628 (*ecode == OP_ALT || *next == OP_ALT))
1629 {
1630 rrc = MATCH_NOMATCH;
1631 break;
1632 }
1633 /* Otherwise fall through. */
1634
1635 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1636 assertion to fail to match, without considering any more alternatives.
1637 Failing to match means the assertion is true. This is a consistent
1638 approach, but does not always have the same effect as in Perl. */
1639
1640 case MATCH_COMMIT:
1641 case MATCH_SKIP:
1642 case MATCH_SKIP_ARG:
1643 case MATCH_PRUNE:
1644 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1645 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1646
1647 /* Anything else is an error */
1648
1649 default:
1650 RRETURN(rrc);
1651 }
1652
1653 /* Continue with next branch */
1654
1655 ecode += GET(ecode,1);
1656 }
1657 while (*ecode == OP_ALT);
1658
1659 /* All branches in the assertion failed to match. */
1660
1661 NEG_ASSERT_TRUE:
1662 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1663 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1664 continue;
1665
1666 /* Move the subject pointer back. This occurs only at the start of
1667 each branch of a lookbehind assertion. If we are too close to the start to
1668 move back, this match function fails. When working with UTF-8 we move
1669 back a number of characters, not bytes. */
1670
1671 case OP_REVERSE:
1672 #ifdef SUPPORT_UTF
1673 if (utf)
1674 {
1675 i = GET(ecode, 1);
1676 while (i-- > 0)
1677 {
1678 eptr--;
1679 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1680 BACKCHAR(eptr);
1681 }
1682 }
1683 else
1684 #endif
1685
1686 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1687
1688 {
1689 eptr -= GET(ecode, 1);
1690 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1691 }
1692
1693 /* Save the earliest consulted character, then skip to next op code */
1694
1695 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1696 ecode += 1 + LINK_SIZE;
1697 break;
1698
1699 /* The callout item calls an external function, if one is provided, passing
1700 details of the match so far. This is mainly for debugging, though the
1701 function is able to force a failure. */
1702
1703 case OP_CALLOUT:
1704 if (PUBL(callout) != NULL)
1705 {
1706 PUBL(callout_block) cb;
1707 cb.version = 2; /* Version 1 of the callout block */
1708 cb.callout_number = ecode[1];
1709 cb.offset_vector = md->offset_vector;
1710 #if defined COMPILE_PCRE8
1711 cb.subject = (PCRE_SPTR)md->start_subject;
1712 #elif defined COMPILE_PCRE16
1713 cb.subject = (PCRE_SPTR16)md->start_subject;
1714 #elif defined COMPILE_PCRE32
1715 cb.subject = (PCRE_SPTR32)md->start_subject;
1716 #endif
1717 cb.subject_length = (int)(md->end_subject - md->start_subject);
1718 cb.start_match = (int)(mstart - md->start_subject);
1719 cb.current_position = (int)(eptr - md->start_subject);
1720 cb.pattern_position = GET(ecode, 2);
1721 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1722 cb.capture_top = offset_top/2;
1723 cb.capture_last = md->capture_last & CAPLMASK;
1724 /* Internal change requires this for API compatibility. */
1725 if (cb.capture_last == 0) cb.capture_last = -1;
1726 cb.callout_data = md->callout_data;
1727 cb.mark = md->nomatch_mark;
1728 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1729 if (rrc < 0) RRETURN(rrc);
1730 }
1731 ecode += 2 + 2*LINK_SIZE;
1732 break;
1733
1734 /* Recursion either matches the current regex, or some subexpression. The
1735 offset data is the offset to the starting bracket from the start of the
1736 whole pattern. (This is so that it works from duplicated subpatterns.)
1737
1738 The state of the capturing groups is preserved over recursion, and
1739 re-instated afterwards. We don't know how many are started and not yet
1740 finished (offset_top records the completed total) so we just have to save
1741 all the potential data. There may be up to 65535 such values, which is too
1742 large to put on the stack, but using malloc for small numbers seems
1743 expensive. As a compromise, the stack is used when there are no more than
1744 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1745
1746 There are also other values that have to be saved. We use a chained
1747 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1748 for the original version of this logic. It has, however, been hacked around
1749 a lot, so he is not to blame for the current way it works. */
1750
1751 case OP_RECURSE:
1752 {
1753 recursion_info *ri;
1754 unsigned int recno;
1755
1756 callpat = md->start_code + GET(ecode, 1);
1757 recno = (callpat == md->start_code)? 0 :
1758 GET2(callpat, 1 + LINK_SIZE);
1759
1760 /* Check for repeating a recursion without advancing the subject pointer.
1761 This should catch convoluted mutual recursions. (Some simple cases are
1762 caught at compile time.) */
1763
1764 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1765 if (recno == ri->group_num && eptr == ri->subject_position)
1766 RRETURN(PCRE_ERROR_RECURSELOOP);
1767
1768 /* Add to "recursing stack" */
1769
1770 new_recursive.group_num = recno;
1771 new_recursive.saved_capture_last = md->capture_last;
1772 new_recursive.subject_position = eptr;
1773 new_recursive.prevrec = md->recursive;
1774 md->recursive = &new_recursive;
1775
1776 /* Where to continue from afterwards */
1777
1778 ecode += 1 + LINK_SIZE;
1779
1780 /* Now save the offset data */
1781
1782 new_recursive.saved_max = md->offset_end;
1783 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1784 new_recursive.offset_save = stacksave;
1785 else
1786 {
1787 new_recursive.offset_save =
1788 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1789 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1790 }
1791 memcpy(new_recursive.offset_save, md->offset_vector,
1792 new_recursive.saved_max * sizeof(int));
1793
1794 /* OK, now we can do the recursion. After processing each alternative,
1795 restore the offset data and the last captured value. If there were nested
1796 recursions, md->recursive might be changed, so reset it before looping.
1797 */
1798
1799 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1800 cbegroup = (*callpat >= OP_SBRA);
1801 do
1802 {
1803 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1804 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1805 md, eptrb, RM6);
1806 memcpy(md->offset_vector, new_recursive.offset_save,
1807 new_recursive.saved_max * sizeof(int));
1808 md->capture_last = new_recursive.saved_capture_last;
1809 md->recursive = new_recursive.prevrec;
1810 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1811 {
1812 DPRINTF(("Recursion matched\n"));
1813 if (new_recursive.offset_save != stacksave)
1814 (PUBL(free))(new_recursive.offset_save);
1815
1816 /* Set where we got to in the subject, and reset the start in case
1817 it was changed by \K. This *is* propagated back out of a recursion,
1818 for Perl compatibility. */
1819
1820 eptr = md->end_match_ptr;
1821 mstart = md->start_match_ptr;
1822 goto RECURSION_MATCHED; /* Exit loop; end processing */
1823 }
1824
1825 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1826 recursion; they cause a NOMATCH for the entire recursion. These codes
1827 are defined in a range that can be tested for. */
1828
1829 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1830 {
1831 if (new_recursive.offset_save != stacksave)
1832 (PUBL(free))(new_recursive.offset_save);
1833 RRETURN(MATCH_NOMATCH);
1834 }
1835
1836 /* Any return code other than NOMATCH is an error. */
1837
1838 if (rrc != MATCH_NOMATCH)
1839 {
1840 DPRINTF(("Recursion gave error %d\n", rrc));
1841 if (new_recursive.offset_save != stacksave)
1842 (PUBL(free))(new_recursive.offset_save);
1843 RRETURN(rrc);
1844 }
1845
1846 md->recursive = &new_recursive;
1847 callpat += GET(callpat, 1);
1848 }
1849 while (*callpat == OP_ALT);
1850
1851 DPRINTF(("Recursion didn't match\n"));
1852 md->recursive = new_recursive.prevrec;
1853 if (new_recursive.offset_save != stacksave)
1854 (PUBL(free))(new_recursive.offset_save);
1855 RRETURN(MATCH_NOMATCH);
1856 }
1857
1858 RECURSION_MATCHED:
1859 break;
1860
1861 /* An alternation is the end of a branch; scan along to find the end of the
1862 bracketed group and go to there. */
1863
1864 case OP_ALT:
1865 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1866 break;
1867
1868 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1869 indicating that it may occur zero times. It may repeat infinitely, or not
1870 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1871 with fixed upper repeat limits are compiled as a number of copies, with the
1872 optional ones preceded by BRAZERO or BRAMINZERO. */
1873
1874 case OP_BRAZERO:
1875 next = ecode + 1;
1876 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1877 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1878 do next += GET(next, 1); while (*next == OP_ALT);
1879 ecode = next + 1 + LINK_SIZE;
1880 break;
1881
1882 case OP_BRAMINZERO:
1883 next = ecode + 1;
1884 do next += GET(next, 1); while (*next == OP_ALT);
1885 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1887 ecode++;
1888 break;
1889
1890 case OP_SKIPZERO:
1891 next = ecode+1;
1892 do next += GET(next,1); while (*next == OP_ALT);
1893 ecode = next + 1 + LINK_SIZE;
1894 break;
1895
1896 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1897 here; just jump to the group, with allow_zero set TRUE. */
1898
1899 case OP_BRAPOSZERO:
1900 op = *(++ecode);
1901 allow_zero = TRUE;
1902 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1903 goto POSSESSIVE_NON_CAPTURE;
1904
1905 /* End of a group, repeated or non-repeating. */
1906
1907 case OP_KET:
1908 case OP_KETRMIN:
1909 case OP_KETRMAX:
1910 case OP_KETRPOS:
1911 prev = ecode - GET(ecode, 1);
1912
1913 /* If this was a group that remembered the subject start, in order to break
1914 infinite repeats of empty string matches, retrieve the subject start from
1915 the chain. Otherwise, set it NULL. */
1916
1917 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1918 {
1919 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1920 eptrb = eptrb->epb_prev; /* Backup to previous group */
1921 }
1922 else saved_eptr = NULL;
1923
1924 /* If we are at the end of an assertion group or a non-capturing atomic
1925 group, stop matching and return MATCH_MATCH, but record the current high
1926 water mark for use by positive assertions. We also need to record the match
1927 start in case it was changed by \K. */
1928
1929 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1930 *prev == OP_ONCE_NC)
1931 {
1932 md->end_match_ptr = eptr; /* For ONCE_NC */
1933 md->end_offset_top = offset_top;
1934 md->start_match_ptr = mstart;
1935 RRETURN(MATCH_MATCH); /* Sets md->mark */
1936 }
1937
1938 /* For capturing groups we have to check the group number back at the start
1939 and if necessary complete handling an extraction by setting the offsets and
1940 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1941 into group 0, so it won't be picked up here. Instead, we catch it when the
1942 OP_END is reached. Other recursion is handled here. We just have to record
1943 the current subject position and start match pointer and give a MATCH
1944 return. */
1945
1946 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1947 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1948 {
1949 number = GET2(prev, 1+LINK_SIZE);
1950 offset = number << 1;
1951
1952 #ifdef PCRE_DEBUG
1953 printf("end bracket %d", number);
1954 printf("\n");
1955 #endif
1956
1957 /* Handle a recursively called group. */
1958
1959 if (md->recursive != NULL && md->recursive->group_num == number)
1960 {
1961 md->end_match_ptr = eptr;
1962 md->start_match_ptr = mstart;
1963 RRETURN(MATCH_MATCH);
1964 }
1965
1966 /* Deal with capturing */
1967
1968 md->capture_last = (md->capture_last & OVFLMASK) | number;
1969 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1970 {
1971 /* If offset is greater than offset_top, it means that we are
1972 "skipping" a capturing group, and that group's offsets must be marked
1973 unset. In earlier versions of PCRE, all the offsets were unset at the
1974 start of matching, but this doesn't work because atomic groups and
1975 assertions can cause a value to be set that should later be unset.
1976 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1977 part of the atomic group, but this is not on the final matching path,
1978 so must be unset when 2 is set. (If there is no group 2, there is no
1979 problem, because offset_top will then be 2, indicating no capture.) */
1980
1981 if (offset > offset_top)
1982 {
1983 register int *iptr = md->offset_vector + offset_top;
1984 register int *iend = md->offset_vector + offset;
1985 while (iptr < iend) *iptr++ = -1;
1986 }
1987
1988 /* Now make the extraction */
1989
1990 md->offset_vector[offset] =
1991 md->offset_vector[md->offset_end - number];
1992 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1993 if (offset_top <= offset) offset_top = offset + 2;
1994 }
1995 }
1996
1997 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1998 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1999 at a time from the outer level, thus saving stack. This must precede the
2000 empty string test - in this case that test is done at the outer level. */
2001
2002 if (*ecode == OP_KETRPOS)
2003 {
2004 md->start_match_ptr = mstart; /* In case \K reset it */
2005 md->end_match_ptr = eptr;
2006 md->end_offset_top = offset_top;
2007 RRETURN(MATCH_KETRPOS);
2008 }
2009
2010 /* For an ordinary non-repeating ket, just continue at this level. This
2011 also happens for a repeating ket if no characters were matched in the
2012 group. This is the forcible breaking of infinite loops as implemented in
2013 Perl 5.005. For a non-repeating atomic group that includes captures,
2014 establish a backup point by processing the rest of the pattern at a lower
2015 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2016 original OP_ONCE level, thereby bypassing intermediate backup points, but
2017 resetting any captures that happened along the way. */
2018
2019 if (*ecode == OP_KET || eptr == saved_eptr)
2020 {
2021 if (*prev == OP_ONCE)
2022 {
2023 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2025 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2026 RRETURN(MATCH_ONCE);
2027 }
2028 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2029 break;
2030 }
2031
2032 /* The normal repeating kets try the rest of the pattern or restart from
2033 the preceding bracket, in the appropriate order. In the second case, we can
2034 use tail recursion to avoid using another stack frame, unless we have an
2035 an atomic group or an unlimited repeat of a group that can match an empty
2036 string. */
2037
2038 if (*ecode == OP_KETRMIN)
2039 {
2040 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2041 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2042 if (*prev == OP_ONCE)
2043 {
2044 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2046 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2047 RRETURN(MATCH_ONCE);
2048 }
2049 if (*prev >= OP_SBRA) /* Could match an empty string */
2050 {
2051 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2052 RRETURN(rrc);
2053 }
2054 ecode = prev;
2055 goto TAIL_RECURSE;
2056 }
2057 else /* OP_KETRMAX */
2058 {
2059 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2060 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2061 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2062 if (*prev == OP_ONCE)
2063 {
2064 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2065 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2066 md->once_target = prev;
2067 RRETURN(MATCH_ONCE);
2068 }
2069 ecode += 1 + LINK_SIZE;
2070 goto TAIL_RECURSE;
2071 }
2072 /* Control never gets here */
2073
2074 /* Not multiline mode: start of subject assertion, unless notbol. */
2075
2076 case OP_CIRC:
2077 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2078
2079 /* Start of subject assertion */
2080
2081 case OP_SOD:
2082 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2083 ecode++;
2084 break;
2085
2086 /* Multiline mode: start of subject unless notbol, or after any newline. */
2087
2088 case OP_CIRCM:
2089 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2090 if (eptr != md->start_subject &&
2091 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2092 RRETURN(MATCH_NOMATCH);
2093 ecode++;
2094 break;
2095
2096 /* Start of match assertion */
2097
2098 case OP_SOM:
2099 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2100 ecode++;
2101 break;
2102
2103 /* Reset the start of match point */
2104
2105 case OP_SET_SOM:
2106 mstart = eptr;
2107 ecode++;
2108 break;
2109
2110 /* Multiline mode: assert before any newline, or before end of subject
2111 unless noteol is set. */
2112
2113 case OP_DOLLM:
2114 if (eptr < md->end_subject)
2115 {
2116 if (!IS_NEWLINE(eptr))
2117 {
2118 if (md->partial != 0 &&
2119 eptr + 1 >= md->end_subject &&
2120 NLBLOCK->nltype == NLTYPE_FIXED &&
2121 NLBLOCK->nllen == 2 &&
2122 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2123 {
2124 md->hitend = TRUE;
2125 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2126 }
2127 RRETURN(MATCH_NOMATCH);
2128 }
2129 }
2130 else
2131 {
2132 if (md->noteol) RRETURN(MATCH_NOMATCH);
2133 SCHECK_PARTIAL();
2134 }
2135 ecode++;
2136 break;
2137
2138 /* Not multiline mode: assert before a terminating newline or before end of
2139 subject unless noteol is set. */
2140
2141 case OP_DOLL:
2142 if (md->noteol) RRETURN(MATCH_NOMATCH);
2143 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2144
2145 /* ... else fall through for endonly */
2146
2147 /* End of subject assertion (\z) */
2148
2149 case OP_EOD:
2150 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2151 SCHECK_PARTIAL();
2152 ecode++;
2153 break;
2154
2155 /* End of subject or ending \n assertion (\Z) */
2156
2157 case OP_EODN:
2158 ASSERT_NL_OR_EOS:
2159 if (eptr < md->end_subject &&
2160 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2161 {
2162 if (md->partial != 0 &&
2163 eptr + 1 >= md->end_subject &&
2164 NLBLOCK->nltype == NLTYPE_FIXED &&
2165 NLBLOCK->nllen == 2 &&
2166 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2167 {
2168 md->hitend = TRUE;
2169 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2170 }
2171 RRETURN(MATCH_NOMATCH);
2172 }
2173
2174 /* Either at end of string or \n before end. */
2175
2176 SCHECK_PARTIAL();
2177 ecode++;
2178 break;
2179
2180 /* Word boundary assertions */
2181
2182 case OP_NOT_WORD_BOUNDARY:
2183 case OP_WORD_BOUNDARY:
2184 {
2185
2186 /* Find out if the previous and current characters are "word" characters.
2187 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2188 be "non-word" characters. Remember the earliest consulted character for
2189 partial matching. */
2190
2191 #ifdef SUPPORT_UTF
2192 if (utf)
2193 {
2194 /* Get status of previous character */
2195
2196 if (eptr == md->start_subject) prev_is_word = FALSE; else
2197 {
2198 PCRE_PUCHAR lastptr = eptr - 1;
2199 BACKCHAR(lastptr);
2200 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2201 GETCHAR(c, lastptr);
2202 #ifdef SUPPORT_UCP
2203 if (md->use_ucp)
2204 {
2205 if (c == '_') prev_is_word = TRUE; else
2206 {
2207 int cat = UCD_CATEGORY(c);
2208 prev_is_word = (cat == ucp_L || cat == ucp_N);
2209 }
2210 }
2211 else
2212 #endif
2213 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2214 }
2215
2216 /* Get status of next character */
2217
2218 if (eptr >= md->end_subject)
2219 {
2220 SCHECK_PARTIAL();
2221 cur_is_word = FALSE;
2222 }
2223 else
2224 {
2225 GETCHAR(c, eptr);
2226 #ifdef SUPPORT_UCP
2227 if (md->use_ucp)
2228 {
2229 if (c == '_') cur_is_word = TRUE; else
2230 {
2231 int cat = UCD_CATEGORY(c);
2232 cur_is_word = (cat == ucp_L || cat == ucp_N);
2233 }
2234 }
2235 else
2236 #endif
2237 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2238 }
2239 }
2240 else
2241 #endif
2242
2243 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2244 consistency with the behaviour of \w we do use it in this case. */
2245
2246 {
2247 /* Get status of previous character */
2248
2249 if (eptr == md->start_subject) prev_is_word = FALSE; else
2250 {
2251 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2252 #ifdef SUPPORT_UCP
2253 if (md->use_ucp)
2254 {
2255 c = eptr[-1];
2256 if (c == '_') prev_is_word = TRUE; else
2257 {
2258 int cat = UCD_CATEGORY(c);
2259 prev_is_word = (cat == ucp_L || cat == ucp_N);
2260 }
2261 }
2262 else
2263 #endif
2264 prev_is_word = MAX_255(eptr[-1])
2265 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2266 }
2267
2268 /* Get status of next character */
2269
2270 if (eptr >= md->end_subject)
2271 {
2272 SCHECK_PARTIAL();
2273 cur_is_word = FALSE;
2274 }
2275 else
2276 #ifdef SUPPORT_UCP
2277 if (md->use_ucp)
2278 {
2279 c = *eptr;
2280 if (c == '_') cur_is_word = TRUE; else
2281 {
2282 int cat = UCD_CATEGORY(c);
2283 cur_is_word = (cat == ucp_L || cat == ucp_N);
2284 }
2285 }
2286 else
2287 #endif
2288 cur_is_word = MAX_255(*eptr)
2289 && ((md->ctypes[*eptr] & ctype_word) != 0);
2290 }
2291
2292 /* Now see if the situation is what we want */
2293
2294 if ((*ecode++ == OP_WORD_BOUNDARY)?
2295 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2296 RRETURN(MATCH_NOMATCH);
2297 }
2298 break;
2299
2300 /* Match any single character type except newline; have to take care with
2301 CRLF newlines and partial matching. */
2302
2303 case OP_ANY:
2304 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2305 if (md->partial != 0 &&
2306 eptr + 1 >= md->end_subject &&
2307 NLBLOCK->nltype == NLTYPE_FIXED &&
2308 NLBLOCK->nllen == 2 &&
2309 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2310 {
2311 md->hitend = TRUE;
2312 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2313 }
2314
2315 /* Fall through */
2316
2317 /* Match any single character whatsoever. */
2318
2319 case OP_ALLANY:
2320 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2321 { /* not be updated before SCHECK_PARTIAL. */
2322 SCHECK_PARTIAL();
2323 RRETURN(MATCH_NOMATCH);
2324 }
2325 eptr++;
2326 #ifdef SUPPORT_UTF
2327 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2328 #endif
2329 ecode++;
2330 break;
2331
2332 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2333 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2334
2335 case OP_ANYBYTE:
2336 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2337 { /* not be updated before SCHECK_PARTIAL. */
2338 SCHECK_PARTIAL();
2339 RRETURN(MATCH_NOMATCH);
2340 }
2341 eptr++;
2342 ecode++;
2343 break;
2344
2345 case OP_NOT_DIGIT:
2346 if (eptr >= md->end_subject)
2347 {
2348 SCHECK_PARTIAL();
2349 RRETURN(MATCH_NOMATCH);
2350 }
2351 GETCHARINCTEST(c, eptr);
2352 if (
2353 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2354 c < 256 &&
2355 #endif
2356 (md->ctypes[c] & ctype_digit) != 0
2357 )
2358 RRETURN(MATCH_NOMATCH);
2359 ecode++;
2360 break;
2361
2362 case OP_DIGIT:
2363 if (eptr >= md->end_subject)
2364 {
2365 SCHECK_PARTIAL();
2366 RRETURN(MATCH_NOMATCH);
2367 }
2368 GETCHARINCTEST(c, eptr);
2369 if (
2370 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2371 c > 255 ||
2372 #endif
2373 (md->ctypes[c] & ctype_digit) == 0
2374 )
2375 RRETURN(MATCH_NOMATCH);
2376 ecode++;
2377 break;
2378
2379 case OP_NOT_WHITESPACE:
2380 if (eptr >= md->end_subject)
2381 {
2382 SCHECK_PARTIAL();
2383 RRETURN(MATCH_NOMATCH);
2384 }
2385 GETCHARINCTEST(c, eptr);
2386 if (
2387 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2388 c < 256 &&
2389 #endif
2390 (md->ctypes[c] & ctype_space) != 0
2391 )
2392 RRETURN(MATCH_NOMATCH);
2393 ecode++;
2394 break;
2395
2396 case OP_WHITESPACE:
2397 if (eptr >= md->end_subject)
2398 {
2399 SCHECK_PARTIAL();
2400 RRETURN(MATCH_NOMATCH);
2401 }
2402 GETCHARINCTEST(c, eptr);
2403 if (
2404 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2405 c > 255 ||
2406 #endif
2407 (md->ctypes[c] & ctype_space) == 0
2408 )
2409 RRETURN(MATCH_NOMATCH);
2410 ecode++;
2411 break;
2412
2413 case OP_NOT_WORDCHAR:
2414 if (eptr >= md->end_subject)
2415 {
2416 SCHECK_PARTIAL();
2417 RRETURN(MATCH_NOMATCH);
2418 }
2419 GETCHARINCTEST(c, eptr);
2420 if (
2421 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2422 c < 256 &&
2423 #endif
2424 (md->ctypes[c] & ctype_word) != 0
2425 )
2426 RRETURN(MATCH_NOMATCH);
2427 ecode++;
2428 break;
2429
2430 case OP_WORDCHAR:
2431 if (eptr >= md->end_subject)
2432 {
2433 SCHECK_PARTIAL();
2434 RRETURN(MATCH_NOMATCH);
2435 }
2436 GETCHARINCTEST(c, eptr);
2437 if (
2438 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2439 c > 255 ||
2440 #endif
2441 (md->ctypes[c] & ctype_word) == 0
2442 )
2443 RRETURN(MATCH_NOMATCH);
2444 ecode++;
2445 break;
2446
2447 case OP_ANYNL:
2448 if (eptr >= md->end_subject)
2449 {
2450 SCHECK_PARTIAL();
2451 RRETURN(MATCH_NOMATCH);
2452 }
2453 GETCHARINCTEST(c, eptr);
2454 switch(c)
2455 {
2456 default: RRETURN(MATCH_NOMATCH);
2457
2458 case CHAR_CR:
2459 if (eptr >= md->end_subject)
2460 {
2461 SCHECK_PARTIAL();
2462 }
2463 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2464 break;
2465
2466 case CHAR_LF:
2467 break;
2468
2469 case CHAR_VT:
2470 case CHAR_FF:
2471 case CHAR_NEL:
2472 #ifndef EBCDIC
2473 case 0x2028:
2474 case 0x2029:
2475 #endif /* Not EBCDIC */
2476 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2477 break;
2478 }
2479 ecode++;
2480 break;
2481
2482 case OP_NOT_HSPACE:
2483 if (eptr >= md->end_subject)
2484 {
2485 SCHECK_PARTIAL();
2486 RRETURN(MATCH_NOMATCH);
2487 }
2488 GETCHARINCTEST(c, eptr);
2489 switch(c)
2490 {
2491 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2492 default: break;
2493 }
2494 ecode++;
2495 break;
2496
2497 case OP_HSPACE:
2498 if (eptr >= md->end_subject)
2499 {
2500 SCHECK_PARTIAL();
2501 RRETURN(MATCH_NOMATCH);
2502 }
2503 GETCHARINCTEST(c, eptr);
2504 switch(c)
2505 {
2506 HSPACE_CASES: break; /* Byte and multibyte cases */
2507 default: RRETURN(MATCH_NOMATCH);
2508 }
2509 ecode++;
2510 break;
2511
2512 case OP_NOT_VSPACE:
2513 if (eptr >= md->end_subject)
2514 {
2515 SCHECK_PARTIAL();
2516 RRETURN(MATCH_NOMATCH);
2517 }
2518 GETCHARINCTEST(c, eptr);
2519 switch(c)
2520 {
2521 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2522 default: break;
2523 }
2524 ecode++;
2525 break;
2526
2527 case OP_VSPACE:
2528 if (eptr >= md->end_subject)
2529 {
2530 SCHECK_PARTIAL();
2531 RRETURN(MATCH_NOMATCH);
2532 }
2533 GETCHARINCTEST(c, eptr);
2534 switch(c)
2535 {
2536 VSPACE_CASES: break;
2537 default: RRETURN(MATCH_NOMATCH);
2538 }
2539 ecode++;
2540 break;
2541
2542 #ifdef SUPPORT_UCP
2543 /* Check the next character by Unicode property. We will get here only
2544 if the support is in the binary; otherwise a compile-time error occurs. */
2545
2546 case OP_PROP:
2547 case OP_NOTPROP:
2548 if (eptr >= md->end_subject)
2549 {
2550 SCHECK_PARTIAL();
2551 RRETURN(MATCH_NOMATCH);
2552 }
2553 GETCHARINCTEST(c, eptr);
2554 {
2555 const pcre_uint32 *cp;
2556 const ucd_record *prop = GET_UCD(c);
2557
2558 switch(ecode[1])
2559 {
2560 case PT_ANY:
2561 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2562 break;
2563
2564 case PT_LAMP:
2565 if ((prop->chartype == ucp_Lu ||
2566 prop->chartype == ucp_Ll ||
2567 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2568 RRETURN(MATCH_NOMATCH);
2569 break;
2570
2571 case PT_GC:
2572 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2573 RRETURN(MATCH_NOMATCH);
2574 break;
2575
2576 case PT_PC:
2577 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2578 RRETURN(MATCH_NOMATCH);
2579 break;
2580
2581 case PT_SC:
2582 if ((ecode[2] != prop->script) == (op == OP_PROP))
2583 RRETURN(MATCH_NOMATCH);
2584 break;
2585
2586 /* These are specials */
2587
2588 case PT_ALNUM:
2589 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2590 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2591 RRETURN(MATCH_NOMATCH);
2592 break;
2593
2594 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2595 which means that Perl space and POSIX space are now identical. PCRE
2596 was changed at release 8.34. */
2597
2598 case PT_SPACE: /* Perl space */
2599 case PT_PXSPACE: /* POSIX space */
2600 switch(c)
2601 {
2602 HSPACE_CASES:
2603 VSPACE_CASES:
2604 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2605 break;
2606
2607 default:
2608 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2609 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2610 break;
2611 }
2612 break;
2613
2614 case PT_WORD:
2615 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2616 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2617 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2618 RRETURN(MATCH_NOMATCH);
2619 break;
2620
2621 case PT_CLIST:
2622 cp = PRIV(ucd_caseless_sets) + ecode[2];
2623 for (;;)
2624 {
2625 if (c < *cp)
2626 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2627 if (c == *cp++)
2628 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2629 }
2630 break;
2631
2632 case PT_UCNC:
2633 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2634 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2635 c >= 0xe000) == (op == OP_NOTPROP))
2636 RRETURN(MATCH_NOMATCH);
2637 break;
2638
2639 /* This should never occur */
2640
2641 default:
2642 RRETURN(PCRE_ERROR_INTERNAL);
2643 }
2644
2645 ecode += 3;
2646 }
2647 break;
2648
2649 /* Match an extended Unicode sequence. We will get here only if the support
2650 is in the binary; otherwise a compile-time error occurs. */
2651
2652 case OP_EXTUNI:
2653 if (eptr >= md->end_subject)
2654 {
2655 SCHECK_PARTIAL();
2656 RRETURN(MATCH_NOMATCH);
2657 }
2658 else
2659 {
2660 int lgb, rgb;
2661 GETCHARINCTEST(c, eptr);
2662 lgb = UCD_GRAPHBREAK(c);
2663 while (eptr < md->end_subject)
2664 {
2665 int len = 1;
2666 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2667 rgb = UCD_GRAPHBREAK(c);
2668 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2669 lgb = rgb;
2670 eptr += len;
2671 }
2672 }
2673 CHECK_PARTIAL();
2674 ecode++;
2675 break;
2676 #endif /* SUPPORT_UCP */
2677
2678
2679 /* Match a back reference, possibly repeatedly. Look past the end of the
2680 item to see if there is repeat information following. The code is similar
2681 to that for character classes, but repeated for efficiency. Then obey
2682 similar code to character type repeats - written out again for speed.
2683 However, if the referenced string is the empty string, always treat
2684 it as matched, any number of times (otherwise there could be infinite
2685 loops). If the reference is unset, there are two possibilities:
2686
2687 (a) In the default, Perl-compatible state, set the length negative;
2688 this ensures that every attempt at a match fails. We can't just fail
2689 here, because of the possibility of quantifiers with zero minima.
2690
2691 (b) If the JavaScript compatibility flag is set, set the length to zero
2692 so that the back reference matches an empty string.
2693
2694 Otherwise, set the length to the length of what was matched by the
2695 referenced subpattern.
2696
2697 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2698 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2699 and OP_DNREFI are used. In this case we must scan the list of groups to
2700 which the name refers, and use the first one that is set. */
2701
2702 case OP_DNREF:
2703 case OP_DNREFI:
2704 caseless = op == OP_DNREFI;
2705 {
2706 int count = GET2(ecode, 1+IMM2_SIZE);
2707 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2708 ecode += 1 + 2*IMM2_SIZE;
2709
2710 /* Setting the default length first and initializing 'offset' avoids
2711 compiler warnings in the REF_REPEAT code. */
2712
2713 length = (md->jscript_compat)? 0 : -1;
2714 offset = 0;
2715
2716 while (count-- > 0)
2717 {
2718 offset = GET2(slot, 0) << 1;
2719 if (offset < offset_top && md->offset_vector[offset] >= 0)
2720 {
2721 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2722 break;
2723 }
2724 slot += md->name_entry_size;
2725 }
2726 }
2727 goto REF_REPEAT;
2728
2729 case OP_REF:
2730 case OP_REFI:
2731 caseless = op == OP_REFI;
2732 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2733 ecode += 1 + IMM2_SIZE;
2734 if (offset >= offset_top || md->offset_vector[offset] < 0)
2735 length = (md->jscript_compat)? 0 : -1;
2736 else
2737 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2738
2739 /* Set up for repetition, or handle the non-repeated case */
2740
2741 REF_REPEAT:
2742 switch (*ecode)
2743 {
2744 case OP_CRSTAR:
2745 case OP_CRMINSTAR:
2746 case OP_CRPLUS:
2747 case OP_CRMINPLUS:
2748 case OP_CRQUERY:
2749 case OP_CRMINQUERY:
2750 c = *ecode++ - OP_CRSTAR;
2751 minimize = (c & 1) != 0;
2752 min = rep_min[c]; /* Pick up values from tables; */
2753 max = rep_max[c]; /* zero for max => infinity */
2754 if (max == 0) max = INT_MAX;
2755 break;
2756
2757 case OP_CRRANGE:
2758 case OP_CRMINRANGE:
2759 minimize = (*ecode == OP_CRMINRANGE);
2760 min = GET2(ecode, 1);
2761 max = GET2(ecode, 1 + IMM2_SIZE);
2762 if (max == 0) max = INT_MAX;
2763 ecode += 1 + 2 * IMM2_SIZE;
2764 break;
2765
2766 default: /* No repeat follows */
2767 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2768 {
2769 if (length == -2) eptr = md->end_subject; /* Partial match */
2770 CHECK_PARTIAL();
2771 RRETURN(MATCH_NOMATCH);
2772 }
2773 eptr += length;
2774 continue; /* With the main loop */
2775 }
2776
2777 /* Handle repeated back references. If the length of the reference is
2778 zero, just continue with the main loop. If the length is negative, it
2779 means the reference is unset in non-Java-compatible mode. If the minimum is
2780 zero, we can continue at the same level without recursion. For any other
2781 minimum, carrying on will result in NOMATCH. */
2782
2783 if (length == 0) continue;
2784 if (length < 0 && min == 0) continue;
2785
2786 /* First, ensure the minimum number of matches are present. We get back
2787 the length of the reference string explicitly rather than passing the
2788 address of eptr, so that eptr can be a register variable. */
2789
2790 for (i = 1; i <= min; i++)
2791 {
2792 int slength;
2793 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2794 {
2795 if (slength == -2) eptr = md->end_subject; /* Partial match */
2796 CHECK_PARTIAL();
2797 RRETURN(MATCH_NOMATCH);
2798 }
2799 eptr += slength;
2800 }
2801
2802 /* If min = max, continue at the same level without recursion.
2803 They are not both allowed to be zero. */
2804
2805 if (min == max) continue;
2806
2807 /* If minimizing, keep trying and advancing the pointer */
2808
2809 if (minimize)
2810 {
2811 for (fi = min;; fi++)
2812 {
2813 int slength;
2814 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2816 if (fi >= max) RRETURN(MATCH_NOMATCH);
2817 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2818 {
2819 if (slength == -2) eptr = md->end_subject; /* Partial match */
2820 CHECK_PARTIAL();
2821 RRETURN(MATCH_NOMATCH);
2822 }
2823 eptr += slength;
2824 }
2825 /* Control never gets here */
2826 }
2827
2828 /* If maximizing, find the longest string and work backwards */
2829
2830 else
2831 {
2832 pp = eptr;
2833 for (i = min; i < max; i++)
2834 {
2835 int slength;
2836 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2837 {
2838 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2839 the soft partial matching case. */
2840
2841 if (slength == -2 && md->partial != 0 &&
2842 md->end_subject > md->start_used_ptr)
2843 {
2844 md->hitend = TRUE;
2845 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2846 }
2847 break;
2848 }
2849 eptr += slength;
2850 }
2851
2852 while (eptr >= pp)
2853 {
2854 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2855 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2856 eptr -= length;
2857 }
2858 RRETURN(MATCH_NOMATCH);
2859 }
2860 /* Control never gets here */
2861
2862 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2863 used when all the characters in the class have values in the range 0-255,
2864 and either the matching is caseful, or the characters are in the range
2865 0-127 when UTF-8 processing is enabled. The only difference between
2866 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2867 encountered.
2868
2869 First, look past the end of the item to see if there is repeat information
2870 following. Then obey similar code to character type repeats - written out
2871 again for speed. */
2872
2873 case OP_NCLASS:
2874 case OP_CLASS:
2875 {
2876 /* The data variable is saved across frames, so the byte map needs to
2877 be stored there. */
2878 #define BYTE_MAP ((pcre_uint8 *)data)
2879 data = ecode + 1; /* Save for matching */
2880 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2881
2882 switch (*ecode)
2883 {
2884 case OP_CRSTAR:
2885 case OP_CRMINSTAR:
2886 case OP_CRPLUS:
2887 case OP_CRMINPLUS:
2888 case OP_CRQUERY:
2889 case OP_CRMINQUERY:
2890 case OP_CRPOSSTAR:
2891 case OP_CRPOSPLUS:
2892 case OP_CRPOSQUERY:
2893 c = *ecode++ - OP_CRSTAR;
2894 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2895 else possessive = TRUE;
2896 min = rep_min[c]; /* Pick up values from tables; */
2897 max = rep_max[c]; /* zero for max => infinity */
2898 if (max == 0) max = INT_MAX;
2899 break;
2900
2901 case OP_CRRANGE:
2902 case OP_CRMINRANGE:
2903 case OP_CRPOSRANGE:
2904 minimize = (*ecode == OP_CRMINRANGE);
2905 possessive = (*ecode == OP_CRPOSRANGE);
2906 min = GET2(ecode, 1);
2907 max = GET2(ecode, 1 + IMM2_SIZE);
2908 if (max == 0) max = INT_MAX;
2909 ecode += 1 + 2 * IMM2_SIZE;
2910 break;
2911
2912 default: /* No repeat follows */
2913 min = max = 1;
2914 break;
2915 }
2916
2917 /* First, ensure the minimum number of matches are present. */
2918
2919 #ifdef SUPPORT_UTF
2920 if (utf)
2921 {
2922 for (i = 1; i <= min; i++)
2923 {
2924 if (eptr >= md->end_subject)
2925 {
2926 SCHECK_PARTIAL();
2927 RRETURN(MATCH_NOMATCH);
2928 }
2929 GETCHARINC(c, eptr);
2930 if (c > 255)
2931 {
2932 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2933 }
2934 else
2935 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2936 }
2937 }
2938 else
2939 #endif
2940 /* Not UTF mode */
2941 {
2942 for (i = 1; i <= min; i++)
2943 {
2944 if (eptr >= md->end_subject)
2945 {
2946 SCHECK_PARTIAL();
2947 RRETURN(MATCH_NOMATCH);
2948 }
2949 c = *eptr++;
2950 #ifndef COMPILE_PCRE8
2951 if (c > 255)
2952 {
2953 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2954 }
2955 else
2956 #endif
2957 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2958 }
2959 }
2960
2961 /* If max == min we can continue with the main loop without the
2962 need to recurse. */
2963
2964 if (min == max) continue;
2965
2966 /* If minimizing, keep testing the rest of the expression and advancing
2967 the pointer while it matches the class. */
2968
2969 if (minimize)
2970 {
2971 #ifdef SUPPORT_UTF
2972 if (utf)
2973 {
2974 for (fi = min;; fi++)
2975 {
2976 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2978 if (fi >= max) RRETURN(MATCH_NOMATCH);
2979 if (eptr >= md->end_subject)
2980 {
2981 SCHECK_PARTIAL();
2982 RRETURN(MATCH_NOMATCH);
2983 }
2984 GETCHARINC(c, eptr);
2985 if (c > 255)
2986 {
2987 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2988 }
2989 else
2990 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2991 }
2992 }
2993 else
2994 #endif
2995 /* Not UTF mode */
2996 {
2997 for (fi = min;; fi++)
2998 {
2999 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
3000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3001 if (fi >= max) RRETURN(MATCH_NOMATCH);
3002 if (eptr >= md->end_subject)
3003 {
3004 SCHECK_PARTIAL();
3005 RRETURN(MATCH_NOMATCH);
3006 }
3007 c = *eptr++;
3008 #ifndef COMPILE_PCRE8
3009 if (c > 255)
3010 {
3011 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3012 }
3013 else
3014 #endif
3015 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3016 }
3017 }
3018 /* Control never gets here */
3019 }
3020
3021 /* If maximizing, find the longest possible run, then work backwards. */
3022
3023 else
3024 {
3025 pp = eptr;
3026
3027 #ifdef SUPPORT_UTF
3028 if (utf)
3029 {
3030 for (i = min; i < max; i++)
3031 {
3032 int len = 1;
3033 if (eptr >= md->end_subject)
3034 {
3035 SCHECK_PARTIAL();
3036 break;
3037 }
3038 GETCHARLEN(c, eptr, len);
3039 if (c > 255)
3040 {
3041 if (op == OP_CLASS) break;
3042 }
3043 else
3044 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3045 eptr += len;
3046 }
3047
3048 if (possessive) continue; /* No backtracking */
3049
3050 for (;;)
3051 {
3052 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3054 if (eptr-- == pp) break; /* Stop if tried at original pos */
3055 BACKCHAR(eptr);
3056 }
3057 }
3058 else
3059 #endif
3060 /* Not UTF mode */
3061 {
3062 for (i = min; i < max; i++)
3063 {
3064 if (eptr >= md->end_subject)
3065 {
3066 SCHECK_PARTIAL();
3067 break;
3068 }
3069 c = *eptr;
3070 #ifndef COMPILE_PCRE8
3071 if (c > 255)
3072 {
3073 if (op == OP_CLASS) break;
3074 }
3075 else
3076 #endif
3077 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3078 eptr++;
3079 }
3080
3081 if (possessive) continue; /* No backtracking */
3082
3083 while (eptr >= pp)
3084 {
3085 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3086 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3087 eptr--;
3088 }
3089 }
3090
3091 RRETURN(MATCH_NOMATCH);
3092 }
3093 #undef BYTE_MAP
3094 }
3095 /* Control never gets here */
3096
3097
3098 /* Match an extended character class. In the 8-bit library, this opcode is
3099 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3100 32-bit libraries, codepoints greater than 255 may be encountered even when
3101 UTF is not supported. */
3102
3103 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3104 case OP_XCLASS:
3105 {
3106 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3107 ecode += GET(ecode, 1); /* Advance past the item */
3108
3109 switch (*ecode)
3110 {
3111 case OP_CRSTAR:
3112 case OP_CRMINSTAR:
3113 case OP_CRPLUS:
3114 case OP_CRMINPLUS:
3115 case OP_CRQUERY:
3116 case OP_CRMINQUERY:
3117 case OP_CRPOSSTAR:
3118 case OP_CRPOSPLUS:
3119 case OP_CRPOSQUERY:
3120 c = *ecode++ - OP_CRSTAR;
3121 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3122 else possessive = TRUE;
3123 min = rep_min[c]; /* Pick up values from tables; */
3124 max = rep_max[c]; /* zero for max => infinity */
3125 if (max == 0) max = INT_MAX;
3126 break;
3127
3128 case OP_CRRANGE:
3129 case OP_CRMINRANGE:
3130 case OP_CRPOSRANGE:
3131 minimize = (*ecode == OP_CRMINRANGE);
3132 possessive = (*ecode == OP_CRPOSRANGE);
3133 min = GET2(ecode, 1);
3134 max = GET2(ecode, 1 + IMM2_SIZE);
3135 if (max == 0) max = INT_MAX;
3136 ecode += 1 + 2 * IMM2_SIZE;
3137 break;
3138
3139 default: /* No repeat follows */
3140 min = max = 1;
3141 break;
3142 }
3143
3144 /* First, ensure the minimum number of matches are present. */
3145
3146 for (i = 1; i <= min; i++)
3147 {
3148 if (eptr >= md->end_subject)
3149 {
3150 SCHECK_PARTIAL();
3151 RRETURN(MATCH_NOMATCH);
3152 }
3153 GETCHARINCTEST(c, eptr);
3154 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3155 }
3156
3157 /* If max == min we can continue with the main loop without the
3158 need to recurse. */
3159
3160 if (min == max) continue;
3161
3162 /* If minimizing, keep testing the rest of the expression and advancing
3163 the pointer while it matches the class. */
3164
3165 if (minimize)
3166 {
3167 for (fi = min;; fi++)
3168 {
3169 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3171 if (fi >= max) RRETURN(MATCH_NOMATCH);
3172 if (eptr >= md->end_subject)
3173 {
3174 SCHECK_PARTIAL();
3175 RRETURN(MATCH_NOMATCH);
3176 }
3177 GETCHARINCTEST(c, eptr);
3178 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3179 }
3180 /* Control never gets here */
3181 }
3182
3183 /* If maximizing, find the longest possible run, then work backwards. */
3184
3185 else
3186 {
3187 pp = eptr;
3188 for (i = min; i < max; i++)
3189 {
3190 int len = 1;
3191 if (eptr >= md->end_subject)
3192 {
3193 SCHECK_PARTIAL();
3194 break;
3195 }
3196 #ifdef SUPPORT_UTF
3197 GETCHARLENTEST(c, eptr, len);
3198 #else
3199 c = *eptr;
3200 #endif
3201 if (!PRIV(xclass)(c, data, utf)) break;
3202 eptr += len;
3203 }
3204
3205 if (possessive) continue; /* No backtracking */
3206
3207 for(;;)
3208 {
3209 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3211 if (eptr-- == pp) break; /* Stop if tried at original pos */
3212 #ifdef SUPPORT_UTF
3213 if (utf) BACKCHAR(eptr);
3214 #endif
3215 }
3216 RRETURN(MATCH_NOMATCH);
3217 }
3218
3219 /* Control never gets here */
3220 }
3221 #endif /* End of XCLASS */
3222
3223 /* Match a single character, casefully */
3224
3225 case OP_CHAR:
3226 #ifdef SUPPORT_UTF
3227 if (utf)
3228 {
3229 length = 1;
3230 ecode++;
3231 GETCHARLEN(fc, ecode, length);
3232 if (length > md->end_subject - eptr)
3233 {
3234 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3235 RRETURN(MATCH_NOMATCH);
3236 }
3237 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3238 }
3239 else
3240 #endif
3241 /* Not UTF mode */
3242 {
3243 if (md->end_subject - eptr < 1)
3244 {
3245 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3246 RRETURN(MATCH_NOMATCH);
3247 }
3248 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3249 ecode += 2;
3250 }
3251 break;
3252
3253 /* Match a single character, caselessly. If we are at the end of the
3254 subject, give up immediately. */
3255
3256 case OP_CHARI:
3257 if (eptr >= md->end_subject)
3258 {
3259 SCHECK_PARTIAL();
3260 RRETURN(MATCH_NOMATCH);
3261 }
3262
3263 #ifdef SUPPORT_UTF
3264 if (utf)
3265 {
3266 length = 1;
3267 ecode++;
3268 GETCHARLEN(fc, ecode, length);
3269
3270 /* If the pattern character's value is < 128, we have only one byte, and
3271 we know that its other case must also be one byte long, so we can use the
3272 fast lookup table. We know that there is at least one byte left in the
3273 subject. */
3274
3275 if (fc < 128)
3276 {
3277 pcre_uint32 cc = UCHAR21(eptr);
3278 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3279 ecode++;
3280 eptr++;
3281 }
3282
3283 /* Otherwise we must pick up the subject character. Note that we cannot
3284 use the value of "length" to check for sufficient bytes left, because the
3285 other case of the character may have more or fewer bytes. */
3286
3287 else
3288 {
3289 pcre_uint32 dc;
3290 GETCHARINC(dc, eptr);
3291 ecode += length;
3292
3293 /* If we have Unicode property support, we can use it to test the other
3294 case of the character, if there is one. */
3295
3296 if (fc != dc)
3297 {
3298 #ifdef SUPPORT_UCP
3299 if (dc != UCD_OTHERCASE(fc))
3300 #endif
3301 RRETURN(MATCH_NOMATCH);
3302 }
3303 }
3304 }
3305 else
3306 #endif /* SUPPORT_UTF */
3307
3308 /* Not UTF mode */
3309 {
3310 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3311 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3312 eptr++;
3313 ecode += 2;
3314 }
3315 break;
3316
3317 /* Match a single character repeatedly. */
3318
3319 case OP_EXACT:
3320 case OP_EXACTI:
3321 min = max = GET2(ecode, 1);
3322 ecode += 1 + IMM2_SIZE;
3323 goto REPEATCHAR;
3324
3325 case OP_POSUPTO:
3326 case OP_POSUPTOI:
3327 possessive = TRUE;
3328 /* Fall through */
3329
3330 case OP_UPTO:
3331 case OP_UPTOI:
3332 case OP_MINUPTO:
3333 case OP_MINUPTOI:
3334 min = 0;
3335 max = GET2(ecode, 1);
3336 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3337 ecode += 1 + IMM2_SIZE;
3338 goto REPEATCHAR;
3339
3340 case OP_POSSTAR:
3341 case OP_POSSTARI:
3342 possessive = TRUE;
3343 min = 0;
3344 max = INT_MAX;
3345 ecode++;
3346 goto REPEATCHAR;
3347
3348 case OP_POSPLUS:
3349 case OP_POSPLUSI:
3350 possessive = TRUE;
3351 min = 1;
3352 max = INT_MAX;
3353 ecode++;
3354 goto REPEATCHAR;
3355
3356 case OP_POSQUERY:
3357 case OP_POSQUERYI:
3358 possessive = TRUE;
3359 min = 0;
3360 max = 1;
3361 ecode++;
3362 goto REPEATCHAR;
3363
3364 case OP_STAR:
3365 case OP_STARI:
3366 case OP_MINSTAR:
3367 case OP_MINSTARI:
3368 case OP_PLUS:
3369 case OP_PLUSI:
3370 case OP_MINPLUS:
3371 case OP_MINPLUSI:
3372 case OP_QUERY:
3373 case OP_QUERYI:
3374 case OP_MINQUERY:
3375 case OP_MINQUERYI:
3376 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3377 minimize = (c & 1) != 0;
3378 min = rep_min[c]; /* Pick up values from tables; */
3379 max = rep_max[c]; /* zero for max => infinity */
3380 if (max == 0) max = INT_MAX;
3381
3382 /* Common code for all repeated single-character matches. We first check
3383 for the minimum number of characters. If the minimum equals the maximum, we
3384 are done. Otherwise, if minimizing, check the rest of the pattern for a
3385 match; if there isn't one, advance up to the maximum, one character at a
3386 time.
3387
3388 If maximizing, advance up to the maximum number of matching characters,
3389 until eptr is past the end of the maximum run. If possessive, we are
3390 then done (no backing up). Otherwise, match at this position; anything
3391 other than no match is immediately returned. For nomatch, back up one
3392 character, unless we are matching \R and the last thing matched was
3393 \r\n, in which case, back up two bytes. When we reach the first optional
3394 character position, we can save stack by doing a tail recurse.
3395
3396 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3397 for speed. */
3398
3399 REPEATCHAR:
3400 #ifdef SUPPORT_UTF
3401 if (utf)
3402 {
3403 length = 1;
3404 charptr = ecode;
3405 GETCHARLEN(fc, ecode, length);
3406 ecode += length;
3407
3408 /* Handle multibyte character matching specially here. There is
3409 support for caseless matching if UCP support is present. */
3410
3411 if (length > 1)
3412 {
3413 #ifdef SUPPORT_UCP
3414 pcre_uint32 othercase;
3415 if (op >= OP_STARI && /* Caseless */
3416 (othercase = UCD_OTHERCASE(fc)) != fc)
3417 oclength = PRIV(ord2utf)(othercase, occhars);
3418 else oclength = 0;
3419 #endif /* SUPPORT_UCP */
3420
3421 for (i = 1; i <= min; i++)
3422 {
3423 if (eptr <= md->end_subject - length &&
3424 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3425 #ifdef SUPPORT_UCP
3426 else if (oclength > 0 &&
3427 eptr <= md->end_subject - oclength &&
3428 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3429 #endif /* SUPPORT_UCP */
3430 else
3431 {
3432 CHECK_PARTIAL();
3433 RRETURN(MATCH_NOMATCH);
3434 }
3435 }
3436
3437 if (min == max) continue;
3438
3439 if (minimize)
3440 {
3441 for (fi = min;; fi++)
3442 {
3443 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445 if (fi >= max) RRETURN(MATCH_NOMATCH);
3446 if (eptr <= md->end_subject - length &&
3447 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3448 #ifdef SUPPORT_UCP
3449 else if (oclength > 0 &&
3450 eptr <= md->end_subject - oclength &&
3451 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3452 #endif /* SUPPORT_UCP */
3453 else
3454 {
3455 CHECK_PARTIAL();
3456 RRETURN(MATCH_NOMATCH);
3457 }
3458 }
3459 /* Control never gets here */
3460 }
3461
3462 else /* Maximize */
3463 {
3464 pp = eptr;
3465 for (i = min; i < max; i++)
3466 {
3467 if (eptr <= md->end_subject - length &&
3468 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3469 #ifdef SUPPORT_UCP
3470 else if (oclength > 0 &&
3471 eptr <= md->end_subject - oclength &&
3472 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3473 #endif /* SUPPORT_UCP */
3474 else
3475 {
3476 CHECK_PARTIAL();
3477 break;
3478 }
3479 }
3480
3481 if (possessive) continue; /* No backtracking */
3482 for(;;)
3483 {
3484 if (eptr <= pp) goto TAIL_RECURSE;
3485 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3486 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3487 #ifdef SUPPORT_UCP
3488 eptr--;
3489 BACKCHAR(eptr);
3490 #else /* without SUPPORT_UCP */
3491 eptr -= length;
3492 #endif /* SUPPORT_UCP */
3493 }
3494 }
3495 /* Control never gets here */
3496 }
3497
3498 /* If the length of a UTF-8 character is 1, we fall through here, and
3499 obey the code as for non-UTF-8 characters below, though in this case the
3500 value of fc will always be < 128. */
3501 }
3502 else
3503 #endif /* SUPPORT_UTF */
3504 /* When not in UTF-8 mode, load a single-byte character. */
3505 fc = *ecode++;
3506
3507 /* The value of fc at this point is always one character, though we may
3508 or may not be in UTF mode. The code is duplicated for the caseless and
3509 caseful cases, for speed, since matching characters is likely to be quite
3510 common. First, ensure the minimum number of matches are present. If min =
3511 max, continue at the same level without recursing. Otherwise, if
3512 minimizing, keep trying the rest of the expression and advancing one
3513 matching character if failing, up to the maximum. Alternatively, if
3514 maximizing, find the maximum number of characters and work backwards. */
3515
3516 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3517 max, (char *)eptr));
3518
3519 if (op >= OP_STARI) /* Caseless */
3520 {
3521 #ifdef COMPILE_PCRE8
3522 /* fc must be < 128 if UTF is enabled. */
3523 foc = md->fcc[fc];
3524 #else
3525 #ifdef SUPPORT_UTF
3526 #ifdef SUPPORT_UCP
3527 if (utf && fc > 127)
3528 foc = UCD_OTHERCASE(fc);
3529 #else
3530 if (utf && fc > 127)
3531 foc = fc;
3532 #endif /* SUPPORT_UCP */
3533 else
3534 #endif /* SUPPORT_UTF */
3535 foc = TABLE_GET(fc, md->fcc, fc);
3536 #endif /* COMPILE_PCRE8 */
3537
3538 for (i = 1; i <= min; i++)
3539 {
3540 pcre_uint32 cc; /* Faster than pcre_uchar */
3541 if (eptr >= md->end_subject)
3542 {
3543 SCHECK_PARTIAL();
3544 RRETURN(MATCH_NOMATCH);
3545 }
3546 cc = UCHAR21TEST(eptr);
3547 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3548 eptr++;
3549 }
3550 if (min == max) continue;
3551 if (minimize)
3552 {
3553 for (fi = min;; fi++)
3554 {
3555 pcre_uint32 cc; /* Faster than pcre_uchar */
3556 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3557 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3558 if (fi >= max) RRETURN(MATCH_NOMATCH);
3559 if (eptr >= md->end_subject)
3560 {
3561 SCHECK_PARTIAL();
3562 RRETURN(MATCH_NOMATCH);
3563 }
3564 cc = UCHAR21TEST(eptr);
3565 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3566 eptr++;
3567 }
3568 /* Control never gets here */
3569 }
3570 else /* Maximize */
3571 {
3572 pp = eptr;
3573 for (i = min; i < max; i++)
3574 {
3575 pcre_uint32 cc; /* Faster than pcre_uchar */
3576 if (eptr >= md->end_subject)
3577 {
3578 SCHECK_PARTIAL();
3579 break;
3580 }
3581 cc = UCHAR21TEST(eptr);
3582 if (fc != cc && foc != cc) break;
3583 eptr++;
3584 }
3585 if (possessive) continue; /* No backtracking */
3586 for (;;)
3587 {
3588 if (eptr == pp) goto TAIL_RECURSE;
3589 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3590 eptr--;
3591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3592 }
3593 /* Control never gets here */
3594 }
3595 }
3596
3597 /* Caseful comparisons (includes all multi-byte characters) */
3598
3599 else
3600 {
3601 for (i = 1; i <= min; i++)
3602 {
3603 if (eptr >= md->end_subject)
3604 {
3605 SCHECK_PARTIAL();
3606 RRETURN(MATCH_NOMATCH);
3607 }
3608 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3609 }
3610
3611 if (min == max) continue;
3612
3613 if (minimize)
3614 {
3615 for (fi = min;; fi++)
3616 {
3617 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3619 if (fi >= max) RRETURN(MATCH_NOMATCH);
3620 if (eptr >= md->end_subject)
3621 {
3622 SCHECK_PARTIAL();
3623 RRETURN(MATCH_NOMATCH);
3624 }
3625 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3626 }
3627 /* Control never gets here */
3628 }
3629 else /* Maximize */
3630 {
3631 pp = eptr;
3632 for (i = min; i < max; i++)
3633 {
3634 if (eptr >= md->end_subject)
3635 {
3636 SCHECK_PARTIAL();
3637 break;
3638 }
3639 if (fc != UCHAR21TEST(eptr)) break;
3640 eptr++;
3641 }
3642 if (possessive) continue; /* No backtracking */
3643 for (;;)
3644 {
3645 if (eptr == pp) goto TAIL_RECURSE;
3646 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3647 eptr--;
3648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3649 }
3650 /* Control never gets here */
3651 }
3652 }
3653 /* Control never gets here */
3654
3655 /* Match a negated single one-byte character. The character we are
3656 checking can be multibyte. */
3657
3658 case OP_NOT:
3659 case OP_NOTI:
3660 if (eptr >= md->end_subject)
3661 {
3662 SCHECK_PARTIAL();
3663 RRETURN(MATCH_NOMATCH);
3664 }
3665 #ifdef SUPPORT_UTF
3666 if (utf)
3667 {
3668 register pcre_uint32 ch, och;
3669
3670 ecode++;
3671 GETCHARINC(ch, ecode);
3672 GETCHARINC(c, eptr);
3673
3674 if (op == OP_NOT)
3675 {
3676 if (ch == c) RRETURN(MATCH_NOMATCH);
3677 }
3678 else
3679 {
3680 #ifdef SUPPORT_UCP
3681 if (ch > 127)
3682 och = UCD_OTHERCASE(ch);
3683 #else
3684 if (ch > 127)
3685 och = ch;
3686 #endif /* SUPPORT_UCP */
3687 else
3688 och = TABLE_GET(ch, md->fcc, ch);
3689 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3690 }
3691 }
3692 else
3693 #endif
3694 {
3695 register pcre_uint32 ch = ecode[1];
3696 c = *eptr++;
3697 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3698 RRETURN(MATCH_NOMATCH);
3699 ecode += 2;
3700 }
3701 break;
3702
3703 /* Match a negated single one-byte character repeatedly. This is almost a
3704 repeat of the code for a repeated single character, but I haven't found a
3705 nice way of commoning these up that doesn't require a test of the
3706 positive/negative option for each character match. Maybe that wouldn't add
3707 very much to the time taken, but character matching *is* what this is all
3708 about... */
3709
3710 case OP_NOTEXACT:
3711 case OP_NOTEXACTI:
3712 min = max = GET2(ecode, 1);
3713 ecode += 1 + IMM2_SIZE;
3714 goto REPEATNOTCHAR;
3715
3716 case OP_NOTUPTO:
3717 case OP_NOTUPTOI:
3718 case OP_NOTMINUPTO:
3719 case OP_NOTMINUPTOI:
3720 min = 0;
3721 max = GET2(ecode, 1);
3722 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3723 ecode += 1 + IMM2_SIZE;
3724 goto REPEATNOTCHAR;
3725
3726 case OP_NOTPOSSTAR:
3727 case OP_NOTPOSSTARI:
3728 possessive = TRUE;
3729 min = 0;
3730 max = INT_MAX;
3731 ecode++;
3732 goto REPEATNOTCHAR;
3733
3734 case OP_NOTPOSPLUS:
3735 case OP_NOTPOSPLUSI:
3736 possessive = TRUE;
3737 min = 1;
3738 max = INT_MAX;
3739 ecode++;
3740 goto REPEATNOTCHAR;
3741
3742 case OP_NOTPOSQUERY:
3743 case OP_NOTPOSQUERYI:
3744 possessive = TRUE;
3745 min = 0;
3746 max = 1;
3747 ecode++;
3748 goto REPEATNOTCHAR;
3749
3750 case OP_NOTPOSUPTO:
3751 case OP_NOTPOSUPTOI:
3752 possessive = TRUE;
3753 min = 0;
3754 max = GET2(ecode, 1);
3755 ecode += 1 + IMM2_SIZE;
3756 goto REPEATNOTCHAR;
3757
3758 case OP_NOTSTAR:
3759 case OP_NOTSTARI:
3760 case OP_NOTMINSTAR:
3761 case OP_NOTMINSTARI:
3762 case OP_NOTPLUS:
3763 case OP_NOTPLUSI:
3764 case OP_NOTMINPLUS:
3765 case OP_NOTMINPLUSI:
3766 case OP_NOTQUERY:
3767 case OP_NOTQUERYI:
3768 case OP_NOTMINQUERY:
3769 case OP_NOTMINQUERYI:
3770 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3771 minimize = (c & 1) != 0;
3772 min = rep_min[c]; /* Pick up values from tables; */
3773 max = rep_max[c]; /* zero for max => infinity */
3774 if (max == 0) max = INT_MAX;
3775
3776 /* Common code for all repeated single-byte matches. */
3777
3778 REPEATNOTCHAR:
3779 GETCHARINCTEST(fc, ecode);
3780
3781 /* The code is duplicated for the caseless and caseful cases, for speed,
3782 since matching characters is likely to be quite common. First, ensure the
3783 minimum number of matches are present. If min = max, continue at the same
3784 level without recursing. Otherwise, if minimizing, keep trying the rest of
3785 the expression and advancing one matching character if failing, up to the
3786 maximum. Alternatively, if maximizing, find the maximum number of
3787 characters and work backwards. */
3788
3789 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3790 max, (char *)eptr));
3791
3792 if (op >= OP_NOTSTARI) /* Caseless */
3793 {
3794 #ifdef SUPPORT_UTF
3795 #ifdef SUPPORT_UCP
3796 if (utf && fc > 127)
3797 foc = UCD_OTHERCASE(fc);
3798 #else
3799 if (utf && fc > 127)
3800 foc = fc;
3801 #endif /* SUPPORT_UCP */
3802 else
3803 #endif /* SUPPORT_UTF */
3804 foc = TABLE_GET(fc, md->fcc, fc);
3805
3806 #ifdef SUPPORT_UTF
3807 if (utf)
3808 {
3809 register pcre_uint32 d;
3810 for (i = 1; i <= min; i++)
3811 {
3812 if (eptr >= md->end_subject)
3813 {
3814 SCHECK_PARTIAL();
3815 RRETURN(MATCH_NOMATCH);
3816 }
3817 GETCHARINC(d, eptr);
3818 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3819 }
3820 }
3821 else
3822 #endif /* SUPPORT_UTF */
3823 /* Not UTF mode */
3824 {
3825 for (i = 1; i <= min; i++)
3826 {
3827 if (eptr >= md->end_subject)
3828 {
3829 SCHECK_PARTIAL();
3830 RRETURN(MATCH_NOMATCH);
3831 }
3832 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3833 eptr++;
3834 }
3835 }
3836
3837 if (min == max) continue;
3838
3839 if (minimize)
3840 {
3841 #ifdef SUPPORT_UTF
3842 if (utf)
3843 {
3844 register pcre_uint32 d;
3845 for (fi = min;; fi++)
3846 {
3847 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3848 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3849 if (fi >= max) RRETURN(MATCH_NOMATCH);
3850 if (eptr >= md->end_subject)
3851 {
3852 SCHECK_PARTIAL();
3853 RRETURN(MATCH_NOMATCH);
3854 }
3855 GETCHARINC(d, eptr);
3856 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3857 }
3858 }
3859 else
3860 #endif /*SUPPORT_UTF */
3861 /* Not UTF mode */
3862 {
3863 for (fi = min;; fi++)
3864 {
3865 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3867 if (fi >= max) RRETURN(MATCH_NOMATCH);
3868 if (eptr >= md->end_subject)
3869 {
3870 SCHECK_PARTIAL();
3871 RRETURN(MATCH_NOMATCH);
3872 }
3873 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3874 eptr++;
3875 }
3876 }
3877 /* Control never gets here */
3878 }
3879
3880 /* Maximize case */
3881
3882 else
3883 {
3884 pp = eptr;
3885
3886 #ifdef SUPPORT_UTF
3887 if (utf)
3888 {
3889 register pcre_uint32 d;
3890 for (i = min; i < max; i++)
3891 {
3892 int len = 1;
3893 if (eptr >= md->end_subject)
3894 {
3895 SCHECK_PARTIAL();
3896 break;
3897 }
3898 GETCHARLEN(d, eptr, len);
3899 if (fc == d || (unsigned int)foc == d) break;
3900 eptr += len;
3901 }
3902 if (possessive) continue; /* No backtracking */
3903 for(;;)
3904 {
3905 if (eptr <= pp) goto TAIL_RECURSE;
3906 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3908 eptr--;
3909 BACKCHAR(eptr);
3910 }
3911 }
3912 else
3913 #endif /* SUPPORT_UTF */
3914 /* Not UTF mode */
3915 {
3916 for (i = min; i < max; i++)
3917 {
3918 if (eptr >= md->end_subject)
3919 {
3920 SCHECK_PARTIAL();
3921 break;
3922 }
3923 if (fc == *eptr || foc == *eptr) break;
3924 eptr++;
3925 }
3926 if (possessive) continue; /* No backtracking */
3927 for (;;)
3928 {
3929 if (eptr == pp) goto TAIL_RECURSE;
3930 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3932 eptr--;
3933 }
3934 }
3935 /* Control never gets here */
3936 }
3937 }
3938
3939 /* Caseful comparisons */
3940
3941 else
3942 {
3943 #ifdef SUPPORT_UTF
3944 if (utf)
3945 {
3946 register pcre_uint32 d;
3947 for (i = 1; i <= min; i++)
3948 {
3949 if (eptr >= md->end_subject)
3950 {
3951 SCHECK_PARTIAL();
3952 RRETURN(MATCH_NOMATCH);
3953 }
3954 GETCHARINC(d, eptr);
3955 if (fc == d) RRETURN(MATCH_NOMATCH);
3956 }
3957 }
3958 else
3959 #endif
3960 /* Not UTF mode */
3961 {
3962 for (i = 1; i <= min; i++)
3963 {
3964 if (eptr >= md->end_subject)
3965 {
3966 SCHECK_PARTIAL();
3967 RRETURN(MATCH_NOMATCH);
3968 }
3969 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3970 }
3971 }
3972
3973 if (min == max) continue;
3974
3975 if (minimize)
3976 {
3977 #ifdef SUPPORT_UTF
3978 if (utf)
3979 {
3980 register pcre_uint32 d;
3981 for (fi = min;; fi++)
3982 {
3983 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3985 if (fi >= max) RRETURN(MATCH_NOMATCH);
3986 if (eptr >= md->end_subject)
3987 {
3988 SCHECK_PARTIAL();
3989 RRETURN(MATCH_NOMATCH);
3990 }
3991 GETCHARINC(d, eptr);
3992 if (fc == d) RRETURN(MATCH_NOMATCH);
3993 }
3994 }
3995 else
3996 #endif
3997 /* Not UTF mode */
3998 {
3999 for (fi = min;; fi++)
4000 {
4001 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4002 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4003 if (fi >= max) RRETURN(MATCH_NOMATCH);
4004 if (eptr >= md->end_subject)
4005 {
4006 SCHECK_PARTIAL();
4007 RRETURN(MATCH_NOMATCH);
4008 }
4009 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4010 }
4011 }
4012 /* Control never gets here */
4013 }
4014
4015 /* Maximize case */
4016
4017 else
4018 {
4019 pp = eptr;
4020
4021 #ifdef SUPPORT_UTF
4022 if (utf)
4023 {
4024 register pcre_uint32 d;
4025 for (i = min; i < max; i++)
4026 {
4027 int len = 1;
4028 if (eptr >= md->end_subject)
4029 {
4030 SCHECK_PARTIAL();
4031 break;
4032 }
4033 GETCHARLEN(d, eptr, len);
4034 if (fc == d) break;
4035 eptr += len;
4036 }
4037 if (possessive) continue; /* No backtracking */
4038 for(;;)
4039 {
4040 if (eptr <= pp) goto TAIL_RECURSE;
4041 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4043 eptr--;
4044 BACKCHAR(eptr);
4045 }
4046 }
4047 else
4048 #endif
4049 /* Not UTF mode */
4050 {
4051 for (i = min; i < max; i++)
4052 {
4053 if (eptr >= md->end_subject)
4054 {
4055 SCHECK_PARTIAL();
4056 break;
4057 }
4058 if (fc == *eptr) break;
4059 eptr++;
4060 }
4061 if (possessive) continue; /* No backtracking */
4062 for (;;)
4063 {
4064 if (eptr == pp) goto TAIL_RECURSE;
4065 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4067 eptr--;
4068 }
4069 }
4070 /* Control never gets here */
4071 }
4072 }
4073 /* Control never gets here */
4074
4075 /* Match a single character type repeatedly; several different opcodes
4076 share code. This is very similar to the code for single characters, but we
4077 repeat it in the interests of efficiency. */
4078
4079 case OP_TYPEEXACT:
4080 min = max = GET2(ecode, 1);
4081 minimize = TRUE;
4082 ecode += 1 + IMM2_SIZE;
4083 goto REPEATTYPE;
4084
4085 case OP_TYPEUPTO:
4086 case OP_TYPEMINUPTO:
4087 min = 0;
4088 max = GET2(ecode, 1);
4089 minimize = *ecode == OP_TYPEMINUPTO;
4090 ecode += 1 + IMM2_SIZE;
4091 goto REPEATTYPE;
4092
4093 case OP_TYPEPOSSTAR:
4094 possessive = TRUE;
4095 min = 0;
4096 max = INT_MAX;
4097 ecode++;
4098 goto REPEATTYPE;
4099
4100 case OP_TYPEPOSPLUS:
4101 possessive = TRUE;
4102 min = 1;
4103 max = INT_MAX;
4104 ecode++;
4105 goto REPEATTYPE;
4106
4107 case OP_TYPEPOSQUERY:
4108 possessive = TRUE;
4109 min = 0;
4110 max = 1;
4111 ecode++;
4112 goto REPEATTYPE;
4113
4114 case OP_TYPEPOSUPTO:
4115 possessive = TRUE;
4116 min = 0;
4117 max = GET2(ecode, 1);
4118 ecode += 1 + IMM2_SIZE;
4119 goto REPEATTYPE;
4120
4121 case OP_TYPESTAR:
4122 case OP_TYPEMINSTAR:
4123 case OP_TYPEPLUS:
4124 case OP_TYPEMINPLUS:
4125 case OP_TYPEQUERY:
4126 case OP_TYPEMINQUERY:
4127 c = *ecode++ - OP_TYPESTAR;
4128 minimize = (c & 1) != 0;
4129 min = rep_min[c]; /* Pick up values from tables; */
4130 max = rep_max[c]; /* zero for max => infinity */
4131 if (max == 0) max = INT_MAX;
4132
4133 /* Common code for all repeated single character type matches. Note that
4134 in UTF-8 mode, '.' matches a character of any length, but for the other
4135 character types, the valid characters are all one-byte long. */
4136
4137 REPEATTYPE:
4138 ctype = *ecode++; /* Code for the character type */
4139
4140 #ifdef SUPPORT_UCP
4141 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4142 {
4143 prop_fail_result = ctype == OP_NOTPROP;
4144 prop_type = *ecode++;
4145 prop_value = *ecode++;
4146 }
4147 else prop_type = -1;
4148 #endif
4149
4150 /* First, ensure the minimum number of matches are present. Use inline
4151 code for maximizing the speed, and do the type test once at the start
4152 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4153 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4154 and single-bytes. */
4155
4156 if (min > 0)
4157 {
4158 #ifdef SUPPORT_UCP
4159 if (prop_type >= 0)
4160 {
4161 switch(prop_type)
4162 {
4163 case PT_ANY:
4164 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4165 for (i = 1; i <= min; i++)
4166 {
4167 if (eptr >= md->end_subject)
4168 {
4169 SCHECK_PARTIAL();
4170 RRETURN(MATCH_NOMATCH);
4171 }
4172 GETCHARINCTEST(c, eptr);
4173 }
4174 break;
4175
4176 case PT_LAMP:
4177 for (i = 1; i <= min; i++)
4178 {
4179 int chartype;
4180 if (eptr >= md->end_subject)
4181 {
4182 SCHECK_PARTIAL();
4183 RRETURN(MATCH_NOMATCH);
4184 }
4185 GETCHARINCTEST(c, eptr);
4186 chartype = UCD_CHARTYPE(c);
4187 if ((chartype == ucp_Lu ||
4188 chartype == ucp_Ll ||
4189 chartype == ucp_Lt) == prop_fail_result)
4190 RRETURN(MATCH_NOMATCH);
4191 }
4192 break;
4193
4194 case PT_GC:
4195 for (i = 1; i <= min; i++)
4196 {
4197 if (eptr >= md->end_subject)
4198 {
4199 SCHECK_PARTIAL();
4200 RRETURN(MATCH_NOMATCH);
4201 }
4202 GETCHARINCTEST(c, eptr);
4203 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4204 RRETURN(MATCH_NOMATCH);
4205 }
4206 break;
4207
4208 case PT_PC:
4209 for (i = 1; i <= min; i++)
4210 {
4211 if (eptr >= md->end_subject)
4212 {
4213 SCHECK_PARTIAL();
4214 RRETURN(MATCH_NOMATCH);
4215 }
4216 GETCHARINCTEST(c, eptr);
4217 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4218 RRETURN(MATCH_NOMATCH);
4219 }
4220 break;
4221
4222 case PT_SC:
4223 for (i = 1; i <= min; i++)
4224 {
4225 if (eptr >= md->end_subject)
4226 {
4227 SCHECK_PARTIAL();
4228 RRETURN(MATCH_NOMATCH);
4229 }
4230 GETCHARINCTEST(c, eptr);
4231 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4232 RRETURN(MATCH_NOMATCH);
4233 }
4234 break;
4235
4236 case PT_ALNUM:
4237 for (i = 1; i <= min; i++)
4238 {
4239 int category;
4240 if (eptr >= md->end_subject)
4241 {
4242 SCHECK_PARTIAL();
4243 RRETURN(MATCH_NOMATCH);
4244 }
4245 GETCHARINCTEST(c, eptr);
4246 category = UCD_CATEGORY(c);
4247 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4248 RRETURN(MATCH_NOMATCH);
4249 }
4250 break;
4251
4252 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4253 which means that Perl space and POSIX space are now identical. PCRE
4254 was changed at release 8.34. */
4255
4256 case PT_SPACE: /* Perl space */
4257 case PT_PXSPACE: /* POSIX space */
4258 for (i = 1; i <= min; i++)
4259 {
4260 if (eptr >= md->end_subject)
4261 {
4262 SCHECK_PARTIAL();
4263 RRETURN(MATCH_NOMATCH);
4264 }
4265 GETCHARINCTEST(c, eptr);
4266 switch(c)
4267 {
4268 HSPACE_CASES:
4269 VSPACE_CASES:
4270 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4271 break;
4272
4273 default:
4274 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4275 RRETURN(MATCH_NOMATCH);
4276 break;
4277 }
4278 }
4279 break;
4280
4281 case PT_WORD:
4282 for (i = 1; i <= min; i++)
4283 {
4284 int category;
4285 if (eptr >= md->end_subject)
4286 {
4287 SCHECK_PARTIAL();
4288 RRETURN(MATCH_NOMATCH);
4289 }
4290 GETCHARINCTEST(c, eptr);
4291 category = UCD_CATEGORY(c);
4292 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4293 == prop_fail_result)
4294 RRETURN(MATCH_NOMATCH);
4295 }
4296 break;
4297
4298 case PT_CLIST:
4299 for (i = 1; i <= min; i++)
4300 {
4301 const pcre_uint32 *cp;
4302 if (eptr >= md->end_subject)
4303 {
4304 SCHECK_PARTIAL();
4305 RRETURN(MATCH_NOMATCH);
4306 }
4307 GETCHARINCTEST(c, eptr);
4308 cp = PRIV(ucd_caseless_sets) + prop_value;
4309 for (;;)
4310 {
4311 if (c < *cp)
4312 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4313 if (c == *cp++)
4314 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4315 }
4316 }
4317 break;
4318
4319 case PT_UCNC:
4320 for (i = 1; i <= min; i++)
4321 {
4322 if (eptr >= md->end_subject)
4323 {
4324 SCHECK_PARTIAL();
4325 RRETURN(MATCH_NOMATCH);
4326 }
4327 GETCHARINCTEST(c, eptr);
4328 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4329 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4330 c >= 0xe000) == prop_fail_result)
4331 RRETURN(MATCH_NOMATCH);
4332 }
4333 break;
4334
4335 /* This should not occur */
4336
4337 default:
4338 RRETURN(PCRE_ERROR_INTERNAL);
4339 }
4340 }
4341
4342 /* Match extended Unicode sequences. We will get here only if the
4343 support is in the binary; otherwise a compile-time error occurs. */
4344
4345 else if (ctype == OP_EXTUNI)
4346 {
4347 for (i = 1; i <= min; i++)
4348 {
4349 if (eptr >= md->end_subject)
4350 {
4351 SCHECK_PARTIAL();
4352 RRETURN(MATCH_NOMATCH);
4353 }
4354 else
4355 {
4356 int lgb, rgb;
4357 GETCHARINCTEST(c, eptr);
4358 lgb = UCD_GRAPHBREAK(c);
4359 while (eptr < md->end_subject)
4360 {
4361 int len = 1;
4362 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4363 rgb = UCD_GRAPHBREAK(c);
4364 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4365 lgb = rgb;
4366 eptr += len;
4367 }
4368 }
4369 CHECK_PARTIAL();
4370 }
4371 }
4372
4373 else
4374 #endif /* SUPPORT_UCP */
4375
4376 /* Handle all other cases when the coding is UTF-8 */
4377
4378 #ifdef SUPPORT_UTF
4379 if (utf) switch(ctype)
4380 {
4381 case OP_ANY:
4382 for (i = 1; i <= min; i++)
4383 {
4384 if (eptr >= md->end_subject)
4385 {
4386 SCHECK_PARTIAL();
4387 RRETURN(MATCH_NOMATCH);
4388 }
4389 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4390 if (md->partial != 0 &&
4391 eptr + 1 >= md->end_subject &&
4392 NLBLOCK->nltype == NLTYPE_FIXED &&
4393 NLBLOCK->nllen == 2 &&
4394 UCHAR21(eptr) == NLBLOCK->nl[0])
4395 {
4396 md->hitend = TRUE;
4397 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4398 }
4399 eptr++;
4400 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4401 }
4402 break;
4403
4404 case OP_ALLANY:
4405 for (i = 1; i <= min; i++)
4406 {
4407 if (eptr >= md->end_subject)
4408 {
4409 SCHECK_PARTIAL();
4410 RRETURN(MATCH_NOMATCH);
4411 }
4412 eptr++;
4413 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4414 }
4415 break;
4416
4417 case OP_ANYBYTE:
4418 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4419 eptr += min;
4420 break;
4421
4422 case OP_ANYNL:
4423 for (i = 1; i <= min; i++)
4424 {
4425 if (eptr >= md->end_subject)
4426 {
4427 SCHECK_PARTIAL();
4428 RRETURN(MATCH_NOMATCH);
4429 }
4430 GETCHARINC(c, eptr);
4431 switch(c)
4432 {
4433 default: RRETURN(MATCH_NOMATCH);
4434
4435 case CHAR_CR:
4436 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4437 break;
4438
4439 case CHAR_LF:
4440 break;
4441
4442 case CHAR_VT:
4443 case CHAR_FF:
4444 case CHAR_NEL:
4445 #ifndef EBCDIC
4446 case 0x2028:
4447 case 0x2029:
4448 #endif /* Not EBCDIC */
4449 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4450 break;
4451 }
4452 }
4453 break;
4454
4455 case OP_NOT_HSPACE:
4456 for (i = 1; i <= min; i++)
4457 {
4458 if (eptr >= md->end_subject)
4459 {
4460 SCHECK_PARTIAL();
4461 RRETURN(MATCH_NOMATCH);
4462 }
4463 GETCHARINC(c, eptr);
4464 switch(c)
4465 {
4466 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4467 default: break;
4468 }
4469 }
4470 break;
4471
4472 case OP_HSPACE:
4473 for (i = 1; i <= min; i++)
4474 {
4475 if (eptr >= md->end_subject)
4476 {
4477 SCHECK_PARTIAL();
4478 RRETURN(MATCH_NOMATCH);
4479 }
4480 GETCHARINC(c, eptr);
4481 switch(c)
4482 {
4483 HSPACE_CASES: break; /* Byte and multibyte cases */
4484 default: RRETURN(MATCH_NOMATCH);
4485 }
4486 }
4487 break;
4488
4489 case OP_NOT_VSPACE:
4490 for (i = 1; i <= min; i++)
4491 {
4492 if (eptr >= md->end_subject)
4493 {
4494 SCHECK_PARTIAL();
4495 RRETURN(MATCH_NOMATCH);
4496 }
4497 GETCHARINC(c, eptr);
4498 switch(c)
4499 {
4500 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4501 default: break;
4502 }
4503 }
4504 break;
4505
4506 case OP_VSPACE:
4507 for (i = 1; i <= min; i++)
4508 {
4509 if (eptr >= md->end_subject)
4510 {
4511 SCHECK_PARTIAL();
4512 RRETURN(MATCH_NOMATCH);
4513 }
4514 GETCHARINC(c, eptr);
4515 switch(c)
4516 {
4517 VSPACE_CASES: break;
4518 default: RRETURN(MATCH_NOMATCH);
4519 }
4520 }
4521 break;
4522
4523 case OP_NOT_DIGIT:
4524 for (i = 1; i <= min; i++)
4525 {
4526 if (eptr >= md->end_subject)
4527 {
4528 SCHECK_PARTIAL();
4529 RRETURN(MATCH_NOMATCH);
4530 }
4531 GETCHARINC(c, eptr);
4532 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4533 RRETURN(MATCH_NOMATCH);
4534 }
4535 break;
4536
4537 case OP_DIGIT:
4538 for (i = 1; i <= min; i++)
4539 {
4540 pcre_uint32 cc;
4541 if (eptr >= md->end_subject)
4542 {
4543 SCHECK_PARTIAL();
4544 RRETURN(MATCH_NOMATCH);
4545 }
4546 cc = UCHAR21(eptr);
4547 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4548 RRETURN(MATCH_NOMATCH);
4549 eptr++;
4550 /* No need to skip more bytes - we know it's a 1-byte character */
4551 }
4552 break;
4553
4554 case OP_NOT_WHITESPACE:
4555 for (i = 1; i <= min; i++)
4556 {
4557 pcre_uint32 cc;
4558 if (eptr >= md->end_subject)
4559 {
4560 SCHECK_PARTIAL();
4561 RRETURN(MATCH_NOMATCH);
4562 }
4563 cc = UCHAR21(eptr);
4564 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4565 RRETURN(MATCH_NOMATCH);
4566 eptr++;
4567 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4568 }
4569 break;
4570
4571 case OP_WHITESPACE:
4572 for (i = 1; i <= min; i++)
4573 {
4574 pcre_uint32 cc;
4575 if (eptr >= md->end_subject)
4576 {
4577 SCHECK_PARTIAL();
4578 RRETURN(MATCH_NOMATCH);
4579 }
4580 cc = UCHAR21(eptr);
4581 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4582 RRETURN(MATCH_NOMATCH);
4583 eptr++;
4584 /* No need to skip more bytes - we know it's a 1-byte character */
4585 }
4586 break;
4587
4588 case OP_NOT_WORDCHAR:
4589 for (i = 1; i <= min; i++)
4590 {
4591 pcre_uint32 cc;
4592 if (eptr >= md->end_subject)
4593 {
4594 SCHECK_PARTIAL();
4595 RRETURN(MATCH_NOMATCH);
4596 }
4597 cc = UCHAR21(eptr);
4598 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4599 RRETURN(MATCH_NOMATCH);
4600 eptr++;
4601 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4602 }
4603 break;
4604
4605 case OP_WORDCHAR:
4606 for (i = 1; i <= min; i++)
4607 {
4608 pcre_uint32 cc;
4609 if (eptr >= md->end_subject)
4610 {
4611 SCHECK_PARTIAL();
4612 RRETURN(MATCH_NOMATCH);
4613 }
4614 cc = UCHAR21(eptr);
4615 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4616 RRETURN(MATCH_NOMATCH);
4617 eptr++;
4618 /* No need to skip more bytes - we know it's a 1-byte character */
4619 }
4620 break;
4621
4622 default:
4623 RRETURN(PCRE_ERROR_INTERNAL);
4624 } /* End switch(ctype) */
4625
4626 else
4627 #endif /* SUPPORT_UTF */
4628
4629 /* Code for the non-UTF-8 case for minimum matching of operators other
4630 than OP_PROP and OP_NOTPROP. */
4631
4632 switch(ctype)
4633 {
4634 case OP_ANY:
4635 for (i = 1; i <= min; i++)
4636 {
4637 if (eptr >= md->end_subject)
4638 {
4639 SCHECK_PARTIAL();
4640 RRETURN(MATCH_NOMATCH);
4641 }
4642 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4643 if (md->partial != 0 &&
4644 eptr + 1 >= md->end_subject &&
4645 NLBLOCK->nltype == NLTYPE_FIXED &&
4646 NLBLOCK->nllen == 2 &&
4647 *eptr == NLBLOCK->nl[0])
4648 {
4649 md->hitend = TRUE;
4650 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4651 }
4652 eptr++;
4653 }
4654 break;
4655
4656 case OP_ALLANY:
4657 if (eptr > md->end_subject - min)
4658 {
4659 SCHECK_PARTIAL();
4660 RRETURN(MATCH_NOMATCH);
4661 }
4662 eptr += min;
4663 break;
4664
4665 case OP_ANYBYTE:
4666 if (eptr > md->end_subject - min)
4667 {
4668 SCHECK_PARTIAL();
4669 RRETURN(MATCH_NOMATCH);
4670 }
4671 eptr += min;
4672 break;
4673
4674 case OP_ANYNL:
4675 for (i = 1; i <= min; i++)
4676 {
4677 if (eptr >= md->end_subject)
4678 {
4679 SCHECK_PARTIAL();
4680 RRETURN(MATCH_NOMATCH);
4681 }
4682 switch(*eptr++)
4683 {
4684 default: RRETURN(MATCH_NOMATCH);
4685
4686 case CHAR_CR:
4687 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4688 break;
4689
4690 case CHAR_LF:
4691 break;
4692
4693 case CHAR_VT:
4694 case CHAR_FF:
4695 case CHAR_NEL:
4696 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4697 case 0x2028:
4698 case 0x2029:
4699 #endif
4700 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4701 break;
4702 }
4703 }
4704 break;
4705
4706 case OP_NOT_HSPACE:
4707 for (i = 1; i <= min; i++)
4708 {
4709 if (eptr >= md->end_subject)
4710 {
4711 SCHECK_PARTIAL();
4712 RRETURN(MATCH_NOMATCH);
4713 }
4714 switch(*eptr++)
4715 {
4716 default: break;
4717 HSPACE_BYTE_CASES:
4718 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4719 HSPACE_MULTIBYTE_CASES:
4720 #endif
4721 RRETURN(MATCH_NOMATCH);
4722 }
4723 }
4724 break;
4725
4726 case OP_HSPACE:
4727 for (i = 1; i <= min; i++)
4728 {
4729 if (eptr >= md->end_subject)
4730 {
4731 SCHECK_PARTIAL();
4732 RRETURN(MATCH_NOMATCH);
4733 }
4734 switch(*eptr++)
4735 {
4736 default: RRETURN(MATCH_NOMATCH);
4737 HSPACE_BYTE_CASES:
4738 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4739 HSPACE_MULTIBYTE_CASES:
4740 #endif
4741 break;
4742 }
4743 }
4744 break;
4745
4746 case OP_NOT_VSPACE:
4747 for (i = 1; i <= min; i++)
4748 {
4749 if (eptr >= md->end_subject)
4750 {
4751 SCHECK_PARTIAL();
4752 RRETURN(MATCH_NOMATCH);
4753 }
4754 switch(*eptr++)
4755 {
4756 VSPACE_BYTE_CASES:
4757 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4758 VSPACE_MULTIBYTE_CASES:
4759 #endif
4760 RRETURN(MATCH_NOMATCH);
4761 default: break;
4762 }
4763 }
4764 break;
4765
4766 case OP_VSPACE:
4767 for (i = 1; i <= min; i++)
4768 {
4769 if (eptr >= md->end_subject)
4770 {
4771 SCHECK_PARTIAL();
4772 RRETURN(MATCH_NOMATCH);
4773 }
4774 switch(*eptr++)
4775 {
4776 default: RRETURN(MATCH_NOMATCH);
4777 VSPACE_BYTE_CASES:
4778 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4779 VSPACE_MULTIBYTE_CASES:
4780 #endif
4781 break;
4782 }
4783 }
4784 break;
4785
4786 case OP_NOT_DIGIT:
4787 for (i = 1; i <= min; i++)
4788 {
4789 if (eptr >= md->end_subject)
4790 {
4791 SCHECK_PARTIAL();
4792 RRETURN(MATCH_NOMATCH);
4793 }
4794 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4795 RRETURN(MATCH_NOMATCH);
4796 eptr++;
4797 }
4798 break;
4799
4800 case OP_DIGIT:
4801 for (i = 1; i <= min; i++)
4802 {
4803 if (eptr >= md->end_subject)
4804 {
4805 SCHECK_PARTIAL();
4806 RRETURN(MATCH_NOMATCH);
4807 }
4808 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4809 RRETURN(MATCH_NOMATCH);
4810 eptr++;
4811 }
4812 break;
4813
4814 case OP_NOT_WHITESPACE:
4815 for (i = 1; i <= min; i++)
4816 {
4817 if (eptr >= md->end_subject)
4818 {
4819 SCHECK_PARTIAL();
4820 RRETURN(MATCH_NOMATCH);
4821 }
4822 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4823 RRETURN(MATCH_NOMATCH);
4824 eptr++;
4825 }
4826 break;
4827
4828 case OP_WHITESPACE:
4829 for (i = 1; i <= min; i++)
4830 {
4831 if (eptr >= md->end_subject)
4832 {
4833 SCHECK_PARTIAL();
4834 RRETURN(MATCH_NOMATCH);
4835 }
4836 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4837 RRETURN(MATCH_NOMATCH);
4838 eptr++;
4839 }
4840 break;
4841
4842 case OP_NOT_WORDCHAR:
4843 for (i = 1; i <= min; i++)
4844 {
4845 if (eptr >= md->end_subject)
4846 {
4847 SCHECK_PARTIAL();
4848 RRETURN(MATCH_NOMATCH);
4849 }
4850 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4851 RRETURN(MATCH_NOMATCH);
4852 eptr++;
4853 }
4854 break;
4855
4856 case OP_WORDCHAR:
4857 for (i = 1; i <= min; i++)
4858 {
4859 if (eptr >= md->end_subject)
4860 {
4861 SCHECK_PARTIAL();
4862 RRETURN(MATCH_NOMATCH);
4863 }
4864 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4865 RRETURN(MATCH_NOMATCH);
4866 eptr++;
4867 }
4868 break;
4869
4870 default:
4871 RRETURN(PCRE_ERROR_INTERNAL);
4872 }
4873 }
4874
4875 /* If min = max, continue at the same level without recursing */
4876
4877 if (min == max) continue;
4878
4879 /* If minimizing, we have to test the rest of the pattern before each
4880 subsequent match. Again, separate the UTF-8 case for speed, and also
4881 separate the UCP cases. */
4882
4883 if (minimize)
4884 {
4885 #ifdef SUPPORT_UCP
4886 if (prop_type >= 0)
4887 {
4888 switch(prop_type)
4889 {
4890 case PT_ANY:
4891 for (fi = min;; fi++)
4892 {
4893 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4895 if (fi >= max) RRETURN(MATCH_NOMATCH);
4896 if (eptr >= md->end_subject)
4897 {
4898 SCHECK_PARTIAL();
4899 RRETURN(MATCH_NOMATCH);
4900 }
4901 GETCHARINCTEST(c, eptr);
4902 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4903 }
4904 /* Control never gets here */
4905
4906 case PT_LAMP:
4907 for (fi = min;; fi++)
4908 {
4909 int chartype;
4910 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4911 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4912 if (fi >= max) RRETURN(MATCH_NOMATCH);
4913 if (eptr >= md->end_subject)
4914 {
4915 SCHECK_PARTIAL();
4916 RRETURN(MATCH_NOMATCH);
4917 }
4918 GETCHARINCTEST(c, eptr);
4919 chartype = UCD_CHARTYPE(c);
4920 if ((chartype == ucp_Lu ||
4921 chartype == ucp_Ll ||
4922 chartype == ucp_Lt) == prop_fail_result)
4923 RRETURN(MATCH_NOMATCH);
4924 }
4925 /* Control never gets here */
4926
4927 case PT_GC:
4928 for (fi = min;; fi++)
4929 {
4930 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4932 if (fi >= max) RRETURN(MATCH_NOMATCH);
4933 if (eptr >= md->end_subject)
4934 {
4935 SCHECK_PARTIAL();
4936 RRETURN(MATCH_NOMATCH);
4937 }
4938 GETCHARINCTEST(c, eptr);
4939 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4940 RRETURN(MATCH_NOMATCH);
4941 }
4942 /* Control never gets here */
4943
4944 case PT_PC:
4945 for (fi = min;; fi++)
4946 {
4947 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4949 if (fi >= max) RRETURN(MATCH_NOMATCH);
4950 if (eptr >= md->end_subject)
4951 {
4952 SCHECK_PARTIAL();
4953 RRETURN(MATCH_NOMATCH);
4954 }
4955 GETCHARINCTEST(c, eptr);
4956 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4957 RRETURN(MATCH_NOMATCH);
4958 }
4959 /* Control never gets here */
4960
4961 case PT_SC:
4962 for (fi = min;; fi++)
4963 {
4964 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4965 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4966 if (fi >= max) RRETURN(MATCH_NOMATCH);
4967 if (eptr >= md->end_subject)
4968 {
4969 SCHECK_PARTIAL();
4970 RRETURN(MATCH_NOMATCH);
4971 }
4972 GETCHARINCTEST(c, eptr);
4973 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4974 RRETURN(MATCH_NOMATCH);
4975 }
4976 /* Control never gets here */
4977
4978 case PT_ALNUM:
4979 for (fi = min;; fi++)
4980 {
4981 int category;
4982 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4983 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4984 if (fi >= max) RRETURN(MATCH_NOMATCH);
4985 if (eptr >= md->end_subject)
4986 {
4987 SCHECK_PARTIAL();
4988 RRETURN(MATCH_NOMATCH);
4989 }
4990 GETCHARINCTEST(c, eptr);
4991 category = UCD_CATEGORY(c);
4992 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4993 RRETURN(MATCH_NOMATCH);
4994 }
4995 /* Control never gets here */
4996
4997 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4998 which means that Perl space and POSIX space are now identical. PCRE
4999 was changed at release 8.34. */
5000
5001 case PT_SPACE: /* Perl space */
5002 case PT_PXSPACE: /* POSIX space */
5003 for (fi = min;; fi++)
5004 {
5005 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5007 if (fi >= max) RRETURN(MATCH_NOMATCH);
5008 if (eptr >= md->end_subject)
5009 {
5010 SCHECK_PARTIAL();
5011 RRETURN(MATCH_NOMATCH);
5012 }
5013 GETCHARINCTEST(c, eptr);
5014 switch(c)
5015 {
5016 HSPACE_CASES:
5017 VSPACE_CASES:
5018 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5019 break;
5020
5021 default:
5022 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5023 RRETURN(MATCH_NOMATCH);
5024 break;
5025 }
5026 }
5027 /* Control never gets here */
5028
5029 case PT_WORD:
5030 for (fi = min;; fi++)
5031 {
5032 int category;
5033 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5035 if (fi >= max) RRETURN(MATCH_NOMATCH);
5036 if (eptr >= md->end_subject)
5037 {
5038 SCHECK_PARTIAL();
5039 RRETURN(MATCH_NOMATCH);
5040 }
5041 GETCHARINCTEST(c, eptr);
5042 category = UCD_CATEGORY(c);
5043 if ((category == ucp_L ||
5044 category == ucp_N ||
5045 c == CHAR_UNDERSCORE)
5046 == prop_fail_result)
5047 RRETURN(MATCH_NOMATCH);
5048 }
5049 /* Control never gets here */
5050
5051 case PT_CLIST:
5052 for (fi = min;; fi++)
5053 {
5054 const pcre_uint32 *cp;
5055 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5056 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5057 if (fi >= max) RRETURN(MATCH_NOMATCH);
5058 if (eptr >= md->end_subject)
5059 {
5060 SCHECK_PARTIAL();
5061 RRETURN(MATCH_NOMATCH);
5062 }
5063 GETCHARINCTEST(c, eptr);
5064 cp = PRIV(ucd_caseless_sets) + prop_value;
5065 for (;;)
5066 {
5067 if (c < *cp)
5068 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5069 if (c == *cp++)
5070 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5071 }
5072 }
5073 /* Control never gets here */
5074
5075 case PT_UCNC:
5076 for (fi = min;; fi++)
5077 {
5078 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5079 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5080 if (fi >= max) RRETURN(MATCH_NOMATCH);
5081 if (eptr >= md->end_subject)
5082 {
5083 SCHECK_PARTIAL();
5084 RRETURN(MATCH_NOMATCH);
5085 }
5086 GETCHARINCTEST(c, eptr);
5087 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5088 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5089 c >= 0xe000) == prop_fail_result)
5090 RRETURN(MATCH_NOMATCH);
5091 }
5092 /* Control never gets here */
5093
5094 /* This should never occur */
5095 default:
5096 RRETURN(PCRE_ERROR_INTERNAL);
5097 }
5098 }
5099
5100 /* Match extended Unicode sequences. We will get here only if the
5101 support is in the binary; otherwise a compile-time error occurs. */
5102
5103 else if (ctype == OP_EXTUNI)
5104 {
5105 for (fi = min;; fi++)
5106 {
5107 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5108 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5109 if (fi >= max) RRETURN(MATCH_NOMATCH);
5110 if (eptr >= md->end_subject)
5111 {
5112 SCHECK_PARTIAL();
5113 RRETURN(MATCH_NOMATCH);
5114 }
5115 else
5116 {
5117 int lgb, rgb;
5118 GETCHARINCTEST(c, eptr);
5119 lgb = UCD_GRAPHBREAK(c);
5120 while (eptr < md->end_subject)
5121 {
5122 int len = 1;
5123 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5124 rgb = UCD_GRAPHBREAK(c);
5125 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5126 lgb = rgb;
5127 eptr += len;
5128 }
5129 }
5130 CHECK_PARTIAL();
5131 }
5132 }
5133 else
5134 #endif /* SUPPORT_UCP */
5135
5136 #ifdef SUPPORT_UTF
5137 if (utf)
5138 {
5139 for (fi = min;; fi++)
5140 {
5141 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5142 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5143 if (fi >= max) RRETURN(MATCH_NOMATCH);
5144 if (eptr >= md->end_subject)
5145 {
5146 SCHECK_PARTIAL();
5147 RRETURN(MATCH_NOMATCH);
5148 }
5149 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5150 RRETURN(MATCH_NOMATCH);
5151 GETCHARINC(c, eptr);
5152 switch(ctype)
5153 {
5154 case OP_ANY: /* This is the non-NL case */
5155 if (md->partial != 0 && /* Take care with CRLF partial */
5156 eptr >= md->end_subject &&
5157 NLBLOCK->nltype == NLTYPE_FIXED &&
5158 NLBLOCK->nllen == 2 &&
5159 c == NLBLOCK->nl[0])
5160 {
5161 md->hitend = TRUE;
5162 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5163 }
5164 break;
5165
5166 case OP_ALLANY:
5167 case OP_ANYBYTE:
5168 break;
5169
5170 case OP_ANYNL:
5171 switch(c)
5172 {
5173 default: RRETURN(MATCH_NOMATCH);
5174 case CHAR_CR:
5175 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5176 break;
5177
5178 case CHAR_LF:
5179 break;
5180
5181 case CHAR_VT:
5182 case CHAR_FF:
5183 case CHAR_NEL:
5184 #ifndef EBCDIC
5185 case 0x2028:
5186 case 0x2029:
5187 #endif /* Not EBCDIC */
5188 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5189 break;
5190 }
5191 break;
5192
5193 case OP_NOT_HSPACE:
5194 switch(c)
5195 {
5196 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5197 default: break;
5198 }
5199 break;
5200
5201 case OP_HSPACE:
5202 switch(c)
5203 {
5204 HSPACE_CASES: break;
5205 default: RRETURN(MATCH_NOMATCH);
5206 }
5207 break;
5208
5209 case OP_NOT_VSPACE:
5210 switch(c)
5211 {
5212 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5213 default: break;
5214 }
5215 break;
5216
5217 case OP_VSPACE:
5218 switch(c)
5219 {
5220 VSPACE_CASES: break;
5221 default: RRETURN(MATCH_NOMATCH);
5222 }
5223 break;
5224
5225 case OP_NOT_DIGIT:
5226 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5227 RRETURN(MATCH_NOMATCH);
5228 break;
5229
5230 case OP_DIGIT:
5231 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5232 RRETURN(MATCH_NOMATCH);
5233 break;
5234
5235 case OP_NOT_WHITESPACE:
5236 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5237 RRETURN(MATCH_NOMATCH);
5238 break;
5239
5240 case OP_WHITESPACE:
5241 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5242 RRETURN(MATCH_NOMATCH);
5243 break;
5244
5245 case OP_NOT_WORDCHAR:
5246 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5247 RRETURN(MATCH_NOMATCH);
5248 break;
5249
5250 case OP_WORDCHAR:
5251 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5252 RRETURN(MATCH_NOMATCH);
5253 break;
5254
5255 default:
5256 RRETURN(PCRE_ERROR_INTERNAL);
5257 }
5258 }
5259 }
5260 else
5261 #endif
5262 /* Not UTF mode */
5263 {
5264 for (fi = min;; fi++)
5265 {
5266 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5267 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5268 if (fi >= max) RRETURN(MATCH_NOMATCH);
5269 if (eptr >= md->end_subject)
5270 {
5271 SCHECK_PARTIAL();
5272 RRETURN(MATCH_NOMATCH);
5273 }
5274 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5275 RRETURN(MATCH_NOMATCH);
5276 c = *eptr++;
5277 switch(ctype)
5278 {
5279 case OP_ANY: /* This is the non-NL case */
5280 if (md->partial != 0 && /* Take care with CRLF partial */
5281 eptr >= md->end_subject &&
5282 NLBLOCK->nltype == NLTYPE_FIXED &&
5283 NLBLOCK->nllen == 2 &&
5284 c == NLBLOCK->nl[0])
5285 {
5286 md->hitend = TRUE;
5287 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5288 }
5289 break;
5290
5291 case OP_ALLANY:
5292 case OP_ANYBYTE:
5293 break;
5294
5295 case OP_ANYNL:
5296 switch(c)
5297 {
5298 default: RRETURN(MATCH_NOMATCH);
5299 case CHAR_CR:
5300 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5301 break;
5302
5303 case CHAR_LF:
5304 break;
5305
5306 case CHAR_VT:
5307 case CHAR_FF:
5308 case CHAR_NEL:
5309 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5310 case 0x2028:
5311 case 0x2029:
5312 #endif
5313 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5314 break;
5315 }
5316 break;
5317
5318 case OP_NOT_HSPACE:
5319 switch(c)
5320 {
5321 default: break;
5322 HSPACE_BYTE_CASES:
5323 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5324 HSPACE_MULTIBYTE_CASES:
5325 #endif
5326 RRETURN(MATCH_NOMATCH);
5327 }
5328 break;
5329
5330 case OP_HSPACE:
5331 switch(c)
5332 {
5333 default: RRETURN(MATCH_NOMATCH);
5334 HSPACE_BYTE_CASES:
5335 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5336 HSPACE_MULTIBYTE_CASES:
5337 #endif
5338 break;
5339 }
5340 break;
5341
5342 case OP_NOT_VSPACE:
5343 switch(c)
5344 {
5345 default: break;
5346 VSPACE_BYTE_CASES:
5347 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5348 VSPACE_MULTIBYTE_CASES:
5349 #endif
5350 RRETURN(MATCH_NOMATCH);
5351 }
5352 break;
5353
5354 case OP_VSPACE:
5355 switch(c)
5356 {
5357 default: RRETURN(MATCH_NOMATCH);
5358 VSPACE_BYTE_CASES:
5359 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5360 VSPACE_MULTIBYTE_CASES:
5361 #endif
5362 break;
5363 }
5364 break;
5365
5366 case OP_NOT_DIGIT:
5367 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5368 break;
5369
5370 case OP_DIGIT:
5371 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5372 break;
5373
5374 case OP_NOT_WHITESPACE:
5375 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5376 break;
5377
5378 case OP_WHITESPACE:
5379 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5380 break;
5381
5382 case OP_NOT_WORDCHAR:
5383 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5384 break;
5385
5386 case OP_WORDCHAR:
5387 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5388 break;
5389
5390 default:
5391 RRETURN(PCRE_ERROR_INTERNAL);
5392 }
5393 }
5394 }
5395 /* Control never gets here */
5396 }
5397
5398 /* If maximizing, it is worth using inline code for speed, doing the type
5399 test once at the start (i.e. keep it out of the loop). Again, keep the
5400 UTF-8 and UCP stuff separate. */
5401
5402 else
5403 {
5404 pp = eptr; /* Remember where we started */
5405
5406 #ifdef SUPPORT_UCP
5407 if (prop_type >= 0)
5408 {
5409 switch(prop_type)
5410 {
5411 case PT_ANY:
5412 for (i = min; i < max; i++)
5413 {
5414 int len = 1;
5415 if (eptr >= md->end_subject)
5416 {
5417 SCHECK_PARTIAL();
5418 break;
5419 }
5420 GETCHARLENTEST(c, eptr, len);
5421 if (prop_fail_result) break;
5422 eptr+= len;
5423 }
5424 break;
5425
5426 case PT_LAMP:
5427 for (i = min; i < max; i++)
5428 {
5429 int chartype;
5430 int len = 1;
5431 if (eptr >= md->end_subject)
5432 {
5433 SCHECK_PARTIAL();
5434 break;
5435 }
5436 GETCHARLENTEST(c, eptr, len);
5437 chartype = UCD_CHARTYPE(c);
5438 if ((chartype == ucp_Lu ||
5439 chartype == ucp_Ll ||
5440 chartype == ucp_Lt) == prop_fail_result)
5441 break;
5442 eptr+= len;
5443 }
5444 break;
5445
5446 case PT_GC:
5447 for (i = min; i < max; i++)
5448 {
5449 int len = 1;
5450 if (eptr >= md->end_subject)
5451 {
5452 SCHECK_PARTIAL();
5453 break;
5454 }
5455 GETCHARLENTEST(c, eptr, len);
5456 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5457 eptr+= len;
5458 }
5459 break;
5460
5461 case PT_PC:
5462 for (i = min; i < max; i++)
5463 {
5464 int len = 1;
5465 if (eptr >= md->end_subject)
5466 {
5467 SCHECK_PARTIAL();
5468 break;
5469 }
5470 GETCHARLENTEST(c, eptr, len);
5471 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5472 eptr+= len;
5473 }
5474 break;
5475
5476 case PT_SC:
5477 for (i = min; i < max; i++)
5478 {
5479 int len = 1;
5480 if (eptr >= md->end_subject)
5481 {
5482 SCHECK_PARTIAL();
5483 break;
5484 }
5485 GETCHARLENTEST(c, eptr, len);
5486 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5487 eptr+= len;
5488 }
5489 break;
5490
5491 case PT_ALNUM:
5492 for (i = min; i < max; i++)
5493 {
5494 int category;
5495 int len = 1;
5496 if (eptr >= md->end_subject)
5497 {
5498 SCHECK_PARTIAL();
5499 break;
5500 }
5501 GETCHARLENTEST(c, eptr, len);
5502 category = UCD_CATEGORY(c);
5503 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5504 break;
5505 eptr+= len;
5506 }
5507 break;
5508
5509 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5510 which means that Perl space and POSIX space are now identical. PCRE
5511 was changed at release 8.34. */
5512
5513 case PT_SPACE: /* Perl space */
5514 case PT_PXSPACE: /* POSIX space */
5515 for (i = min; i < max; i++)
5516 {
5517 int len = 1;
5518 if (eptr >= md->end_subject)
5519 {
5520 SCHECK_PARTIAL();
5521 break;
5522 }
5523 GETCHARLENTEST(c, eptr, len);
5524 switch(c)
5525 {
5526 HSPACE_CASES:
5527 VSPACE_CASES:
5528 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5529 break;
5530
5531 default:
5532 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5533 goto ENDLOOP99; /* Break the loop */
5534 break;
5535 }
5536 eptr+= len;
5537 }
5538 ENDLOOP99:
5539 break;
5540
5541 case PT_WORD:
5542 for (i = min; i < max; i++)
5543 {
5544 int category;
5545 int len = 1;
5546 if (eptr >= md->end_subject)
5547 {
5548 SCHECK_PARTIAL();
5549 break;
5550 }
5551 GETCHARLENTEST(c, eptr, len);
5552 category = UCD_CATEGORY(c);
5553 if ((category == ucp_L || category == ucp_N ||
5554 c == CHAR_UNDERSCORE) == prop_fail_result)
5555 break;
5556 eptr+= len;
5557 }
5558 break;
5559
5560 case PT_CLIST:
5561 for (i = min; i < max; i++)
5562 {
5563 const pcre_uint32 *cp;
5564 int len = 1;
5565 if (eptr >= md->end_subject)
5566 {
5567 SCHECK_PARTIAL();
5568 break;
5569 }
5570 GETCHARLENTEST(c, eptr, len);
5571 cp = PRIV(ucd_caseless_sets) + prop_value;
5572 for (;;)
5573 {
5574 if (c < *cp)
5575 { if (prop_fail_result) break; else goto GOT_MAX; }
5576 if (c == *cp++)
5577 { if (prop_fail_result) goto GOT_MAX; else break; }
5578 }
5579 eptr += len;
5580 }
5581 GOT_MAX:
5582 break;
5583
5584 case PT_UCNC:
5585 for (i = min; i < max; i++)
5586 {
5587 int len = 1;
5588 if (eptr >= md->end_subject)
5589 {
5590 SCHECK_PARTIAL();
5591 break;
5592 }
5593 GETCHARLENTEST(c, eptr, len);
5594 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5595 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5596 c >= 0xe000) == prop_fail_result)
5597 break;
5598 eptr += len;
5599 }
5600 break;
5601
5602 default:
5603 RRETURN(PCRE_ERROR_INTERNAL);
5604 }
5605
5606 /* eptr is now past the end of the maximum run */
5607
5608 if (possessive) continue; /* No backtracking */
5609 for(;;)
5610 {
5611 if (eptr <= pp) goto TAIL_RECURSE;
5612 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5613 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5614 eptr--;
5615 if (utf) BACKCHAR(eptr);
5616 }
5617 }
5618
5619 /* Match extended Unicode grapheme clusters. We will get here only if the
5620 support is in the binary; otherwise a compile-time error occurs. */
5621
5622 else if (ctype == OP_EXTUNI)
5623 {
5624 for (i = min; i < max; i++)
5625 {
5626 if (eptr >= md->end_subject)
5627 {
5628 SCHECK_PARTIAL();
5629 break;
5630 }
5631 else
5632 {
5633 int lgb, rgb;
5634 GETCHARINCTEST(c, eptr);
5635 lgb = UCD_GRAPHBREAK(c);
5636 while (eptr < md->end_subject)
5637 {
5638 int len = 1;
5639 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5640 rgb = UCD_GRAPHBREAK(c);
5641 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5642 lgb = rgb;
5643 eptr += len;
5644 }
5645 }
5646 CHECK_PARTIAL();
5647 }
5648
5649 /* eptr is now past the end of the maximum run */
5650
5651 if (possessive) continue; /* No backtracking */
5652
5653 /* We use <= pp rather than == pp to detect the start of the run while
5654 backtracking because the use of \C in UTF mode can cause BACKCHAR to
5655 move back past pp. This is just palliative; the use of \C in UTF mode
5656 is fraught with danger. */
5657
5658 for(;;)
5659 {
5660 int lgb, rgb;
5661 PCRE_PUCHAR fptr;
5662
5663 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5664 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5666
5667 /* Backtracking over an extended grapheme cluster involves inspecting
5668 the previous two characters (if present) to see if a break is
5669 permitted between them. */
5670
5671 eptr--;
5672 if (!utf) c = *eptr; else
5673 {
5674 BACKCHAR(eptr);
5675 GETCHAR(c, eptr);
5676 }
5677 rgb = UCD_GRAPHBREAK(c);
5678
5679 for (;;)
5680 {
5681 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5682 fptr = eptr - 1;
5683 if (!utf) c = *fptr; else
5684 {
5685 BACKCHAR(fptr);
5686 GETCHAR(c, fptr);
5687 }
5688 lgb = UCD_GRAPHBREAK(c);
5689 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5690 eptr = fptr;
5691 rgb = lgb;
5692 }
5693 }
5694 }
5695
5696 else
5697 #endif /* SUPPORT_UCP */
5698
5699 #ifdef SUPPORT_UTF
5700 if (utf)
5701 {
5702 switch(ctype)
5703 {
5704 case OP_ANY:
5705 for (i = min; i < max; i++)
5706 {
5707 if (eptr >= md->end_subject)
5708 {
5709 SCHECK_PARTIAL();
5710 break;
5711 }
5712 if (IS_NEWLINE(eptr)) break;
5713 if (md->partial != 0 && /* Take care with CRLF partial */
5714 eptr + 1 >= md->end_subject &&
5715 NLBLOCK->nltype == NLTYPE_FIXED &&
5716 NLBLOCK->nllen == 2 &&
5717 UCHAR21(eptr) == NLBLOCK->nl[0])
5718 {
5719 md->hitend = TRUE;
5720 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5721 }
5722 eptr++;
5723 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5724 }
5725 break;
5726
5727 case OP_ALLANY:
5728 if (max < INT_MAX)
5729 {
5730 for (i = min; i < max; i++)
5731 {
5732 if (eptr >= md->end_subject)
5733 {
5734 SCHECK_PARTIAL();
5735 break;
5736 }
5737 eptr++;
5738 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5739 }
5740 }
5741 else
5742 {
5743 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5744 SCHECK_PARTIAL();
5745 }
5746 break;
5747
5748 /* The byte case is the same as non-UTF8 */
5749
5750 case OP_ANYBYTE:
5751 c = max - min;
5752 if (c > (unsigned int)(md->end_subject - eptr))
5753 {
5754 eptr = md->end_subject;
5755 SCHECK_PARTIAL();
5756 }
5757 else eptr += c;
5758 break;
5759
5760 case OP_ANYNL:
5761 for (i = min; i < max; i++)
5762 {
5763 int len = 1;
5764 if (eptr >= md->end_subject)
5765 {
5766 SCHECK_PARTIAL();
5767 break;
5768 }
5769 GETCHARLEN(c, eptr, len);
5770 if (c == CHAR_CR)
5771 {
5772 if (++eptr >= md->end_subject) break;
5773 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5774 }
5775 else
5776 {
5777 if (c != CHAR_LF &&
5778 (md->bsr_anycrlf ||
5779 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5780 #ifndef EBCDIC
5781 && c != 0x2028 && c != 0x2029
5782 #endif /* Not EBCDIC */
5783 )))
5784 break;
5785 eptr += len;
5786 }
5787 }
5788 break;
5789
5790 case OP_NOT_HSPACE:
5791 case OP_HSPACE:
5792 for (i = min; i < max; i++)
5793 {
5794 BOOL gotspace;
5795 int len = 1;
5796 if (eptr >= md->end_subject)
5797 {
5798 SCHECK_PARTIAL();
5799 break;
5800 }
5801 GETCHARLEN(c, eptr, len);
5802 switch(c)
5803 {
5804 HSPACE_CASES: gotspace = TRUE; break;
5805 default: gotspace = FALSE; break;
5806 }
5807 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5808 eptr += len;
5809 }
5810 break;
5811
5812 case OP_NOT_VSPACE:
5813 case OP_VSPACE:
5814 for (i = min; i < max; i++)
5815 {
5816 BOOL gotspace;
5817 int len = 1;
5818 if (eptr >= md->end_subject)
5819 {
5820 SCHECK_PARTIAL();
5821 break;
5822 }
5823 GETCHARLEN(c, eptr, len);
5824 switch(c)
5825 {
5826 VSPACE_CASES: gotspace = TRUE; break;
5827 default: gotspace = FALSE; break;
5828 }
5829 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5830 eptr += len;
5831 }
5832 break;
5833
5834 case OP_NOT_DIGIT:
5835 for (i = min; i < max; i++)
5836 {
5837 int len = 1;
5838 if (eptr >= md->end_subject)
5839 {
5840 SCHECK_PARTIAL();
5841 break;
5842 }
5843 GETCHARLEN(c, eptr, len);
5844 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5845 eptr+= len;
5846 }
5847 break;
5848
5849 case OP_DIGIT:
5850 for (i = min; i < max; i++)
5851 {
5852 int len = 1;
5853 if (eptr >= md->end_subject)
5854 {
5855 SCHECK_PARTIAL();
5856 break;
5857 }
5858 GETCHARLEN(c, eptr, len);
5859 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5860 eptr+= len;
5861 }
5862 break;
5863
5864 case OP_NOT_WHITESPACE:
5865 for (i = min; i < max; i++)
5866 {
5867 int len = 1;
5868 if (eptr >= md->end_subject)
5869 {
5870 SCHECK_PARTIAL();
5871 break;
5872 }
5873 GETCHARLEN(c, eptr, len);
5874 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5875 eptr+= len;
5876 }
5877 break;
5878
5879 case OP_WHITESPACE:
5880 for (i = min; i < max; i++)
5881 {
5882 int len = 1;
5883 if (eptr >= md->end_subject)
5884 {
5885 SCHECK_PARTIAL();
5886 break;
5887 }
5888 GETCHARLEN(c, eptr, len);
5889 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5890 eptr+= len;
5891 }
5892 break;
5893
5894 case OP_NOT_WORDCHAR:
5895 for (i = min; i < max; i++)
5896 {
5897 int len = 1;
5898 if (eptr >= md->end_subject)
5899 {
5900 SCHECK_PARTIAL();
5901 break;
5902 }
5903 GETCHARLEN(c, eptr, len);
5904 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5905 eptr+= len;
5906 }
5907 break;
5908
5909 case OP_WORDCHAR:
5910 for (i = min; i < max; i++)
5911 {
5912 int len = 1;
5913 if (eptr >= md->end_subject)
5914 {
5915 SCHECK_PARTIAL();
5916 break;
5917 }
5918 GETCHARLEN(c, eptr, len);
5919 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5920 eptr+= len;
5921 }
5922 break;
5923
5924 default:
5925 RRETURN(PCRE_ERROR_INTERNAL);
5926 }
5927
5928 if (possessive) continue; /* No backtracking */
5929 for(;;)
5930 {
5931 if (eptr <= pp) goto TAIL_RECURSE;
5932 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5934 eptr--;
5935 BACKCHAR(eptr);
5936 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5937 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5938 }
5939 }
5940 else
5941 #endif /* SUPPORT_UTF */
5942 /* Not UTF mode */
5943 {
5944 switch(ctype)
5945 {
5946 case OP_ANY:
5947 for (i = min; i < max; i++)
5948 {
5949 if (eptr >= md->end_subject)
5950 {
5951 SCHECK_PARTIAL();
5952 break;
5953 }
5954 if (IS_NEWLINE(eptr)) break;
5955 if (md->partial != 0 && /* Take care with CRLF partial */
5956 eptr + 1 >= md->end_subject &&
5957 NLBLOCK->nltype == NLTYPE_FIXED &&
5958 NLBLOCK->nllen == 2 &&
5959 *eptr == NLBLOCK->nl[0])
5960 {
5961 md->hitend = TRUE;
5962 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5963 }
5964 eptr++;
5965 }
5966 break;
5967
5968 case OP_ALLANY:
5969 case OP_ANYBYTE:
5970 c = max - min;
5971 if (c > (unsigned int)(md->end_subject - eptr))
5972 {
5973 eptr = md->end_subject;
5974 SCHECK_PARTIAL();
5975 }
5976 else eptr += c;
5977 break;
5978
5979 case OP_ANYNL:
5980 for (i = min; i < max; i++)
5981 {
5982 if (eptr >= md->end_subject)
5983 {
5984 SCHECK_PARTIAL();
5985 break;
5986 }
5987 c = *eptr;
5988 if (c == CHAR_CR)
5989 {
5990 if (++eptr >= md->end_subject) break;
5991 if (*eptr == CHAR_LF) eptr++;
5992 }
5993 else
5994 {
5995 if (c != CHAR_LF && (md->bsr_anycrlf ||
5996 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5997 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5998 && c != 0x2028 && c != 0x2029
5999 #endif
6000 ))) break;
6001 eptr++;
6002 }
6003 }
6004 break;
6005
6006 case OP_NOT_HSPACE:
6007 for (i = min; i < max; i++)
6008 {
6009 if (eptr >= md->end_subject)
6010 {
6011 SCHECK_PARTIAL();
6012 break;
6013 }
6014 switch(*eptr)
6015 {
6016 default: eptr++; break;
6017 HSPACE_BYTE_CASES:
6018 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6019 HSPACE_MULTIBYTE_CASES:
6020 #endif
6021 goto ENDLOOP00;
6022 }
6023 }
6024 ENDLOOP00:
6025 break;
6026
6027 case OP_HSPACE:
6028 for (i = min; i < max; i++)
6029 {
6030 if (eptr >= md->end_subject)
6031 {
6032 SCHECK_PARTIAL();
6033 break;
6034 }
6035 switch(*eptr)
6036 {
6037 default: goto ENDLOOP01;
6038 HSPACE_BYTE_CASES:
6039 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6040 HSPACE_MULTIBYTE_CASES:
6041 #endif
6042 eptr++; break;
6043 }
6044 }
6045 ENDLOOP01:
6046 break;
6047
6048 case OP_NOT_VSPACE:
6049 for (i = min; i < max; i++)
6050 {
6051 if (eptr >= md->end_subject)
6052 {
6053 SCHECK_PARTIAL();
6054 break;
6055 }
6056 switch(*eptr)
6057 {
6058 default: eptr++; break;
6059 VSPACE_BYTE_CASES:
6060 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6061 VSPACE_MULTIBYTE_CASES:
6062 #endif
6063 goto ENDLOOP02;
6064 }
6065 }
6066 ENDLOOP02:
6067 break;
6068
6069 case OP_VSPACE:
6070 for (i = min; i < max; i++)
6071 {
6072 if (eptr >= md->end_subject)
6073 {
6074 SCHECK_PARTIAL();
6075 break;
6076 }
6077 switch(*eptr)
6078 {
6079 default: goto ENDLOOP03;
6080 VSPACE_BYTE_CASES:
6081 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6082 VSPACE_MULTIBYTE_CASES:
6083 #endif
6084 eptr++; break;
6085 }
6086 }
6087 ENDLOOP03:
6088 break;
6089
6090 case OP_NOT_DIGIT:
6091 for (i = min; i < max; i++)
6092 {
6093 if (eptr >= md->end_subject)
6094 {
6095 SCHECK_PARTIAL();
6096 break;
6097 }
6098 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6099 eptr++;
6100 }
6101 break;
6102
6103 case OP_DIGIT:
6104 for (i = min; i < max; i++)
6105 {
6106 if (eptr >= md->end_subject)
6107 {
6108 SCHECK_PARTIAL();
6109 break;
6110 }
6111 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6112 eptr++;
6113 }
6114 break;
6115
6116 case OP_NOT_WHITESPACE:
6117 for (i = min; i < max; i++)
6118 {
6119 if (eptr >= md->end_subject)
6120 {
6121 SCHECK_PARTIAL();
6122 break;
6123 }
6124 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6125 eptr++;
6126 }
6127 break;
6128
6129 case OP_WHITESPACE:
6130 for (i = min; i < max; i++)
6131 {
6132 if (eptr >= md->end_subject)
6133 {
6134 SCHECK_PARTIAL();
6135 break;
6136 }
6137 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6138 eptr++;
6139 }
6140 break;
6141
6142 case OP_NOT_WORDCHAR:
6143 for (i = min; i < max; i++)
6144 {
6145 if (eptr >= md->end_subject)
6146 {
6147 SCHECK_PARTIAL();
6148 break;
6149 }
6150 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6151 eptr++;
6152 }
6153 break;
6154
6155 case OP_WORDCHAR:
6156 for (i = min; i < max; i++)
6157 {
6158 if (eptr >= md->end_subject)
6159 {
6160 SCHECK_PARTIAL();
6161 break;
6162 }
6163 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6164 eptr++;
6165 }
6166 break;
6167
6168 default:
6169 RRETURN(PCRE_ERROR_INTERNAL);
6170 }
6171
6172 if (possessive) continue; /* No backtracking */
6173 for (;;)
6174 {
6175 if (eptr == pp) goto TAIL_RECURSE;
6176 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6177 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6178 eptr--;
6179 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6180 eptr[-1] == CHAR_CR) eptr--;
6181 }
6182 }
6183
6184 /* Control never gets here */
6185 }
6186
6187 /* There's been some horrible disaster. Arrival here can only mean there is
6188 something seriously wrong in the code above or the OP_xxx definitions. */
6189
6190 default:
6191 DPRINTF(("Unknown opcode %d\n", *ecode));
6192 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6193 }
6194
6195 /* Do not stick any code in here without much thought; it is assumed
6196 that "continue" in the code above comes out to here to repeat the main
6197 loop. */
6198
6199 } /* End of main loop */
6200 /* Control never reaches here */
6201
6202
6203 /* When compiling to use the heap rather than the stack for recursive calls to
6204 match(), the RRETURN() macro jumps here. The number that is saved in
6205 frame->Xwhere indicates which label we actually want to return to. */
6206
6207 #ifdef NO_RECURSE
6208 #define LBL(val) case val: goto L_RM##val;
6209 HEAP_RETURN:
6210 switch (frame->Xwhere)
6211 {
6212 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6213 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6214 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6215 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6216 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6217 LBL(65) LBL(66)
6218 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6219 LBL(20) LBL(21)
6220 #endif
6221 #ifdef SUPPORT_UTF
6222 LBL(16) LBL(18)
6223 LBL(22) LBL(23) LBL(28) LBL(30)
6224 LBL(32) LBL(34) LBL(42) LBL(46)
6225 #ifdef SUPPORT_UCP
6226 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6227 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6228 #endif /* SUPPORT_UCP */
6229 #endif /* SUPPORT_UTF */
6230 default:
6231 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6232 return PCRE_ERROR_INTERNAL;
6233 }
6234 #undef LBL
6235 #endif /* NO_RECURSE */
6236 }
6237
6238
6239 /***************************************************************************
6240 ****************************************************************************
6241 RECURSION IN THE match() FUNCTION
6242
6243 Undefine all the macros that were defined above to handle this. */
6244
6245 #ifdef NO_RECURSE
6246 #undef eptr
6247 #undef ecode
6248 #undef mstart
6249 #undef offset_top
6250 #undef eptrb
6251 #undef flags
6252
6253 #undef callpat
6254 #undef charptr
6255 #undef data
6256 #undef next
6257 #undef pp
6258 #undef prev
6259 #undef saved_eptr
6260
6261 #undef new_recursive
6262
6263 #undef cur_is_word
6264 #undef condition
6265 #undef prev_is_word
6266
6267 #undef ctype
6268 #undef length
6269 #undef max
6270 #undef min
6271 #undef number
6272 #undef offset
6273 #undef op
6274 #undef save_capture_last
6275 #undef save_offset1
6276 #undef save_offset2
6277 #undef save_offset3
6278 #undef stacksave
6279
6280 #undef newptrb
6281
6282 #endif
6283
6284 /* These two are defined as macros in both cases */
6285
6286 #undef fc
6287 #undef fi
6288
6289 /***************************************************************************
6290 ***************************************************************************/
6291
6292
6293 #ifdef NO_RECURSE
6294 /*************************************************
6295 * Release allocated heap frames *
6296 *************************************************/
6297
6298 /* This function releases all the allocated frames. The base frame is on the
6299 machine stack, and so must not be freed.
6300
6301 Argument: the address of the base frame
6302 Returns: nothing
6303 */
6304
6305 static void
release_match_heapframes(heapframe * frame_base)6306 release_match_heapframes (heapframe *frame_base)
6307 {
6308 heapframe *nextframe = frame_base->Xnextframe;
6309 while (nextframe != NULL)
6310 {
6311 heapframe *oldframe = nextframe;
6312 nextframe = nextframe->Xnextframe;
6313 (PUBL(stack_free))(oldframe);
6314 }
6315 }
6316 #endif
6317
6318
6319 /*************************************************
6320 * Execute a Regular Expression *
6321 *************************************************/
6322
6323 /* This function applies a compiled re to a subject string and picks out
6324 portions of the string if it matches. Two elements in the vector are set for
6325 each substring: the offsets to the start and end of the substring.
6326
6327 Arguments:
6328 argument_re points to the compiled expression
6329 extra_data points to extra data or is NULL
6330 subject points to the subject string
6331 length length of subject string (may contain binary zeros)
6332 start_offset where to start in the subject string
6333 options option bits
6334 offsets points to a vector of ints to be filled in with offsets
6335 offsetcount the number of elements in the vector
6336
6337 Returns: > 0 => success; value is the number of elements filled in
6338 = 0 => success, but offsets is not big enough
6339 -1 => failed to match
6340 < -1 => some kind of unexpected problem
6341 */
6342
6343 #if defined COMPILE_PCRE8
6344 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_exec(const pcre * argument_re,const pcre_extra * extra_data,PCRE_SPTR subject,int length,int start_offset,int options,int * offsets,int offsetcount)6345 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6346 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6347 int offsetcount)
6348 #elif defined COMPILE_PCRE16
6349 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6350 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6351 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6352 int offsetcount)
6353 #elif defined COMPILE_PCRE32
6354 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6355 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6356 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6357 int offsetcount)
6358 #endif
6359 {
6360 int rc, ocount, arg_offset_max;
6361 int newline;
6362 BOOL using_temporary_offsets = FALSE;
6363 BOOL anchored;
6364 BOOL startline;
6365 BOOL firstline;
6366 BOOL utf;
6367 BOOL has_first_char = FALSE;
6368 BOOL has_req_char = FALSE;
6369 pcre_uchar first_char = 0;
6370 pcre_uchar first_char2 = 0;
6371 pcre_uchar req_char = 0;
6372 pcre_uchar req_char2 = 0;
6373 match_data match_block;
6374 match_data *md = &match_block;
6375 const pcre_uint8 *tables;
6376 const pcre_uint8 *start_bits = NULL;
6377 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6378 PCRE_PUCHAR end_subject;
6379 PCRE_PUCHAR start_partial = NULL;
6380 PCRE_PUCHAR match_partial = NULL;
6381 PCRE_PUCHAR req_char_ptr = start_match - 1;
6382
6383 const pcre_study_data *study;
6384 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6385
6386 #ifdef NO_RECURSE
6387 heapframe frame_zero;
6388 frame_zero.Xprevframe = NULL; /* Marks the top level */
6389 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6390 md->match_frames_base = &frame_zero;
6391 #endif
6392
6393 /* Check for the special magic call that measures the size of the stack used
6394 per recursive call of match(). Without the funny casting for sizeof, a Windows
6395 compiler gave this error: "unary minus operator applied to unsigned type,
6396 result still unsigned". Hopefully the cast fixes that. */
6397
6398 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6399 start_offset == -999)
6400 #ifdef NO_RECURSE
6401 return -((int)sizeof(heapframe));
6402 #else
6403 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6404 #endif
6405
6406 /* Plausibility checks */
6407
6408 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6409 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6410 return PCRE_ERROR_NULL;
6411 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6412 if (length < 0) return PCRE_ERROR_BADLENGTH;
6413 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6414
6415 /* Check that the first field in the block is the magic number. If it is not,
6416 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6417 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6418 means that the pattern is likely compiled with different endianness. */
6419
6420 if (re->magic_number != MAGIC_NUMBER)
6421 return re->magic_number == REVERSED_MAGIC_NUMBER?
6422 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6423 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6424
6425 /* These two settings are used in the code for checking a UTF-8 string that
6426 follows immediately afterwards. Other values in the md block are used only
6427 during "normal" pcre_exec() processing, not when the JIT support is in use,
6428 so they are set up later. */
6429
6430 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6431 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6432 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6433 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6434
6435 /* Check a UTF-8 string if required. Pass back the character offset and error
6436 code for an invalid string if a results vector is available. */
6437
6438 #ifdef SUPPORT_UTF
6439 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6440 {
6441 int erroroffset;
6442 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6443 if (errorcode != 0)
6444 {
6445 if (offsetcount >= 2)
6446 {
6447 offsets[0] = erroroffset;
6448 offsets[1] = errorcode;
6449 }
6450 #if defined COMPILE_PCRE8
6451 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6452 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6453 #elif defined COMPILE_PCRE16
6454 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6455 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6456 #elif defined COMPILE_PCRE32
6457 return PCRE_ERROR_BADUTF32;
6458 #endif
6459 }
6460 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6461 /* Check that a start_offset points to the start of a UTF character. */
6462 if (start_offset > 0 && start_offset < length &&
6463 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6464 return PCRE_ERROR_BADUTF8_OFFSET;
6465 #endif
6466 }
6467 #endif
6468
6469 /* If the pattern was successfully studied with JIT support, run the JIT
6470 executable instead of the rest of this function. Most options must be set at
6471 compile time for the JIT code to be usable. Fallback to the normal code path if
6472 an unsupported flag is set. */
6473
6474 #ifdef SUPPORT_JIT
6475 if (extra_data != NULL
6476 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6477 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6478 && extra_data->executable_jit != NULL
6479 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6480 {
6481 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6482 start_offset, options, offsets, offsetcount);
6483
6484 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6485 mode is not compiled. In this case we simply fallback to interpreter. */
6486
6487 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6488 }
6489 #endif
6490
6491 /* Carry on with non-JIT matching. This information is for finding all the
6492 numbers associated with a given name, for condition testing. */
6493
6494 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6495 md->name_count = re->name_count;
6496 md->name_entry_size = re->name_entry_size;
6497
6498 /* Fish out the optional data from the extra_data structure, first setting
6499 the default values. */
6500
6501 study = NULL;
6502 md->match_limit = MATCH_LIMIT;
6503 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6504 md->callout_data = NULL;
6505
6506 /* The table pointer is always in native byte order. */
6507
6508 tables = re->tables;
6509
6510 /* The two limit values override the defaults, whatever their value. */
6511
6512 if (extra_data != NULL)
6513 {
6514 unsigned long int flags = extra_data->flags;
6515 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6516 study = (const pcre_study_data *)extra_data->study_data;
6517 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6518 md->match_limit = extra_data->match_limit;
6519 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6520 md->match_limit_recursion = extra_data->match_limit_recursion;
6521 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6522 md->callout_data = extra_data->callout_data;
6523 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6524 }
6525
6526 /* Limits in the regex override only if they are smaller. */
6527
6528 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6529 md->match_limit = re->limit_match;
6530
6531 if ((re->flags & PCRE_RLSET) != 0 &&
6532 re->limit_recursion < md->match_limit_recursion)
6533 md->match_limit_recursion = re->limit_recursion;
6534
6535 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6536 is a feature that makes it possible to save compiled regex and re-use them
6537 in other programs later. */
6538
6539 if (tables == NULL) tables = PRIV(default_tables);
6540
6541 /* Set up other data */
6542
6543 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6544 startline = (re->flags & PCRE_STARTLINE) != 0;
6545 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6546
6547 /* The code starts after the real_pcre block and the capture name table. */
6548
6549 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6550 re->name_count * re->name_entry_size;
6551
6552 md->start_subject = (PCRE_PUCHAR)subject;
6553 md->start_offset = start_offset;
6554 md->end_subject = md->start_subject + length;
6555 end_subject = md->end_subject;
6556
6557 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6558 md->use_ucp = (re->options & PCRE_UCP) != 0;
6559 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6560 md->ignore_skip_arg = 0;
6561
6562 /* Some options are unpacked into BOOL variables in the hope that testing
6563 them will be faster than individual option bits. */
6564
6565 md->notbol = (options & PCRE_NOTBOL) != 0;
6566 md->noteol = (options & PCRE_NOTEOL) != 0;
6567 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6568 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6569
6570 md->hitend = FALSE;
6571 md->mark = md->nomatch_mark = NULL; /* In case never set */
6572
6573 md->recursive = NULL; /* No recursion at top level */
6574 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6575
6576 md->lcc = tables + lcc_offset;
6577 md->fcc = tables + fcc_offset;
6578 md->ctypes = tables + ctypes_offset;
6579
6580 /* Handle different \R options. */
6581
6582 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6583 {
6584 case 0:
6585 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6586 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6587 else
6588 #ifdef BSR_ANYCRLF
6589 md->bsr_anycrlf = TRUE;
6590 #else
6591 md->bsr_anycrlf = FALSE;
6592 #endif
6593 break;
6594
6595 case PCRE_BSR_ANYCRLF:
6596 md->bsr_anycrlf = TRUE;
6597 break;
6598
6599 case PCRE_BSR_UNICODE:
6600 md->bsr_anycrlf = FALSE;
6601 break;
6602
6603 default: return PCRE_ERROR_BADNEWLINE;
6604 }
6605
6606 /* Handle different types of newline. The three bits give eight cases. If
6607 nothing is set at run time, whatever was used at compile time applies. */
6608
6609 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6610 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6611 {
6612 case 0: newline = NEWLINE; break; /* Compile-time default */
6613 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6614 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6615 case PCRE_NEWLINE_CR+
6616 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6617 case PCRE_NEWLINE_ANY: newline = -1; break;
6618 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6619 default: return PCRE_ERROR_BADNEWLINE;
6620 }
6621
6622 if (newline == -2)
6623 {
6624 md->nltype = NLTYPE_ANYCRLF;
6625 }
6626 else if (newline < 0)
6627 {
6628 md->nltype = NLTYPE_ANY;
6629 }
6630 else
6631 {
6632 md->nltype = NLTYPE_FIXED;
6633 if (newline > 255)
6634 {
6635 md->nllen = 2;
6636 md->nl[0] = (newline >> 8) & 255;
6637 md->nl[1] = newline & 255;
6638 }
6639 else
6640 {
6641 md->nllen = 1;
6642 md->nl[0] = newline;
6643 }
6644 }
6645
6646 /* Partial matching was originally supported only for a restricted set of
6647 regexes; from release 8.00 there are no restrictions, but the bits are still
6648 defined (though never set). So there's no harm in leaving this code. */
6649
6650 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6651 return PCRE_ERROR_BADPARTIAL;
6652
6653 /* If the expression has got more back references than the offsets supplied can
6654 hold, we get a temporary chunk of working store to use during the matching.
6655 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6656 of 3. */
6657
6658 ocount = offsetcount - (offsetcount % 3);
6659 arg_offset_max = (2*ocount)/3;
6660
6661 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6662 {
6663 ocount = re->top_backref * 3 + 3;
6664 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6665 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6666 using_temporary_offsets = TRUE;
6667 DPRINTF(("Got memory to hold back references\n"));
6668 }
6669 else md->offset_vector = offsets;
6670 md->offset_end = ocount;
6671 md->offset_max = (2*ocount)/3;
6672 md->capture_last = 0;
6673
6674 /* Reset the working variable associated with each extraction. These should
6675 never be used unless previously set, but they get saved and restored, and so we
6676 initialize them to avoid reading uninitialized locations. Also, unset the
6677 offsets for the matched string. This is really just for tidiness with callouts,
6678 in case they inspect these fields. */
6679
6680 if (md->offset_vector != NULL)
6681 {
6682 register int *iptr = md->offset_vector + ocount;
6683 register int *iend = iptr - re->top_bracket;
6684 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6685 while (--iptr >= iend) *iptr = -1;
6686 if (offsetcount > 0) md->offset_vector[0] = -1;
6687 if (offsetcount > 1) md->offset_vector[1] = -1;
6688 }
6689
6690 /* Set up the first character to match, if available. The first_char value is
6691 never set for an anchored regular expression, but the anchoring may be forced
6692 at run time, so we have to test for anchoring. The first char may be unset for
6693 an unanchored pattern, of course. If there's no first char and the pattern was
6694 studied, there may be a bitmap of possible first characters. */
6695
6696 if (!anchored)
6697 {
6698 if ((re->flags & PCRE_FIRSTSET) != 0)
6699 {
6700 has_first_char = TRUE;
6701 first_char = first_char2 = (pcre_uchar)(re->first_char);
6702 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6703 {
6704 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6705 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6706 if (utf && first_char > 127)
6707 first_char2 = UCD_OTHERCASE(first_char);
6708 #endif
6709 }
6710 }
6711 else
6712 if (!startline && study != NULL &&
6713 (study->flags & PCRE_STUDY_MAPPED) != 0)
6714 start_bits = study->start_bits;
6715 }
6716
6717 /* For anchored or unanchored matches, there may be a "last known required
6718 character" set. */
6719
6720 if ((re->flags & PCRE_REQCHSET) != 0)
6721 {
6722 has_req_char = TRUE;
6723 req_char = req_char2 = (pcre_uchar)(re->req_char);
6724 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6725 {
6726 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6727 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6728 if (utf && req_char > 127)
6729 req_char2 = UCD_OTHERCASE(req_char);
6730 #endif
6731 }
6732 }
6733
6734
6735 /* ==========================================================================*/
6736
6737 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6738 the loop runs just once. */
6739
6740 for(;;)
6741 {
6742 PCRE_PUCHAR save_end_subject = end_subject;
6743 PCRE_PUCHAR new_start_match;
6744
6745 /* If firstline is TRUE, the start of the match is constrained to the first
6746 line of a multiline string. That is, the match must be before or at the first
6747 newline. Implement this by temporarily adjusting end_subject so that we stop
6748 scanning at a newline. If the match fails at the newline, later code breaks
6749 this loop. */
6750
6751 if (firstline)
6752 {
6753 PCRE_PUCHAR t = start_match;
6754 #ifdef SUPPORT_UTF
6755 if (utf)
6756 {
6757 while (t < md->end_subject && !IS_NEWLINE(t))
6758 {
6759 t++;
6760 ACROSSCHAR(t < end_subject, *t, t++);
6761 }
6762 }
6763 else
6764 #endif
6765 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6766 end_subject = t;
6767 }
6768
6769 /* There are some optimizations that avoid running the match if a known
6770 starting point is not found, or if a known later character is not present.
6771 However, there is an option that disables these, for testing and for ensuring
6772 that all callouts do actually occur. The option can be set in the regex by
6773 (*NO_START_OPT) or passed in match-time options. */
6774
6775 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6776 {
6777 /* Advance to a unique first char if there is one. */
6778
6779 if (has_first_char)
6780 {
6781 pcre_uchar smc;
6782
6783 if (first_char != first_char2)
6784 while (start_match < end_subject &&
6785 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
6786 start_match++;
6787 else
6788 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
6789 start_match++;
6790 }
6791
6792 /* Or to just after a linebreak for a multiline match */
6793
6794 else if (startline)
6795 {
6796 if (start_match > md->start_subject + start_offset)
6797 {
6798 #ifdef SUPPORT_UTF
6799 if (utf)
6800 {
6801 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6802 {
6803 start_match++;
6804 ACROSSCHAR(start_match < end_subject, *start_match,
6805 start_match++);
6806 }
6807 }
6808 else
6809 #endif
6810 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6811 start_match++;
6812
6813 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6814 and we are now at a LF, advance the match position by one more character.
6815 */
6816
6817 if (start_match[-1] == CHAR_CR &&
6818 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6819 start_match < end_subject &&
6820 UCHAR21TEST(start_match) == CHAR_NL)
6821 start_match++;
6822 }
6823 }
6824
6825 /* Or to a non-unique first byte after study */
6826
6827 else if (start_bits != NULL)
6828 {
6829 while (start_match < end_subject)
6830 {
6831 register pcre_uint32 c = UCHAR21TEST(start_match);
6832 #ifndef COMPILE_PCRE8
6833 if (c > 255) c = 255;
6834 #endif
6835 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6836 start_match++;
6837 }
6838 }
6839 } /* Starting optimizations */
6840
6841 /* Restore fudged end_subject */
6842
6843 end_subject = save_end_subject;
6844
6845 /* The following two optimizations are disabled for partial matching or if
6846 disabling is explicitly requested. */
6847
6848 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6849 {
6850 /* If the pattern was studied, a minimum subject length may be set. This is
6851 a lower bound; no actual string of that length may actually match the
6852 pattern. Although the value is, strictly, in characters, we treat it as
6853 bytes to avoid spending too much time in this optimization. */
6854
6855 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6856 (pcre_uint32)(end_subject - start_match) < study->minlength)
6857 {
6858 rc = MATCH_NOMATCH;
6859 break;
6860 }
6861
6862 /* If req_char is set, we know that that character must appear in the
6863 subject for the match to succeed. If the first character is set, req_char
6864 must be later in the subject; otherwise the test starts at the match point.
6865 This optimization can save a huge amount of backtracking in patterns with
6866 nested unlimited repeats that aren't going to match. Writing separate code
6867 for cased/caseless versions makes it go faster, as does using an
6868 autoincrement and backing off on a match.
6869
6870 HOWEVER: when the subject string is very, very long, searching to its end
6871 can take a long time, and give bad performance on quite ordinary patterns.
6872 This showed up when somebody was matching something like /^\d+C/ on a
6873 32-megabyte string... so we don't do this when the string is sufficiently
6874 long. */
6875
6876 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6877 {
6878 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6879
6880 /* We don't need to repeat the search if we haven't yet reached the
6881 place we found it at last time. */
6882
6883 if (p > req_char_ptr)
6884 {
6885 if (req_char != req_char2)
6886 {
6887 while (p < end_subject)
6888 {
6889 register pcre_uint32 pp = UCHAR21INCTEST(p);
6890 if (pp == req_char || pp == req_char2) { p--; break; }
6891 }
6892 }
6893 else
6894 {
6895 while (p < end_subject)
6896 {
6897 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
6898 }
6899 }
6900
6901 /* If we can't find the required character, break the matching loop,
6902 forcing a match failure. */
6903
6904 if (p >= end_subject)
6905 {
6906 rc = MATCH_NOMATCH;
6907 break;
6908 }
6909
6910 /* If we have found the required character, save the point where we
6911 found it, so that we don't search again next time round the loop if
6912 the start hasn't passed this character yet. */
6913
6914 req_char_ptr = p;
6915 }
6916 }
6917 }
6918
6919 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6920 printf(">>>> Match against: ");
6921 pchars(start_match, end_subject - start_match, TRUE, md);
6922 printf("\n");
6923 #endif
6924
6925 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6926 first starting point for which a partial match was found. */
6927
6928 md->start_match_ptr = start_match;
6929 md->start_used_ptr = start_match;
6930 md->match_call_count = 0;
6931 md->match_function_type = 0;
6932 md->end_offset_top = 0;
6933 md->skip_arg_count = 0;
6934 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6935 if (md->hitend && start_partial == NULL)
6936 {
6937 start_partial = md->start_used_ptr;
6938 match_partial = start_match;
6939 }
6940
6941 switch(rc)
6942 {
6943 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6944 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6945 entirely. The only way we can do that is to re-do the match at the same
6946 point, with a flag to force SKIP with an argument to be ignored. Just
6947 treating this case as NOMATCH does not work because it does not check other
6948 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6949
6950 case MATCH_SKIP_ARG:
6951 new_start_match = start_match;
6952 md->ignore_skip_arg = md->skip_arg_count;
6953 break;
6954
6955 /* SKIP passes back the next starting point explicitly, but if it is no
6956 greater than the match we have just done, treat it as NOMATCH. */
6957
6958 case MATCH_SKIP:
6959 if (md->start_match_ptr > start_match)
6960 {
6961 new_start_match = md->start_match_ptr;
6962 break;
6963 }
6964 /* Fall through */
6965
6966 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6967 exactly like PRUNE. Unset ignore SKIP-with-argument. */
6968
6969 case MATCH_NOMATCH:
6970 case MATCH_PRUNE:
6971 case MATCH_THEN:
6972 md->ignore_skip_arg = 0;
6973 new_start_match = start_match + 1;
6974 #ifdef SUPPORT_UTF
6975 if (utf)
6976 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6977 new_start_match++);
6978 #endif
6979 break;
6980
6981 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6982
6983 case MATCH_COMMIT:
6984 rc = MATCH_NOMATCH;
6985 goto ENDLOOP;
6986
6987 /* Any other return is either a match, or some kind of error. */
6988
6989 default:
6990 goto ENDLOOP;
6991 }
6992
6993 /* Control reaches here for the various types of "no match at this point"
6994 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6995
6996 rc = MATCH_NOMATCH;
6997
6998 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6999 newline in the subject (though it may continue over the newline). Therefore,
7000 if we have just failed to match, starting at a newline, do not continue. */
7001
7002 if (firstline && IS_NEWLINE(start_match)) break;
7003
7004 /* Advance to new matching position */
7005
7006 start_match = new_start_match;
7007
7008 /* Break the loop if the pattern is anchored or if we have passed the end of
7009 the subject. */
7010
7011 if (anchored || start_match > end_subject) break;
7012
7013 /* If we have just passed a CR and we are now at a LF, and the pattern does
7014 not contain any explicit matches for \r or \n, and the newline option is CRLF
7015 or ANY or ANYCRLF, advance the match position by one more character. In
7016 normal matching start_match will aways be greater than the first position at
7017 this stage, but a failed *SKIP can cause a return at the same point, which is
7018 why the first test exists. */
7019
7020 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7021 start_match[-1] == CHAR_CR &&
7022 start_match < end_subject &&
7023 *start_match == CHAR_NL &&
7024 (re->flags & PCRE_HASCRORLF) == 0 &&
7025 (md->nltype == NLTYPE_ANY ||
7026 md->nltype == NLTYPE_ANYCRLF ||
7027 md->nllen == 2))
7028 start_match++;
7029
7030 md->mark = NULL; /* Reset for start of next match attempt */
7031 } /* End of for(;;) "bumpalong" loop */
7032
7033 /* ==========================================================================*/
7034
7035 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7036 conditions is true:
7037
7038 (1) The pattern is anchored or the match was failed by (*COMMIT);
7039
7040 (2) We are past the end of the subject;
7041
7042 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7043 this option requests that a match occur at or before the first newline in
7044 the subject.
7045
7046 When we have a match and the offset vector is big enough to deal with any
7047 backreferences, captured substring offsets will already be set up. In the case
7048 where we had to get some local store to hold offsets for backreference
7049 processing, copy those that we can. In this case there need not be overflow if
7050 certain parts of the pattern were not used, even though there are more
7051 capturing parentheses than vector slots. */
7052
7053 ENDLOOP:
7054
7055 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7056 {
7057 if (using_temporary_offsets)
7058 {
7059 if (arg_offset_max >= 4)
7060 {
7061 memcpy(offsets + 2, md->offset_vector + 2,
7062 (arg_offset_max - 2) * sizeof(int));
7063 DPRINTF(("Copied offsets from temporary memory\n"));
7064 }
7065 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
7066 DPRINTF(("Freeing temporary memory\n"));
7067 (PUBL(free))(md->offset_vector);
7068 }
7069
7070 /* Set the return code to the number of captured strings, or 0 if there were
7071 too many to fit into the vector. */
7072
7073 rc = ((md->capture_last & OVFLBIT) != 0 &&
7074 md->end_offset_top >= arg_offset_max)?
7075 0 : md->end_offset_top/2;
7076
7077 /* If there is space in the offset vector, set any unused pairs at the end of
7078 the pattern to -1 for backwards compatibility. It is documented that this
7079 happens. In earlier versions, the whole set of potential capturing offsets
7080 was set to -1 each time round the loop, but this is handled differently now.
7081 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7082 those at the end that need unsetting here. We can't just unset them all at
7083 the start of the whole thing because they may get set in one branch that is
7084 not the final matching branch. */
7085
7086 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7087 {
7088 register int *iptr, *iend;
7089 int resetcount = 2 + re->top_bracket * 2;
7090 if (resetcount > offsetcount) resetcount = offsetcount;
7091 iptr = offsets + md->end_offset_top;
7092 iend = offsets + resetcount;
7093 while (iptr < iend) *iptr++ = -1;
7094 }
7095
7096 /* If there is space, set up the whole thing as substring 0. The value of
7097 md->start_match_ptr might be modified if \K was encountered on the success
7098 matching path. */
7099
7100 if (offsetcount < 2) rc = 0; else
7101 {
7102 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7103 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7104 }
7105
7106 /* Return MARK data if requested */
7107
7108 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7109 *(extra_data->mark) = (pcre_uchar *)md->mark;
7110 DPRINTF((">>>> returning %d\n", rc));
7111 #ifdef NO_RECURSE
7112 release_match_heapframes(&frame_zero);
7113 #endif
7114 return rc;
7115 }
7116
7117 /* Control gets here if there has been an error, or if the overall match
7118 attempt has failed at all permitted starting positions. */
7119
7120 if (using_temporary_offsets)
7121 {
7122 DPRINTF(("Freeing temporary memory\n"));
7123 (PUBL(free))(md->offset_vector);
7124 }
7125
7126 /* For anything other than nomatch or partial match, just return the code. */
7127
7128 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7129 {
7130 DPRINTF((">>>> error: returning %d\n", rc));
7131 #ifdef NO_RECURSE
7132 release_match_heapframes(&frame_zero);
7133 #endif
7134 return rc;
7135 }
7136
7137 /* Handle partial matches - disable any mark data */
7138
7139 if (match_partial != NULL)
7140 {
7141 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7142 md->mark = NULL;
7143 if (offsetcount > 1)
7144 {
7145 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7146 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7147 if (offsetcount > 2)
7148 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
7149 }
7150 rc = PCRE_ERROR_PARTIAL;
7151 }
7152
7153 /* This is the classic nomatch case */
7154
7155 else
7156 {
7157 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7158 rc = PCRE_ERROR_NOMATCH;
7159 }
7160
7161 /* Return the MARK data if it has been requested. */
7162
7163 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7164 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7165 #ifdef NO_RECURSE
7166 release_match_heapframes(&frame_zero);
7167 #endif
7168 return rc;
7169 }
7170
7171 /* End of pcre_exec.c */
7172