1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2015-2024 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #include "pcre2_internal.h"
47
48 /* These defines enable debugging code */
49
50 /* #define DEBUG_FRAMES_DISPLAY */
51 /* #define DEBUG_SHOW_OPS */
52 /* #define DEBUG_SHOW_RMATCH */
53
54 #ifdef DEBUG_FRAMES_DISPLAY
55 #include <stdarg.h>
56 #endif
57
58 #ifdef DEBUG_SHOW_OPS
59 static const char *OP_names[] = { OP_NAME_LIST };
60 #endif
61
62 /* These defines identify the name of the block containing "static"
63 information, and fields within it. */
64
65 #define NLBLOCK mb /* Block containing newline information */
66 #define PSSTART start_subject /* Field containing processed string start */
67 #define PSEND end_subject /* Field containing processed string end */
68
69 #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
70
71 /* Masks for identifying the public options that are permitted at match time. */
72
73 #define PUBLIC_MATCH_OPTIONS \
74 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
75 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
76 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT| \
77 PCRE2_DISABLE_RECURSELOOP_CHECK)
78
79 #define PUBLIC_JIT_MATCH_OPTIONS \
80 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
81 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\
82 PCRE2_COPY_MATCHED_SUBJECT)
83
84 /* Non-error returns from and within the match() function. Error returns are
85 externally defined PCRE2_ERROR_xxx codes, which are all negative. */
86
87 #define MATCH_MATCH 1
88 #define MATCH_NOMATCH 0
89
90 /* Special internal returns used in the match() function. Make them
91 sufficiently negative to avoid the external error codes. */
92
93 #define MATCH_ACCEPT (-999)
94 #define MATCH_KETRPOS (-998)
95 /* The next 5 must be kept together and in sequence so that a test that checks
96 for any one of them can use a range. */
97 #define MATCH_COMMIT (-997)
98 #define MATCH_PRUNE (-996)
99 #define MATCH_SKIP (-995)
100 #define MATCH_SKIP_ARG (-994)
101 #define MATCH_THEN (-993)
102 #define MATCH_BACKTRACK_MAX MATCH_THEN
103 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
104
105 /* Group frame type values. Zero means the frame is not a group frame. The
106 lower 16 bits are used for data (e.g. the capture number). Group frames are
107 used for most groups so that information about the start is easily available at
108 the end without having to scan back through intermediate frames (backtrack
109 points). */
110
111 #define GF_CAPTURE 0x00010000u
112 #define GF_NOCAPTURE 0x00020000u
113 #define GF_CONDASSERT 0x00030000u
114 #define GF_RECURSE 0x00040000u
115
116 /* Masks for the identity and data parts of the group frame type. */
117
118 #define GF_IDMASK(a) ((a) & 0xffff0000u)
119 #define GF_DATAMASK(a) ((a) & 0x0000ffffu)
120
121 /* Repetition types */
122
123 enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
124
125 /* Min and max values for the common repeats; a maximum of UINT32_MAX =>
126 infinity. */
127
128 static const uint32_t rep_min[] = {
129 0, 0, /* * and *? */
130 1, 1, /* + and +? */
131 0, 0, /* ? and ?? */
132 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
133 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
134
135 static const uint32_t rep_max[] = {
136 UINT32_MAX, UINT32_MAX, /* * and *? */
137 UINT32_MAX, UINT32_MAX, /* + and +? */
138 1, 1, /* ? and ?? */
139 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
140 UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
141
142 /* Repetition types - must include OP_CRPOSRANGE (not needed above) */
143
144 static const uint32_t rep_typ[] = {
145 REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
146 REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
147 REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
148 REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
149 REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
150 REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
151
152 /* Numbers for RMATCH calls at backtracking points. When these lists are
153 changed, the code at RETURN_SWITCH below must be updated in sync. */
154
155 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
156 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
157 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
158 RM31, RM32, RM33, RM34, RM35, RM36, RM37 };
159
160 #ifdef SUPPORT_WIDE_CHARS
161 enum { RM100=100, RM101 };
162 #endif
163
164 #ifdef SUPPORT_UNICODE
165 enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
166 RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
167 RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223,
168 RM224, RM225 };
169 #endif
170
171 /* Define short names for general fields in the current backtrack frame, which
172 is always pointed to by the F variable. Occasional references to fields in
173 other frames are written out explicitly. There are also some fields in the
174 current frame whose names start with "temp" that are used for short-term,
175 localised backtracking memory. These are #defined with Lxxx names at the point
176 of use and undefined afterwards. */
177
178 #define Fback_frame F->back_frame
179 #define Fcapture_last F->capture_last
180 #define Fcurrent_recurse F->current_recurse
181 #define Fecode F->ecode
182 #define Feptr F->eptr
183 #define Fgroup_frame_type F->group_frame_type
184 #define Flast_group_offset F->last_group_offset
185 #define Flength F->length
186 #define Fmark F->mark
187 #define Frdepth F->rdepth
188 #define Fstart_match F->start_match
189 #define Foffset_top F->offset_top
190 #define Foccu F->occu
191 #define Fop F->op
192 #define Fovector F->ovector
193 #define Freturn_id F->return_id
194
195
196 #ifdef DEBUG_FRAMES_DISPLAY
197 /*************************************************
198 * Display current frames and contents *
199 *************************************************/
200
201 /* This debugging function displays the current set of frames and their
202 contents. It is not called automatically from anywhere, the intention being
203 that calls can be inserted where necessary when debugging frame-related
204 problems.
205
206 Arguments:
207 f the file to write to
208 F the current top frame
209 P a previous frame of interest
210 frame_size the frame size
211 mb points to the match block
212 match_data points to the match data block
213 s identification text
214
215 Returns: nothing
216 */
217
218 static void
display_frames(FILE * f,heapframe * F,heapframe * P,PCRE2_SIZE frame_size,match_block * mb,pcre2_match_data * match_data,const char * s,...)219 display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
220 match_block *mb, pcre2_match_data *match_data, const char *s, ...)
221 {
222 uint32_t i;
223 heapframe *Q;
224 va_list ap;
225 va_start(ap, s);
226
227 fprintf(f, "FRAMES ");
228 vfprintf(f, s, ap);
229 va_end(ap);
230
231 if (P != NULL) fprintf(f, " P=%lu",
232 ((char *)P - (char *)(match_data->heapframes))/frame_size);
233 fprintf(f, "\n");
234
235 for (i = 0, Q = match_data->heapframes;
236 Q <= F;
237 i++, Q = (heapframe *)((char *)Q + frame_size))
238 {
239 fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
240 i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
241 Q->back_frame, Q->return_id);
242
243 if (Q->last_group_offset == PCRE2_UNSET)
244 fprintf(f, " lgoffset=unset\n");
245 else
246 fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
247 }
248 }
249
250 #endif
251
252
253
254 /*************************************************
255 * Process a callout *
256 *************************************************/
257
258 /* This function is called for all callouts, whether "standalone" or at the
259 start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
260 OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
261 with fixed values.
262
263 Arguments:
264 F points to the current backtracking frame
265 mb points to the match block
266 lengthptr where to return the length of the callout item
267
268 Returns: the return from the callout
269 or 0 if no callout function exists
270 */
271
272 static int
do_callout(heapframe * F,match_block * mb,PCRE2_SIZE * lengthptr)273 do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
274 {
275 int rc;
276 PCRE2_SIZE save0, save1;
277 PCRE2_SIZE *callout_ovector;
278 pcre2_callout_block *cb;
279
280 *lengthptr = (*Fecode == OP_CALLOUT)?
281 PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
282
283 if (mb->callout == NULL) return 0; /* No callout function provided */
284
285 /* The original matching code (pre 10.30) worked directly with the ovector
286 passed by the user, and this was passed to callouts. Now that the working
287 ovector is in the backtracking frame, it no longer needs to reserve space for
288 the overall match offsets (which would waste space in the frame). For backward
289 compatibility, however, we pass capture_top and offset_vector to the callout as
290 if for the extended ovector, and we ensure that the first two slots are unset
291 by preserving and restoring their current contents. Picky compilers complain if
292 references such as Fovector[-2] are use directly, so we set up a separate
293 pointer. */
294
295 callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
296
297 /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
298 are set externally. The first 3 never change; the last is updated for each
299 bumpalong. */
300
301 cb = mb->cb;
302 cb->capture_top = (uint32_t)Foffset_top/2 + 1;
303 cb->capture_last = Fcapture_last;
304 cb->offset_vector = callout_ovector;
305 cb->mark = mb->nomatch_mark;
306 cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
307 cb->pattern_position = GET(Fecode, 1);
308 cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
309
310 if (*Fecode == OP_CALLOUT) /* Numerical callout */
311 {
312 cb->callout_number = Fecode[1 + 2*LINK_SIZE];
313 cb->callout_string_offset = 0;
314 cb->callout_string = NULL;
315 cb->callout_string_length = 0;
316 }
317 else /* String callout */
318 {
319 cb->callout_number = 0;
320 cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
321 cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
322 cb->callout_string_length =
323 *lengthptr - (1 + 4*LINK_SIZE) - 2;
324 }
325
326 save0 = callout_ovector[0];
327 save1 = callout_ovector[1];
328 callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
329 rc = mb->callout(cb, mb->callout_data);
330 callout_ovector[0] = save0;
331 callout_ovector[1] = save1;
332 cb->callout_flags = 0;
333 return rc;
334 }
335
336
337
338 /*************************************************
339 * Match a back-reference *
340 *************************************************/
341
342 /* This function is called only when it is known that the offset lies within
343 the offsets that have so far been used in the match. Note that in caseless
344 UTF-8 mode, the number of subject bytes matched may be different to the number
345 of reference bytes. (In theory this could also happen in UTF-16 mode, but it
346 seems unlikely.)
347
348 Arguments:
349 offset index into the offset vector
350 caseless TRUE if caseless
351 F the current backtracking frame pointer
352 mb points to match block
353 lengthptr pointer for returning the length matched
354
355 Returns: = 0 sucessful match; number of code units matched is set
356 < 0 no match
357 > 0 partial match
358 */
359
360 static int
match_ref(PCRE2_SIZE offset,BOOL caseless,heapframe * F,match_block * mb,PCRE2_SIZE * lengthptr)361 match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
362 PCRE2_SIZE *lengthptr)
363 {
364 PCRE2_SPTR p;
365 PCRE2_SIZE length;
366 PCRE2_SPTR eptr;
367 PCRE2_SPTR eptr_start;
368
369 /* Deal with an unset group. The default is no match, but there is an option to
370 match an empty string. */
371
372 if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
373 {
374 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
375 {
376 *lengthptr = 0;
377 return 0; /* Match */
378 }
379 else return -1; /* No match */
380 }
381
382 /* Separate the caseless and UTF cases for speed. */
383
384 eptr = eptr_start = Feptr;
385 p = mb->start_subject + Fovector[offset];
386 length = Fovector[offset+1] - Fovector[offset];
387
388 if (caseless)
389 {
390 #if defined SUPPORT_UNICODE
391 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
392
393 if (utf || (mb->poptions & PCRE2_UCP) != 0)
394 {
395 PCRE2_SPTR endptr = p + length;
396
397 /* Match characters up to the end of the reference. NOTE: the number of
398 code units matched may differ, because in UTF-8 there are some characters
399 whose upper and lower case codes have different numbers of bytes. For
400 example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
401 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
402 sequence of two of the latter. It is important, therefore, to check the
403 length along the reference, not along the subject (earlier code did this
404 wrong). UCP without uses Unicode properties but without UTF encoding. */
405
406 while (p < endptr)
407 {
408 uint32_t c, d;
409 const ucd_record *ur;
410 if (eptr >= mb->end_subject) return 1; /* Partial match */
411
412 if (utf)
413 {
414 GETCHARINC(c, eptr);
415 GETCHARINC(d, p);
416 }
417 else
418 {
419 c = *eptr++;
420 d = *p++;
421 }
422
423 ur = GET_UCD(d);
424 if (c != d && c != (uint32_t)((int)d + ur->other_case))
425 {
426 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
427 for (;;)
428 {
429 if (c < *pp) return -1; /* No match */
430 if (c == *pp++) break;
431 }
432 }
433 }
434 }
435 else
436 #endif
437
438 /* Not in UTF or UCP mode */
439 {
440 for (; length > 0; length--)
441 {
442 uint32_t cc, cp;
443 if (eptr >= mb->end_subject) return 1; /* Partial match */
444 cc = UCHAR21TEST(eptr);
445 cp = UCHAR21TEST(p);
446 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
447 return -1; /* No match */
448 p++;
449 eptr++;
450 }
451 }
452 }
453
454 /* In the caseful case, we can just compare the code units, whether or not we
455 are in UTF and/or UCP mode. When partial matching, we have to do this unit by
456 unit. */
457
458 else
459 {
460 if (mb->partial != 0)
461 {
462 for (; length > 0; length--)
463 {
464 if (eptr >= mb->end_subject) return 1; /* Partial match */
465 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
466 }
467 }
468
469 /* Not partial matching */
470
471 else
472 {
473 if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */
474 if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
475 eptr += length;
476 }
477 }
478
479 *lengthptr = eptr - eptr_start;
480 return 0; /* Match */
481 }
482
483
484
485 /******************************************************************************
486 *******************************************************************************
487 "Recursion" in the match() function
488
489 The original match() function was highly recursive, but this proved to be the
490 source of a number of problems over the years, mostly because of the relatively
491 small system stacks that are commonly found. As new features were added to
492 patterns, various kludges were invented to reduce the amount of stack used,
493 making the code hard to understand in places.
494
495 A version did exist that used individual frames on the heap instead of calling
496 match() recursively, but this ran substantially slower. The current version is
497 a refactoring that uses a vector of frames to remember backtracking points.
498 This runs no slower, and possibly even a bit faster than the original recursive
499 implementation.
500
501 At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50
502 frames) was allocated on the system stack. If this was not big enough, the heap
503 was used for a larger vector. However, it turns out that there are environments
504 where taking as little as 20KiB from the system stack is an embarrassment.
505 After another refactoring, the heap is used exclusively, but a pointer the
506 frames vector and its size are cached in the match_data block, so that there is
507 no new memory allocation if the same match_data block is used for multiple
508 matches (unless the frames vector has to be extended).
509 *******************************************************************************
510 ******************************************************************************/
511
512
513
514
515 /*************************************************
516 * Macros for the match() function *
517 *************************************************/
518
519 /* These macros pack up tests that are used for partial matching several times
520 in the code. The second one is used when we already know we are past the end of
521 the subject. We set the "hit end" flag if the pointer is at the end of the
522 subject and either (a) the pointer is past the earliest inspected character
523 (i.e. something has been matched, even if not part of the actual matched
524 string), or (b) the pattern contains a lookbehind. These are the conditions for
525 which adding more characters may allow the current match to continue.
526
527 For hard partial matching, we immediately return a partial match. Otherwise,
528 carrying on means that a complete match on the current subject will be sought.
529 A partial match is returned only if no complete match can be found. */
530
531 #define CHECK_PARTIAL()\
532 if (Feptr >= mb->end_subject) \
533 { \
534 SCHECK_PARTIAL(); \
535 }
536
537 #define SCHECK_PARTIAL()\
538 if (mb->partial != 0 && \
539 (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \
540 { \
541 mb->hitend = TRUE; \
542 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
543 }
544
545
546 /* These macros are used to implement backtracking. They simulate a recursive
547 call to the match() function by means of a local vector of frames which
548 remember the backtracking points. */
549
550 #define RMATCH(ra,rb)\
551 {\
552 start_ecode = ra;\
553 Freturn_id = rb;\
554 goto MATCH_RECURSE;\
555 L_##rb:;\
556 }
557
558 #define RRETURN(ra)\
559 {\
560 rrc = ra;\
561 goto RETURN_SWITCH;\
562 }
563
564
565
566 /*************************************************
567 * Match from current position *
568 *************************************************/
569
570 /* This function is called to run one match attempt at a single starting point
571 in the subject.
572
573 Performance note: It might be tempting to extract commonly used fields from the
574 mb structure (e.g. end_subject) into individual variables to improve
575 performance. Tests using gcc on a SPARC disproved this; in the first case, it
576 made performance worse.
577
578 Arguments:
579 start_eptr starting character in subject
580 start_ecode starting position in compiled code
581 top_bracket number of capturing parentheses in the pattern
582 frame_size size of each backtracking frame
583 match_data pointer to the match_data block
584 mb pointer to "static" variables block
585
586 Returns: MATCH_MATCH if matched ) these values are >= 0
587 MATCH_NOMATCH if failed to match )
588 negative MATCH_xxx value for PRUNE, SKIP, etc
589 negative PCRE2_ERROR_xxx value if aborted by an error condition
590 (e.g. stopped by repeated call or depth limit)
591 */
592
593 static int
match(PCRE2_SPTR start_eptr,PCRE2_SPTR start_ecode,uint16_t top_bracket,PCRE2_SIZE frame_size,pcre2_match_data * match_data,match_block * mb)594 match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket,
595 PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
596 {
597 /* Frame-handling variables */
598
599 heapframe *F; /* Current frame pointer */
600 heapframe *N = NULL; /* Temporary frame pointers */
601 heapframe *P = NULL;
602
603 heapframe *frames_top; /* End of frames vector */
604 heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
605 PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
606
607 /* Local variables that do not need to be preserved over calls to RRMATCH(). */
608
609 PCRE2_SPTR branch_end = NULL;
610 PCRE2_SPTR branch_start;
611 PCRE2_SPTR bracode; /* Temp pointer to start of group */
612 PCRE2_SIZE offset; /* Used for group offsets */
613 PCRE2_SIZE length; /* Used for various length calculations */
614
615 int rrc; /* Return from functions & backtracking "recursions" */
616 #ifdef SUPPORT_UNICODE
617 int proptype; /* Type of character property */
618 #endif
619
620 uint32_t i; /* Used for local loops */
621 uint32_t fc; /* Character values */
622 uint32_t number; /* Used for group and other numbers */
623 uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
624 uint32_t group_frame_type; /* Specifies type for new group frames */
625
626 BOOL condition; /* Used in conditional groups */
627 BOOL cur_is_word; /* Used in "word" tests */
628 BOOL prev_is_word; /* Used in "word" tests */
629
630 /* UTF and UCP flags */
631
632 #ifdef SUPPORT_UNICODE
633 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
634 BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
635 #else
636 BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
637 #endif
638
639 /* This is the length of the last part of a backtracking frame that must be
640 copied when a new frame is created. */
641
642 frame_copy_size = frame_size - offsetof(heapframe, eptr);
643
644 /* Set up the first frame and the end of the frames vector. */
645
646 F = match_data->heapframes;
647 frames_top = (heapframe *)((char *)F + match_data->heapframes_size);
648
649 Frdepth = 0; /* "Recursion" depth */
650 Fcapture_last = 0; /* Number of most recent capture */
651 Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
652 Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
653 Fmark = NULL; /* Most recent mark */
654 Foffset_top = 0; /* End of captures within the frame */
655 Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
656 group_frame_type = 0; /* Not a start of group frame */
657 goto NEW_FRAME; /* Start processing with this frame */
658
659 /* Come back here when we want to create a new frame for remembering a
660 backtracking point. */
661
662 MATCH_RECURSE:
663
664 /* Set up a new backtracking frame. If the vector is full, get a new one,
665 doubling the size, but constrained by the heap limit (which is in KiB). */
666
667 N = (heapframe *)((char *)F + frame_size);
668 if ((heapframe *)((char *)N + frame_size) >= frames_top)
669 {
670 heapframe *new;
671 PCRE2_SIZE newsize;
672 PCRE2_SIZE usedsize = (char *)N - (char *)(match_data->heapframes);
673
674 if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2)
675 {
676 if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1)
677 return PCRE2_ERROR_NOMEMORY;
678 newsize = PCRE2_SIZE_MAX - 1;
679 }
680 else
681 newsize = match_data->heapframes_size * 2;
682
683 if (newsize / 1024 >= mb->heap_limit)
684 {
685 PCRE2_SIZE old_size = match_data->heapframes_size / 1024;
686 if (mb->heap_limit <= old_size)
687 return PCRE2_ERROR_HEAPLIMIT;
688 else
689 {
690 PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size);
691 int over_bytes = match_data->heapframes_size % 1024;
692 if (over_bytes) max_delta -= (1024 - over_bytes);
693 newsize = match_data->heapframes_size + max_delta;
694 }
695 }
696
697 /* With a heap limit set, the permitted additional size may not be enough for
698 another frame, so do a final check. */
699
700 if (newsize - usedsize < frame_size) return PCRE2_ERROR_HEAPLIMIT;
701 new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);
702 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
703 memcpy(new, match_data->heapframes, usedsize);
704
705 N = (heapframe *)((char *)new + usedsize);
706 F = (heapframe *)((char *)N - frame_size);
707
708 match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);
709 match_data->heapframes = new;
710 match_data->heapframes_size = newsize;
711 frames_top = (heapframe *)((char *)new + newsize);
712 }
713
714 #ifdef DEBUG_SHOW_RMATCH
715 fprintf(stderr, "++ RMATCH %d frame=%d", Freturn_id, Frdepth + 1);
716 if (group_frame_type != 0)
717 {
718 fprintf(stderr, " type=%x ", group_frame_type);
719 switch (GF_IDMASK(group_frame_type))
720 {
721 case GF_CAPTURE:
722 fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
723 break;
724
725 case GF_NOCAPTURE:
726 fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
727 break;
728
729 case GF_CONDASSERT:
730 fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
731 break;
732
733 case GF_RECURSE:
734 fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
735 break;
736
737 default:
738 fprintf(stderr, "*** unknown ***");
739 break;
740 }
741 }
742 fprintf(stderr, "\n");
743 #endif
744
745 /* Copy those fields that must be copied into the new frame, increase the
746 "recursion" depth (i.e. the new frame's index) and then make the new frame
747 current. */
748
749 memcpy((char *)N + offsetof(heapframe, eptr),
750 (char *)F + offsetof(heapframe, eptr),
751 frame_copy_size);
752
753 N->rdepth = Frdepth + 1;
754 F = N;
755
756 /* Carry on processing with a new frame. */
757
758 NEW_FRAME:
759 Fgroup_frame_type = group_frame_type;
760 Fecode = start_ecode; /* Starting code pointer */
761 Fback_frame = frame_size; /* Default is go back one frame */
762
763 /* If this is a special type of group frame, remember its offset for quick
764 access at the end of the group. If this is a recursion, set a new current
765 recursion value. */
766
767 if (group_frame_type != 0)
768 {
769 Flast_group_offset = (char *)F - (char *)match_data->heapframes;
770 if (GF_IDMASK(group_frame_type) == GF_RECURSE)
771 Fcurrent_recurse = GF_DATAMASK(group_frame_type);
772 group_frame_type = 0;
773 }
774
775
776 /* ========================================================================= */
777 /* This is the main processing loop. First check that we haven't recorded too
778 many backtracks (search tree is too large), or that we haven't exceeded the
779 recursive depth limit (used too many backtracking frames). If not, process the
780 opcodes. */
781
782 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
783 if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
784
785 #ifdef DEBUG_SHOW_OPS
786 fprintf(stderr, "\n++ New frame: type=0x%x subject offset %ld\n",
787 GF_IDMASK(Fgroup_frame_type), Feptr - mb->start_subject);
788 #endif
789
790 for (;;)
791 {
792 #ifdef DEBUG_SHOW_OPS
793 fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
794 OP_names[*Fecode]);
795 #endif
796
797 Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
798 switch(Fop)
799 {
800 /* ===================================================================== */
801 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
802 any currently open capturing brackets. Unlike reaching the end of a group,
803 where we know the starting frame is at the top of the chained frames, in
804 this case we have to search back for the relevant frame in case other types
805 of group that use chained frames have intervened. Multiple OP_CLOSEs always
806 come innermost first, which matches the chain order. We can ignore this in
807 a recursion, because captures are not passed out of recursions. */
808
809 case OP_CLOSE:
810 if (Fcurrent_recurse == RECURSE_UNSET)
811 {
812 number = GET2(Fecode, 1);
813 offset = Flast_group_offset;
814 for(;;)
815 {
816 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
817 N = (heapframe *)((char *)match_data->heapframes + offset);
818 P = (heapframe *)((char *)N - frame_size);
819 if (N->group_frame_type == (GF_CAPTURE | number)) break;
820 offset = P->last_group_offset;
821 }
822 offset = (number << 1) - 2;
823 Fcapture_last = number;
824 Fovector[offset] = P->eptr - mb->start_subject;
825 Fovector[offset+1] = Feptr - mb->start_subject;
826 if (offset >= Foffset_top) Foffset_top = offset + 2;
827 }
828 Fecode += PRIV(OP_lengths)[*Fecode];
829 break;
830
831
832 /* ===================================================================== */
833 /* Real or forced end of the pattern, assertion, or recursion. In an
834 assertion ACCEPT, update the last used pointer and remember the current
835 frame so that the captures and mark can be fished out of it. */
836
837 case OP_ASSERT_ACCEPT:
838 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
839 assert_accept_frame = F;
840 RRETURN(MATCH_ACCEPT);
841
842 /* For ACCEPT within a recursion, we have to find the most recent
843 recursion. If not in a recursion, fall through to code that is common with
844 OP_END. */
845
846 case OP_ACCEPT:
847 if (Fcurrent_recurse != RECURSE_UNSET)
848 {
849 #ifdef DEBUG_SHOW_OPS
850 fprintf(stderr, "++ Accept within recursion\n");
851 #endif
852 offset = Flast_group_offset;
853 for(;;)
854 {
855 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
856 N = (heapframe *)((char *)match_data->heapframes + offset);
857 P = (heapframe *)((char *)N - frame_size);
858 if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
859 offset = P->last_group_offset;
860 }
861
862 /* N is now the frame of the recursion; the previous frame is at the
863 OP_RECURSE position. Go back there, copying the current subject position
864 and mark, and the start_match position (\K might have changed it), and
865 then move on past the OP_RECURSE. */
866
867 P->eptr = Feptr;
868 P->mark = Fmark;
869 P->start_match = Fstart_match;
870 F = P;
871 Fecode += 1 + LINK_SIZE;
872 continue;
873 }
874 /* Fall through */
875
876 /* OP_END itself can never be reached within a recursion because that is
877 picked up when the OP_KET that always precedes OP_END is reached. */
878
879 case OP_END:
880
881 /* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if
882 PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the
883 subject. In both cases, backtracking will then try other alternatives, if
884 any. */
885
886 if (Feptr == Fstart_match &&
887 ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
888 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
889 Fstart_match == mb->start_subject + mb->start_offset)))
890 {
891 #ifdef DEBUG_SHOW_OPS
892 fprintf(stderr, "++ Backtrack because empty string\n");
893 #endif
894 RRETURN(MATCH_NOMATCH);
895 }
896
897 /* Fail if PCRE2_ENDANCHORED is set and the end of the match is not
898 the end of the subject. After (*ACCEPT) we fail the entire match (at this
899 position) but backtrack if we've reached the end of the pattern. This
900 applies whether or not we are in a recursion. */
901
902 if (Feptr < mb->end_subject &&
903 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
904 {
905 if (Fop == OP_END)
906 {
907 #ifdef DEBUG_SHOW_OPS
908 fprintf(stderr, "++ Backtrack because not at end (endanchored set)\n");
909 #endif
910 RRETURN(MATCH_NOMATCH);
911 }
912
913 #ifdef DEBUG_SHOW_OPS
914 fprintf(stderr, "++ Failed ACCEPT not at end (endanchnored set)\n");
915 #endif
916 return MATCH_NOMATCH; /* (*ACCEPT) */
917 }
918
919 /* We have a successful match of the whole pattern. Record the result and
920 then do a direct return from the function. If there is space in the offset
921 vector, set any pairs that follow the highest-numbered captured string but
922 are less than the number of capturing groups in the pattern to PCRE2_UNSET.
923 It is documented that this happens. "Gaps" are set to PCRE2_UNSET
924 dynamically. It is only those at the end that need setting here. */
925
926 mb->end_match_ptr = Feptr; /* Record where we ended */
927 mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
928 mb->mark = Fmark; /* and the last success mark */
929 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
930
931 match_data->ovector[0] = Fstart_match - mb->start_subject;
932 match_data->ovector[1] = Feptr - mb->start_subject;
933
934 /* Set i to the smaller of the sizes of the external and frame ovectors. */
935
936 i = 2 * ((top_bracket + 1 > match_data->oveccount)?
937 match_data->oveccount : top_bracket + 1);
938 memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
939 while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET;
940 return MATCH_MATCH; /* Note: NOT RRETURN */
941
942
943 /*===================================================================== */
944 /* Match any single character type except newline; have to take care with
945 CRLF newlines and partial matching. */
946
947 case OP_ANY:
948 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
949 if (mb->partial != 0 &&
950 Feptr == mb->end_subject - 1 &&
951 NLBLOCK->nltype == NLTYPE_FIXED &&
952 NLBLOCK->nllen == 2 &&
953 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
954 {
955 mb->hitend = TRUE;
956 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
957 }
958 /* Fall through */
959
960 /* Match any single character whatsoever. */
961
962 case OP_ALLANY:
963 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
964 { /* not be updated before SCHECK_PARTIAL. */
965 SCHECK_PARTIAL();
966 RRETURN(MATCH_NOMATCH);
967 }
968 Feptr++;
969 #ifdef SUPPORT_UNICODE
970 if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
971 #endif
972 Fecode++;
973 break;
974
975
976 /* ===================================================================== */
977 /* Match a single code unit, even in UTF mode. This opcode really does
978 match any code unit, even newline. (It really should be called ANYCODEUNIT,
979 of course - the byte name is from pre-16 bit days.) */
980
981 case OP_ANYBYTE:
982 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
983 { /* not be updated before SCHECK_PARTIAL. */
984 SCHECK_PARTIAL();
985 RRETURN(MATCH_NOMATCH);
986 }
987 Feptr++;
988 Fecode++;
989 break;
990
991
992 /* ===================================================================== */
993 /* Match a single character, casefully */
994
995 case OP_CHAR:
996 #ifdef SUPPORT_UNICODE
997 if (utf)
998 {
999 Flength = 1;
1000 Fecode++;
1001 GETCHARLEN(fc, Fecode, Flength);
1002 if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
1003 {
1004 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
1005 RRETURN(MATCH_NOMATCH);
1006 }
1007 for (; Flength > 0; Flength--)
1008 {
1009 if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
1010 }
1011 }
1012 else
1013 #endif
1014
1015 /* Not UTF mode */
1016 {
1017 if (mb->end_subject - Feptr < 1)
1018 {
1019 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
1020 RRETURN(MATCH_NOMATCH);
1021 }
1022 if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
1023 Fecode += 2;
1024 }
1025 break;
1026
1027
1028 /* ===================================================================== */
1029 /* Match a single character, caselessly. If we are at the end of the
1030 subject, give up immediately. We get here only when the pattern character
1031 has at most one other case. Characters with more than two cases are coded
1032 as OP_PROP with the pseudo-property PT_CLIST. */
1033
1034 case OP_CHARI:
1035 if (Feptr >= mb->end_subject)
1036 {
1037 SCHECK_PARTIAL();
1038 RRETURN(MATCH_NOMATCH);
1039 }
1040
1041 #ifdef SUPPORT_UNICODE
1042 if (utf)
1043 {
1044 Flength = 1;
1045 Fecode++;
1046 GETCHARLEN(fc, Fecode, Flength);
1047
1048 /* If the pattern character's value is < 128, we know that its other case
1049 (if any) is also < 128 (and therefore only one code unit long in all
1050 code-unit widths), so we can use the fast lookup table. We checked above
1051 that there is at least one character left in the subject. */
1052
1053 if (fc < 128)
1054 {
1055 uint32_t cc = UCHAR21(Feptr);
1056 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1057 Fecode++;
1058 Feptr++;
1059 }
1060
1061 /* Otherwise we must pick up the subject character and use Unicode
1062 property support to test its other case. Note that we cannot use the
1063 value of "Flength" to check for sufficient bytes left, because the other
1064 case of the character may have more or fewer code units. */
1065
1066 else
1067 {
1068 uint32_t dc;
1069 GETCHARINC(dc, Feptr);
1070 Fecode += Flength;
1071 if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1072 }
1073 }
1074
1075 /* If UCP is set without UTF we must do the same as above, but with one
1076 character per code unit. */
1077
1078 else if (ucp)
1079 {
1080 uint32_t cc = UCHAR21(Feptr);
1081 fc = Fecode[1];
1082 if (fc < 128)
1083 {
1084 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1085 }
1086 else
1087 {
1088 if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1089 }
1090 Feptr++;
1091 Fecode += 2;
1092 }
1093
1094 else
1095 #endif /* SUPPORT_UNICODE */
1096
1097 /* Not UTF or UCP mode; use the table for characters < 256. */
1098 {
1099 if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
1100 != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
1101 Feptr++;
1102 Fecode += 2;
1103 }
1104 break;
1105
1106
1107 /* ===================================================================== */
1108 /* Match not a single character. */
1109
1110 case OP_NOT:
1111 case OP_NOTI:
1112 if (Feptr >= mb->end_subject)
1113 {
1114 SCHECK_PARTIAL();
1115 RRETURN(MATCH_NOMATCH);
1116 }
1117
1118 #ifdef SUPPORT_UNICODE
1119 if (utf)
1120 {
1121 uint32_t ch;
1122 Fecode++;
1123 GETCHARINC(ch, Fecode);
1124 GETCHARINC(fc, Feptr);
1125 if (ch == fc)
1126 {
1127 RRETURN(MATCH_NOMATCH); /* Caseful match */
1128 }
1129 else if (Fop == OP_NOTI) /* If caseless */
1130 {
1131 if (ch > 127)
1132 ch = UCD_OTHERCASE(ch);
1133 else
1134 ch = (mb->fcc)[ch];
1135 if (ch == fc) RRETURN(MATCH_NOMATCH);
1136 }
1137 }
1138
1139 /* UCP without UTF is as above, but with one character per code unit. */
1140
1141 else if (ucp)
1142 {
1143 uint32_t ch;
1144 fc = UCHAR21INC(Feptr);
1145 ch = Fecode[1];
1146 Fecode += 2;
1147
1148 if (ch == fc)
1149 {
1150 RRETURN(MATCH_NOMATCH); /* Caseful match */
1151 }
1152 else if (Fop == OP_NOTI) /* If caseless */
1153 {
1154 if (ch > 127)
1155 ch = UCD_OTHERCASE(ch);
1156 else
1157 ch = (mb->fcc)[ch];
1158 if (ch == fc) RRETURN(MATCH_NOMATCH);
1159 }
1160 }
1161
1162 else
1163 #endif /* SUPPORT_UNICODE */
1164
1165 /* Neither UTF nor UCP is set */
1166
1167 {
1168 uint32_t ch = Fecode[1];
1169 fc = UCHAR21INC(Feptr);
1170 if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
1171 RRETURN(MATCH_NOMATCH);
1172 Fecode += 2;
1173 }
1174 break;
1175
1176
1177 /* ===================================================================== */
1178 /* Match a single character repeatedly. */
1179
1180 #define Loclength F->temp_size
1181 #define Lstart_eptr F->temp_sptr[0]
1182 #define Lcharptr F->temp_sptr[1]
1183 #define Lmin F->temp_32[0]
1184 #define Lmax F->temp_32[1]
1185 #define Lc F->temp_32[2]
1186 #define Loc F->temp_32[3]
1187
1188 case OP_EXACT:
1189 case OP_EXACTI:
1190 Lmin = Lmax = GET2(Fecode, 1);
1191 Fecode += 1 + IMM2_SIZE;
1192 goto REPEATCHAR;
1193
1194 case OP_POSUPTO:
1195 case OP_POSUPTOI:
1196 reptype = REPTYPE_POS;
1197 Lmin = 0;
1198 Lmax = GET2(Fecode, 1);
1199 Fecode += 1 + IMM2_SIZE;
1200 goto REPEATCHAR;
1201
1202 case OP_UPTO:
1203 case OP_UPTOI:
1204 reptype = REPTYPE_MAX;
1205 Lmin = 0;
1206 Lmax = GET2(Fecode, 1);
1207 Fecode += 1 + IMM2_SIZE;
1208 goto REPEATCHAR;
1209
1210 case OP_MINUPTO:
1211 case OP_MINUPTOI:
1212 reptype = REPTYPE_MIN;
1213 Lmin = 0;
1214 Lmax = GET2(Fecode, 1);
1215 Fecode += 1 + IMM2_SIZE;
1216 goto REPEATCHAR;
1217
1218 case OP_POSSTAR:
1219 case OP_POSSTARI:
1220 reptype = REPTYPE_POS;
1221 Lmin = 0;
1222 Lmax = UINT32_MAX;
1223 Fecode++;
1224 goto REPEATCHAR;
1225
1226 case OP_POSPLUS:
1227 case OP_POSPLUSI:
1228 reptype = REPTYPE_POS;
1229 Lmin = 1;
1230 Lmax = UINT32_MAX;
1231 Fecode++;
1232 goto REPEATCHAR;
1233
1234 case OP_POSQUERY:
1235 case OP_POSQUERYI:
1236 reptype = REPTYPE_POS;
1237 Lmin = 0;
1238 Lmax = 1;
1239 Fecode++;
1240 goto REPEATCHAR;
1241
1242 case OP_STAR:
1243 case OP_STARI:
1244 case OP_MINSTAR:
1245 case OP_MINSTARI:
1246 case OP_PLUS:
1247 case OP_PLUSI:
1248 case OP_MINPLUS:
1249 case OP_MINPLUSI:
1250 case OP_QUERY:
1251 case OP_QUERYI:
1252 case OP_MINQUERY:
1253 case OP_MINQUERYI:
1254 fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
1255 Lmin = rep_min[fc];
1256 Lmax = rep_max[fc];
1257 reptype = rep_typ[fc];
1258
1259 /* Common code for all repeated single-character matches. We first check
1260 for the minimum number of characters. If the minimum equals the maximum, we
1261 are done. Otherwise, if minimizing, check the rest of the pattern for a
1262 match; if there isn't one, advance up to the maximum, one character at a
1263 time.
1264
1265 If maximizing, advance up to the maximum number of matching characters,
1266 until Feptr is past the end of the maximum run. If possessive, we are
1267 then done (no backing up). Otherwise, match at this position; anything
1268 other than no match is immediately returned. For nomatch, back up one
1269 character, unless we are matching \R and the last thing matched was
1270 \r\n, in which case, back up two code units until we reach the first
1271 optional character position.
1272
1273 The various UTF/non-UTF and caseful/caseless cases are handled separately,
1274 for speed. */
1275
1276 REPEATCHAR:
1277 #ifdef SUPPORT_UNICODE
1278 if (utf)
1279 {
1280 Flength = 1;
1281 Lcharptr = Fecode;
1282 GETCHARLEN(fc, Fecode, Flength);
1283 Fecode += Flength;
1284
1285 /* Handle multi-code-unit character matching, caseful and caseless. */
1286
1287 if (Flength > 1)
1288 {
1289 uint32_t othercase;
1290
1291 if (Fop >= OP_STARI && /* Caseless */
1292 (othercase = UCD_OTHERCASE(fc)) != fc)
1293 Loclength = PRIV(ord2utf)(othercase, Foccu);
1294 else Loclength = 0;
1295
1296 for (i = 1; i <= Lmin; i++)
1297 {
1298 if (Feptr <= mb->end_subject - Flength &&
1299 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1300 else if (Loclength > 0 &&
1301 Feptr <= mb->end_subject - Loclength &&
1302 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1303 Feptr += Loclength;
1304 else
1305 {
1306 CHECK_PARTIAL();
1307 RRETURN(MATCH_NOMATCH);
1308 }
1309 }
1310
1311 if (Lmin == Lmax) continue;
1312
1313 if (reptype == REPTYPE_MIN)
1314 {
1315 for (;;)
1316 {
1317 RMATCH(Fecode, RM202);
1318 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1319 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1320 if (Feptr <= mb->end_subject - Flength &&
1321 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1322 else if (Loclength > 0 &&
1323 Feptr <= mb->end_subject - Loclength &&
1324 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1325 Feptr += Loclength;
1326 else
1327 {
1328 CHECK_PARTIAL();
1329 RRETURN(MATCH_NOMATCH);
1330 }
1331 }
1332 /* Control never gets here */
1333 }
1334
1335 else /* Maximize */
1336 {
1337 Lstart_eptr = Feptr;
1338 for (i = Lmin; i < Lmax; i++)
1339 {
1340 if (Feptr <= mb->end_subject - Flength &&
1341 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
1342 Feptr += Flength;
1343 else if (Loclength > 0 &&
1344 Feptr <= mb->end_subject - Loclength &&
1345 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1346 Feptr += Loclength;
1347 else
1348 {
1349 CHECK_PARTIAL();
1350 break;
1351 }
1352 }
1353
1354 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1355 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1356 go too far. */
1357
1358 if (reptype != REPTYPE_POS) for(;;)
1359 {
1360 if (Feptr <= Lstart_eptr) break;
1361 RMATCH(Fecode, RM203);
1362 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1363 Feptr--;
1364 BACKCHAR(Feptr);
1365 }
1366 }
1367 break; /* End of repeated wide character handling */
1368 }
1369
1370 /* Length of UTF character is 1. Put it into the preserved variable and
1371 fall through to the non-UTF code. */
1372
1373 Lc = fc;
1374 }
1375 else
1376 #endif /* SUPPORT_UNICODE */
1377
1378 /* When not in UTF mode, load a single-code-unit character. Then proceed as
1379 above, using Unicode casing if either UTF or UCP is set. */
1380
1381 Lc = *Fecode++;
1382
1383 /* Caseless comparison */
1384
1385 if (Fop >= OP_STARI)
1386 {
1387 #if PCRE2_CODE_UNIT_WIDTH == 8
1388 #ifdef SUPPORT_UNICODE
1389 if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1390 else
1391 #endif /* SUPPORT_UNICODE */
1392 /* Lc will be < 128 in UTF-8 mode. */
1393 Loc = mb->fcc[Lc];
1394 #else /* 16-bit & 32-bit */
1395 #ifdef SUPPORT_UNICODE
1396 if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1397 else
1398 #endif /* SUPPORT_UNICODE */
1399 Loc = TABLE_GET(Lc, mb->fcc, Lc);
1400 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1401
1402 for (i = 1; i <= Lmin; i++)
1403 {
1404 uint32_t cc; /* Faster than PCRE2_UCHAR */
1405 if (Feptr >= mb->end_subject)
1406 {
1407 SCHECK_PARTIAL();
1408 RRETURN(MATCH_NOMATCH);
1409 }
1410 cc = UCHAR21TEST(Feptr);
1411 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1412 Feptr++;
1413 }
1414 if (Lmin == Lmax) continue;
1415
1416 if (reptype == REPTYPE_MIN)
1417 {
1418 for (;;)
1419 {
1420 uint32_t cc; /* Faster than PCRE2_UCHAR */
1421 RMATCH(Fecode, RM25);
1422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1423 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1424 if (Feptr >= mb->end_subject)
1425 {
1426 SCHECK_PARTIAL();
1427 RRETURN(MATCH_NOMATCH);
1428 }
1429 cc = UCHAR21TEST(Feptr);
1430 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1431 Feptr++;
1432 }
1433 /* Control never gets here */
1434 }
1435
1436 else /* Maximize */
1437 {
1438 Lstart_eptr = Feptr;
1439 for (i = Lmin; i < Lmax; i++)
1440 {
1441 uint32_t cc; /* Faster than PCRE2_UCHAR */
1442 if (Feptr >= mb->end_subject)
1443 {
1444 SCHECK_PARTIAL();
1445 break;
1446 }
1447 cc = UCHAR21TEST(Feptr);
1448 if (Lc != cc && Loc != cc) break;
1449 Feptr++;
1450 }
1451 if (reptype != REPTYPE_POS) for (;;)
1452 {
1453 if (Feptr == Lstart_eptr) break;
1454 RMATCH(Fecode, RM26);
1455 Feptr--;
1456 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1457 }
1458 }
1459 }
1460
1461 /* Caseful comparisons (includes all multi-byte characters) */
1462
1463 else
1464 {
1465 for (i = 1; i <= Lmin; i++)
1466 {
1467 if (Feptr >= mb->end_subject)
1468 {
1469 SCHECK_PARTIAL();
1470 RRETURN(MATCH_NOMATCH);
1471 }
1472 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1473 }
1474
1475 if (Lmin == Lmax) continue;
1476
1477 if (reptype == REPTYPE_MIN)
1478 {
1479 for (;;)
1480 {
1481 RMATCH(Fecode, RM27);
1482 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1483 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1484 if (Feptr >= mb->end_subject)
1485 {
1486 SCHECK_PARTIAL();
1487 RRETURN(MATCH_NOMATCH);
1488 }
1489 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1490 }
1491 /* Control never gets here */
1492 }
1493 else /* Maximize */
1494 {
1495 Lstart_eptr = Feptr;
1496 for (i = Lmin; i < Lmax; i++)
1497 {
1498 if (Feptr >= mb->end_subject)
1499 {
1500 SCHECK_PARTIAL();
1501 break;
1502 }
1503
1504 if (Lc != UCHAR21TEST(Feptr)) break;
1505 Feptr++;
1506 }
1507
1508 if (reptype != REPTYPE_POS) for (;;)
1509 {
1510 if (Feptr <= Lstart_eptr) break;
1511 RMATCH(Fecode, RM28);
1512 Feptr--;
1513 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1514 }
1515 }
1516 }
1517 break;
1518
1519 #undef Loclength
1520 #undef Lstart_eptr
1521 #undef Lcharptr
1522 #undef Lmin
1523 #undef Lmax
1524 #undef Lc
1525 #undef Loc
1526
1527
1528 /* ===================================================================== */
1529 /* Match a negated single one-byte character repeatedly. This is almost a
1530 repeat of the code for a repeated single character, but I haven't found a
1531 nice way of commoning these up that doesn't require a test of the
1532 positive/negative option for each character match. Maybe that wouldn't add
1533 very much to the time taken, but character matching *is* what this is all
1534 about... */
1535
1536 #define Lstart_eptr F->temp_sptr[0]
1537 #define Lmin F->temp_32[0]
1538 #define Lmax F->temp_32[1]
1539 #define Lc F->temp_32[2]
1540 #define Loc F->temp_32[3]
1541
1542 case OP_NOTEXACT:
1543 case OP_NOTEXACTI:
1544 Lmin = Lmax = GET2(Fecode, 1);
1545 Fecode += 1 + IMM2_SIZE;
1546 goto REPEATNOTCHAR;
1547
1548 case OP_NOTUPTO:
1549 case OP_NOTUPTOI:
1550 Lmin = 0;
1551 Lmax = GET2(Fecode, 1);
1552 reptype = REPTYPE_MAX;
1553 Fecode += 1 + IMM2_SIZE;
1554 goto REPEATNOTCHAR;
1555
1556 case OP_NOTMINUPTO:
1557 case OP_NOTMINUPTOI:
1558 Lmin = 0;
1559 Lmax = GET2(Fecode, 1);
1560 reptype = REPTYPE_MIN;
1561 Fecode += 1 + IMM2_SIZE;
1562 goto REPEATNOTCHAR;
1563
1564 case OP_NOTPOSSTAR:
1565 case OP_NOTPOSSTARI:
1566 reptype = REPTYPE_POS;
1567 Lmin = 0;
1568 Lmax = UINT32_MAX;
1569 Fecode++;
1570 goto REPEATNOTCHAR;
1571
1572 case OP_NOTPOSPLUS:
1573 case OP_NOTPOSPLUSI:
1574 reptype = REPTYPE_POS;
1575 Lmin = 1;
1576 Lmax = UINT32_MAX;
1577 Fecode++;
1578 goto REPEATNOTCHAR;
1579
1580 case OP_NOTPOSQUERY:
1581 case OP_NOTPOSQUERYI:
1582 reptype = REPTYPE_POS;
1583 Lmin = 0;
1584 Lmax = 1;
1585 Fecode++;
1586 goto REPEATNOTCHAR;
1587
1588 case OP_NOTPOSUPTO:
1589 case OP_NOTPOSUPTOI:
1590 reptype = REPTYPE_POS;
1591 Lmin = 0;
1592 Lmax = GET2(Fecode, 1);
1593 Fecode += 1 + IMM2_SIZE;
1594 goto REPEATNOTCHAR;
1595
1596 case OP_NOTSTAR:
1597 case OP_NOTSTARI:
1598 case OP_NOTMINSTAR:
1599 case OP_NOTMINSTARI:
1600 case OP_NOTPLUS:
1601 case OP_NOTPLUSI:
1602 case OP_NOTMINPLUS:
1603 case OP_NOTMINPLUSI:
1604 case OP_NOTQUERY:
1605 case OP_NOTQUERYI:
1606 case OP_NOTMINQUERY:
1607 case OP_NOTMINQUERYI:
1608 fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1609 Lmin = rep_min[fc];
1610 Lmax = rep_max[fc];
1611 reptype = rep_typ[fc];
1612
1613 /* Common code for all repeated single-character non-matches. */
1614
1615 REPEATNOTCHAR:
1616 GETCHARINCTEST(Lc, Fecode);
1617
1618 /* The code is duplicated for the caseless and caseful cases, for speed,
1619 since matching characters is likely to be quite common. First, ensure the
1620 minimum number of matches are present. If Lmin = Lmax, we are done.
1621 Otherwise, if minimizing, keep trying the rest of the expression and
1622 advancing one matching character if failing, up to the maximum.
1623 Alternatively, if maximizing, find the maximum number of characters and
1624 work backwards. */
1625
1626 if (Fop >= OP_NOTSTARI) /* Caseless */
1627 {
1628 #ifdef SUPPORT_UNICODE
1629 if ((utf || ucp) && Lc > 127)
1630 Loc = UCD_OTHERCASE(Lc);
1631 else
1632 #endif /* SUPPORT_UNICODE */
1633
1634 Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
1635
1636 #ifdef SUPPORT_UNICODE
1637 if (utf)
1638 {
1639 uint32_t d;
1640 for (i = 1; i <= Lmin; i++)
1641 {
1642 if (Feptr >= mb->end_subject)
1643 {
1644 SCHECK_PARTIAL();
1645 RRETURN(MATCH_NOMATCH);
1646 }
1647 GETCHARINC(d, Feptr);
1648 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1649 }
1650 }
1651 else
1652 #endif /* SUPPORT_UNICODE */
1653
1654 /* Not UTF mode */
1655 {
1656 for (i = 1; i <= Lmin; i++)
1657 {
1658 if (Feptr >= mb->end_subject)
1659 {
1660 SCHECK_PARTIAL();
1661 RRETURN(MATCH_NOMATCH);
1662 }
1663 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1664 Feptr++;
1665 }
1666 }
1667
1668 if (Lmin == Lmax) continue; /* Finished for exact count */
1669
1670 if (reptype == REPTYPE_MIN)
1671 {
1672 #ifdef SUPPORT_UNICODE
1673 if (utf)
1674 {
1675 uint32_t d;
1676 for (;;)
1677 {
1678 RMATCH(Fecode, RM204);
1679 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1680 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1681 if (Feptr >= mb->end_subject)
1682 {
1683 SCHECK_PARTIAL();
1684 RRETURN(MATCH_NOMATCH);
1685 }
1686 GETCHARINC(d, Feptr);
1687 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1688 }
1689 }
1690 else
1691 #endif /*SUPPORT_UNICODE */
1692
1693 /* Not UTF mode */
1694 {
1695 for (;;)
1696 {
1697 RMATCH(Fecode, RM29);
1698 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1699 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1700 if (Feptr >= mb->end_subject)
1701 {
1702 SCHECK_PARTIAL();
1703 RRETURN(MATCH_NOMATCH);
1704 }
1705 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1706 Feptr++;
1707 }
1708 }
1709 /* Control never gets here */
1710 }
1711
1712 /* Maximize case */
1713
1714 else
1715 {
1716 Lstart_eptr = Feptr;
1717
1718 #ifdef SUPPORT_UNICODE
1719 if (utf)
1720 {
1721 uint32_t d;
1722 for (i = Lmin; i < Lmax; i++)
1723 {
1724 int len = 1;
1725 if (Feptr >= mb->end_subject)
1726 {
1727 SCHECK_PARTIAL();
1728 break;
1729 }
1730 GETCHARLEN(d, Feptr, len);
1731 if (Lc == d || Loc == d) break;
1732 Feptr += len;
1733 }
1734
1735 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1736 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1737 go too far. */
1738
1739 if (reptype != REPTYPE_POS) for(;;)
1740 {
1741 if (Feptr <= Lstart_eptr) break;
1742 RMATCH(Fecode, RM205);
1743 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1744 Feptr--;
1745 BACKCHAR(Feptr);
1746 }
1747 }
1748 else
1749 #endif /* SUPPORT_UNICODE */
1750
1751 /* Not UTF mode */
1752 {
1753 for (i = Lmin; i < Lmax; i++)
1754 {
1755 if (Feptr >= mb->end_subject)
1756 {
1757 SCHECK_PARTIAL();
1758 break;
1759 }
1760 if (Lc == *Feptr || Loc == *Feptr) break;
1761 Feptr++;
1762 }
1763 if (reptype != REPTYPE_POS) for (;;)
1764 {
1765 if (Feptr == Lstart_eptr) break;
1766 RMATCH(Fecode, RM30);
1767 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1768 Feptr--;
1769 }
1770 }
1771 }
1772 }
1773
1774 /* Caseful comparisons */
1775
1776 else
1777 {
1778 #ifdef SUPPORT_UNICODE
1779 if (utf)
1780 {
1781 uint32_t d;
1782 for (i = 1; i <= Lmin; i++)
1783 {
1784 if (Feptr >= mb->end_subject)
1785 {
1786 SCHECK_PARTIAL();
1787 RRETURN(MATCH_NOMATCH);
1788 }
1789 GETCHARINC(d, Feptr);
1790 if (Lc == d) RRETURN(MATCH_NOMATCH);
1791 }
1792 }
1793 else
1794 #endif
1795 /* Not UTF mode */
1796 {
1797 for (i = 1; i <= Lmin; i++)
1798 {
1799 if (Feptr >= mb->end_subject)
1800 {
1801 SCHECK_PARTIAL();
1802 RRETURN(MATCH_NOMATCH);
1803 }
1804 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1805 }
1806 }
1807
1808 if (Lmin == Lmax) continue;
1809
1810 if (reptype == REPTYPE_MIN)
1811 {
1812 #ifdef SUPPORT_UNICODE
1813 if (utf)
1814 {
1815 uint32_t d;
1816 for (;;)
1817 {
1818 RMATCH(Fecode, RM206);
1819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1820 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1821 if (Feptr >= mb->end_subject)
1822 {
1823 SCHECK_PARTIAL();
1824 RRETURN(MATCH_NOMATCH);
1825 }
1826 GETCHARINC(d, Feptr);
1827 if (Lc == d) RRETURN(MATCH_NOMATCH);
1828 }
1829 }
1830 else
1831 #endif
1832 /* Not UTF mode */
1833 {
1834 for (;;)
1835 {
1836 RMATCH(Fecode, RM31);
1837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1838 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1839 if (Feptr >= mb->end_subject)
1840 {
1841 SCHECK_PARTIAL();
1842 RRETURN(MATCH_NOMATCH);
1843 }
1844 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1845 }
1846 }
1847 /* Control never gets here */
1848 }
1849
1850 /* Maximize case */
1851
1852 else
1853 {
1854 Lstart_eptr = Feptr;
1855
1856 #ifdef SUPPORT_UNICODE
1857 if (utf)
1858 {
1859 uint32_t d;
1860 for (i = Lmin; i < Lmax; i++)
1861 {
1862 int len = 1;
1863 if (Feptr >= mb->end_subject)
1864 {
1865 SCHECK_PARTIAL();
1866 break;
1867 }
1868 GETCHARLEN(d, Feptr, len);
1869 if (Lc == d) break;
1870 Feptr += len;
1871 }
1872
1873 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1874 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1875 go too far. */
1876
1877 if (reptype != REPTYPE_POS) for(;;)
1878 {
1879 if (Feptr <= Lstart_eptr) break;
1880 RMATCH(Fecode, RM207);
1881 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1882 Feptr--;
1883 BACKCHAR(Feptr);
1884 }
1885 }
1886 else
1887 #endif
1888 /* Not UTF mode */
1889 {
1890 for (i = Lmin; i < Lmax; i++)
1891 {
1892 if (Feptr >= mb->end_subject)
1893 {
1894 SCHECK_PARTIAL();
1895 break;
1896 }
1897 if (Lc == *Feptr) break;
1898 Feptr++;
1899 }
1900 if (reptype != REPTYPE_POS) for (;;)
1901 {
1902 if (Feptr == Lstart_eptr) break;
1903 RMATCH(Fecode, RM32);
1904 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1905 Feptr--;
1906 }
1907 }
1908 }
1909 }
1910 break;
1911
1912 #undef Lstart_eptr
1913 #undef Lmin
1914 #undef Lmax
1915 #undef Lc
1916 #undef Loc
1917
1918
1919 /* ===================================================================== */
1920 /* Match a bit-mapped character class, possibly repeatedly. These opcodes
1921 are used when all the characters in the class have values in the range
1922 0-255, and either the matching is caseful, or the characters are in the
1923 range 0-127 when UTF processing is enabled. The only difference between
1924 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1925 encountered. */
1926
1927 #define Lmin F->temp_32[0]
1928 #define Lmax F->temp_32[1]
1929 #define Lstart_eptr F->temp_sptr[0]
1930 #define Lbyte_map_address F->temp_sptr[1]
1931 #define Lbyte_map ((unsigned char *)Lbyte_map_address)
1932
1933 case OP_NCLASS:
1934 case OP_CLASS:
1935 {
1936 Lbyte_map_address = Fecode + 1; /* Save for matching */
1937 Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
1938
1939 /* Look past the end of the item to see if there is repeat information
1940 following. Then obey similar code to character type repeats. */
1941
1942 switch (*Fecode)
1943 {
1944 case OP_CRSTAR:
1945 case OP_CRMINSTAR:
1946 case OP_CRPLUS:
1947 case OP_CRMINPLUS:
1948 case OP_CRQUERY:
1949 case OP_CRMINQUERY:
1950 case OP_CRPOSSTAR:
1951 case OP_CRPOSPLUS:
1952 case OP_CRPOSQUERY:
1953 fc = *Fecode++ - OP_CRSTAR;
1954 Lmin = rep_min[fc];
1955 Lmax = rep_max[fc];
1956 reptype = rep_typ[fc];
1957 break;
1958
1959 case OP_CRRANGE:
1960 case OP_CRMINRANGE:
1961 case OP_CRPOSRANGE:
1962 Lmin = GET2(Fecode, 1);
1963 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
1964 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
1965 reptype = rep_typ[*Fecode - OP_CRSTAR];
1966 Fecode += 1 + 2 * IMM2_SIZE;
1967 break;
1968
1969 default: /* No repeat follows */
1970 Lmin = Lmax = 1;
1971 break;
1972 }
1973
1974 /* First, ensure the minimum number of matches are present. */
1975
1976 #ifdef SUPPORT_UNICODE
1977 if (utf)
1978 {
1979 for (i = 1; i <= Lmin; i++)
1980 {
1981 if (Feptr >= mb->end_subject)
1982 {
1983 SCHECK_PARTIAL();
1984 RRETURN(MATCH_NOMATCH);
1985 }
1986 GETCHARINC(fc, Feptr);
1987 if (fc > 255)
1988 {
1989 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1990 }
1991 else
1992 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1993 }
1994 }
1995 else
1996 #endif
1997 /* Not UTF mode */
1998 {
1999 for (i = 1; i <= Lmin; i++)
2000 {
2001 if (Feptr >= mb->end_subject)
2002 {
2003 SCHECK_PARTIAL();
2004 RRETURN(MATCH_NOMATCH);
2005 }
2006 fc = *Feptr++;
2007 #if PCRE2_CODE_UNIT_WIDTH != 8
2008 if (fc > 255)
2009 {
2010 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2011 }
2012 else
2013 #endif
2014 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2015 }
2016 }
2017
2018 /* If Lmax == Lmin we are done. Continue with main loop. */
2019
2020 if (Lmin == Lmax) continue;
2021
2022 /* If minimizing, keep testing the rest of the expression and advancing
2023 the pointer while it matches the class. */
2024
2025 if (reptype == REPTYPE_MIN)
2026 {
2027 #ifdef SUPPORT_UNICODE
2028 if (utf)
2029 {
2030 for (;;)
2031 {
2032 RMATCH(Fecode, RM200);
2033 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2034 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2035 if (Feptr >= mb->end_subject)
2036 {
2037 SCHECK_PARTIAL();
2038 RRETURN(MATCH_NOMATCH);
2039 }
2040 GETCHARINC(fc, Feptr);
2041 if (fc > 255)
2042 {
2043 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2044 }
2045 else
2046 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2047 }
2048 }
2049 else
2050 #endif
2051 /* Not UTF mode */
2052 {
2053 for (;;)
2054 {
2055 RMATCH(Fecode, RM23);
2056 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2057 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2058 if (Feptr >= mb->end_subject)
2059 {
2060 SCHECK_PARTIAL();
2061 RRETURN(MATCH_NOMATCH);
2062 }
2063 fc = *Feptr++;
2064 #if PCRE2_CODE_UNIT_WIDTH != 8
2065 if (fc > 255)
2066 {
2067 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2068 }
2069 else
2070 #endif
2071 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2072 }
2073 }
2074 /* Control never gets here */
2075 }
2076
2077 /* If maximizing, find the longest possible run, then work backwards. */
2078
2079 else
2080 {
2081 Lstart_eptr = Feptr;
2082
2083 #ifdef SUPPORT_UNICODE
2084 if (utf)
2085 {
2086 for (i = Lmin; i < Lmax; i++)
2087 {
2088 int len = 1;
2089 if (Feptr >= mb->end_subject)
2090 {
2091 SCHECK_PARTIAL();
2092 break;
2093 }
2094 GETCHARLEN(fc, Feptr, len);
2095 if (fc > 255)
2096 {
2097 if (Fop == OP_CLASS) break;
2098 }
2099 else
2100 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2101 Feptr += len;
2102 }
2103
2104 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2105
2106 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2107 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2108 go too far. */
2109
2110 for (;;)
2111 {
2112 RMATCH(Fecode, RM201);
2113 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2114 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2115 BACKCHAR(Feptr);
2116 }
2117 }
2118 else
2119 #endif
2120 /* Not UTF mode */
2121 {
2122 for (i = Lmin; i < Lmax; i++)
2123 {
2124 if (Feptr >= mb->end_subject)
2125 {
2126 SCHECK_PARTIAL();
2127 break;
2128 }
2129 fc = *Feptr;
2130 #if PCRE2_CODE_UNIT_WIDTH != 8
2131 if (fc > 255)
2132 {
2133 if (Fop == OP_CLASS) break;
2134 }
2135 else
2136 #endif
2137 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2138 Feptr++;
2139 }
2140
2141 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2142
2143 while (Feptr >= Lstart_eptr)
2144 {
2145 RMATCH(Fecode, RM24);
2146 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2147 Feptr--;
2148 }
2149 }
2150
2151 RRETURN(MATCH_NOMATCH);
2152 }
2153 }
2154 /* Control never gets here */
2155
2156 #undef Lbyte_map_address
2157 #undef Lbyte_map
2158 #undef Lstart_eptr
2159 #undef Lmin
2160 #undef Lmax
2161
2162
2163 /* ===================================================================== */
2164 /* Match an extended character class. In the 8-bit library, this opcode is
2165 encountered only when UTF-8 mode mode is supported. In the 16-bit and
2166 32-bit libraries, codepoints greater than 255 may be encountered even when
2167 UTF is not supported. */
2168
2169 #define Lstart_eptr F->temp_sptr[0]
2170 #define Lxclass_data F->temp_sptr[1]
2171 #define Lmin F->temp_32[0]
2172 #define Lmax F->temp_32[1]
2173
2174 #ifdef SUPPORT_WIDE_CHARS
2175 case OP_XCLASS:
2176 {
2177 Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2178 Fecode += GET(Fecode, 1); /* Advance past the item */
2179
2180 switch (*Fecode)
2181 {
2182 case OP_CRSTAR:
2183 case OP_CRMINSTAR:
2184 case OP_CRPLUS:
2185 case OP_CRMINPLUS:
2186 case OP_CRQUERY:
2187 case OP_CRMINQUERY:
2188 case OP_CRPOSSTAR:
2189 case OP_CRPOSPLUS:
2190 case OP_CRPOSQUERY:
2191 fc = *Fecode++ - OP_CRSTAR;
2192 Lmin = rep_min[fc];
2193 Lmax = rep_max[fc];
2194 reptype = rep_typ[fc];
2195 break;
2196
2197 case OP_CRRANGE:
2198 case OP_CRMINRANGE:
2199 case OP_CRPOSRANGE:
2200 Lmin = GET2(Fecode, 1);
2201 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2202 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2203 reptype = rep_typ[*Fecode - OP_CRSTAR];
2204 Fecode += 1 + 2 * IMM2_SIZE;
2205 break;
2206
2207 default: /* No repeat follows */
2208 Lmin = Lmax = 1;
2209 break;
2210 }
2211
2212 /* First, ensure the minimum number of matches are present. */
2213
2214 for (i = 1; i <= Lmin; i++)
2215 {
2216 if (Feptr >= mb->end_subject)
2217 {
2218 SCHECK_PARTIAL();
2219 RRETURN(MATCH_NOMATCH);
2220 }
2221 GETCHARINCTEST(fc, Feptr);
2222 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2223 }
2224
2225 /* If Lmax == Lmin we can just continue with the main loop. */
2226
2227 if (Lmin == Lmax) continue;
2228
2229 /* If minimizing, keep testing the rest of the expression and advancing
2230 the pointer while it matches the class. */
2231
2232 if (reptype == REPTYPE_MIN)
2233 {
2234 for (;;)
2235 {
2236 RMATCH(Fecode, RM100);
2237 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2238 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2239 if (Feptr >= mb->end_subject)
2240 {
2241 SCHECK_PARTIAL();
2242 RRETURN(MATCH_NOMATCH);
2243 }
2244 GETCHARINCTEST(fc, Feptr);
2245 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2246 }
2247 /* Control never gets here */
2248 }
2249
2250 /* If maximizing, find the longest possible run, then work backwards. */
2251
2252 else
2253 {
2254 Lstart_eptr = Feptr;
2255 for (i = Lmin; i < Lmax; i++)
2256 {
2257 int len = 1;
2258 if (Feptr >= mb->end_subject)
2259 {
2260 SCHECK_PARTIAL();
2261 break;
2262 }
2263 #ifdef SUPPORT_UNICODE
2264 GETCHARLENTEST(fc, Feptr, len);
2265 #else
2266 fc = *Feptr;
2267 #endif
2268 if (!PRIV(xclass)(fc, Lxclass_data, utf)) break;
2269 Feptr += len;
2270 }
2271
2272 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2273
2274 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2275 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2276 go too far. */
2277
2278 for(;;)
2279 {
2280 RMATCH(Fecode, RM101);
2281 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2282 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2283 #ifdef SUPPORT_UNICODE
2284 if (utf) BACKCHAR(Feptr);
2285 #endif
2286 }
2287 RRETURN(MATCH_NOMATCH);
2288 }
2289
2290 /* Control never gets here */
2291 }
2292 #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
2293
2294 #undef Lstart_eptr
2295 #undef Lxclass_data
2296 #undef Lmin
2297 #undef Lmax
2298
2299
2300 /* ===================================================================== */
2301 /* Match various character types when PCRE2_UCP is not set. These opcodes
2302 are not generated when PCRE2_UCP is set - instead appropriate property
2303 tests are compiled. */
2304
2305 case OP_NOT_DIGIT:
2306 if (Feptr >= mb->end_subject)
2307 {
2308 SCHECK_PARTIAL();
2309 RRETURN(MATCH_NOMATCH);
2310 }
2311 GETCHARINCTEST(fc, Feptr);
2312 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
2313 RRETURN(MATCH_NOMATCH);
2314 Fecode++;
2315 break;
2316
2317 case OP_DIGIT:
2318 if (Feptr >= mb->end_subject)
2319 {
2320 SCHECK_PARTIAL();
2321 RRETURN(MATCH_NOMATCH);
2322 }
2323 GETCHARINCTEST(fc, Feptr);
2324 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
2325 RRETURN(MATCH_NOMATCH);
2326 Fecode++;
2327 break;
2328
2329 case OP_NOT_WHITESPACE:
2330 if (Feptr >= mb->end_subject)
2331 {
2332 SCHECK_PARTIAL();
2333 RRETURN(MATCH_NOMATCH);
2334 }
2335 GETCHARINCTEST(fc, Feptr);
2336 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
2337 RRETURN(MATCH_NOMATCH);
2338 Fecode++;
2339 break;
2340
2341 case OP_WHITESPACE:
2342 if (Feptr >= mb->end_subject)
2343 {
2344 SCHECK_PARTIAL();
2345 RRETURN(MATCH_NOMATCH);
2346 }
2347 GETCHARINCTEST(fc, Feptr);
2348 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
2349 RRETURN(MATCH_NOMATCH);
2350 Fecode++;
2351 break;
2352
2353 case OP_NOT_WORDCHAR:
2354 if (Feptr >= mb->end_subject)
2355 {
2356 SCHECK_PARTIAL();
2357 RRETURN(MATCH_NOMATCH);
2358 }
2359 GETCHARINCTEST(fc, Feptr);
2360 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
2361 RRETURN(MATCH_NOMATCH);
2362 Fecode++;
2363 break;
2364
2365 case OP_WORDCHAR:
2366 if (Feptr >= mb->end_subject)
2367 {
2368 SCHECK_PARTIAL();
2369 RRETURN(MATCH_NOMATCH);
2370 }
2371 GETCHARINCTEST(fc, Feptr);
2372 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
2373 RRETURN(MATCH_NOMATCH);
2374 Fecode++;
2375 break;
2376
2377 case OP_ANYNL:
2378 if (Feptr >= mb->end_subject)
2379 {
2380 SCHECK_PARTIAL();
2381 RRETURN(MATCH_NOMATCH);
2382 }
2383 GETCHARINCTEST(fc, Feptr);
2384 switch(fc)
2385 {
2386 default: RRETURN(MATCH_NOMATCH);
2387
2388 case CHAR_CR:
2389 if (Feptr >= mb->end_subject)
2390 {
2391 SCHECK_PARTIAL();
2392 }
2393 else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
2394 break;
2395
2396 case CHAR_LF:
2397 break;
2398
2399 case CHAR_VT:
2400 case CHAR_FF:
2401 case CHAR_NEL:
2402 #ifndef EBCDIC
2403 case 0x2028:
2404 case 0x2029:
2405 #endif /* Not EBCDIC */
2406 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2407 break;
2408 }
2409 Fecode++;
2410 break;
2411
2412 case OP_NOT_HSPACE:
2413 if (Feptr >= mb->end_subject)
2414 {
2415 SCHECK_PARTIAL();
2416 RRETURN(MATCH_NOMATCH);
2417 }
2418 GETCHARINCTEST(fc, Feptr);
2419 switch(fc)
2420 {
2421 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2422 default: break;
2423 }
2424 Fecode++;
2425 break;
2426
2427 case OP_HSPACE:
2428 if (Feptr >= mb->end_subject)
2429 {
2430 SCHECK_PARTIAL();
2431 RRETURN(MATCH_NOMATCH);
2432 }
2433 GETCHARINCTEST(fc, Feptr);
2434 switch(fc)
2435 {
2436 HSPACE_CASES: break; /* Byte and multibyte cases */
2437 default: RRETURN(MATCH_NOMATCH);
2438 }
2439 Fecode++;
2440 break;
2441
2442 case OP_NOT_VSPACE:
2443 if (Feptr >= mb->end_subject)
2444 {
2445 SCHECK_PARTIAL();
2446 RRETURN(MATCH_NOMATCH);
2447 }
2448 GETCHARINCTEST(fc, Feptr);
2449 switch(fc)
2450 {
2451 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2452 default: break;
2453 }
2454 Fecode++;
2455 break;
2456
2457 case OP_VSPACE:
2458 if (Feptr >= mb->end_subject)
2459 {
2460 SCHECK_PARTIAL();
2461 RRETURN(MATCH_NOMATCH);
2462 }
2463 GETCHARINCTEST(fc, Feptr);
2464 switch(fc)
2465 {
2466 VSPACE_CASES: break;
2467 default: RRETURN(MATCH_NOMATCH);
2468 }
2469 Fecode++;
2470 break;
2471
2472
2473 #ifdef SUPPORT_UNICODE
2474
2475 /* ===================================================================== */
2476 /* Check the next character by Unicode property. We will get here only
2477 if the support is in the binary; otherwise a compile-time error occurs. */
2478
2479 case OP_PROP:
2480 case OP_NOTPROP:
2481 if (Feptr >= mb->end_subject)
2482 {
2483 SCHECK_PARTIAL();
2484 RRETURN(MATCH_NOMATCH);
2485 }
2486 GETCHARINCTEST(fc, Feptr);
2487 {
2488 const uint32_t *cp;
2489 uint32_t chartype;
2490 const ucd_record *prop = GET_UCD(fc);
2491 BOOL notmatch = Fop == OP_NOTPROP;
2492
2493 switch(Fecode[1])
2494 {
2495 case PT_ANY:
2496 if (notmatch) RRETURN(MATCH_NOMATCH);
2497 break;
2498
2499 case PT_LAMP:
2500 chartype = prop->chartype;
2501 if ((chartype == ucp_Lu ||
2502 chartype == ucp_Ll ||
2503 chartype == ucp_Lt) == notmatch)
2504 RRETURN(MATCH_NOMATCH);
2505 break;
2506
2507 case PT_GC:
2508 if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch)
2509 RRETURN(MATCH_NOMATCH);
2510 break;
2511
2512 case PT_PC:
2513 if ((Fecode[2] == prop->chartype) == notmatch)
2514 RRETURN(MATCH_NOMATCH);
2515 break;
2516
2517 case PT_SC:
2518 if ((Fecode[2] == prop->script) == notmatch)
2519 RRETURN(MATCH_NOMATCH);
2520 break;
2521
2522 case PT_SCX:
2523 {
2524 BOOL ok = (Fecode[2] == prop->script ||
2525 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0);
2526 if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2527 }
2528 break;
2529
2530 /* These are specials */
2531
2532 case PT_ALNUM:
2533 chartype = prop->chartype;
2534 if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2535 PRIV(ucp_gentype)[chartype] == ucp_N) == notmatch)
2536 RRETURN(MATCH_NOMATCH);
2537 break;
2538
2539 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2540 which means that Perl space and POSIX space are now identical. PCRE
2541 was changed at release 8.34. */
2542
2543 case PT_SPACE: /* Perl space */
2544 case PT_PXSPACE: /* POSIX space */
2545 switch(fc)
2546 {
2547 HSPACE_CASES:
2548 VSPACE_CASES:
2549 if (notmatch) RRETURN(MATCH_NOMATCH);
2550 break;
2551
2552 default:
2553 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch)
2554 RRETURN(MATCH_NOMATCH);
2555 break;
2556 }
2557 break;
2558
2559 case PT_WORD:
2560 chartype = prop->chartype;
2561 if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
2562 PRIV(ucp_gentype)[chartype] == ucp_N ||
2563 chartype == ucp_Mn ||
2564 chartype == ucp_Pc) == notmatch)
2565 RRETURN(MATCH_NOMATCH);
2566 break;
2567
2568 case PT_CLIST:
2569 #if PCRE2_CODE_UNIT_WIDTH == 32
2570 if (fc > MAX_UTF_CODE_POINT)
2571 {
2572 if (notmatch) break;;
2573 RRETURN(MATCH_NOMATCH);
2574 }
2575 #endif
2576 cp = PRIV(ucd_caseless_sets) + Fecode[2];
2577 for (;;)
2578 {
2579 if (fc < *cp)
2580 { if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } }
2581 if (fc == *cp++)
2582 { if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; }
2583 }
2584 break;
2585
2586 case PT_UCNC:
2587 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2588 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2589 fc >= 0xe000) == notmatch)
2590 RRETURN(MATCH_NOMATCH);
2591 break;
2592
2593 case PT_BIDICL:
2594 if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch)
2595 RRETURN(MATCH_NOMATCH);
2596 break;
2597
2598 case PT_BOOL:
2599 {
2600 BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) +
2601 UCD_BPROPS_PROP(prop), Fecode[2]) != 0;
2602 if (ok == notmatch) RRETURN(MATCH_NOMATCH);
2603 }
2604 break;
2605
2606 /* This should never occur */
2607
2608 default:
2609 return PCRE2_ERROR_INTERNAL;
2610 }
2611
2612 Fecode += 3;
2613 }
2614 break;
2615
2616
2617 /* ===================================================================== */
2618 /* Match an extended Unicode sequence. We will get here only if the support
2619 is in the binary; otherwise a compile-time error occurs. */
2620
2621 case OP_EXTUNI:
2622 if (Feptr >= mb->end_subject)
2623 {
2624 SCHECK_PARTIAL();
2625 RRETURN(MATCH_NOMATCH);
2626 }
2627 else
2628 {
2629 GETCHARINCTEST(fc, Feptr);
2630 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
2631 NULL);
2632 }
2633 CHECK_PARTIAL();
2634 Fecode++;
2635 break;
2636
2637 #endif /* SUPPORT_UNICODE */
2638
2639
2640 /* ===================================================================== */
2641 /* Match a single character type repeatedly. Note that the property type
2642 does not need to be in a stack frame as it is not used within an RMATCH()
2643 loop. */
2644
2645 #define Lstart_eptr F->temp_sptr[0]
2646 #define Lmin F->temp_32[0]
2647 #define Lmax F->temp_32[1]
2648 #define Lctype F->temp_32[2]
2649 #define Lpropvalue F->temp_32[3]
2650
2651 case OP_TYPEEXACT:
2652 Lmin = Lmax = GET2(Fecode, 1);
2653 Fecode += 1 + IMM2_SIZE;
2654 goto REPEATTYPE;
2655
2656 case OP_TYPEUPTO:
2657 case OP_TYPEMINUPTO:
2658 Lmin = 0;
2659 Lmax = GET2(Fecode, 1);
2660 reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
2661 Fecode += 1 + IMM2_SIZE;
2662 goto REPEATTYPE;
2663
2664 case OP_TYPEPOSSTAR:
2665 reptype = REPTYPE_POS;
2666 Lmin = 0;
2667 Lmax = UINT32_MAX;
2668 Fecode++;
2669 goto REPEATTYPE;
2670
2671 case OP_TYPEPOSPLUS:
2672 reptype = REPTYPE_POS;
2673 Lmin = 1;
2674 Lmax = UINT32_MAX;
2675 Fecode++;
2676 goto REPEATTYPE;
2677
2678 case OP_TYPEPOSQUERY:
2679 reptype = REPTYPE_POS;
2680 Lmin = 0;
2681 Lmax = 1;
2682 Fecode++;
2683 goto REPEATTYPE;
2684
2685 case OP_TYPEPOSUPTO:
2686 reptype = REPTYPE_POS;
2687 Lmin = 0;
2688 Lmax = GET2(Fecode, 1);
2689 Fecode += 1 + IMM2_SIZE;
2690 goto REPEATTYPE;
2691
2692 case OP_TYPESTAR:
2693 case OP_TYPEMINSTAR:
2694 case OP_TYPEPLUS:
2695 case OP_TYPEMINPLUS:
2696 case OP_TYPEQUERY:
2697 case OP_TYPEMINQUERY:
2698 fc = *Fecode++ - OP_TYPESTAR;
2699 Lmin = rep_min[fc];
2700 Lmax = rep_max[fc];
2701 reptype = rep_typ[fc];
2702
2703 /* Common code for all repeated character type matches. */
2704
2705 REPEATTYPE:
2706 Lctype = *Fecode++; /* Code for the character type */
2707
2708 #ifdef SUPPORT_UNICODE
2709 if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
2710 {
2711 proptype = *Fecode++;
2712 Lpropvalue = *Fecode++;
2713 }
2714 else proptype = -1;
2715 #endif
2716
2717 /* First, ensure the minimum number of matches are present. Use inline
2718 code for maximizing the speed, and do the type test once at the start
2719 (i.e. keep it out of the loops). As there are no calls to RMATCH in the
2720 loops, we can use an ordinary variable for "notmatch". The code for UTF
2721 mode is separated out for tidiness, except for Unicode property tests. */
2722
2723 if (Lmin > 0)
2724 {
2725 #ifdef SUPPORT_UNICODE
2726 if (proptype >= 0) /* Property tests in all modes */
2727 {
2728 BOOL notmatch = Lctype == OP_NOTPROP;
2729 switch(proptype)
2730 {
2731 case PT_ANY:
2732 if (notmatch) RRETURN(MATCH_NOMATCH);
2733 for (i = 1; i <= Lmin; i++)
2734 {
2735 if (Feptr >= mb->end_subject)
2736 {
2737 SCHECK_PARTIAL();
2738 RRETURN(MATCH_NOMATCH);
2739 }
2740 GETCHARINCTEST(fc, Feptr);
2741 }
2742 break;
2743
2744 case PT_LAMP:
2745 for (i = 1; i <= Lmin; i++)
2746 {
2747 int chartype;
2748 if (Feptr >= mb->end_subject)
2749 {
2750 SCHECK_PARTIAL();
2751 RRETURN(MATCH_NOMATCH);
2752 }
2753 GETCHARINCTEST(fc, Feptr);
2754 chartype = UCD_CHARTYPE(fc);
2755 if ((chartype == ucp_Lu ||
2756 chartype == ucp_Ll ||
2757 chartype == ucp_Lt) == notmatch)
2758 RRETURN(MATCH_NOMATCH);
2759 }
2760 break;
2761
2762 case PT_GC:
2763 for (i = 1; i <= Lmin; i++)
2764 {
2765 if (Feptr >= mb->end_subject)
2766 {
2767 SCHECK_PARTIAL();
2768 RRETURN(MATCH_NOMATCH);
2769 }
2770 GETCHARINCTEST(fc, Feptr);
2771 if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch)
2772 RRETURN(MATCH_NOMATCH);
2773 }
2774 break;
2775
2776 case PT_PC:
2777 for (i = 1; i <= Lmin; i++)
2778 {
2779 if (Feptr >= mb->end_subject)
2780 {
2781 SCHECK_PARTIAL();
2782 RRETURN(MATCH_NOMATCH);
2783 }
2784 GETCHARINCTEST(fc, Feptr);
2785 if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch)
2786 RRETURN(MATCH_NOMATCH);
2787 }
2788 break;
2789
2790 case PT_SC:
2791 for (i = 1; i <= Lmin; i++)
2792 {
2793 if (Feptr >= mb->end_subject)
2794 {
2795 SCHECK_PARTIAL();
2796 RRETURN(MATCH_NOMATCH);
2797 }
2798 GETCHARINCTEST(fc, Feptr);
2799 if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch)
2800 RRETURN(MATCH_NOMATCH);
2801 }
2802 break;
2803
2804 case PT_SCX:
2805 for (i = 1; i <= Lmin; i++)
2806 {
2807 BOOL ok;
2808 const ucd_record *prop;
2809 if (Feptr >= mb->end_subject)
2810 {
2811 SCHECK_PARTIAL();
2812 RRETURN(MATCH_NOMATCH);
2813 }
2814 GETCHARINCTEST(fc, Feptr);
2815 prop = GET_UCD(fc);
2816 ok = (prop->script == Lpropvalue ||
2817 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
2818 if (ok == notmatch)
2819 RRETURN(MATCH_NOMATCH);
2820 }
2821 break;
2822
2823 case PT_ALNUM:
2824 for (i = 1; i <= Lmin; i++)
2825 {
2826 int category;
2827 if (Feptr >= mb->end_subject)
2828 {
2829 SCHECK_PARTIAL();
2830 RRETURN(MATCH_NOMATCH);
2831 }
2832 GETCHARINCTEST(fc, Feptr);
2833 category = UCD_CATEGORY(fc);
2834 if ((category == ucp_L || category == ucp_N) == notmatch)
2835 RRETURN(MATCH_NOMATCH);
2836 }
2837 break;
2838
2839 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2840 which means that Perl space and POSIX space are now identical. PCRE
2841 was changed at release 8.34. */
2842
2843 case PT_SPACE: /* Perl space */
2844 case PT_PXSPACE: /* POSIX space */
2845 for (i = 1; i <= Lmin; i++)
2846 {
2847 if (Feptr >= mb->end_subject)
2848 {
2849 SCHECK_PARTIAL();
2850 RRETURN(MATCH_NOMATCH);
2851 }
2852 GETCHARINCTEST(fc, Feptr);
2853 switch(fc)
2854 {
2855 HSPACE_CASES:
2856 VSPACE_CASES:
2857 if (notmatch) RRETURN(MATCH_NOMATCH);
2858 break;
2859
2860 default:
2861 if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
2862 RRETURN(MATCH_NOMATCH);
2863 break;
2864 }
2865 }
2866 break;
2867
2868 case PT_WORD:
2869 for (i = 1; i <= Lmin; i++)
2870 {
2871 int chartype, category;
2872 if (Feptr >= mb->end_subject)
2873 {
2874 SCHECK_PARTIAL();
2875 RRETURN(MATCH_NOMATCH);
2876 }
2877 GETCHARINCTEST(fc, Feptr);
2878 chartype = UCD_CHARTYPE(fc);
2879 category = PRIV(ucp_gentype)[chartype];
2880 if ((category == ucp_L || category == ucp_N ||
2881 chartype == ucp_Mn || chartype == ucp_Pc) == notmatch)
2882 RRETURN(MATCH_NOMATCH);
2883 }
2884 break;
2885
2886 case PT_CLIST:
2887 for (i = 1; i <= Lmin; i++)
2888 {
2889 const uint32_t *cp;
2890 if (Feptr >= mb->end_subject)
2891 {
2892 SCHECK_PARTIAL();
2893 RRETURN(MATCH_NOMATCH);
2894 }
2895 GETCHARINCTEST(fc, Feptr);
2896 #if PCRE2_CODE_UNIT_WIDTH == 32
2897 if (fc > MAX_UTF_CODE_POINT)
2898 {
2899 if (notmatch) continue;
2900 RRETURN(MATCH_NOMATCH);
2901 }
2902 #endif
2903 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
2904 for (;;)
2905 {
2906 if (fc < *cp)
2907 {
2908 if (notmatch) break;
2909 RRETURN(MATCH_NOMATCH);
2910 }
2911 if (fc == *cp++)
2912 {
2913 if (notmatch) RRETURN(MATCH_NOMATCH);
2914 break;
2915 }
2916 }
2917 }
2918 break;
2919
2920 case PT_UCNC:
2921 for (i = 1; i <= Lmin; i++)
2922 {
2923 if (Feptr >= mb->end_subject)
2924 {
2925 SCHECK_PARTIAL();
2926 RRETURN(MATCH_NOMATCH);
2927 }
2928 GETCHARINCTEST(fc, Feptr);
2929 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2930 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2931 fc >= 0xe000) == notmatch)
2932 RRETURN(MATCH_NOMATCH);
2933 }
2934 break;
2935
2936 case PT_BIDICL:
2937 for (i = 1; i <= Lmin; i++)
2938 {
2939 if (Feptr >= mb->end_subject)
2940 {
2941 SCHECK_PARTIAL();
2942 RRETURN(MATCH_NOMATCH);
2943 }
2944 GETCHARINCTEST(fc, Feptr);
2945 if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch)
2946 RRETURN(MATCH_NOMATCH);
2947 }
2948 break;
2949
2950 case PT_BOOL:
2951 for (i = 1; i <= Lmin; i++)
2952 {
2953 BOOL ok;
2954 const ucd_record *prop;
2955 if (Feptr >= mb->end_subject)
2956 {
2957 SCHECK_PARTIAL();
2958 RRETURN(MATCH_NOMATCH);
2959 }
2960 GETCHARINCTEST(fc, Feptr);
2961 prop = GET_UCD(fc);
2962 ok = MAPBIT(PRIV(ucd_boolprop_sets) +
2963 UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
2964 if (ok == notmatch)
2965 RRETURN(MATCH_NOMATCH);
2966 }
2967 break;
2968
2969 /* This should not occur */
2970
2971 default:
2972 return PCRE2_ERROR_INTERNAL;
2973 }
2974 }
2975
2976 /* Match extended Unicode sequences. We will get here only if the
2977 support is in the binary; otherwise a compile-time error occurs. */
2978
2979 else if (Lctype == OP_EXTUNI)
2980 {
2981 for (i = 1; i <= Lmin; i++)
2982 {
2983 if (Feptr >= mb->end_subject)
2984 {
2985 SCHECK_PARTIAL();
2986 RRETURN(MATCH_NOMATCH);
2987 }
2988 else
2989 {
2990 GETCHARINCTEST(fc, Feptr);
2991 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
2992 mb->end_subject, utf, NULL);
2993 }
2994 CHECK_PARTIAL();
2995 }
2996 }
2997 else
2998 #endif /* SUPPORT_UNICODE */
2999
3000 /* Handle all other cases in UTF mode */
3001
3002 #ifdef SUPPORT_UNICODE
3003 if (utf) switch(Lctype)
3004 {
3005 case OP_ANY:
3006 for (i = 1; i <= Lmin; i++)
3007 {
3008 if (Feptr >= mb->end_subject)
3009 {
3010 SCHECK_PARTIAL();
3011 RRETURN(MATCH_NOMATCH);
3012 }
3013 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3014 if (mb->partial != 0 &&
3015 Feptr + 1 >= mb->end_subject &&
3016 NLBLOCK->nltype == NLTYPE_FIXED &&
3017 NLBLOCK->nllen == 2 &&
3018 UCHAR21(Feptr) == NLBLOCK->nl[0])
3019 {
3020 mb->hitend = TRUE;
3021 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3022 }
3023 Feptr++;
3024 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3025 }
3026 break;
3027
3028 case OP_ALLANY:
3029 for (i = 1; i <= Lmin; i++)
3030 {
3031 if (Feptr >= mb->end_subject)
3032 {
3033 SCHECK_PARTIAL();
3034 RRETURN(MATCH_NOMATCH);
3035 }
3036 Feptr++;
3037 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3038 }
3039 break;
3040
3041 case OP_ANYBYTE:
3042 if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
3043 Feptr += Lmin;
3044 break;
3045
3046 case OP_ANYNL:
3047 for (i = 1; i <= Lmin; i++)
3048 {
3049 if (Feptr >= mb->end_subject)
3050 {
3051 SCHECK_PARTIAL();
3052 RRETURN(MATCH_NOMATCH);
3053 }
3054 GETCHARINC(fc, Feptr);
3055 switch(fc)
3056 {
3057 default: RRETURN(MATCH_NOMATCH);
3058
3059 case CHAR_CR:
3060 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3061 break;
3062
3063 case CHAR_LF:
3064 break;
3065
3066 case CHAR_VT:
3067 case CHAR_FF:
3068 case CHAR_NEL:
3069 #ifndef EBCDIC
3070 case 0x2028:
3071 case 0x2029:
3072 #endif /* Not EBCDIC */
3073 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3074 break;
3075 }
3076 }
3077 break;
3078
3079 case OP_NOT_HSPACE:
3080 for (i = 1; i <= Lmin; i++)
3081 {
3082 if (Feptr >= mb->end_subject)
3083 {
3084 SCHECK_PARTIAL();
3085 RRETURN(MATCH_NOMATCH);
3086 }
3087 GETCHARINC(fc, Feptr);
3088 switch(fc)
3089 {
3090 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3091 default: break;
3092 }
3093 }
3094 break;
3095
3096 case OP_HSPACE:
3097 for (i = 1; i <= Lmin; i++)
3098 {
3099 if (Feptr >= mb->end_subject)
3100 {
3101 SCHECK_PARTIAL();
3102 RRETURN(MATCH_NOMATCH);
3103 }
3104 GETCHARINC(fc, Feptr);
3105 switch(fc)
3106 {
3107 HSPACE_CASES: break;
3108 default: RRETURN(MATCH_NOMATCH);
3109 }
3110 }
3111 break;
3112
3113 case OP_NOT_VSPACE:
3114 for (i = 1; i <= Lmin; i++)
3115 {
3116 if (Feptr >= mb->end_subject)
3117 {
3118 SCHECK_PARTIAL();
3119 RRETURN(MATCH_NOMATCH);
3120 }
3121 GETCHARINC(fc, Feptr);
3122 switch(fc)
3123 {
3124 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3125 default: break;
3126 }
3127 }
3128 break;
3129
3130 case OP_VSPACE:
3131 for (i = 1; i <= Lmin; i++)
3132 {
3133 if (Feptr >= mb->end_subject)
3134 {
3135 SCHECK_PARTIAL();
3136 RRETURN(MATCH_NOMATCH);
3137 }
3138 GETCHARINC(fc, Feptr);
3139 switch(fc)
3140 {
3141 VSPACE_CASES: break;
3142 default: RRETURN(MATCH_NOMATCH);
3143 }
3144 }
3145 break;
3146
3147 case OP_NOT_DIGIT:
3148 for (i = 1; i <= Lmin; i++)
3149 {
3150 if (Feptr >= mb->end_subject)
3151 {
3152 SCHECK_PARTIAL();
3153 RRETURN(MATCH_NOMATCH);
3154 }
3155 GETCHARINC(fc, Feptr);
3156 if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
3157 RRETURN(MATCH_NOMATCH);
3158 }
3159 break;
3160
3161 case OP_DIGIT:
3162 for (i = 1; i <= Lmin; i++)
3163 {
3164 uint32_t cc;
3165 if (Feptr >= mb->end_subject)
3166 {
3167 SCHECK_PARTIAL();
3168 RRETURN(MATCH_NOMATCH);
3169 }
3170 cc = UCHAR21(Feptr);
3171 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
3172 RRETURN(MATCH_NOMATCH);
3173 Feptr++;
3174 /* No need to skip more code units - we know it has only one. */
3175 }
3176 break;
3177
3178 case OP_NOT_WHITESPACE:
3179 for (i = 1; i <= Lmin; i++)
3180 {
3181 uint32_t cc;
3182 if (Feptr >= mb->end_subject)
3183 {
3184 SCHECK_PARTIAL();
3185 RRETURN(MATCH_NOMATCH);
3186 }
3187 cc = UCHAR21(Feptr);
3188 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
3189 RRETURN(MATCH_NOMATCH);
3190 Feptr++;
3191 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3192 }
3193 break;
3194
3195 case OP_WHITESPACE:
3196 for (i = 1; i <= Lmin; i++)
3197 {
3198 uint32_t cc;
3199 if (Feptr >= mb->end_subject)
3200 {
3201 SCHECK_PARTIAL();
3202 RRETURN(MATCH_NOMATCH);
3203 }
3204 cc = UCHAR21(Feptr);
3205 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
3206 RRETURN(MATCH_NOMATCH);
3207 Feptr++;
3208 /* No need to skip more code units - we know it has only one. */
3209 }
3210 break;
3211
3212 case OP_NOT_WORDCHAR:
3213 for (i = 1; i <= Lmin; i++)
3214 {
3215 uint32_t cc;
3216 if (Feptr >= mb->end_subject)
3217 {
3218 SCHECK_PARTIAL();
3219 RRETURN(MATCH_NOMATCH);
3220 }
3221 cc = UCHAR21(Feptr);
3222 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
3223 RRETURN(MATCH_NOMATCH);
3224 Feptr++;
3225 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3226 }
3227 break;
3228
3229 case OP_WORDCHAR:
3230 for (i = 1; i <= Lmin; i++)
3231 {
3232 uint32_t cc;
3233 if (Feptr >= mb->end_subject)
3234 {
3235 SCHECK_PARTIAL();
3236 RRETURN(MATCH_NOMATCH);
3237 }
3238 cc = UCHAR21(Feptr);
3239 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
3240 RRETURN(MATCH_NOMATCH);
3241 Feptr++;
3242 /* No need to skip more code units - we know it has only one. */
3243 }
3244 break;
3245
3246 default:
3247 return PCRE2_ERROR_INTERNAL;
3248 } /* End switch(Lctype) */
3249
3250 else
3251 #endif /* SUPPORT_UNICODE */
3252
3253 /* Code for the non-UTF case for minimum matching of operators other
3254 than OP_PROP and OP_NOTPROP. */
3255
3256 switch(Lctype)
3257 {
3258 case OP_ANY:
3259 for (i = 1; i <= Lmin; i++)
3260 {
3261 if (Feptr >= mb->end_subject)
3262 {
3263 SCHECK_PARTIAL();
3264 RRETURN(MATCH_NOMATCH);
3265 }
3266 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3267 if (mb->partial != 0 &&
3268 Feptr + 1 >= mb->end_subject &&
3269 NLBLOCK->nltype == NLTYPE_FIXED &&
3270 NLBLOCK->nllen == 2 &&
3271 *Feptr == NLBLOCK->nl[0])
3272 {
3273 mb->hitend = TRUE;
3274 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3275 }
3276 Feptr++;
3277 }
3278 break;
3279
3280 case OP_ALLANY:
3281 if (Feptr > mb->end_subject - Lmin)
3282 {
3283 SCHECK_PARTIAL();
3284 RRETURN(MATCH_NOMATCH);
3285 }
3286 Feptr += Lmin;
3287 break;
3288
3289 /* This OP_ANYBYTE case will never be reached because \C gets turned
3290 into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
3291 reports don't complain about it's never being used. */
3292
3293 /* case OP_ANYBYTE:
3294 * if (Feptr > mb->end_subject - Lmin)
3295 * {
3296 * SCHECK_PARTIAL();
3297 * RRETURN(MATCH_NOMATCH);
3298 * }
3299 * Feptr += Lmin;
3300 * break;
3301 */
3302 case OP_ANYNL:
3303 for (i = 1; i <= Lmin; i++)
3304 {
3305 if (Feptr >= mb->end_subject)
3306 {
3307 SCHECK_PARTIAL();
3308 RRETURN(MATCH_NOMATCH);
3309 }
3310 switch(*Feptr++)
3311 {
3312 default: RRETURN(MATCH_NOMATCH);
3313
3314 case CHAR_CR:
3315 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3316 break;
3317
3318 case CHAR_LF:
3319 break;
3320
3321 case CHAR_VT:
3322 case CHAR_FF:
3323 case CHAR_NEL:
3324 #if PCRE2_CODE_UNIT_WIDTH != 8
3325 case 0x2028:
3326 case 0x2029:
3327 #endif
3328 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3329 break;
3330 }
3331 }
3332 break;
3333
3334 case OP_NOT_HSPACE:
3335 for (i = 1; i <= Lmin; i++)
3336 {
3337 if (Feptr >= mb->end_subject)
3338 {
3339 SCHECK_PARTIAL();
3340 RRETURN(MATCH_NOMATCH);
3341 }
3342 switch(*Feptr++)
3343 {
3344 default: break;
3345 HSPACE_BYTE_CASES:
3346 #if PCRE2_CODE_UNIT_WIDTH != 8
3347 HSPACE_MULTIBYTE_CASES:
3348 #endif
3349 RRETURN(MATCH_NOMATCH);
3350 }
3351 }
3352 break;
3353
3354 case OP_HSPACE:
3355 for (i = 1; i <= Lmin; i++)
3356 {
3357 if (Feptr >= mb->end_subject)
3358 {
3359 SCHECK_PARTIAL();
3360 RRETURN(MATCH_NOMATCH);
3361 }
3362 switch(*Feptr++)
3363 {
3364 default: RRETURN(MATCH_NOMATCH);
3365 HSPACE_BYTE_CASES:
3366 #if PCRE2_CODE_UNIT_WIDTH != 8
3367 HSPACE_MULTIBYTE_CASES:
3368 #endif
3369 break;
3370 }
3371 }
3372 break;
3373
3374 case OP_NOT_VSPACE:
3375 for (i = 1; i <= Lmin; i++)
3376 {
3377 if (Feptr >= mb->end_subject)
3378 {
3379 SCHECK_PARTIAL();
3380 RRETURN(MATCH_NOMATCH);
3381 }
3382 switch(*Feptr++)
3383 {
3384 VSPACE_BYTE_CASES:
3385 #if PCRE2_CODE_UNIT_WIDTH != 8
3386 VSPACE_MULTIBYTE_CASES:
3387 #endif
3388 RRETURN(MATCH_NOMATCH);
3389 default: break;
3390 }
3391 }
3392 break;
3393
3394 case OP_VSPACE:
3395 for (i = 1; i <= Lmin; i++)
3396 {
3397 if (Feptr >= mb->end_subject)
3398 {
3399 SCHECK_PARTIAL();
3400 RRETURN(MATCH_NOMATCH);
3401 }
3402 switch(*Feptr++)
3403 {
3404 default: RRETURN(MATCH_NOMATCH);
3405 VSPACE_BYTE_CASES:
3406 #if PCRE2_CODE_UNIT_WIDTH != 8
3407 VSPACE_MULTIBYTE_CASES:
3408 #endif
3409 break;
3410 }
3411 }
3412 break;
3413
3414 case OP_NOT_DIGIT:
3415 for (i = 1; i <= Lmin; i++)
3416 {
3417 if (Feptr >= mb->end_subject)
3418 {
3419 SCHECK_PARTIAL();
3420 RRETURN(MATCH_NOMATCH);
3421 }
3422 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
3423 RRETURN(MATCH_NOMATCH);
3424 Feptr++;
3425 }
3426 break;
3427
3428 case OP_DIGIT:
3429 for (i = 1; i <= Lmin; i++)
3430 {
3431 if (Feptr >= mb->end_subject)
3432 {
3433 SCHECK_PARTIAL();
3434 RRETURN(MATCH_NOMATCH);
3435 }
3436 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
3437 RRETURN(MATCH_NOMATCH);
3438 Feptr++;
3439 }
3440 break;
3441
3442 case OP_NOT_WHITESPACE:
3443 for (i = 1; i <= Lmin; i++)
3444 {
3445 if (Feptr >= mb->end_subject)
3446 {
3447 SCHECK_PARTIAL();
3448 RRETURN(MATCH_NOMATCH);
3449 }
3450 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
3451 RRETURN(MATCH_NOMATCH);
3452 Feptr++;
3453 }
3454 break;
3455
3456 case OP_WHITESPACE:
3457 for (i = 1; i <= Lmin; i++)
3458 {
3459 if (Feptr >= mb->end_subject)
3460 {
3461 SCHECK_PARTIAL();
3462 RRETURN(MATCH_NOMATCH);
3463 }
3464 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
3465 RRETURN(MATCH_NOMATCH);
3466 Feptr++;
3467 }
3468 break;
3469
3470 case OP_NOT_WORDCHAR:
3471 for (i = 1; i <= Lmin; i++)
3472 {
3473 if (Feptr >= mb->end_subject)
3474 {
3475 SCHECK_PARTIAL();
3476 RRETURN(MATCH_NOMATCH);
3477 }
3478 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
3479 RRETURN(MATCH_NOMATCH);
3480 Feptr++;
3481 }
3482 break;
3483
3484 case OP_WORDCHAR:
3485 for (i = 1; i <= Lmin; i++)
3486 {
3487 if (Feptr >= mb->end_subject)
3488 {
3489 SCHECK_PARTIAL();
3490 RRETURN(MATCH_NOMATCH);
3491 }
3492 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
3493 RRETURN(MATCH_NOMATCH);
3494 Feptr++;
3495 }
3496 break;
3497
3498 default:
3499 return PCRE2_ERROR_INTERNAL;
3500 }
3501 }
3502
3503 /* If Lmin = Lmax we are done. Continue with the main loop. */
3504
3505 if (Lmin == Lmax) continue;
3506
3507 /* If minimizing, we have to test the rest of the pattern before each
3508 subsequent match. This means we cannot use a local "notmatch" variable as
3509 in the other cases. As all 4 temporary 32-bit values in the frame are
3510 already in use, just test the type each time. */
3511
3512 if (reptype == REPTYPE_MIN)
3513 {
3514 #ifdef SUPPORT_UNICODE
3515 if (proptype >= 0)
3516 {
3517 switch(proptype)
3518 {
3519 case PT_ANY:
3520 for (;;)
3521 {
3522 RMATCH(Fecode, RM208);
3523 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3524 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3525 if (Feptr >= mb->end_subject)
3526 {
3527 SCHECK_PARTIAL();
3528 RRETURN(MATCH_NOMATCH);
3529 }
3530 GETCHARINCTEST(fc, Feptr);
3531 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3532 }
3533 /* Control never gets here */
3534
3535 case PT_LAMP:
3536 for (;;)
3537 {
3538 int chartype;
3539 RMATCH(Fecode, RM209);
3540 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3541 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3542 if (Feptr >= mb->end_subject)
3543 {
3544 SCHECK_PARTIAL();
3545 RRETURN(MATCH_NOMATCH);
3546 }
3547 GETCHARINCTEST(fc, Feptr);
3548 chartype = UCD_CHARTYPE(fc);
3549 if ((chartype == ucp_Lu ||
3550 chartype == ucp_Ll ||
3551 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3552 RRETURN(MATCH_NOMATCH);
3553 }
3554 /* Control never gets here */
3555
3556 case PT_GC:
3557 for (;;)
3558 {
3559 RMATCH(Fecode, RM210);
3560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3561 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3562 if (Feptr >= mb->end_subject)
3563 {
3564 SCHECK_PARTIAL();
3565 RRETURN(MATCH_NOMATCH);
3566 }
3567 GETCHARINCTEST(fc, Feptr);
3568 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3569 RRETURN(MATCH_NOMATCH);
3570 }
3571 /* Control never gets here */
3572
3573 case PT_PC:
3574 for (;;)
3575 {
3576 RMATCH(Fecode, RM211);
3577 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3578 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3579 if (Feptr >= mb->end_subject)
3580 {
3581 SCHECK_PARTIAL();
3582 RRETURN(MATCH_NOMATCH);
3583 }
3584 GETCHARINCTEST(fc, Feptr);
3585 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3586 RRETURN(MATCH_NOMATCH);
3587 }
3588 /* Control never gets here */
3589
3590 case PT_SC:
3591 for (;;)
3592 {
3593 RMATCH(Fecode, RM212);
3594 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3595 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3596 if (Feptr >= mb->end_subject)
3597 {
3598 SCHECK_PARTIAL();
3599 RRETURN(MATCH_NOMATCH);
3600 }
3601 GETCHARINCTEST(fc, Feptr);
3602 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3603 RRETURN(MATCH_NOMATCH);
3604 }
3605 /* Control never gets here */
3606
3607 case PT_SCX:
3608 for (;;)
3609 {
3610 BOOL ok;
3611 const ucd_record *prop;
3612 RMATCH(Fecode, RM225);
3613 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3614 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3615 if (Feptr >= mb->end_subject)
3616 {
3617 SCHECK_PARTIAL();
3618 RRETURN(MATCH_NOMATCH);
3619 }
3620 GETCHARINCTEST(fc, Feptr);
3621 prop = GET_UCD(fc);
3622 ok = (prop->script == Lpropvalue
3623 || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
3624 if (ok == (Lctype == OP_NOTPROP))
3625 RRETURN(MATCH_NOMATCH);
3626 }
3627 /* Control never gets here */
3628
3629 case PT_ALNUM:
3630 for (;;)
3631 {
3632 int category;
3633 RMATCH(Fecode, RM213);
3634 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3635 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3636 if (Feptr >= mb->end_subject)
3637 {
3638 SCHECK_PARTIAL();
3639 RRETURN(MATCH_NOMATCH);
3640 }
3641 GETCHARINCTEST(fc, Feptr);
3642 category = UCD_CATEGORY(fc);
3643 if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
3644 RRETURN(MATCH_NOMATCH);
3645 }
3646 /* Control never gets here */
3647
3648 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3649 which means that Perl space and POSIX space are now identical. PCRE
3650 was changed at release 8.34. */
3651
3652 case PT_SPACE: /* Perl space */
3653 case PT_PXSPACE: /* POSIX space */
3654 for (;;)
3655 {
3656 RMATCH(Fecode, RM214);
3657 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3658 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3659 if (Feptr >= mb->end_subject)
3660 {
3661 SCHECK_PARTIAL();
3662 RRETURN(MATCH_NOMATCH);
3663 }
3664 GETCHARINCTEST(fc, Feptr);
3665 switch(fc)
3666 {
3667 HSPACE_CASES:
3668 VSPACE_CASES:
3669 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3670 break;
3671
3672 default:
3673 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3674 RRETURN(MATCH_NOMATCH);
3675 break;
3676 }
3677 }
3678 /* Control never gets here */
3679
3680 case PT_WORD:
3681 for (;;)
3682 {
3683 int chartype, category;
3684 RMATCH(Fecode, RM215);
3685 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3686 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3687 if (Feptr >= mb->end_subject)
3688 {
3689 SCHECK_PARTIAL();
3690 RRETURN(MATCH_NOMATCH);
3691 }
3692 GETCHARINCTEST(fc, Feptr);
3693 chartype = UCD_CHARTYPE(fc);
3694 category = PRIV(ucp_gentype)[chartype];
3695 if ((category == ucp_L ||
3696 category == ucp_N ||
3697 chartype == ucp_Mn ||
3698 chartype == ucp_Pc) == (Lctype == OP_NOTPROP))
3699 RRETURN(MATCH_NOMATCH);
3700 }
3701 /* Control never gets here */
3702
3703 case PT_CLIST:
3704 for (;;)
3705 {
3706 const uint32_t *cp;
3707 RMATCH(Fecode, RM216);
3708 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3709 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3710 if (Feptr >= mb->end_subject)
3711 {
3712 SCHECK_PARTIAL();
3713 RRETURN(MATCH_NOMATCH);
3714 }
3715 GETCHARINCTEST(fc, Feptr);
3716 #if PCRE2_CODE_UNIT_WIDTH == 32
3717 if (fc > MAX_UTF_CODE_POINT)
3718 {
3719 if (Lctype == OP_NOTPROP) continue;
3720 RRETURN(MATCH_NOMATCH);
3721 }
3722 #endif
3723 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3724 for (;;)
3725 {
3726 if (fc < *cp)
3727 {
3728 if (Lctype == OP_NOTPROP) break;
3729 RRETURN(MATCH_NOMATCH);
3730 }
3731 if (fc == *cp++)
3732 {
3733 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3734 break;
3735 }
3736 }
3737 }
3738 /* Control never gets here */
3739
3740 case PT_UCNC:
3741 for (;;)
3742 {
3743 RMATCH(Fecode, RM217);
3744 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3745 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3746 if (Feptr >= mb->end_subject)
3747 {
3748 SCHECK_PARTIAL();
3749 RRETURN(MATCH_NOMATCH);
3750 }
3751 GETCHARINCTEST(fc, Feptr);
3752 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3753 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3754 fc >= 0xe000) == (Lctype == OP_NOTPROP))
3755 RRETURN(MATCH_NOMATCH);
3756 }
3757 /* Control never gets here */
3758
3759 case PT_BIDICL:
3760 for (;;)
3761 {
3762 RMATCH(Fecode, RM224);
3763 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3764 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3765 if (Feptr >= mb->end_subject)
3766 {
3767 SCHECK_PARTIAL();
3768 RRETURN(MATCH_NOMATCH);
3769 }
3770 GETCHARINCTEST(fc, Feptr);
3771 if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3772 RRETURN(MATCH_NOMATCH);
3773 }
3774 /* Control never gets here */
3775
3776 case PT_BOOL:
3777 for (;;)
3778 {
3779 BOOL ok;
3780 const ucd_record *prop;
3781 RMATCH(Fecode, RM223);
3782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3783 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3784 if (Feptr >= mb->end_subject)
3785 {
3786 SCHECK_PARTIAL();
3787 RRETURN(MATCH_NOMATCH);
3788 }
3789 GETCHARINCTEST(fc, Feptr);
3790 prop = GET_UCD(fc);
3791 ok = MAPBIT(PRIV(ucd_boolprop_sets) +
3792 UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
3793 if (ok == (Lctype == OP_NOTPROP))
3794 RRETURN(MATCH_NOMATCH);
3795 }
3796 /* Control never gets here */
3797
3798 /* This should never occur */
3799 default:
3800 return PCRE2_ERROR_INTERNAL;
3801 }
3802 }
3803
3804 /* Match extended Unicode sequences. We will get here only if the
3805 support is in the binary; otherwise a compile-time error occurs. */
3806
3807 else if (Lctype == OP_EXTUNI)
3808 {
3809 for (;;)
3810 {
3811 RMATCH(Fecode, RM218);
3812 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3813 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3814 if (Feptr >= mb->end_subject)
3815 {
3816 SCHECK_PARTIAL();
3817 RRETURN(MATCH_NOMATCH);
3818 }
3819 else
3820 {
3821 GETCHARINCTEST(fc, Feptr);
3822 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
3823 utf, NULL);
3824 }
3825 CHECK_PARTIAL();
3826 }
3827 }
3828 else
3829 #endif /* SUPPORT_UNICODE */
3830
3831 /* UTF mode for non-property testing character types. */
3832
3833 #ifdef SUPPORT_UNICODE
3834 if (utf)
3835 {
3836 for (;;)
3837 {
3838 RMATCH(Fecode, RM219);
3839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3840 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3841 if (Feptr >= mb->end_subject)
3842 {
3843 SCHECK_PARTIAL();
3844 RRETURN(MATCH_NOMATCH);
3845 }
3846 if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3847 GETCHARINC(fc, Feptr);
3848 switch(Lctype)
3849 {
3850 case OP_ANY: /* This is the non-NL case */
3851 if (mb->partial != 0 && /* Take care with CRLF partial */
3852 Feptr >= mb->end_subject &&
3853 NLBLOCK->nltype == NLTYPE_FIXED &&
3854 NLBLOCK->nllen == 2 &&
3855 fc == NLBLOCK->nl[0])
3856 {
3857 mb->hitend = TRUE;
3858 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3859 }
3860 break;
3861
3862 case OP_ALLANY:
3863 case OP_ANYBYTE:
3864 break;
3865
3866 case OP_ANYNL:
3867 switch(fc)
3868 {
3869 default: RRETURN(MATCH_NOMATCH);
3870
3871 case CHAR_CR:
3872 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3873 break;
3874
3875 case CHAR_LF:
3876 break;
3877
3878 case CHAR_VT:
3879 case CHAR_FF:
3880 case CHAR_NEL:
3881 #ifndef EBCDIC
3882 case 0x2028:
3883 case 0x2029:
3884 #endif /* Not EBCDIC */
3885 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3886 RRETURN(MATCH_NOMATCH);
3887 break;
3888 }
3889 break;
3890
3891 case OP_NOT_HSPACE:
3892 switch(fc)
3893 {
3894 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3895 default: break;
3896 }
3897 break;
3898
3899 case OP_HSPACE:
3900 switch(fc)
3901 {
3902 HSPACE_CASES: break;
3903 default: RRETURN(MATCH_NOMATCH);
3904 }
3905 break;
3906
3907 case OP_NOT_VSPACE:
3908 switch(fc)
3909 {
3910 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3911 default: break;
3912 }
3913 break;
3914
3915 case OP_VSPACE:
3916 switch(fc)
3917 {
3918 VSPACE_CASES: break;
3919 default: RRETURN(MATCH_NOMATCH);
3920 }
3921 break;
3922
3923 case OP_NOT_DIGIT:
3924 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
3925 RRETURN(MATCH_NOMATCH);
3926 break;
3927
3928 case OP_DIGIT:
3929 if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
3930 RRETURN(MATCH_NOMATCH);
3931 break;
3932
3933 case OP_NOT_WHITESPACE:
3934 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
3935 RRETURN(MATCH_NOMATCH);
3936 break;
3937
3938 case OP_WHITESPACE:
3939 if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
3940 RRETURN(MATCH_NOMATCH);
3941 break;
3942
3943 case OP_NOT_WORDCHAR:
3944 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
3945 RRETURN(MATCH_NOMATCH);
3946 break;
3947
3948 case OP_WORDCHAR:
3949 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
3950 RRETURN(MATCH_NOMATCH);
3951 break;
3952
3953 default:
3954 return PCRE2_ERROR_INTERNAL;
3955 }
3956 }
3957 }
3958 else
3959 #endif /* SUPPORT_UNICODE */
3960
3961 /* Not UTF mode */
3962 {
3963 for (;;)
3964 {
3965 RMATCH(Fecode, RM33);
3966 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3967 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3968 if (Feptr >= mb->end_subject)
3969 {
3970 SCHECK_PARTIAL();
3971 RRETURN(MATCH_NOMATCH);
3972 }
3973 if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
3974 RRETURN(MATCH_NOMATCH);
3975 fc = *Feptr++;
3976 switch(Lctype)
3977 {
3978 case OP_ANY: /* This is the non-NL case */
3979 if (mb->partial != 0 && /* Take care with CRLF partial */
3980 Feptr >= mb->end_subject &&
3981 NLBLOCK->nltype == NLTYPE_FIXED &&
3982 NLBLOCK->nllen == 2 &&
3983 fc == NLBLOCK->nl[0])
3984 {
3985 mb->hitend = TRUE;
3986 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3987 }
3988 break;
3989
3990 case OP_ALLANY:
3991 case OP_ANYBYTE:
3992 break;
3993
3994 case OP_ANYNL:
3995 switch(fc)
3996 {
3997 default: RRETURN(MATCH_NOMATCH);
3998
3999 case CHAR_CR:
4000 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
4001 break;
4002
4003 case CHAR_LF:
4004 break;
4005
4006 case CHAR_VT:
4007 case CHAR_FF:
4008 case CHAR_NEL:
4009 #if PCRE2_CODE_UNIT_WIDTH != 8
4010 case 0x2028:
4011 case 0x2029:
4012 #endif
4013 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
4014 RRETURN(MATCH_NOMATCH);
4015 break;
4016 }
4017 break;
4018
4019 case OP_NOT_HSPACE:
4020 switch(fc)
4021 {
4022 default: break;
4023 HSPACE_BYTE_CASES:
4024 #if PCRE2_CODE_UNIT_WIDTH != 8
4025 HSPACE_MULTIBYTE_CASES:
4026 #endif
4027 RRETURN(MATCH_NOMATCH);
4028 }
4029 break;
4030
4031 case OP_HSPACE:
4032 switch(fc)
4033 {
4034 default: RRETURN(MATCH_NOMATCH);
4035 HSPACE_BYTE_CASES:
4036 #if PCRE2_CODE_UNIT_WIDTH != 8
4037 HSPACE_MULTIBYTE_CASES:
4038 #endif
4039 break;
4040 }
4041 break;
4042
4043 case OP_NOT_VSPACE:
4044 switch(fc)
4045 {
4046 default: break;
4047 VSPACE_BYTE_CASES:
4048 #if PCRE2_CODE_UNIT_WIDTH != 8
4049 VSPACE_MULTIBYTE_CASES:
4050 #endif
4051 RRETURN(MATCH_NOMATCH);
4052 }
4053 break;
4054
4055 case OP_VSPACE:
4056 switch(fc)
4057 {
4058 default: RRETURN(MATCH_NOMATCH);
4059 VSPACE_BYTE_CASES:
4060 #if PCRE2_CODE_UNIT_WIDTH != 8
4061 VSPACE_MULTIBYTE_CASES:
4062 #endif
4063 break;
4064 }
4065 break;
4066
4067 case OP_NOT_DIGIT:
4068 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
4069 RRETURN(MATCH_NOMATCH);
4070 break;
4071
4072 case OP_DIGIT:
4073 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
4074 RRETURN(MATCH_NOMATCH);
4075 break;
4076
4077 case OP_NOT_WHITESPACE:
4078 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
4079 RRETURN(MATCH_NOMATCH);
4080 break;
4081
4082 case OP_WHITESPACE:
4083 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
4084 RRETURN(MATCH_NOMATCH);
4085 break;
4086
4087 case OP_NOT_WORDCHAR:
4088 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
4089 RRETURN(MATCH_NOMATCH);
4090 break;
4091
4092 case OP_WORDCHAR:
4093 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
4094 RRETURN(MATCH_NOMATCH);
4095 break;
4096
4097 default:
4098 return PCRE2_ERROR_INTERNAL;
4099 }
4100 }
4101 }
4102 /* Control never gets here */
4103 }
4104
4105 /* If maximizing, it is worth using inline code for speed, doing the type
4106 test once at the start (i.e. keep it out of the loops). Once again,
4107 "notmatch" can be an ordinary local variable because the loops do not call
4108 RMATCH. */
4109
4110 else
4111 {
4112 Lstart_eptr = Feptr; /* Remember where we started */
4113
4114 #ifdef SUPPORT_UNICODE
4115 if (proptype >= 0)
4116 {
4117 BOOL notmatch = Lctype == OP_NOTPROP;
4118 switch(proptype)
4119 {
4120 case PT_ANY:
4121 for (i = Lmin; i < Lmax; i++)
4122 {
4123 int len = 1;
4124 if (Feptr >= mb->end_subject)
4125 {
4126 SCHECK_PARTIAL();
4127 break;
4128 }
4129 GETCHARLENTEST(fc, Feptr, len);
4130 if (notmatch) break;
4131 Feptr+= len;
4132 }
4133 break;
4134
4135 case PT_LAMP:
4136 for (i = Lmin; i < Lmax; i++)
4137 {
4138 int chartype;
4139 int len = 1;
4140 if (Feptr >= mb->end_subject)
4141 {
4142 SCHECK_PARTIAL();
4143 break;
4144 }
4145 GETCHARLENTEST(fc, Feptr, len);
4146 chartype = UCD_CHARTYPE(fc);
4147 if ((chartype == ucp_Lu ||
4148 chartype == ucp_Ll ||
4149 chartype == ucp_Lt) == notmatch)
4150 break;
4151 Feptr+= len;
4152 }
4153 break;
4154
4155 case PT_GC:
4156 for (i = Lmin; i < Lmax; i++)
4157 {
4158 int len = 1;
4159 if (Feptr >= mb->end_subject)
4160 {
4161 SCHECK_PARTIAL();
4162 break;
4163 }
4164 GETCHARLENTEST(fc, Feptr, len);
4165 if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break;
4166 Feptr+= len;
4167 }
4168 break;
4169
4170 case PT_PC:
4171 for (i = Lmin; i < Lmax; i++)
4172 {
4173 int len = 1;
4174 if (Feptr >= mb->end_subject)
4175 {
4176 SCHECK_PARTIAL();
4177 break;
4178 }
4179 GETCHARLENTEST(fc, Feptr, len);
4180 if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break;
4181 Feptr+= len;
4182 }
4183 break;
4184
4185 case PT_SC:
4186 for (i = Lmin; i < Lmax; i++)
4187 {
4188 int len = 1;
4189 if (Feptr >= mb->end_subject)
4190 {
4191 SCHECK_PARTIAL();
4192 break;
4193 }
4194 GETCHARLENTEST(fc, Feptr, len);
4195 if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break;
4196 Feptr+= len;
4197 }
4198 break;
4199
4200 case PT_SCX:
4201 for (i = Lmin; i < Lmax; i++)
4202 {
4203 BOOL ok;
4204 const ucd_record *prop;
4205 int len = 1;
4206 if (Feptr >= mb->end_subject)
4207 {
4208 SCHECK_PARTIAL();
4209 break;
4210 }
4211 GETCHARLENTEST(fc, Feptr, len);
4212 prop = GET_UCD(fc);
4213 ok = (prop->script == Lpropvalue ||
4214 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);
4215 if (ok == notmatch) break;
4216 Feptr+= len;
4217 }
4218 break;
4219
4220 case PT_ALNUM:
4221 for (i = Lmin; i < Lmax; i++)
4222 {
4223 int category;
4224 int len = 1;
4225 if (Feptr >= mb->end_subject)
4226 {
4227 SCHECK_PARTIAL();
4228 break;
4229 }
4230 GETCHARLENTEST(fc, Feptr, len);
4231 category = UCD_CATEGORY(fc);
4232 if ((category == ucp_L || category == ucp_N) == notmatch)
4233 break;
4234 Feptr+= len;
4235 }
4236 break;
4237
4238 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4239 which means that Perl space and POSIX space are now identical. PCRE
4240 was changed at release 8.34. */
4241
4242 case PT_SPACE: /* Perl space */
4243 case PT_PXSPACE: /* POSIX space */
4244 for (i = Lmin; i < Lmax; i++)
4245 {
4246 int len = 1;
4247 if (Feptr >= mb->end_subject)
4248 {
4249 SCHECK_PARTIAL();
4250 break;
4251 }
4252 GETCHARLENTEST(fc, Feptr, len);
4253 switch(fc)
4254 {
4255 HSPACE_CASES:
4256 VSPACE_CASES:
4257 if (notmatch) goto ENDLOOP99; /* Break the loop */
4258 break;
4259
4260 default:
4261 if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)
4262 goto ENDLOOP99; /* Break the loop */
4263 break;
4264 }
4265 Feptr+= len;
4266 }
4267 ENDLOOP99:
4268 break;
4269
4270 case PT_WORD:
4271 for (i = Lmin; i < Lmax; i++)
4272 {
4273 int chartype, category;
4274 int len = 1;
4275 if (Feptr >= mb->end_subject)
4276 {
4277 SCHECK_PARTIAL();
4278 break;
4279 }
4280 GETCHARLENTEST(fc, Feptr, len);
4281 chartype = UCD_CHARTYPE(fc);
4282 category = PRIV(ucp_gentype)[chartype];
4283 if ((category == ucp_L ||
4284 category == ucp_N ||
4285 chartype == ucp_Mn ||
4286 chartype == ucp_Pc) == notmatch)
4287 break;
4288 Feptr+= len;
4289 }
4290 break;
4291
4292 case PT_CLIST:
4293 for (i = Lmin; i < Lmax; i++)
4294 {
4295 const uint32_t *cp;
4296 int len = 1;
4297 if (Feptr >= mb->end_subject)
4298 {
4299 SCHECK_PARTIAL();
4300 break;
4301 }
4302 GETCHARLENTEST(fc, Feptr, len);
4303 #if PCRE2_CODE_UNIT_WIDTH == 32
4304 if (fc > MAX_UTF_CODE_POINT)
4305 {
4306 if (!notmatch) goto GOT_MAX;
4307 }
4308 else
4309 #endif
4310 {
4311 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
4312 for (;;)
4313 {
4314 if (fc < *cp)
4315 { if (notmatch) break; else goto GOT_MAX; }
4316 if (fc == *cp++)
4317 { if (notmatch) goto GOT_MAX; else break; }
4318 }
4319 }
4320
4321 Feptr += len;
4322 }
4323 GOT_MAX:
4324 break;
4325
4326 case PT_UCNC:
4327 for (i = Lmin; i < Lmax; i++)
4328 {
4329 int len = 1;
4330 if (Feptr >= mb->end_subject)
4331 {
4332 SCHECK_PARTIAL();
4333 break;
4334 }
4335 GETCHARLENTEST(fc, Feptr, len);
4336 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
4337 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
4338 fc >= 0xe000) == notmatch)
4339 break;
4340 Feptr += len;
4341 }
4342 break;
4343
4344 case PT_BIDICL:
4345 for (i = Lmin; i < Lmax; i++)
4346 {
4347 int len = 1;
4348 if (Feptr >= mb->end_subject)
4349 {
4350 SCHECK_PARTIAL();
4351 break;
4352 }
4353 GETCHARLENTEST(fc, Feptr, len);
4354 if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break;
4355 Feptr+= len;
4356 }
4357 break;
4358
4359 case PT_BOOL:
4360 for (i = Lmin; i < Lmax; i++)
4361 {
4362 BOOL ok;
4363 const ucd_record *prop;
4364 int len = 1;
4365 if (Feptr >= mb->end_subject)
4366 {
4367 SCHECK_PARTIAL();
4368 break;
4369 }
4370 GETCHARLENTEST(fc, Feptr, len);
4371 prop = GET_UCD(fc);
4372 ok = MAPBIT(PRIV(ucd_boolprop_sets) +
4373 UCD_BPROPS_PROP(prop), Lpropvalue) != 0;
4374 if (ok == notmatch) break;
4375 Feptr+= len;
4376 }
4377 break;
4378
4379 default:
4380 return PCRE2_ERROR_INTERNAL;
4381 }
4382
4383 /* Feptr is now past the end of the maximum run */
4384
4385 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4386
4387 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4388 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
4389 go too far. */
4390
4391 for(;;)
4392 {
4393 if (Feptr <= Lstart_eptr) break;
4394 RMATCH(Fecode, RM222);
4395 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4396 Feptr--;
4397 if (utf) BACKCHAR(Feptr);
4398 }
4399 }
4400
4401 /* Match extended Unicode grapheme clusters. We will get here only if the
4402 support is in the binary; otherwise a compile-time error occurs. */
4403
4404 else if (Lctype == OP_EXTUNI)
4405 {
4406 for (i = Lmin; i < Lmax; i++)
4407 {
4408 if (Feptr >= mb->end_subject)
4409 {
4410 SCHECK_PARTIAL();
4411 break;
4412 }
4413 else
4414 {
4415 GETCHARINCTEST(fc, Feptr);
4416 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4417 utf, NULL);
4418 }
4419 CHECK_PARTIAL();
4420 }
4421
4422 /* Feptr is now past the end of the maximum run */
4423
4424 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4425
4426 /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
4427 of the run while backtracking because the use of \C in UTF mode can
4428 cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
4429 the use of \C in UTF mode is fraught with danger. */
4430
4431 for(;;)
4432 {
4433 int lgb, rgb;
4434 PCRE2_SPTR fptr;
4435
4436 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4437 RMATCH(Fecode, RM220);
4438 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4439
4440 /* Backtracking over an extended grapheme cluster involves inspecting
4441 the previous two characters (if present) to see if a break is
4442 permitted between them. */
4443
4444 Feptr--;
4445 if (!utf) fc = *Feptr; else
4446 {
4447 BACKCHAR(Feptr);
4448 GETCHAR(fc, Feptr);
4449 }
4450 rgb = UCD_GRAPHBREAK(fc);
4451
4452 for (;;)
4453 {
4454 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4455 fptr = Feptr - 1;
4456 if (!utf) fc = *fptr; else
4457 {
4458 BACKCHAR(fptr);
4459 GETCHAR(fc, fptr);
4460 }
4461 lgb = UCD_GRAPHBREAK(fc);
4462 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
4463 Feptr = fptr;
4464 rgb = lgb;
4465 }
4466 }
4467 }
4468
4469 else
4470 #endif /* SUPPORT_UNICODE */
4471
4472 #ifdef SUPPORT_UNICODE
4473 if (utf)
4474 {
4475 switch(Lctype)
4476 {
4477 case OP_ANY:
4478 for (i = Lmin; i < Lmax; i++)
4479 {
4480 if (Feptr >= mb->end_subject)
4481 {
4482 SCHECK_PARTIAL();
4483 break;
4484 }
4485 if (IS_NEWLINE(Feptr)) break;
4486 if (mb->partial != 0 && /* Take care with CRLF partial */
4487 Feptr + 1 >= mb->end_subject &&
4488 NLBLOCK->nltype == NLTYPE_FIXED &&
4489 NLBLOCK->nllen == 2 &&
4490 UCHAR21(Feptr) == NLBLOCK->nl[0])
4491 {
4492 mb->hitend = TRUE;
4493 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4494 }
4495 Feptr++;
4496 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4497 }
4498 break;
4499
4500 case OP_ALLANY:
4501 if (Lmax < UINT32_MAX)
4502 {
4503 for (i = Lmin; i < Lmax; i++)
4504 {
4505 if (Feptr >= mb->end_subject)
4506 {
4507 SCHECK_PARTIAL();
4508 break;
4509 }
4510 Feptr++;
4511 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4512 }
4513 }
4514 else
4515 {
4516 Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
4517 SCHECK_PARTIAL();
4518 }
4519 break;
4520
4521 /* The "byte" (i.e. "code unit") case is the same as non-UTF */
4522
4523 case OP_ANYBYTE:
4524 fc = Lmax - Lmin;
4525 if (fc > (uint32_t)(mb->end_subject - Feptr))
4526 {
4527 Feptr = mb->end_subject;
4528 SCHECK_PARTIAL();
4529 }
4530 else Feptr += fc;
4531 break;
4532
4533 case OP_ANYNL:
4534 for (i = Lmin; i < Lmax; i++)
4535 {
4536 int len = 1;
4537 if (Feptr >= mb->end_subject)
4538 {
4539 SCHECK_PARTIAL();
4540 break;
4541 }
4542 GETCHARLEN(fc, Feptr, len);
4543 if (fc == CHAR_CR)
4544 {
4545 if (++Feptr >= mb->end_subject) break;
4546 if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
4547 }
4548 else
4549 {
4550 if (fc != CHAR_LF &&
4551 (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4552 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4553 #ifndef EBCDIC
4554 && fc != 0x2028 && fc != 0x2029
4555 #endif /* Not EBCDIC */
4556 )))
4557 break;
4558 Feptr += len;
4559 }
4560 }
4561 break;
4562
4563 case OP_NOT_HSPACE:
4564 case OP_HSPACE:
4565 for (i = Lmin; i < Lmax; i++)
4566 {
4567 BOOL gotspace;
4568 int len = 1;
4569 if (Feptr >= mb->end_subject)
4570 {
4571 SCHECK_PARTIAL();
4572 break;
4573 }
4574 GETCHARLEN(fc, Feptr, len);
4575 switch(fc)
4576 {
4577 HSPACE_CASES: gotspace = TRUE; break;
4578 default: gotspace = FALSE; break;
4579 }
4580 if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
4581 Feptr += len;
4582 }
4583 break;
4584
4585 case OP_NOT_VSPACE:
4586 case OP_VSPACE:
4587 for (i = Lmin; i < Lmax; i++)
4588 {
4589 BOOL gotspace;
4590 int len = 1;
4591 if (Feptr >= mb->end_subject)
4592 {
4593 SCHECK_PARTIAL();
4594 break;
4595 }
4596 GETCHARLEN(fc, Feptr, len);
4597 switch(fc)
4598 {
4599 VSPACE_CASES: gotspace = TRUE; break;
4600 default: gotspace = FALSE; break;
4601 }
4602 if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
4603 Feptr += len;
4604 }
4605 break;
4606
4607 case OP_NOT_DIGIT:
4608 for (i = Lmin; i < Lmax; i++)
4609 {
4610 int len = 1;
4611 if (Feptr >= mb->end_subject)
4612 {
4613 SCHECK_PARTIAL();
4614 break;
4615 }
4616 GETCHARLEN(fc, Feptr, len);
4617 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
4618 Feptr+= len;
4619 }
4620 break;
4621
4622 case OP_DIGIT:
4623 for (i = Lmin; i < Lmax; i++)
4624 {
4625 int len = 1;
4626 if (Feptr >= mb->end_subject)
4627 {
4628 SCHECK_PARTIAL();
4629 break;
4630 }
4631 GETCHARLEN(fc, Feptr, len);
4632 if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
4633 Feptr+= len;
4634 }
4635 break;
4636
4637 case OP_NOT_WHITESPACE:
4638 for (i = Lmin; i < Lmax; i++)
4639 {
4640 int len = 1;
4641 if (Feptr >= mb->end_subject)
4642 {
4643 SCHECK_PARTIAL();
4644 break;
4645 }
4646 GETCHARLEN(fc, Feptr, len);
4647 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
4648 Feptr+= len;
4649 }
4650 break;
4651
4652 case OP_WHITESPACE:
4653 for (i = Lmin; i < Lmax; i++)
4654 {
4655 int len = 1;
4656 if (Feptr >= mb->end_subject)
4657 {
4658 SCHECK_PARTIAL();
4659 break;
4660 }
4661 GETCHARLEN(fc, Feptr, len);
4662 if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
4663 Feptr+= len;
4664 }
4665 break;
4666
4667 case OP_NOT_WORDCHAR:
4668 for (i = Lmin; i < Lmax; i++)
4669 {
4670 int len = 1;
4671 if (Feptr >= mb->end_subject)
4672 {
4673 SCHECK_PARTIAL();
4674 break;
4675 }
4676 GETCHARLEN(fc, Feptr, len);
4677 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
4678 Feptr+= len;
4679 }
4680 break;
4681
4682 case OP_WORDCHAR:
4683 for (i = Lmin; i < Lmax; i++)
4684 {
4685 int len = 1;
4686 if (Feptr >= mb->end_subject)
4687 {
4688 SCHECK_PARTIAL();
4689 break;
4690 }
4691 GETCHARLEN(fc, Feptr, len);
4692 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
4693 Feptr+= len;
4694 }
4695 break;
4696
4697 default:
4698 return PCRE2_ERROR_INTERNAL;
4699 }
4700
4701 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4702
4703 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4704 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
4705 too far. */
4706
4707 for(;;)
4708 {
4709 if (Feptr <= Lstart_eptr) break;
4710 RMATCH(Fecode, RM221);
4711 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4712 Feptr--;
4713 BACKCHAR(Feptr);
4714 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
4715 UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
4716 Feptr--;
4717 }
4718 }
4719 else
4720 #endif /* SUPPORT_UNICODE */
4721
4722 /* Not UTF mode */
4723 {
4724 switch(Lctype)
4725 {
4726 case OP_ANY:
4727 for (i = Lmin; i < Lmax; i++)
4728 {
4729 if (Feptr >= mb->end_subject)
4730 {
4731 SCHECK_PARTIAL();
4732 break;
4733 }
4734 if (IS_NEWLINE(Feptr)) break;
4735 if (mb->partial != 0 && /* Take care with CRLF partial */
4736 Feptr + 1 >= mb->end_subject &&
4737 NLBLOCK->nltype == NLTYPE_FIXED &&
4738 NLBLOCK->nllen == 2 &&
4739 *Feptr == NLBLOCK->nl[0])
4740 {
4741 mb->hitend = TRUE;
4742 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4743 }
4744 Feptr++;
4745 }
4746 break;
4747
4748 case OP_ALLANY:
4749 case OP_ANYBYTE:
4750 fc = Lmax - Lmin;
4751 if (fc > (uint32_t)(mb->end_subject - Feptr))
4752 {
4753 Feptr = mb->end_subject;
4754 SCHECK_PARTIAL();
4755 }
4756 else Feptr += fc;
4757 break;
4758
4759 case OP_ANYNL:
4760 for (i = Lmin; i < Lmax; i++)
4761 {
4762 if (Feptr >= mb->end_subject)
4763 {
4764 SCHECK_PARTIAL();
4765 break;
4766 }
4767 fc = *Feptr;
4768 if (fc == CHAR_CR)
4769 {
4770 if (++Feptr >= mb->end_subject) break;
4771 if (*Feptr == CHAR_LF) Feptr++;
4772 }
4773 else
4774 {
4775 if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4776 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4777 #if PCRE2_CODE_UNIT_WIDTH != 8
4778 && fc != 0x2028 && fc != 0x2029
4779 #endif
4780 ))) break;
4781 Feptr++;
4782 }
4783 }
4784 break;
4785
4786 case OP_NOT_HSPACE:
4787 for (i = Lmin; i < Lmax; i++)
4788 {
4789 if (Feptr >= mb->end_subject)
4790 {
4791 SCHECK_PARTIAL();
4792 break;
4793 }
4794 switch(*Feptr)
4795 {
4796 default: Feptr++; break;
4797 HSPACE_BYTE_CASES:
4798 #if PCRE2_CODE_UNIT_WIDTH != 8
4799 HSPACE_MULTIBYTE_CASES:
4800 #endif
4801 goto ENDLOOP00;
4802 }
4803 }
4804 ENDLOOP00:
4805 break;
4806
4807 case OP_HSPACE:
4808 for (i = Lmin; i < Lmax; i++)
4809 {
4810 if (Feptr >= mb->end_subject)
4811 {
4812 SCHECK_PARTIAL();
4813 break;
4814 }
4815 switch(*Feptr)
4816 {
4817 default: goto ENDLOOP01;
4818 HSPACE_BYTE_CASES:
4819 #if PCRE2_CODE_UNIT_WIDTH != 8
4820 HSPACE_MULTIBYTE_CASES:
4821 #endif
4822 Feptr++; break;
4823 }
4824 }
4825 ENDLOOP01:
4826 break;
4827
4828 case OP_NOT_VSPACE:
4829 for (i = Lmin; i < Lmax; i++)
4830 {
4831 if (Feptr >= mb->end_subject)
4832 {
4833 SCHECK_PARTIAL();
4834 break;
4835 }
4836 switch(*Feptr)
4837 {
4838 default: Feptr++; break;
4839 VSPACE_BYTE_CASES:
4840 #if PCRE2_CODE_UNIT_WIDTH != 8
4841 VSPACE_MULTIBYTE_CASES:
4842 #endif
4843 goto ENDLOOP02;
4844 }
4845 }
4846 ENDLOOP02:
4847 break;
4848
4849 case OP_VSPACE:
4850 for (i = Lmin; i < Lmax; i++)
4851 {
4852 if (Feptr >= mb->end_subject)
4853 {
4854 SCHECK_PARTIAL();
4855 break;
4856 }
4857 switch(*Feptr)
4858 {
4859 default: goto ENDLOOP03;
4860 VSPACE_BYTE_CASES:
4861 #if PCRE2_CODE_UNIT_WIDTH != 8
4862 VSPACE_MULTIBYTE_CASES:
4863 #endif
4864 Feptr++; break;
4865 }
4866 }
4867 ENDLOOP03:
4868 break;
4869
4870 case OP_NOT_DIGIT:
4871 for (i = Lmin; i < Lmax; i++)
4872 {
4873 if (Feptr >= mb->end_subject)
4874 {
4875 SCHECK_PARTIAL();
4876 break;
4877 }
4878 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
4879 break;
4880 Feptr++;
4881 }
4882 break;
4883
4884 case OP_DIGIT:
4885 for (i = Lmin; i < Lmax; i++)
4886 {
4887 if (Feptr >= mb->end_subject)
4888 {
4889 SCHECK_PARTIAL();
4890 break;
4891 }
4892 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
4893 break;
4894 Feptr++;
4895 }
4896 break;
4897
4898 case OP_NOT_WHITESPACE:
4899 for (i = Lmin; i < Lmax; i++)
4900 {
4901 if (Feptr >= mb->end_subject)
4902 {
4903 SCHECK_PARTIAL();
4904 break;
4905 }
4906 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
4907 break;
4908 Feptr++;
4909 }
4910 break;
4911
4912 case OP_WHITESPACE:
4913 for (i = Lmin; i < Lmax; i++)
4914 {
4915 if (Feptr >= mb->end_subject)
4916 {
4917 SCHECK_PARTIAL();
4918 break;
4919 }
4920 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
4921 break;
4922 Feptr++;
4923 }
4924 break;
4925
4926 case OP_NOT_WORDCHAR:
4927 for (i = Lmin; i < Lmax; i++)
4928 {
4929 if (Feptr >= mb->end_subject)
4930 {
4931 SCHECK_PARTIAL();
4932 break;
4933 }
4934 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
4935 break;
4936 Feptr++;
4937 }
4938 break;
4939
4940 case OP_WORDCHAR:
4941 for (i = Lmin; i < Lmax; i++)
4942 {
4943 if (Feptr >= mb->end_subject)
4944 {
4945 SCHECK_PARTIAL();
4946 break;
4947 }
4948 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
4949 break;
4950 Feptr++;
4951 }
4952 break;
4953
4954 default:
4955 return PCRE2_ERROR_INTERNAL;
4956 }
4957
4958 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4959
4960 for (;;)
4961 {
4962 if (Feptr == Lstart_eptr) break;
4963 RMATCH(Fecode, RM34);
4964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4965 Feptr--;
4966 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
4967 Feptr[-1] == CHAR_CR) Feptr--;
4968 }
4969 }
4970 }
4971 break; /* End of repeat character type processing */
4972
4973 #undef Lstart_eptr
4974 #undef Lmin
4975 #undef Lmax
4976 #undef Lctype
4977 #undef Lpropvalue
4978
4979
4980 /* ===================================================================== */
4981 /* Match a back reference, possibly repeatedly. Look past the end of the
4982 item to see if there is repeat information following. The OP_REF and
4983 OP_REFI opcodes are used for a reference to a numbered group or to a
4984 non-duplicated named group. For a duplicated named group, OP_DNREF and
4985 OP_DNREFI are used. In this case we must scan the list of groups to which
4986 the name refers, and use the first one that is set. */
4987
4988 #define Lmin F->temp_32[0]
4989 #define Lmax F->temp_32[1]
4990 #define Lcaseless F->temp_32[2]
4991 #define Lstart F->temp_sptr[0]
4992 #define Loffset F->temp_size
4993
4994 case OP_DNREF:
4995 case OP_DNREFI:
4996 Lcaseless = (Fop == OP_DNREFI);
4997 {
4998 int count = GET2(Fecode, 1+IMM2_SIZE);
4999 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5000 Fecode += 1 + 2*IMM2_SIZE;
5001
5002 while (count-- > 0)
5003 {
5004 Loffset = (GET2(slot, 0) << 1) - 2;
5005 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
5006 slot += mb->name_entry_size;
5007 }
5008 }
5009 goto REF_REPEAT;
5010
5011 case OP_REF:
5012 case OP_REFI:
5013 Lcaseless = (Fop == OP_REFI);
5014 Loffset = (GET2(Fecode, 1) << 1) - 2;
5015 Fecode += 1 + IMM2_SIZE;
5016
5017 /* Set up for repetition, or handle the non-repeated case. The maximum and
5018 minimum must be in the heap frame, but as they are short-term values, we
5019 use temporary fields. */
5020
5021 REF_REPEAT:
5022 switch (*Fecode)
5023 {
5024 case OP_CRSTAR:
5025 case OP_CRMINSTAR:
5026 case OP_CRPLUS:
5027 case OP_CRMINPLUS:
5028 case OP_CRQUERY:
5029 case OP_CRMINQUERY:
5030 fc = *Fecode++ - OP_CRSTAR;
5031 Lmin = rep_min[fc];
5032 Lmax = rep_max[fc];
5033 reptype = rep_typ[fc];
5034 break;
5035
5036 case OP_CRRANGE:
5037 case OP_CRMINRANGE:
5038 Lmin = GET2(Fecode, 1);
5039 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
5040 reptype = rep_typ[*Fecode - OP_CRSTAR];
5041 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
5042 Fecode += 1 + 2 * IMM2_SIZE;
5043 break;
5044
5045 default: /* No repeat follows */
5046 {
5047 rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
5048 if (rrc != 0)
5049 {
5050 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5051 CHECK_PARTIAL();
5052 RRETURN(MATCH_NOMATCH);
5053 }
5054 }
5055 Feptr += length;
5056 continue; /* With the main loop */
5057 }
5058
5059 /* Handle repeated back references. If a set group has length zero, just
5060 continue with the main loop, because it matches however many times. For an
5061 unset reference, if the minimum is zero, we can also just continue. We can
5062 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
5063 group behave as a zero-length group. For any other unset cases, carrying
5064 on will result in NOMATCH. */
5065
5066 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
5067 {
5068 if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
5069 }
5070 else /* Group is not set */
5071 {
5072 if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
5073 continue;
5074 }
5075
5076 /* First, ensure the minimum number of matches are present. */
5077
5078 for (i = 1; i <= Lmin; i++)
5079 {
5080 PCRE2_SIZE slength;
5081 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
5082 if (rrc != 0)
5083 {
5084 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5085 CHECK_PARTIAL();
5086 RRETURN(MATCH_NOMATCH);
5087 }
5088 Feptr += slength;
5089 }
5090
5091 /* If min = max, we are done. They are not both allowed to be zero. */
5092
5093 if (Lmin == Lmax) continue;
5094
5095 /* If minimizing, keep trying and advancing the pointer. */
5096
5097 if (reptype == REPTYPE_MIN)
5098 {
5099 for (;;)
5100 {
5101 PCRE2_SIZE slength;
5102 RMATCH(Fecode, RM20);
5103 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5104 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
5105 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
5106 if (rrc != 0)
5107 {
5108 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
5109 CHECK_PARTIAL();
5110 RRETURN(MATCH_NOMATCH);
5111 }
5112 Feptr += slength;
5113 }
5114 /* Control never gets here */
5115 }
5116
5117 /* If maximizing, find the longest string and work backwards, as long as
5118 the matched lengths for each iteration are the same. */
5119
5120 else
5121 {
5122 BOOL samelengths = TRUE;
5123 Lstart = Feptr; /* Starting position */
5124 Flength = Fovector[Loffset+1] - Fovector[Loffset];
5125
5126 for (i = Lmin; i < Lmax; i++)
5127 {
5128 PCRE2_SIZE slength;
5129 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
5130 if (rrc != 0)
5131 {
5132 /* Can't use CHECK_PARTIAL because we don't want to update Feptr in
5133 the soft partial matching case. */
5134
5135 if (rrc > 0 && mb->partial != 0 &&
5136 mb->end_subject > mb->start_used_ptr)
5137 {
5138 mb->hitend = TRUE;
5139 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5140 }
5141 break;
5142 }
5143
5144 if (slength != Flength) samelengths = FALSE;
5145 Feptr += slength;
5146 }
5147
5148 /* If the length matched for each repetition is the same as the length of
5149 the captured group, we can easily work backwards. This is the normal
5150 case. However, in caseless UTF-8 mode there are pairs of case-equivalent
5151 characters whose lengths (in terms of code units) differ. However, this
5152 is very rare, so we handle it by re-matching fewer and fewer times. */
5153
5154 if (samelengths)
5155 {
5156 while (Feptr >= Lstart)
5157 {
5158 RMATCH(Fecode, RM21);
5159 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5160 Feptr -= Flength;
5161 }
5162 }
5163
5164 /* The rare case of non-matching lengths. Re-scan the repetition for each
5165 iteration. We know that match_ref() will succeed every time. */
5166
5167 else
5168 {
5169 Lmax = i;
5170 for (;;)
5171 {
5172 RMATCH(Fecode, RM22);
5173 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5174 if (Feptr == Lstart) break; /* Failed after minimal repetition */
5175 Feptr = Lstart;
5176 Lmax--;
5177 for (i = Lmin; i < Lmax; i++)
5178 {
5179 PCRE2_SIZE slength;
5180 (void)match_ref(Loffset, Lcaseless, F, mb, &slength);
5181 Feptr += slength;
5182 }
5183 }
5184 }
5185
5186 RRETURN(MATCH_NOMATCH);
5187 }
5188 /* Control never gets here */
5189
5190 #undef Lcaseless
5191 #undef Lmin
5192 #undef Lmax
5193 #undef Lstart
5194 #undef Loffset
5195
5196
5197
5198 /* ========================================================================= */
5199 /* Opcodes for the start of various parenthesized items */
5200 /* ========================================================================= */
5201
5202 /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
5203 (*THEN) is within the current branch by comparing the address of OP_THEN
5204 that is passed back with the end of the branch. If (*THEN) is within the
5205 current branch, and the branch is one of two or more alternatives (it
5206 either starts or ends with OP_ALT), we have reached the limit of THEN's
5207 action, so convert the return code to NOMATCH, which will cause normal
5208 backtracking to happen from now on. Otherwise, THEN is passed back to an
5209 outer alternative. This implements Perl's treatment of parenthesized
5210 groups, where a group not containing | does not affect the current
5211 alternative, that is, (X) is NOT the same as (X|(*F)). */
5212
5213
5214 /* ===================================================================== */
5215 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
5216 bracket group, indicating that it may occur zero times. It may repeat
5217 infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
5218 the pattern. Brackets with fixed upper repeat limits are compiled as a
5219 number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
5220 Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
5221
5222 #define Lnext_ecode F->temp_sptr[0]
5223
5224 case OP_BRAZERO:
5225 Lnext_ecode = Fecode + 1;
5226 RMATCH(Lnext_ecode, RM9);
5227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5228 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5229 Fecode = Lnext_ecode + 1 + LINK_SIZE;
5230 break;
5231
5232 case OP_BRAMINZERO:
5233 Lnext_ecode = Fecode + 1;
5234 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
5235 RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
5236 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5237 Fecode++;
5238 break;
5239
5240 #undef Lnext_ecode
5241
5242 case OP_SKIPZERO:
5243 Fecode++;
5244 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5245 Fecode += 1 + LINK_SIZE;
5246 break;
5247
5248
5249 /* ===================================================================== */
5250 /* Handle possessive brackets with an unlimited repeat. The end of these
5251 brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
5252 going further in the pattern. */
5253
5254 #define Lframe_type F->temp_32[0]
5255 #define Lmatched_once F->temp_32[1]
5256 #define Lzero_allowed F->temp_32[2]
5257 #define Lstart_eptr F->temp_sptr[0]
5258 #define Lstart_group F->temp_sptr[1]
5259
5260 case OP_BRAPOSZERO:
5261 Lzero_allowed = TRUE; /* Zero repeat is allowed */
5262 Fecode += 1;
5263 if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
5264 goto POSSESSIVE_CAPTURE;
5265 goto POSSESSIVE_NON_CAPTURE;
5266
5267 case OP_BRAPOS:
5268 case OP_SBRAPOS:
5269 Lzero_allowed = FALSE; /* Zero repeat not allowed */
5270
5271 POSSESSIVE_NON_CAPTURE:
5272 Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
5273 goto POSSESSIVE_GROUP;
5274
5275 case OP_CBRAPOS:
5276 case OP_SCBRAPOS:
5277 Lzero_allowed = FALSE; /* Zero repeat not allowed */
5278
5279 POSSESSIVE_CAPTURE:
5280 number = GET2(Fecode, 1+LINK_SIZE);
5281 Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
5282
5283 POSSESSIVE_GROUP:
5284 Lmatched_once = FALSE; /* Never matched */
5285 Lstart_group = Fecode; /* Start of this group */
5286
5287 for (;;)
5288 {
5289 Lstart_eptr = Feptr; /* Position at group start */
5290 group_frame_type = Lframe_type;
5291 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
5292 if (rrc == MATCH_KETRPOS)
5293 {
5294 Lmatched_once = TRUE; /* Matched at least once */
5295 if (Feptr == Lstart_eptr) /* Empty match; skip to end */
5296 {
5297 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5298 break;
5299 }
5300
5301 Fecode = Lstart_group;
5302 continue;
5303 }
5304
5305 /* See comment above about handling THEN. */
5306
5307 if (rrc == MATCH_THEN)
5308 {
5309 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5310 if (mb->verb_ecode_ptr < next_ecode &&
5311 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5312 rrc = MATCH_NOMATCH;
5313 }
5314
5315 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5316 Fecode += GET(Fecode, 1);
5317 if (*Fecode != OP_ALT) break;
5318 }
5319
5320 /* Success if matched something or zero repeat allowed */
5321
5322 if (Lmatched_once || Lzero_allowed)
5323 {
5324 Fecode += 1 + LINK_SIZE;
5325 break;
5326 }
5327
5328 RRETURN(MATCH_NOMATCH);
5329
5330 #undef Lmatched_once
5331 #undef Lzero_allowed
5332 #undef Lframe_type
5333 #undef Lstart_eptr
5334 #undef Lstart_group
5335
5336
5337 /* ===================================================================== */
5338 /* Handle non-capturing brackets that cannot match an empty string. When we
5339 get to the final alternative within the brackets, as long as there are no
5340 THEN's in the pattern, we can optimize by not recording a new backtracking
5341 point. (Ideally we should test for a THEN within this group, but we don't
5342 have that information.) Don't do this if we are at the very top level,
5343 however, because that would make handling assertions and once-only brackets
5344 messier when there is nothing to go back to. */
5345
5346 #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
5347 #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
5348
5349 case OP_BRA:
5350 if (mb->hasthen || Frdepth == 0)
5351 {
5352 Lframe_type = 0;
5353 goto GROUPLOOP;
5354 }
5355
5356 for (;;)
5357 {
5358 Lnext_branch = Fecode + GET(Fecode, 1);
5359 if (*Lnext_branch != OP_ALT) break;
5360
5361 /* This is never the final branch. We do not need to test for MATCH_THEN
5362 here because this code is not used when there is a THEN in the pattern. */
5363
5364 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
5365 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5366 Fecode = Lnext_branch;
5367 }
5368
5369 /* Hit the start of the final branch. Continue at this level. */
5370
5371 Fecode += PRIV(OP_lengths)[*Fecode];
5372 break;
5373
5374 #undef Lnext_branch
5375
5376
5377 /* ===================================================================== */
5378 /* Handle a capturing bracket, other than those that are possessive with an
5379 unlimited repeat. */
5380
5381 case OP_CBRA:
5382 case OP_SCBRA:
5383 Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
5384 goto GROUPLOOP;
5385
5386
5387 /* ===================================================================== */
5388 /* Atomic groups and non-capturing brackets that can match an empty string
5389 must record a backtracking point and also set up a chained frame. */
5390
5391 case OP_ONCE:
5392 case OP_SCRIPT_RUN:
5393 case OP_SBRA:
5394 Lframe_type = GF_NOCAPTURE | Fop;
5395
5396 GROUPLOOP:
5397 for (;;)
5398 {
5399 group_frame_type = Lframe_type;
5400 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
5401 if (rrc == MATCH_THEN)
5402 {
5403 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5404 if (mb->verb_ecode_ptr < next_ecode &&
5405 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5406 rrc = MATCH_NOMATCH;
5407 }
5408 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5409 Fecode += GET(Fecode, 1);
5410 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5411 }
5412 /* Control never reaches here. */
5413
5414 #undef Lframe_type
5415
5416
5417 /* ===================================================================== */
5418 /* Pattern recursion either matches the current regex, or some
5419 subexpression. The offset data is the offset to the starting bracket from
5420 the start of the whole pattern. This is so that it works from duplicated
5421 subpatterns. For a whole-pattern recursion, we have to infer the number
5422 zero. */
5423
5424 #define Lframe_type F->temp_32[0]
5425 #define Lstart_branch F->temp_sptr[0]
5426
5427 case OP_RECURSE:
5428 bracode = mb->start_code + GET(Fecode, 1);
5429 number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
5430
5431 /* If we are already in a pattern recursion, check for repeating the same
5432 one without changing the subject pointer or the last referenced character
5433 in the subject. This should catch convoluted mutual recursions; some
5434 simple cases are caught at compile time. However, there are rare cases when
5435 this check needs to be turned off. In this case, actual recursion loops
5436 will be caught by the match or heap limits. */
5437
5438 if (Fcurrent_recurse != RECURSE_UNSET)
5439 {
5440 offset = Flast_group_offset;
5441 while (offset != PCRE2_UNSET)
5442 {
5443 N = (heapframe *)((char *)match_data->heapframes + offset);
5444 P = (heapframe *)((char *)N - frame_size);
5445 if (N->group_frame_type == (GF_RECURSE | number))
5446 {
5447 if (Feptr == P->eptr && mb->last_used_ptr == P->recurse_last_used &&
5448 (mb->moptions & PCRE2_DISABLE_RECURSELOOP_CHECK) == 0)
5449 return PCRE2_ERROR_RECURSELOOP;
5450 break;
5451 }
5452 offset = P->last_group_offset;
5453 }
5454 }
5455
5456 /* Remember the current last referenced character and then run the
5457 recursion branch by branch. */
5458
5459 F->recurse_last_used = mb->last_used_ptr;
5460 Lstart_branch = bracode;
5461 Lframe_type = GF_RECURSE | number;
5462
5463 for (;;)
5464 {
5465 PCRE2_SPTR next_ecode;
5466
5467 group_frame_type = Lframe_type;
5468 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
5469 next_ecode = Lstart_branch + GET(Lstart_branch,1);
5470
5471 /* Handle backtracking verbs, which are defined in a range that can
5472 easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
5473 escape beyond a recursion; they cause a NOMATCH for the entire recursion.
5474
5475 When one of these verbs triggers, the current recursion group number is
5476 recorded. If it matches the recursion we are processing, the verb
5477 happened within the recursion and we must deal with it. Otherwise it must
5478 have happened after the recursion completed, and so has to be passed
5479 back. See comment above about handling THEN. */
5480
5481 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
5482 mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
5483 {
5484 if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
5485 (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
5486 rrc = MATCH_NOMATCH;
5487 else RRETURN(MATCH_NOMATCH);
5488 }
5489
5490 /* Note that carrying on after (*ACCEPT) in a recursion is handled in the
5491 OP_ACCEPT code. Nothing needs to be done here. */
5492
5493 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5494 Lstart_branch = next_ecode;
5495 if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
5496 }
5497 /* Control never reaches here. */
5498
5499 #undef Lframe_type
5500 #undef Lstart_branch
5501
5502
5503 /* ===================================================================== */
5504 /* Positive assertions are like other groups except that PCRE doesn't allow
5505 the effect of (*THEN) to escape beyond an assertion; it is therefore
5506 treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
5507 captures and mark retained. Any other return is an error. */
5508
5509 #define Lframe_type F->temp_32[0]
5510
5511 case OP_ASSERT:
5512 case OP_ASSERTBACK:
5513 case OP_ASSERT_NA:
5514 case OP_ASSERTBACK_NA:
5515 Lframe_type = GF_NOCAPTURE | Fop;
5516 for (;;)
5517 {
5518 group_frame_type = Lframe_type;
5519 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
5520 if (rrc == MATCH_ACCEPT)
5521 {
5522 memcpy(Fovector,
5523 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5524 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5525 Foffset_top = assert_accept_frame->offset_top;
5526 Fmark = assert_accept_frame->mark;
5527 break;
5528 }
5529 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
5530 Fecode += GET(Fecode, 1);
5531 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5532 }
5533
5534 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5535 Fecode += 1 + LINK_SIZE;
5536 break;
5537
5538 #undef Lframe_type
5539
5540
5541 /* ===================================================================== */
5542 /* Handle negative assertions. Loop for each non-matching branch as for
5543 positive assertions. */
5544
5545 #define Lframe_type F->temp_32[0]
5546
5547 case OP_ASSERT_NOT:
5548 case OP_ASSERTBACK_NOT:
5549 Lframe_type = GF_NOCAPTURE | Fop;
5550
5551 for (;;)
5552 {
5553 group_frame_type = Lframe_type;
5554 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
5555 switch(rrc)
5556 {
5557 case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
5558 case MATCH_MATCH:
5559 RRETURN (MATCH_NOMATCH);
5560
5561 case MATCH_NOMATCH: /* Branch failed, try next if present. */
5562 case MATCH_THEN:
5563 Fecode += GET(Fecode, 1);
5564 if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
5565 break;
5566
5567 case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
5568 case MATCH_SKIP:
5569 case MATCH_PRUNE:
5570 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5571 goto ASSERT_NOT_FAILED;
5572
5573 default: /* Pass back any other return */
5574 RRETURN(rrc);
5575 }
5576 }
5577
5578 /* None of the branches have matched or there was a backtrack to (*COMMIT),
5579 (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
5580 negative assertion, so carry on. */
5581
5582 ASSERT_NOT_FAILED:
5583 Fecode += 1 + LINK_SIZE;
5584 break;
5585
5586 #undef Lframe_type
5587
5588
5589 /* ===================================================================== */
5590 /* The callout item calls an external function, if one is provided, passing
5591 details of the match so far. This is mainly for debugging, though the
5592 function is able to force a failure. */
5593
5594 case OP_CALLOUT:
5595 case OP_CALLOUT_STR:
5596 rrc = do_callout(F, mb, &length);
5597 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5598 if (rrc < 0) RRETURN(rrc);
5599 Fecode += length;
5600 break;
5601
5602
5603 /* ===================================================================== */
5604 /* Conditional group: compilation checked that there are no more than two
5605 branches. If the condition is false, skipping the first branch takes us
5606 past the end of the item if there is only one branch, but that's exactly
5607 what we want. */
5608
5609 case OP_COND:
5610 case OP_SCOND:
5611
5612 /* The variable Flength will be added to Fecode when the condition is
5613 false, to get to the second branch. Setting it to the offset to the ALT or
5614 KET, then incrementing Fecode achieves this effect. However, if the second
5615 branch is non-existent, we must point to the KET so that the end of the
5616 group is correctly processed. We now have Fecode pointing to the condition
5617 or callout. */
5618
5619 Flength = GET(Fecode, 1); /* Offset to the second branch */
5620 if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
5621 Fecode += 1 + LINK_SIZE; /* From this opcode */
5622
5623 /* Because of the way auto-callout works during compile, a callout item is
5624 inserted between OP_COND and an assertion condition. Such a callout can
5625 also be inserted manually. */
5626
5627 if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
5628 {
5629 rrc = do_callout(F, mb, &length);
5630 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5631 if (rrc < 0) RRETURN(rrc);
5632
5633 /* Advance Fecode past the callout, so it now points to the condition. We
5634 must adjust Flength so that the value of Fecode+Flength is unchanged. */
5635
5636 Fecode += length;
5637 Flength -= length;
5638 }
5639
5640 /* Test the various possible conditions */
5641
5642 condition = FALSE;
5643 switch(*Fecode)
5644 {
5645 case OP_RREF: /* Group recursion test */
5646 if (Fcurrent_recurse != RECURSE_UNSET)
5647 {
5648 number = GET2(Fecode, 1);
5649 condition = (number == RREF_ANY || number == Fcurrent_recurse);
5650 }
5651 break;
5652
5653 case OP_DNRREF: /* Duplicate named group recursion test */
5654 if (Fcurrent_recurse != RECURSE_UNSET)
5655 {
5656 int count = GET2(Fecode, 1 + IMM2_SIZE);
5657 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5658 while (count-- > 0)
5659 {
5660 number = GET2(slot, 0);
5661 condition = number == Fcurrent_recurse;
5662 if (condition) break;
5663 slot += mb->name_entry_size;
5664 }
5665 }
5666 break;
5667
5668 case OP_CREF: /* Numbered group used test */
5669 offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
5670 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5671 break;
5672
5673 case OP_DNCREF: /* Duplicate named group used test */
5674 {
5675 int count = GET2(Fecode, 1 + IMM2_SIZE);
5676 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5677 while (count-- > 0)
5678 {
5679 offset = (GET2(slot, 0) << 1) - 2;
5680 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5681 if (condition) break;
5682 slot += mb->name_entry_size;
5683 }
5684 }
5685 break;
5686
5687 case OP_FALSE:
5688 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
5689 break;
5690
5691 case OP_TRUE:
5692 condition = TRUE;
5693 break;
5694
5695 /* The condition is an assertion. Run code similar to the assertion code
5696 above. */
5697
5698 #define Lpositive F->temp_32[0]
5699 #define Lstart_branch F->temp_sptr[0]
5700
5701 default:
5702 Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
5703 Lstart_branch = Fecode;
5704
5705 for (;;)
5706 {
5707 group_frame_type = GF_CONDASSERT | *Fecode;
5708 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
5709
5710 switch(rrc)
5711 {
5712 case MATCH_ACCEPT: /* Save captures */
5713 memcpy(Fovector,
5714 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5715 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5716 Foffset_top = assert_accept_frame->offset_top;
5717
5718 /* Fall through */
5719 /* In the case of a match, the captures have already been put into
5720 the current frame. */
5721
5722 case MATCH_MATCH:
5723 condition = Lpositive; /* TRUE for positive assertion */
5724 break;
5725
5726 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
5727 assertion; it is therefore always treated as NOMATCH. */
5728
5729 case MATCH_NOMATCH:
5730 case MATCH_THEN:
5731 Lstart_branch += GET(Lstart_branch, 1);
5732 if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
5733 condition = !Lpositive; /* TRUE for negative assertion */
5734 break;
5735
5736 /* These force no match without checking other branches. */
5737
5738 case MATCH_COMMIT:
5739 case MATCH_SKIP:
5740 case MATCH_PRUNE:
5741 condition = !Lpositive;
5742 break;
5743
5744 default:
5745 RRETURN(rrc);
5746 }
5747 break; /* Out of the branch loop */
5748 }
5749
5750 /* If the condition is true, find the end of the assertion so that
5751 advancing past it gets us to the start of the first branch. */
5752
5753 if (condition)
5754 {
5755 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5756 }
5757 break; /* End of assertion condition */
5758 }
5759
5760 #undef Lpositive
5761 #undef Lstart_branch
5762
5763 /* Choose branch according to the condition. */
5764
5765 Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
5766
5767 /* If the opcode is OP_SCOND it means we are at a repeated conditional
5768 group that might match an empty string. We must therefore descend a level
5769 so that the start is remembered for checking. For OP_COND we can just
5770 continue at this level. */
5771
5772 if (Fop == OP_SCOND)
5773 {
5774 group_frame_type = GF_NOCAPTURE | Fop;
5775 RMATCH(Fecode, RM35);
5776 RRETURN(rrc);
5777 }
5778 break;
5779
5780
5781
5782 /* ========================================================================= */
5783 /* End of start of parenthesis opcodes */
5784 /* ========================================================================= */
5785
5786
5787 /* ===================================================================== */
5788 /* Move the subject pointer back by one fixed amount. This occurs at the
5789 start of each branch that has a fixed length in a lookbehind assertion. If
5790 we are too close to the start to move back, fail. When working with UTF-8
5791 we move back a number of characters, not bytes. */
5792
5793 case OP_REVERSE:
5794 number = GET2(Fecode, 1);
5795 #ifdef SUPPORT_UNICODE
5796 if (utf)
5797 {
5798 while (number-- > 0)
5799 {
5800 if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);
5801 Feptr--;
5802 BACKCHAR(Feptr);
5803 }
5804 }
5805 else
5806 #endif
5807
5808 /* No UTF support, or not in UTF mode: count is code unit count */
5809
5810 {
5811 if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
5812 Feptr -= number;
5813 }
5814
5815 /* Save the earliest consulted character, then skip to next opcode */
5816
5817 if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
5818 Fecode += 1 + IMM2_SIZE;
5819 break;
5820
5821
5822 /* ===================================================================== */
5823 /* Move the subject pointer back by a variable amount. This occurs at the
5824 start of each branch of a lookbehind assertion when the branch has a
5825 variable, but limited, length. A loop is needed to try matching the branch
5826 after moving back different numbers of characters. If we are too close to
5827 the start to move back even the minimum amount, fail. When working with
5828 UTF-8 we move back a number of characters, not bytes. */
5829
5830 #define Lmin F->temp_32[0]
5831 #define Lmax F->temp_32[1]
5832 #define Leptr F->temp_sptr[0]
5833
5834 case OP_VREVERSE:
5835 Lmin = GET2(Fecode, 1);
5836 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
5837 Leptr = Feptr;
5838
5839 /* Move back by the maximum branch length and then work forwards. This
5840 ensures that items such as \d{3,5} get the maximum length, which is
5841 relevant for captures, and makes for Perl compatibility. */
5842
5843 #ifdef SUPPORT_UNICODE
5844 if (utf)
5845 {
5846 for (i = 0; i < Lmax; i++)
5847 {
5848 if (Feptr == mb->start_subject)
5849 {
5850 if (i < Lmin) RRETURN(MATCH_NOMATCH);
5851 Lmax = i;
5852 break;
5853 }
5854 Feptr--;
5855 BACKCHAR(Feptr);
5856 }
5857 }
5858 else
5859 #endif
5860
5861 /* No UTF support or not in UTF mode */
5862
5863 {
5864 ptrdiff_t diff = Feptr - mb->start_subject;
5865 uint32_t available = (diff > 65535)? 65535 : ((diff > 0)? (int)diff : 0);
5866 if (Lmin > available) RRETURN(MATCH_NOMATCH);
5867 if (Lmax > available) Lmax = available;
5868 Feptr -= Lmax;
5869 }
5870
5871 /* Now try matching, moving forward one character on failure, until we
5872 reach the mimimum back length. */
5873
5874 for (;;)
5875 {
5876 RMATCH(Fecode + 1 + 2 * IMM2_SIZE, RM37);
5877 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5878 if (Lmax-- <= Lmin) RRETURN(MATCH_NOMATCH);
5879 Feptr++;
5880 #ifdef SUPPORT_UNICODE
5881 if (utf) { FORWARDCHARTEST(Feptr, mb->end_subject); }
5882 #endif
5883 }
5884 /* Control never reaches here */
5885
5886 #undef Lmin
5887 #undef Lmax
5888 #undef Leptr
5889
5890 /* ===================================================================== */
5891 /* An alternation is the end of a branch; scan along to find the end of the
5892 bracketed group. */
5893
5894 case OP_ALT:
5895 branch_end = Fecode;
5896 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5897 break;
5898
5899
5900 /* ===================================================================== */
5901 /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
5902 starting frame was added to the chained frames in order to remember the
5903 starting subject position for the group. (Not true for OP_BRA when it's a
5904 whole pattern recursion, but that is handled separately below.)*/
5905
5906 case OP_KET:
5907 case OP_KETRMIN:
5908 case OP_KETRMAX:
5909 case OP_KETRPOS:
5910
5911 bracode = Fecode - GET(Fecode, 1);
5912
5913 if (branch_end == NULL) branch_end = Fecode;
5914 branch_start = bracode;
5915 while (branch_start + GET(branch_start, 1) != branch_end)
5916 branch_start += GET(branch_start, 1);
5917 branch_end = NULL;
5918
5919 /* Point N to the frame at the start of the most recent group, and P to its
5920 predecessor. Remember the subject pointer at the start of the group. */
5921
5922 if (*bracode != OP_BRA && *bracode != OP_COND)
5923 {
5924 N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset);
5925 P = (heapframe *)((char *)N - frame_size);
5926 Flast_group_offset = P->last_group_offset;
5927
5928 #ifdef DEBUG_SHOW_RMATCH
5929 fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
5930 N->rdepth, N->group_frame_type,
5931 (char *)P->eptr - (char *)mb->start_subject);
5932 #endif
5933
5934 /* If we are at the end of an assertion that is a condition, return a
5935 match, discarding any intermediate backtracking points. Copy back the
5936 mark setting and the captures into the frame before N so that they are
5937 set on return. Doing this for all assertions, both positive and negative,
5938 seems to match what Perl does. */
5939
5940 if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
5941 {
5942 memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
5943 Foffset_top * sizeof(PCRE2_SIZE));
5944 P->offset_top = Foffset_top;
5945 P->mark = Fmark;
5946 Fback_frame = (char *)F - (char *)P;
5947 RRETURN(MATCH_MATCH);
5948 }
5949 }
5950 else P = NULL; /* Indicates starting frame not recorded */
5951
5952 /* The group was not a conditional assertion. */
5953
5954 switch (*bracode)
5955 {
5956 /* Whole pattern recursion is handled as a recursion into group 0, but
5957 the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing
5958 group - a design mistake: it should perhaps have been capture group 0.
5959 Anyway, that means the end of such recursion must be handled here. It is
5960 detected by checking for an immediately following OP_END when we are
5961 recursing in group 0. If this is not the end of a whole-pattern
5962 recursion, there is nothing to be done. */
5963
5964 case OP_BRA:
5965 if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break;
5966
5967 /* It is the end of whole-pattern recursion. */
5968
5969 offset = Flast_group_offset;
5970 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
5971 N = (heapframe *)((char *)match_data->heapframes + offset);
5972 P = (heapframe *)((char *)N - frame_size);
5973 Flast_group_offset = P->last_group_offset;
5974
5975 /* Reinstate the previous set of captures and then carry on after the
5976 recursion call. */
5977
5978 memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
5979 Foffset_top * sizeof(PCRE2_SIZE));
5980 Foffset_top = P->offset_top;
5981 Fcapture_last = P->capture_last;
5982 Fcurrent_recurse = P->current_recurse;
5983 Fecode = P->ecode + 1 + LINK_SIZE;
5984 continue; /* With next opcode */
5985
5986 case OP_COND: /* No need to do anything for these */
5987 case OP_SCOND:
5988 break;
5989
5990 /* Non-atomic positive assertions are like OP_BRA, except that the
5991 subject pointer must be put back to where it was at the start of the
5992 assertion. For a variable lookbehind, check its end point. */
5993
5994 case OP_ASSERTBACK_NA:
5995 if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
5996 RRETURN(MATCH_NOMATCH);
5997 /* Fall through */
5998
5999 case OP_ASSERT_NA:
6000 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6001 Feptr = P->eptr;
6002 break;
6003
6004 /* Atomic positive assertions are like OP_ONCE, except that in addition
6005 the subject pointer must be put back to where it was at the start of the
6006 assertion. For a variable lookbehind, check its end point. */
6007
6008 case OP_ASSERTBACK:
6009 if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6010 RRETURN(MATCH_NOMATCH);
6011 /* Fall through */
6012
6013 case OP_ASSERT:
6014 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6015 Feptr = P->eptr;
6016 /* Fall through */
6017
6018 /* For an atomic group, discard internal backtracking points. We must
6019 also ensure that any remaining branches within the top-level of the group
6020 are not tried. Do this by adjusting the code pointer within the backtrack
6021 frame so that it points to the final branch. */
6022
6023 case OP_ONCE:
6024 Fback_frame = ((char *)F - (char *)P);
6025 for (;;)
6026 {
6027 uint32_t y = GET(P->ecode,1);
6028 if ((P->ecode)[y] != OP_ALT) break;
6029 P->ecode += y;
6030 }
6031 break;
6032
6033 /* A matching negative assertion returns MATCH, which is turned into
6034 NOMATCH at the assertion level. For a variable lookbehind, check its end
6035 point. */
6036
6037 case OP_ASSERTBACK_NOT:
6038 if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
6039 RRETURN(MATCH_NOMATCH);
6040 /* Fall through */
6041
6042 case OP_ASSERT_NOT:
6043 RRETURN(MATCH_MATCH);
6044
6045 /* At the end of a script run, apply the script-checking rules. This code
6046 will never by exercised if Unicode support it not compiled, because in
6047 that environment script runs cause an error at compile time. */
6048
6049 case OP_SCRIPT_RUN:
6050 if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
6051 break;
6052
6053 /* Whole-pattern recursion is coded as a recurse into group 0, and is
6054 handled with OP_BRA above. Other recursion is handled here. */
6055
6056 case OP_CBRA:
6057 case OP_CBRAPOS:
6058 case OP_SCBRA:
6059 case OP_SCBRAPOS:
6060 number = GET2(bracode, 1+LINK_SIZE);
6061
6062 /* Handle a recursively called group. We reinstate the previous set of
6063 captures and then carry on after the recursion call. */
6064
6065 if (Fcurrent_recurse == number)
6066 {
6067 P = (heapframe *)((char *)N - frame_size);
6068 memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
6069 Foffset_top * sizeof(PCRE2_SIZE));
6070 Foffset_top = P->offset_top;
6071 Fcapture_last = P->capture_last;
6072 Fcurrent_recurse = P->current_recurse;
6073 Fecode = P->ecode + 1 + LINK_SIZE;
6074 continue; /* With next opcode */
6075 }
6076
6077 /* Deal with actual capturing. */
6078
6079 offset = (number << 1) - 2;
6080 Fcapture_last = number;
6081 Fovector[offset] = P->eptr - mb->start_subject;
6082 Fovector[offset+1] = Feptr - mb->start_subject;
6083 if (offset >= Foffset_top) Foffset_top = offset + 2;
6084 break;
6085 } /* End actions relating to the starting opcode */
6086
6087 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
6088 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
6089 at a time from the outer level. This must precede the empty string test -
6090 in this case that test is done at the outer level. */
6091
6092 if (*Fecode == OP_KETRPOS)
6093 {
6094 memcpy((char *)P + offsetof(heapframe, eptr),
6095 (char *)F + offsetof(heapframe, eptr),
6096 frame_copy_size);
6097 RRETURN(MATCH_KETRPOS);
6098 }
6099
6100 /* Handle the different kinds of closing brackets. A non-repeating ket
6101 needs no special action, just continuing at this level. This also happens
6102 for the repeating kets if the group matched no characters, in order to
6103 forcibly break infinite loops. Otherwise, the repeating kets try the rest
6104 of the pattern or restart from the preceding bracket, in the appropriate
6105 order. */
6106
6107 if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
6108 {
6109 if (Fop == OP_KETRMIN)
6110 {
6111 RMATCH(Fecode + 1 + LINK_SIZE, RM6);
6112 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6113 Fecode -= GET(Fecode, 1);
6114 break; /* End of ket processing */
6115 }
6116
6117 /* Repeat the maximum number of times (KETRMAX) */
6118
6119 RMATCH(bracode, RM7);
6120 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6121 }
6122
6123 /* Carry on at this level for a non-repeating ket, or after matching an
6124 empty string, or after repeating for a maximum number of times. */
6125
6126 Fecode += 1 + LINK_SIZE;
6127 break;
6128
6129
6130 /* ===================================================================== */
6131 /* Start and end of line assertions, not multiline mode. */
6132
6133 case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
6134 if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
6135 RRETURN(MATCH_NOMATCH);
6136 Fecode++;
6137 break;
6138
6139 case OP_SOD: /* Unconditional start of subject */
6140 if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
6141 Fecode++;
6142 break;
6143
6144 /* When PCRE2_NOTEOL is unset, assert before the subject end, or a
6145 terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
6146
6147 case OP_DOLL:
6148 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
6149 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
6150
6151 /* Fall through */
6152 /* Unconditional end of subject assertion (\z). */
6153
6154 case OP_EOD:
6155 if (Feptr < mb->true_end_subject) RRETURN(MATCH_NOMATCH);
6156 if (mb->partial != 0)
6157 {
6158 mb->hitend = TRUE;
6159 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6160 }
6161 Fecode++;
6162 break;
6163
6164 /* End of subject or ending \n assertion (\Z) */
6165
6166 case OP_EODN:
6167 ASSERT_NL_OR_EOS:
6168 if (Feptr < mb->end_subject &&
6169 (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen))
6170 {
6171 if (mb->partial != 0 &&
6172 Feptr + 1 >= mb->end_subject &&
6173 NLBLOCK->nltype == NLTYPE_FIXED &&
6174 NLBLOCK->nllen == 2 &&
6175 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
6176 {
6177 mb->hitend = TRUE;
6178 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6179 }
6180 RRETURN(MATCH_NOMATCH);
6181 }
6182
6183 /* Either at end of string or \n before end. */
6184
6185 if (mb->partial != 0)
6186 {
6187 mb->hitend = TRUE;
6188 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6189 }
6190 Fecode++;
6191 break;
6192
6193
6194 /* ===================================================================== */
6195 /* Start and end of line assertions, multiline mode. */
6196
6197 /* Start of subject unless notbol, or after any newline except for one at
6198 the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
6199
6200 case OP_CIRCM:
6201 if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
6202 RRETURN(MATCH_NOMATCH);
6203 if (Feptr != mb->start_subject &&
6204 ((Feptr == mb->end_subject &&
6205 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
6206 !WAS_NEWLINE(Feptr)))
6207 RRETURN(MATCH_NOMATCH);
6208 Fecode++;
6209 break;
6210
6211 /* Assert before any newline, or before end of subject unless noteol is
6212 set. */
6213
6214 case OP_DOLLM:
6215 if (Feptr < mb->end_subject)
6216 {
6217 if (!IS_NEWLINE(Feptr))
6218 {
6219 if (mb->partial != 0 &&
6220 Feptr + 1 >= mb->end_subject &&
6221 NLBLOCK->nltype == NLTYPE_FIXED &&
6222 NLBLOCK->nllen == 2 &&
6223 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
6224 {
6225 mb->hitend = TRUE;
6226 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
6227 }
6228 RRETURN(MATCH_NOMATCH);
6229 }
6230 }
6231 else
6232 {
6233 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
6234 SCHECK_PARTIAL();
6235 }
6236 Fecode++;
6237 break;
6238
6239
6240 /* ===================================================================== */
6241 /* Start of match assertion */
6242
6243 case OP_SOM:
6244 if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
6245 Fecode++;
6246 break;
6247
6248
6249 /* ===================================================================== */
6250 /* Reset the start of match point */
6251
6252 case OP_SET_SOM:
6253 Fstart_match = Feptr;
6254 Fecode++;
6255 break;
6256
6257
6258 /* ===================================================================== */
6259 /* Word boundary assertions. Find out if the previous and current
6260 characters are "word" characters. It takes a bit more work in UTF mode.
6261 Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
6262 not set. When it is set, use Unicode properties if available, even when not
6263 in UTF mode. Remember the earliest and latest consulted characters. */
6264
6265 case OP_NOT_WORD_BOUNDARY:
6266 case OP_WORD_BOUNDARY:
6267 case OP_NOT_UCP_WORD_BOUNDARY:
6268 case OP_UCP_WORD_BOUNDARY:
6269 if (Feptr == mb->check_subject) prev_is_word = FALSE; else
6270 {
6271 PCRE2_SPTR lastptr = Feptr - 1;
6272 #ifdef SUPPORT_UNICODE
6273 if (utf)
6274 {
6275 BACKCHAR(lastptr);
6276 GETCHAR(fc, lastptr);
6277 }
6278 else
6279 #endif /* SUPPORT_UNICODE */
6280 fc = *lastptr;
6281 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
6282 #ifdef SUPPORT_UNICODE
6283 if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
6284 {
6285 int chartype = UCD_CHARTYPE(fc);
6286 int category = PRIV(ucp_gentype)[chartype];
6287 prev_is_word = (category == ucp_L || category == ucp_N ||
6288 chartype == ucp_Mn || chartype == ucp_Pc);
6289 }
6290 else
6291 #endif /* SUPPORT_UNICODE */
6292 prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6293 }
6294
6295 /* Get status of next character */
6296
6297 if (Feptr >= mb->end_subject)
6298 {
6299 SCHECK_PARTIAL();
6300 cur_is_word = FALSE;
6301 }
6302 else
6303 {
6304 PCRE2_SPTR nextptr = Feptr + 1;
6305 #ifdef SUPPORT_UNICODE
6306 if (utf)
6307 {
6308 FORWARDCHARTEST(nextptr, mb->end_subject);
6309 GETCHAR(fc, Feptr);
6310 }
6311 else
6312 #endif /* SUPPORT_UNICODE */
6313 fc = *Feptr;
6314 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
6315 #ifdef SUPPORT_UNICODE
6316 if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
6317 {
6318 int chartype = UCD_CHARTYPE(fc);
6319 int category = PRIV(ucp_gentype)[chartype];
6320 cur_is_word = (category == ucp_L || category == ucp_N ||
6321 chartype == ucp_Mn || chartype == ucp_Pc);
6322 }
6323 else
6324 #endif /* SUPPORT_UNICODE */
6325 cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
6326 }
6327
6328 /* Now see if the situation is what we want */
6329
6330 if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)?
6331 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6332 RRETURN(MATCH_NOMATCH);
6333 break;
6334
6335
6336 /* ===================================================================== */
6337 /* Backtracking (*VERB)s, with and without arguments. Note that if the
6338 pattern is successfully matched, we do not come back from RMATCH. */
6339
6340 case OP_MARK:
6341 Fmark = mb->nomatch_mark = Fecode + 2;
6342 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
6343
6344 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
6345 argument, and we must check whether that argument matches this MARK's
6346 argument. It is passed back in mb->verb_skip_ptr. If it does match, we
6347 return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
6348 position that corresponds to this mark. Otherwise, pass back the return
6349 code unaltered. */
6350
6351 if (rrc == MATCH_SKIP_ARG &&
6352 PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
6353 {
6354 mb->verb_skip_ptr = Feptr; /* Pass back current position */
6355 RRETURN(MATCH_SKIP);
6356 }
6357 RRETURN(rrc);
6358
6359 case OP_FAIL:
6360 RRETURN(MATCH_NOMATCH);
6361
6362 /* Record the current recursing group number in mb->verb_current_recurse
6363 when a backtracking return such as MATCH_COMMIT is given. This enables the
6364 recurse processing to catch verbs from within the recursion. */
6365
6366 case OP_COMMIT:
6367 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
6368 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6369 mb->verb_current_recurse = Fcurrent_recurse;
6370 RRETURN(MATCH_COMMIT);
6371
6372 case OP_COMMIT_ARG:
6373 Fmark = mb->nomatch_mark = Fecode + 2;
6374 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
6375 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6376 mb->verb_current_recurse = Fcurrent_recurse;
6377 RRETURN(MATCH_COMMIT);
6378
6379 case OP_PRUNE:
6380 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
6381 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6382 mb->verb_current_recurse = Fcurrent_recurse;
6383 RRETURN(MATCH_PRUNE);
6384
6385 case OP_PRUNE_ARG:
6386 Fmark = mb->nomatch_mark = Fecode + 2;
6387 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
6388 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6389 mb->verb_current_recurse = Fcurrent_recurse;
6390 RRETURN(MATCH_PRUNE);
6391
6392 case OP_SKIP:
6393 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
6394 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6395 mb->verb_skip_ptr = Feptr; /* Pass back current position */
6396 mb->verb_current_recurse = Fcurrent_recurse;
6397 RRETURN(MATCH_SKIP);
6398
6399 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
6400 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
6401 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
6402 that failed and any that precede it (either they also failed, or were not
6403 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
6404 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
6405 set to the count of the one that failed. */
6406
6407 case OP_SKIP_ARG:
6408 mb->skip_arg_count++;
6409 if (mb->skip_arg_count <= mb->ignore_skip_arg)
6410 {
6411 Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
6412 break;
6413 }
6414 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
6415 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6416
6417 /* Pass back the current skip name and return the special MATCH_SKIP_ARG
6418 return code. This will either be caught by a matching MARK, or get to the
6419 top, where it causes a rematch with mb->ignore_skip_arg set to the value of
6420 mb->skip_arg_count. */
6421
6422 mb->verb_skip_ptr = Fecode + 2;
6423 mb->verb_current_recurse = Fcurrent_recurse;
6424 RRETURN(MATCH_SKIP_ARG);
6425
6426 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
6427 the branch in which it occurs can be determined. */
6428
6429 case OP_THEN:
6430 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
6431 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6432 mb->verb_ecode_ptr = Fecode;
6433 mb->verb_current_recurse = Fcurrent_recurse;
6434 RRETURN(MATCH_THEN);
6435
6436 case OP_THEN_ARG:
6437 Fmark = mb->nomatch_mark = Fecode + 2;
6438 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
6439 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6440 mb->verb_ecode_ptr = Fecode;
6441 mb->verb_current_recurse = Fcurrent_recurse;
6442 RRETURN(MATCH_THEN);
6443
6444
6445 /* ===================================================================== */
6446 /* There's been some horrible disaster. Arrival here can only mean there is
6447 something seriously wrong in the code above or the OP_xxx definitions. */
6448
6449 default:
6450 return PCRE2_ERROR_INTERNAL;
6451 }
6452
6453 /* Do not insert any code in here without much thought; it is assumed
6454 that "continue" in the code above comes out to here to repeat the main
6455 loop. */
6456
6457 } /* End of main loop */
6458 /* Control never reaches here */
6459
6460
6461 /* ========================================================================= */
6462 /* The RRETURN() macro jumps here. The number that is saved in Freturn_id
6463 indicates which label we actually want to return to. The value in Frdepth is
6464 the index number of the frame in the vector. The return value has been placed
6465 in rrc. */
6466
6467 #define LBL(val) case val: goto L_RM##val;
6468
6469 RETURN_SWITCH:
6470 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6471 if (Frdepth == 0) return rrc; /* Exit from the top level */
6472 F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
6473 mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
6474
6475 #ifdef DEBUG_SHOW_RMATCH
6476 fprintf(stderr, "++ RETURN %d to RM%d\n", rrc, Freturn_id);
6477 #endif
6478
6479 switch (Freturn_id)
6480 {
6481 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6482 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
6483 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
6484 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
6485 LBL(33) LBL(34) LBL(35) LBL(36) LBL(37)
6486
6487 #ifdef SUPPORT_WIDE_CHARS
6488 LBL(100) LBL(101)
6489 #endif
6490
6491 #ifdef SUPPORT_UNICODE
6492 LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
6493 LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
6494 LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
6495 LBL(221) LBL(222) LBL(223) LBL(224) LBL(225)
6496 #endif
6497
6498 default:
6499 return PCRE2_ERROR_INTERNAL;
6500 }
6501 #undef LBL
6502 }
6503
6504
6505 /*************************************************
6506 * Match a Regular Expression *
6507 *************************************************/
6508
6509 /* This function applies a compiled pattern to a subject string and picks out
6510 portions of the string if it matches. Two elements in the vector are set for
6511 each substring: the offsets to the start and end of the substring.
6512
6513 Arguments:
6514 code points to the compiled expression
6515 subject points to the subject string
6516 length length of subject string (may contain binary zeros)
6517 start_offset where to start in the subject string
6518 options option bits
6519 match_data points to a match_data block
6520 mcontext points a PCRE2 context
6521
6522 Returns: > 0 => success; value is the number of ovector pairs filled
6523 = 0 => success, but ovector is not big enough
6524 = -1 => failed to match (PCRE2_ERROR_NOMATCH)
6525 = -2 => partial match (PCRE2_ERROR_PARTIAL)
6526 < -2 => some kind of unexpected problem
6527 */
6528
6529 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext)6530 pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
6531 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6532 pcre2_match_context *mcontext)
6533 {
6534 int rc;
6535 int was_zero_terminated = 0;
6536 const uint8_t *start_bits = NULL;
6537 const pcre2_real_code *re = (const pcre2_real_code *)code;
6538
6539 BOOL anchored;
6540 BOOL firstline;
6541 BOOL has_first_cu = FALSE;
6542 BOOL has_req_cu = FALSE;
6543 BOOL startline;
6544
6545 #if PCRE2_CODE_UNIT_WIDTH == 8
6546 PCRE2_SPTR memchr_found_first_cu;
6547 PCRE2_SPTR memchr_found_first_cu2;
6548 #endif
6549
6550 PCRE2_UCHAR first_cu = 0;
6551 PCRE2_UCHAR first_cu2 = 0;
6552 PCRE2_UCHAR req_cu = 0;
6553 PCRE2_UCHAR req_cu2 = 0;
6554
6555 PCRE2_SPTR bumpalong_limit;
6556 PCRE2_SPTR end_subject;
6557 PCRE2_SPTR true_end_subject;
6558 PCRE2_SPTR start_match;
6559 PCRE2_SPTR req_cu_ptr;
6560 PCRE2_SPTR start_partial;
6561 PCRE2_SPTR match_partial;
6562
6563 #ifdef SUPPORT_JIT
6564 BOOL use_jit;
6565 #endif
6566
6567 /* This flag is needed even when Unicode is not supported for convenience
6568 (it is used by the IS_NEWLINE macro). */
6569
6570 BOOL utf = FALSE;
6571
6572 #ifdef SUPPORT_UNICODE
6573 BOOL ucp = FALSE;
6574 BOOL allow_invalid;
6575 uint32_t fragment_options = 0;
6576 #ifdef SUPPORT_JIT
6577 BOOL jit_checked_utf = FALSE;
6578 #endif
6579 #endif /* SUPPORT_UNICODE */
6580
6581 PCRE2_SIZE frame_size;
6582 PCRE2_SIZE heapframes_size;
6583
6584 /* We need to have mb as a pointer to a match block, because the IS_NEWLINE
6585 macro is used below, and it expects NLBLOCK to be defined as a pointer. */
6586
6587 pcre2_callout_block cb;
6588 match_block actual_match_block;
6589 match_block *mb = &actual_match_block;
6590
6591 /* Recognize NULL, length 0 as an empty string. */
6592
6593 if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
6594
6595 /* Plausibility checks */
6596
6597 if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
6598 if (code == NULL || subject == NULL || match_data == NULL)
6599 return PCRE2_ERROR_NULL;
6600
6601 start_match = subject + start_offset;
6602 req_cu_ptr = start_match - 1;
6603 if (length == PCRE2_ZERO_TERMINATED)
6604 {
6605 length = PRIV(strlen)(subject);
6606 was_zero_terminated = 1;
6607 }
6608 true_end_subject = end_subject = subject + length;
6609
6610 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
6611
6612 /* Check that the first field in the block is the magic number. */
6613
6614 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
6615
6616 /* Check the code unit width. */
6617
6618 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
6619 return PCRE2_ERROR_BADMODE;
6620
6621 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
6622 options variable for this function. Users of PCRE2 who are not calling the
6623 function directly would like to have a way of setting these flags, in the same
6624 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
6625 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
6626 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
6627 transfer to the options for this function. The bits are guaranteed to be
6628 adjacent, but do not have the same values. This bit of Boolean trickery assumes
6629 that the match-time bits are not more significant than the flag bits. If by
6630 accident this is not the case, a compile-time division by zero error will
6631 occur. */
6632
6633 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
6634 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
6635 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
6636 #undef FF
6637 #undef OO
6638
6639 /* If the pattern was successfully studied with JIT support, we will run the
6640 JIT executable instead of the rest of this function. Most options must be set
6641 at compile time for the JIT code to be usable. */
6642
6643 #ifdef SUPPORT_JIT
6644 use_jit = (re->executable_jit != NULL &&
6645 (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
6646 #endif
6647
6648 /* Initialize UTF/UCP parameters. */
6649
6650 #ifdef SUPPORT_UNICODE
6651 utf = (re->overall_options & PCRE2_UTF) != 0;
6652 allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
6653 ucp = (re->overall_options & PCRE2_UCP) != 0;
6654 #endif /* SUPPORT_UNICODE */
6655
6656 /* Convert the partial matching flags into an integer. */
6657
6658 mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
6659 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
6660
6661 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
6662 time. */
6663
6664 if (mb->partial != 0 &&
6665 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
6666 return PCRE2_ERROR_BADOPTION;
6667
6668 /* It is an error to set an offset limit without setting the flag at compile
6669 time. */
6670
6671 if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
6672 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
6673 return PCRE2_ERROR_BADOFFSETLIMIT;
6674
6675 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
6676 free the memory that was obtained. Set the field to NULL for no match cases. */
6677
6678 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
6679 {
6680 match_data->memctl.free((void *)match_data->subject,
6681 match_data->memctl.memory_data);
6682 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
6683 }
6684 match_data->subject = NULL;
6685
6686 /* Zero the error offset in case the first code unit is invalid UTF. */
6687
6688 match_data->startchar = 0;
6689
6690
6691 /* ============================= JIT matching ============================== */
6692
6693 /* Prepare for JIT matching. Check a UTF string for validity unless no check is
6694 requested or invalid UTF can be handled. We check only the portion of the
6695 subject that might be be inspected during matching - from the offset minus the
6696 maximum lookbehind to the given length. This saves time when a small part of a
6697 large subject is being matched by the use of a starting offset. Note that the
6698 maximum lookbehind is a number of characters, not code units. */
6699
6700 #ifdef SUPPORT_JIT
6701 if (use_jit)
6702 {
6703 #ifdef SUPPORT_UNICODE
6704 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)
6705 {
6706 #if PCRE2_CODE_UNIT_WIDTH != 32
6707 unsigned int i;
6708 #endif
6709
6710 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6711 character start. */
6712
6713 #if PCRE2_CODE_UNIT_WIDTH != 32
6714 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6715 {
6716 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6717 #if PCRE2_CODE_UNIT_WIDTH == 8
6718 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6719 #else
6720 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6721 #endif
6722 }
6723 #endif /* WIDTH != 32 */
6724
6725 /* Move back by the maximum lookbehind, just in case it happens at the very
6726 start of matching. */
6727
6728 #if PCRE2_CODE_UNIT_WIDTH != 32
6729 for (i = re->max_lookbehind; i > 0 && start_match > subject; i--)
6730 {
6731 start_match--;
6732 while (start_match > subject &&
6733 #if PCRE2_CODE_UNIT_WIDTH == 8
6734 (*start_match & 0xc0) == 0x80)
6735 #else /* 16-bit */
6736 (*start_match & 0xfc00) == 0xdc00)
6737 #endif
6738 start_match--;
6739 }
6740 #else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6741
6742 /* In the 32-bit library, one code unit equals one character. However,
6743 we cannot just subtract the lookbehind and then compare pointers, because
6744 a very large lookbehind could create an invalid pointer. */
6745
6746 if (start_offset >= re->max_lookbehind)
6747 start_match -= re->max_lookbehind;
6748 else
6749 start_match = subject;
6750 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6751
6752 /* Validate the relevant portion of the subject. Adjust the offset of an
6753 invalid code point to be an absolute offset in the whole string. */
6754
6755 match_data->rc = PRIV(valid_utf)(start_match,
6756 length - (start_match - subject), &(match_data->startchar));
6757 if (match_data->rc != 0)
6758 {
6759 match_data->startchar += start_match - subject;
6760 return match_data->rc;
6761 }
6762 jit_checked_utf = TRUE;
6763 }
6764 #endif /* SUPPORT_UNICODE */
6765
6766 /* If JIT returns BADOPTION, which means that the selected complete or
6767 partial matching mode was not compiled, fall through to the interpreter. */
6768
6769 rc = pcre2_jit_match(code, subject, length, start_offset, options,
6770 match_data, mcontext);
6771 if (rc != PCRE2_ERROR_JIT_BADOPTION)
6772 {
6773 match_data->subject_length = length;
6774 if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
6775 {
6776 length = CU2BYTES(length + was_zero_terminated);
6777 match_data->subject = match_data->memctl.malloc(length,
6778 match_data->memctl.memory_data);
6779 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
6780 memcpy((void *)match_data->subject, subject, length);
6781 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
6782 }
6783 return rc;
6784 }
6785 }
6786 #endif /* SUPPORT_JIT */
6787
6788 /* ========================= End of JIT matching ========================== */
6789
6790
6791 /* Proceed with non-JIT matching. The default is to allow lookbehinds to the
6792 start of the subject. A UTF check when there is a non-zero offset may change
6793 this. */
6794
6795 mb->check_subject = subject;
6796
6797 /* If a UTF subject string was not checked for validity in the JIT code above,
6798 check it here, and handle support for invalid UTF strings. The check above
6799 happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.
6800 If we get here in those circumstances, it means the subject string is valid,
6801 but for some reason JIT matching was not successful. There is no need to check
6802 the subject again.
6803
6804 We check only the portion of the subject that might be be inspected during
6805 matching - from the offset minus the maximum lookbehind to the given length.
6806 This saves time when a small part of a large subject is being matched by the
6807 use of a starting offset. Note that the maximum lookbehind is a number of
6808 characters, not code units.
6809
6810 Note also that support for invalid UTF forces a check, overriding the setting
6811 of PCRE2_NO_CHECK_UTF. */
6812
6813 #ifdef SUPPORT_UNICODE
6814 if (utf &&
6815 #ifdef SUPPORT_JIT
6816 !jit_checked_utf &&
6817 #endif
6818 ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))
6819 {
6820 #if PCRE2_CODE_UNIT_WIDTH != 32
6821 BOOL skipped_bad_start = FALSE;
6822 #endif
6823
6824 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6825 character start. If we are handling invalid UTF, just skip over such code
6826 units. Otherwise, give an appropriate error. */
6827
6828 #if PCRE2_CODE_UNIT_WIDTH != 32
6829 if (allow_invalid)
6830 {
6831 while (start_match < end_subject && NOT_FIRSTCU(*start_match))
6832 {
6833 start_match++;
6834 skipped_bad_start = TRUE;
6835 }
6836 }
6837 else if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6838 {
6839 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6840 #if PCRE2_CODE_UNIT_WIDTH == 8
6841 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6842 #else
6843 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6844 #endif
6845 }
6846 #endif /* WIDTH != 32 */
6847
6848 /* The mb->check_subject field points to the start of UTF checking;
6849 lookbehinds can go back no further than this. */
6850
6851 mb->check_subject = start_match;
6852
6853 /* Move back by the maximum lookbehind, just in case it happens at the very
6854 start of matching, but don't do this if we skipped bad 8-bit or 16-bit code
6855 units above. */
6856
6857 #if PCRE2_CODE_UNIT_WIDTH != 32
6858 if (!skipped_bad_start)
6859 {
6860 unsigned int i;
6861 for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)
6862 {
6863 mb->check_subject--;
6864 while (mb->check_subject > subject &&
6865 #if PCRE2_CODE_UNIT_WIDTH == 8
6866 (*mb->check_subject & 0xc0) == 0x80)
6867 #else /* 16-bit */
6868 (*mb->check_subject & 0xfc00) == 0xdc00)
6869 #endif
6870 mb->check_subject--;
6871 }
6872 }
6873 #else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6874
6875 /* In the 32-bit library, one code unit equals one character. However,
6876 we cannot just subtract the lookbehind and then compare pointers, because
6877 a very large lookbehind could create an invalid pointer. */
6878
6879 if (start_offset >= re->max_lookbehind)
6880 mb->check_subject -= re->max_lookbehind;
6881 else
6882 mb->check_subject = subject;
6883 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6884
6885 /* Validate the relevant portion of the subject. There's a loop in case we
6886 encounter bad UTF in the characters preceding start_match which we are
6887 scanning because of a lookbehind. */
6888
6889 for (;;)
6890 {
6891 match_data->rc = PRIV(valid_utf)(mb->check_subject,
6892 length - (mb->check_subject - subject), &(match_data->startchar));
6893
6894 if (match_data->rc == 0) break; /* Valid UTF string */
6895
6896 /* Invalid UTF string. Adjust the offset to be an absolute offset in the
6897 whole string. If we are handling invalid UTF strings, set end_subject to
6898 stop before the bad code unit, and set the options to "not end of line".
6899 Otherwise return the error. */
6900
6901 match_data->startchar += mb->check_subject - subject;
6902 if (!allow_invalid || match_data->rc > 0) return match_data->rc;
6903 end_subject = subject + match_data->startchar;
6904
6905 /* If the end precedes start_match, it means there is invalid UTF in the
6906 extra code units we reversed over because of a lookbehind. Advance past the
6907 first bad code unit, and then skip invalid character starting code units in
6908 8-bit and 16-bit modes, and try again with the original end point. */
6909
6910 if (end_subject < start_match)
6911 {
6912 mb->check_subject = end_subject + 1;
6913 #if PCRE2_CODE_UNIT_WIDTH != 32
6914 while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))
6915 mb->check_subject++;
6916 #endif
6917 end_subject = true_end_subject;
6918 }
6919
6920 /* Otherwise, set the not end of line option, and do the match. */
6921
6922 else
6923 {
6924 fragment_options = PCRE2_NOTEOL;
6925 break;
6926 }
6927 }
6928 }
6929 #endif /* SUPPORT_UNICODE */
6930
6931 /* A NULL match context means "use a default context", but we take the memory
6932 control functions from the pattern. */
6933
6934 if (mcontext == NULL)
6935 {
6936 mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
6937 mb->memctl = re->memctl;
6938 }
6939 else mb->memctl = mcontext->memctl;
6940
6941 anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
6942 firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
6943 startline = (re->flags & PCRE2_STARTLINE) != 0;
6944 bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
6945 true_end_subject : subject + mcontext->offset_limit;
6946
6947 /* Initialize and set up the fixed fields in the callout block, with a pointer
6948 in the match block. */
6949
6950 mb->cb = &cb;
6951 cb.version = 2;
6952 cb.subject = subject;
6953 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
6954 cb.callout_flags = 0;
6955
6956 /* Fill in the remaining fields in the match block, except for moptions, which
6957 gets set later. */
6958
6959 mb->callout = mcontext->callout;
6960 mb->callout_data = mcontext->callout_data;
6961
6962 mb->start_subject = subject;
6963 mb->start_offset = start_offset;
6964 mb->end_subject = end_subject;
6965 mb->true_end_subject = true_end_subject;
6966 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
6967 mb->allowemptypartial = (re->max_lookbehind > 0) ||
6968 (re->flags & PCRE2_MATCH_EMPTY) != 0;
6969 mb->poptions = re->overall_options; /* Pattern options */
6970 mb->ignore_skip_arg = 0;
6971 mb->mark = mb->nomatch_mark = NULL; /* In case never set */
6972
6973 /* The name table is needed for finding all the numbers associated with a
6974 given name, for condition testing. The code follows the name table. */
6975
6976 mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
6977 mb->name_count = re->name_count;
6978 mb->name_entry_size = re->name_entry_size;
6979 mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
6980
6981 /* Process the \R and newline settings. */
6982
6983 mb->bsr_convention = re->bsr_convention;
6984 mb->nltype = NLTYPE_FIXED;
6985 switch(re->newline_convention)
6986 {
6987 case PCRE2_NEWLINE_CR:
6988 mb->nllen = 1;
6989 mb->nl[0] = CHAR_CR;
6990 break;
6991
6992 case PCRE2_NEWLINE_LF:
6993 mb->nllen = 1;
6994 mb->nl[0] = CHAR_NL;
6995 break;
6996
6997 case PCRE2_NEWLINE_NUL:
6998 mb->nllen = 1;
6999 mb->nl[0] = CHAR_NUL;
7000 break;
7001
7002 case PCRE2_NEWLINE_CRLF:
7003 mb->nllen = 2;
7004 mb->nl[0] = CHAR_CR;
7005 mb->nl[1] = CHAR_NL;
7006 break;
7007
7008 case PCRE2_NEWLINE_ANY:
7009 mb->nltype = NLTYPE_ANY;
7010 break;
7011
7012 case PCRE2_NEWLINE_ANYCRLF:
7013 mb->nltype = NLTYPE_ANYCRLF;
7014 break;
7015
7016 default: return PCRE2_ERROR_INTERNAL;
7017 }
7018
7019 /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
7020 vector at the end, whose size depends on the number of capturing parentheses in
7021 the pattern. It is not used at all if there are no capturing parentheses.
7022
7023 frame_size is the total size of each frame
7024 match_data->heapframes is the pointer to the frames vector
7025 match_data->heapframes_size is the allocated size of the vector
7026
7027 We must pad the frame_size for alignment to ensure subsequent frames are as
7028 aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE
7029 array, that does not guarantee it is suitably aligned for pointers, as some
7030 architectures have pointers that are larger than a size_t. */
7031
7032 frame_size = (offsetof(heapframe, ovector) +
7033 re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) &
7034 ~(HEAPFRAME_ALIGNMENT - 1);
7035
7036 /* Limits set in the pattern override the match context only if they are
7037 smaller. */
7038
7039 mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?
7040 mcontext->heap_limit : re->limit_heap);
7041
7042 mb->match_limit = (mcontext->match_limit < re->limit_match)?
7043 mcontext->match_limit : re->limit_match;
7044
7045 mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
7046 mcontext->depth_limit : re->limit_depth;
7047
7048 /* If a pattern has very many capturing parentheses, the frame size may be very
7049 large. Set the initial frame vector size to ensure that there are at least 10
7050 available frames, but enforce a minimum of START_FRAMES_SIZE. If this is
7051 greater than the heap limit, get as large a vector as possible. */
7052
7053 heapframes_size = frame_size * 10;
7054 if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;
7055 if (heapframes_size / 1024 > mb->heap_limit)
7056 {
7057 PCRE2_SIZE max_size = 1024 * mb->heap_limit;
7058 if (max_size < frame_size) return PCRE2_ERROR_HEAPLIMIT;
7059 heapframes_size = max_size;
7060 }
7061
7062 /* If an existing frame vector in the match_data block is large enough, we can
7063 use it. Otherwise, free any pre-existing vector and get a new one. */
7064
7065 if (match_data->heapframes_size < heapframes_size)
7066 {
7067 match_data->memctl.free(match_data->heapframes,
7068 match_data->memctl.memory_data);
7069 match_data->heapframes = match_data->memctl.malloc(heapframes_size,
7070 match_data->memctl.memory_data);
7071 if (match_data->heapframes == NULL)
7072 {
7073 match_data->heapframes_size = 0;
7074 return PCRE2_ERROR_NOMEMORY;
7075 }
7076 match_data->heapframes_size = heapframes_size;
7077 }
7078
7079 /* Write to the ovector within the first frame to mark every capture unset and
7080 to avoid uninitialized memory read errors when it is copied to a new frame. */
7081
7082 memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff,
7083 frame_size - offsetof(heapframe, ovector));
7084
7085 /* Pointers to the individual character tables */
7086
7087 mb->lcc = re->tables + lcc_offset;
7088 mb->fcc = re->tables + fcc_offset;
7089 mb->ctypes = re->tables + ctypes_offset;
7090
7091 /* Set up the first code unit to match, if available. If there's no first code
7092 unit there may be a bitmap of possible first characters. */
7093
7094 if ((re->flags & PCRE2_FIRSTSET) != 0)
7095 {
7096 has_first_cu = TRUE;
7097 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
7098 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
7099 {
7100 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
7101 #ifdef SUPPORT_UNICODE
7102 #if PCRE2_CODE_UNIT_WIDTH == 8
7103 if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
7104 #else
7105 if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
7106 #endif
7107 #endif /* SUPPORT_UNICODE */
7108 }
7109 }
7110 else
7111 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
7112 start_bits = re->start_bitmap;
7113
7114 /* There may also be a "last known required character" set. */
7115
7116 if ((re->flags & PCRE2_LASTSET) != 0)
7117 {
7118 has_req_cu = TRUE;
7119 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
7120 if ((re->flags & PCRE2_LASTCASELESS) != 0)
7121 {
7122 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
7123 #ifdef SUPPORT_UNICODE
7124 #if PCRE2_CODE_UNIT_WIDTH == 8
7125 if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
7126 #else
7127 if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
7128 #endif
7129 #endif /* SUPPORT_UNICODE */
7130 }
7131 }
7132
7133
7134 /* ==========================================================================*/
7135
7136 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
7137 the loop runs just once. */
7138
7139 #ifdef SUPPORT_UNICODE
7140 FRAGMENT_RESTART:
7141 #endif
7142
7143 start_partial = match_partial = NULL;
7144 mb->hitend = FALSE;
7145
7146 #if PCRE2_CODE_UNIT_WIDTH == 8
7147 memchr_found_first_cu = NULL;
7148 memchr_found_first_cu2 = NULL;
7149 #endif
7150
7151 for(;;)
7152 {
7153 PCRE2_SPTR new_start_match;
7154
7155 /* ----------------- Start of match optimizations ---------------- */
7156
7157 /* There are some optimizations that avoid running the match if a known
7158 starting point is not found, or if a known later code unit is not present.
7159 However, there is an option (settable at compile time) that disables these,
7160 for testing and for ensuring that all callouts do actually occur. */
7161
7162 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
7163 {
7164 /* If firstline is TRUE, the start of the match is constrained to the first
7165 line of a multiline string. That is, the match must be before or at the
7166 first newline following the start of matching. Temporarily adjust
7167 end_subject so that we stop the scans for a first code unit at a newline.
7168 If the match fails at the newline, later code breaks the loop. */
7169
7170 if (firstline)
7171 {
7172 PCRE2_SPTR t = start_match;
7173 #ifdef SUPPORT_UNICODE
7174 if (utf)
7175 {
7176 while (t < end_subject && !IS_NEWLINE(t))
7177 {
7178 t++;
7179 ACROSSCHAR(t < end_subject, t, t++);
7180 }
7181 }
7182 else
7183 #endif
7184 while (t < end_subject && !IS_NEWLINE(t)) t++;
7185 end_subject = t;
7186 }
7187
7188 /* Anchored: check the first code unit if one is recorded. This may seem
7189 pointless but it can help in detecting a no match case without scanning for
7190 the required code unit. */
7191
7192 if (anchored)
7193 {
7194 if (has_first_cu || start_bits != NULL)
7195 {
7196 BOOL ok = start_match < end_subject;
7197 if (ok)
7198 {
7199 PCRE2_UCHAR c = UCHAR21TEST(start_match);
7200 ok = has_first_cu && (c == first_cu || c == first_cu2);
7201 if (!ok && start_bits != NULL)
7202 {
7203 #if PCRE2_CODE_UNIT_WIDTH != 8
7204 if (c > 255) c = 255;
7205 #endif
7206 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
7207 }
7208 }
7209 if (!ok)
7210 {
7211 rc = MATCH_NOMATCH;
7212 break;
7213 }
7214 }
7215 }
7216
7217 /* Not anchored. Advance to a unique first code unit if there is one. */
7218
7219 else
7220 {
7221 if (has_first_cu)
7222 {
7223 if (first_cu != first_cu2) /* Caseless */
7224 {
7225 /* In 16-bit and 32_bit modes we have to do our own search, so can
7226 look for both cases at once. */
7227
7228 #if PCRE2_CODE_UNIT_WIDTH != 8
7229 PCRE2_UCHAR smc;
7230 while (start_match < end_subject &&
7231 (smc = UCHAR21TEST(start_match)) != first_cu &&
7232 smc != first_cu2)
7233 start_match++;
7234 #else
7235 /* In 8-bit mode, the use of memchr() gives a big speed up, even
7236 though we have to call it twice in order to find the earliest
7237 occurrence of the code unit in either of its cases. Caching is used
7238 to remember the positions of previously found code units. This can
7239 make a huge difference when the strings are very long and only one
7240 case is actually present. */
7241
7242 PCRE2_SPTR pp1 = NULL;
7243 PCRE2_SPTR pp2 = NULL;
7244 PCRE2_SIZE searchlength = end_subject - start_match;
7245
7246 /* If we haven't got a previously found position for first_cu, or if
7247 the current starting position is later, we need to do a search. If
7248 the code unit is not found, set it to the end. */
7249
7250 if (memchr_found_first_cu == NULL ||
7251 start_match > memchr_found_first_cu)
7252 {
7253 pp1 = memchr(start_match, first_cu, searchlength);
7254 memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
7255 }
7256
7257 /* If the start is before a previously found position, use the
7258 previous position, or NULL if a previous search failed. */
7259
7260 else pp1 = (memchr_found_first_cu == end_subject)? NULL :
7261 memchr_found_first_cu;
7262
7263 /* Do the same thing for the other case. */
7264
7265 if (memchr_found_first_cu2 == NULL ||
7266 start_match > memchr_found_first_cu2)
7267 {
7268 pp2 = memchr(start_match, first_cu2, searchlength);
7269 memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
7270 }
7271
7272 else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
7273 memchr_found_first_cu2;
7274
7275 /* Set the start to the end of the subject if neither case was found.
7276 Otherwise, use the earlier found point. */
7277
7278 if (pp1 == NULL)
7279 start_match = (pp2 == NULL)? end_subject : pp2;
7280 else
7281 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
7282
7283 #endif /* 8-bit handling */
7284 }
7285
7286 /* The caseful case is much simpler. */
7287
7288 else
7289 {
7290 #if PCRE2_CODE_UNIT_WIDTH != 8
7291 while (start_match < end_subject && UCHAR21TEST(start_match) !=
7292 first_cu)
7293 start_match++;
7294 #else
7295 start_match = memchr(start_match, first_cu, end_subject - start_match);
7296 if (start_match == NULL) start_match = end_subject;
7297 #endif
7298 }
7299
7300 /* If we can't find the required first code unit, having reached the
7301 true end of the subject, break the bumpalong loop, to force a match
7302 failure, except when doing partial matching, when we let the next cycle
7303 run at the end of the subject. To see why, consider the pattern
7304 /(?<=abc)def/, which partially matches "abc", even though the string
7305 does not contain the starting character "d". If we have not reached the
7306 true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
7307 temporarily modified) we also let the cycle run, because the matching
7308 string is legitimately allowed to start with the first code unit of a
7309 newline. */
7310
7311 if (mb->partial == 0 && start_match >= mb->end_subject)
7312 {
7313 rc = MATCH_NOMATCH;
7314 break;
7315 }
7316 }
7317
7318 /* If there's no first code unit, advance to just after a linebreak for a
7319 multiline match if required. */
7320
7321 else if (startline)
7322 {
7323 if (start_match > mb->start_subject + start_offset)
7324 {
7325 #ifdef SUPPORT_UNICODE
7326 if (utf)
7327 {
7328 while (start_match < end_subject && !WAS_NEWLINE(start_match))
7329 {
7330 start_match++;
7331 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
7332 }
7333 }
7334 else
7335 #endif
7336 while (start_match < end_subject && !WAS_NEWLINE(start_match))
7337 start_match++;
7338
7339 /* If we have just passed a CR and the newline option is ANY or
7340 ANYCRLF, and we are now at a LF, advance the match position by one
7341 more code unit. */
7342
7343 if (start_match[-1] == CHAR_CR &&
7344 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
7345 start_match < end_subject &&
7346 UCHAR21TEST(start_match) == CHAR_NL)
7347 start_match++;
7348 }
7349 }
7350
7351 /* If there's no first code unit or a requirement for a multiline line
7352 start, advance to a non-unique first code unit if any have been
7353 identified. The bitmap contains only 256 bits. When code units are 16 or
7354 32 bits wide, all code units greater than 254 set the 255 bit. */
7355
7356 else if (start_bits != NULL)
7357 {
7358 while (start_match < end_subject)
7359 {
7360 uint32_t c = UCHAR21TEST(start_match);
7361 #if PCRE2_CODE_UNIT_WIDTH != 8
7362 if (c > 255) c = 255;
7363 #endif
7364 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
7365 start_match++;
7366 }
7367
7368 /* See comment above in first_cu checking about the next few lines. */
7369
7370 if (mb->partial == 0 && start_match >= mb->end_subject)
7371 {
7372 rc = MATCH_NOMATCH;
7373 break;
7374 }
7375 }
7376 } /* End first code unit handling */
7377
7378 /* Restore fudged end_subject */
7379
7380 end_subject = mb->end_subject;
7381
7382 /* The following two optimizations must be disabled for partial matching. */
7383
7384 if (mb->partial == 0)
7385 {
7386 PCRE2_SPTR p;
7387
7388 /* The minimum matching length is a lower bound; no string of that length
7389 may actually match the pattern. Although the value is, strictly, in
7390 characters, we treat it as code units to avoid spending too much time in
7391 this optimization. */
7392
7393 if (end_subject - start_match < re->minlength)
7394 {
7395 rc = MATCH_NOMATCH;
7396 break;
7397 }
7398
7399 /* If req_cu is set, we know that that code unit must appear in the
7400 subject for the (non-partial) match to succeed. If the first code unit is
7401 set, req_cu must be later in the subject; otherwise the test starts at
7402 the match point. This optimization can save a huge amount of backtracking
7403 in patterns with nested unlimited repeats that aren't going to match.
7404 Writing separate code for caseful/caseless versions makes it go faster,
7405 as does using an autoincrement and backing off on a match. As in the case
7406 of the first code unit, using memchr() in the 8-bit library gives a big
7407 speed up. Unlike the first_cu check above, we do not need to call
7408 memchr() twice in the caseless case because we only need to check for the
7409 presence of the character in either case, not find the first occurrence.
7410
7411 The search can be skipped if the code unit was found later than the
7412 current starting point in a previous iteration of the bumpalong loop.
7413
7414 HOWEVER: when the subject string is very, very long, searching to its end
7415 can take a long time, and give bad performance on quite ordinary
7416 anchored patterns. This showed up when somebody was matching something
7417 like /^\d+C/ on a 32-megabyte string... so we don't do this when the
7418 string is sufficiently long, but it's worth searching a lot more for
7419 unanchored patterns. */
7420
7421 p = start_match + (has_first_cu? 1:0);
7422 if (has_req_cu && p > req_cu_ptr)
7423 {
7424 PCRE2_SIZE check_length = end_subject - start_match;
7425
7426 if (check_length < REQ_CU_MAX ||
7427 (!anchored && check_length < REQ_CU_MAX * 1000))
7428 {
7429 if (req_cu != req_cu2) /* Caseless */
7430 {
7431 #if PCRE2_CODE_UNIT_WIDTH != 8
7432 while (p < end_subject)
7433 {
7434 uint32_t pp = UCHAR21INCTEST(p);
7435 if (pp == req_cu || pp == req_cu2) { p--; break; }
7436 }
7437 #else /* 8-bit code units */
7438 PCRE2_SPTR pp = p;
7439 p = memchr(pp, req_cu, end_subject - pp);
7440 if (p == NULL)
7441 {
7442 p = memchr(pp, req_cu2, end_subject - pp);
7443 if (p == NULL) p = end_subject;
7444 }
7445 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
7446 }
7447
7448 /* The caseful case */
7449
7450 else
7451 {
7452 #if PCRE2_CODE_UNIT_WIDTH != 8
7453 while (p < end_subject)
7454 {
7455 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
7456 }
7457
7458 #else /* 8-bit code units */
7459 p = memchr(p, req_cu, end_subject - p);
7460 if (p == NULL) p = end_subject;
7461 #endif
7462 }
7463
7464 /* If we can't find the required code unit, break the bumpalong loop,
7465 forcing a match failure. */
7466
7467 if (p >= end_subject)
7468 {
7469 rc = MATCH_NOMATCH;
7470 break;
7471 }
7472
7473 /* If we have found the required code unit, save the point where we
7474 found it, so that we don't search again next time round the bumpalong
7475 loop if the start hasn't yet passed this code unit. */
7476
7477 req_cu_ptr = p;
7478 }
7479 }
7480 }
7481 }
7482
7483 /* ------------ End of start of match optimizations ------------ */
7484
7485 /* Give no match if we have passed the bumpalong limit. */
7486
7487 if (start_match > bumpalong_limit)
7488 {
7489 rc = MATCH_NOMATCH;
7490 break;
7491 }
7492
7493 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
7494 first starting point for which a partial match was found. */
7495
7496 cb.start_match = (PCRE2_SIZE)(start_match - subject);
7497 cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
7498
7499 mb->start_used_ptr = start_match;
7500 mb->last_used_ptr = start_match;
7501 #ifdef SUPPORT_UNICODE
7502 mb->moptions = options | fragment_options;
7503 #else
7504 mb->moptions = options;
7505 #endif
7506 mb->match_call_count = 0;
7507 mb->end_offset_top = 0;
7508 mb->skip_arg_count = 0;
7509
7510 #ifdef DEBUG_SHOW_OPS
7511 fprintf(stderr, "++ Calling match()\n");
7512 #endif
7513
7514 rc = match(start_match, mb->start_code, re->top_bracket, frame_size,
7515 match_data, mb);
7516
7517 #ifdef DEBUG_SHOW_OPS
7518 fprintf(stderr, "++ match() returned %d\n\n", rc);
7519 #endif
7520
7521 if (mb->hitend && start_partial == NULL)
7522 {
7523 start_partial = mb->start_used_ptr;
7524 match_partial = start_match;
7525 }
7526
7527 switch(rc)
7528 {
7529 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
7530 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
7531 entirely. The only way we can do that is to re-do the match at the same
7532 point, with a flag to force SKIP with an argument to be ignored. Just
7533 treating this case as NOMATCH does not work because it does not check other
7534 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
7535
7536 case MATCH_SKIP_ARG:
7537 new_start_match = start_match;
7538 mb->ignore_skip_arg = mb->skip_arg_count;
7539 break;
7540
7541 /* SKIP passes back the next starting point explicitly, but if it is no
7542 greater than the match we have just done, treat it as NOMATCH. */
7543
7544 case MATCH_SKIP:
7545 if (mb->verb_skip_ptr > start_match)
7546 {
7547 new_start_match = mb->verb_skip_ptr;
7548 break;
7549 }
7550 /* Fall through */
7551
7552 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
7553 exactly like PRUNE. Unset ignore SKIP-with-argument. */
7554
7555 case MATCH_NOMATCH:
7556 case MATCH_PRUNE:
7557 case MATCH_THEN:
7558 mb->ignore_skip_arg = 0;
7559 new_start_match = start_match + 1;
7560 #ifdef SUPPORT_UNICODE
7561 if (utf)
7562 ACROSSCHAR(new_start_match < end_subject, new_start_match,
7563 new_start_match++);
7564 #endif
7565 break;
7566
7567 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7568
7569 case MATCH_COMMIT:
7570 rc = MATCH_NOMATCH;
7571 goto ENDLOOP;
7572
7573 /* Any other return is either a match, or some kind of error. */
7574
7575 default:
7576 goto ENDLOOP;
7577 }
7578
7579 /* Control reaches here for the various types of "no match at this point"
7580 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7581
7582 rc = MATCH_NOMATCH;
7583
7584 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
7585 newline in the subject (though it may continue over the newline). Therefore,
7586 if we have just failed to match, starting at a newline, do not continue. */
7587
7588 if (firstline && IS_NEWLINE(start_match)) break;
7589
7590 /* Advance to new matching position */
7591
7592 start_match = new_start_match;
7593
7594 /* Break the loop if the pattern is anchored or if we have passed the end of
7595 the subject. */
7596
7597 if (anchored || start_match > end_subject) break;
7598
7599 /* If we have just passed a CR and we are now at a LF, and the pattern does
7600 not contain any explicit matches for \r or \n, and the newline option is CRLF
7601 or ANY or ANYCRLF, advance the match position by one more code unit. In
7602 normal matching start_match will aways be greater than the first position at
7603 this stage, but a failed *SKIP can cause a return at the same point, which is
7604 why the first test exists. */
7605
7606 if (start_match > subject + start_offset &&
7607 start_match[-1] == CHAR_CR &&
7608 start_match < end_subject &&
7609 *start_match == CHAR_NL &&
7610 (re->flags & PCRE2_HASCRORLF) == 0 &&
7611 (mb->nltype == NLTYPE_ANY ||
7612 mb->nltype == NLTYPE_ANYCRLF ||
7613 mb->nllen == 2))
7614 start_match++;
7615
7616 mb->mark = NULL; /* Reset for start of next match attempt */
7617 } /* End of for(;;) "bumpalong" loop */
7618
7619 /* ==========================================================================*/
7620
7621 /* When we reach here, one of the following stopping conditions is true:
7622
7623 (1) The match succeeded, either completely, or partially;
7624
7625 (2) The pattern is anchored or the match was failed after (*COMMIT);
7626
7627 (3) We are past the end of the subject or the bumpalong limit;
7628
7629 (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
7630 this option requests that a match occur at or before the first newline in
7631 the subject.
7632
7633 (5) Some kind of error occurred.
7634
7635 */
7636
7637 ENDLOOP:
7638
7639 /* If end_subject != true_end_subject, it means we are handling invalid UTF,
7640 and have just processed a non-terminal fragment. If this resulted in no match
7641 or a partial match we must carry on to the next fragment (a partial match is
7642 returned to the caller only at the very end of the subject). A loop is used to
7643 avoid trying to match against empty fragments; if the pattern can match an
7644 empty string it would have done so already. */
7645
7646 #ifdef SUPPORT_UNICODE
7647 if (utf && end_subject != true_end_subject &&
7648 (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))
7649 {
7650 for (;;)
7651 {
7652 /* Advance past the first bad code unit, and then skip invalid character
7653 starting code units in 8-bit and 16-bit modes. */
7654
7655 start_match = end_subject + 1;
7656
7657 #if PCRE2_CODE_UNIT_WIDTH != 32
7658 while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))
7659 start_match++;
7660 #endif
7661
7662 /* If we have hit the end of the subject, there isn't another non-empty
7663 fragment, so give up. */
7664
7665 if (start_match >= true_end_subject)
7666 {
7667 rc = MATCH_NOMATCH; /* In case it was partial */
7668 match_partial = NULL;
7669 break;
7670 }
7671
7672 /* Check the rest of the subject */
7673
7674 mb->check_subject = start_match;
7675 rc = PRIV(valid_utf)(start_match, length - (start_match - subject),
7676 &(match_data->startchar));
7677
7678 /* The rest of the subject is valid UTF. */
7679
7680 if (rc == 0)
7681 {
7682 mb->end_subject = end_subject = true_end_subject;
7683 fragment_options = PCRE2_NOTBOL;
7684 goto FRAGMENT_RESTART;
7685 }
7686
7687 /* A subsequent UTF error has been found; if the next fragment is
7688 non-empty, set up to process it. Otherwise, let the loop advance. */
7689
7690 else if (rc < 0)
7691 {
7692 mb->end_subject = end_subject = start_match + match_data->startchar;
7693 if (end_subject > start_match)
7694 {
7695 fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;
7696 goto FRAGMENT_RESTART;
7697 }
7698 }
7699 }
7700 }
7701 #endif /* SUPPORT_UNICODE */
7702
7703 /* Fill in fields that are always returned in the match data. */
7704
7705 match_data->code = re;
7706 match_data->mark = mb->mark;
7707 match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
7708
7709 /* Handle a fully successful match. Set the return code to the number of
7710 captured strings, or 0 if there were too many to fit into the ovector, and then
7711 set the remaining returned values before returning. Make a copy of the subject
7712 string if requested. */
7713
7714 if (rc == MATCH_MATCH)
7715 {
7716 match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
7717 0 : (int)mb->end_offset_top/2 + 1;
7718 match_data->subject_length = length;
7719 match_data->startchar = start_match - subject;
7720 match_data->leftchar = mb->start_used_ptr - subject;
7721 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
7722 mb->last_used_ptr : mb->end_match_ptr) - subject;
7723 if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
7724 {
7725 length = CU2BYTES(length + was_zero_terminated);
7726 match_data->subject = match_data->memctl.malloc(length,
7727 match_data->memctl.memory_data);
7728 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
7729 memcpy((void *)match_data->subject, subject, length);
7730 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
7731 }
7732 else match_data->subject = subject;
7733
7734 return match_data->rc;
7735 }
7736
7737 /* Control gets here if there has been a partial match, an error, or if the
7738 overall match attempt has failed at all permitted starting positions. Any mark
7739 data is in the nomatch_mark field. */
7740
7741 match_data->mark = mb->nomatch_mark;
7742
7743 /* For anything other than nomatch or partial match, just return the code. */
7744
7745 if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
7746
7747 /* Handle a partial match. If a "soft" partial match was requested, searching
7748 for a complete match will have continued, and the value of rc at this point
7749 will be MATCH_NOMATCH. For a "hard" partial match, it will already be
7750 PCRE2_ERROR_PARTIAL. */
7751
7752 else if (match_partial != NULL)
7753 {
7754 match_data->subject = subject;
7755 match_data->subject_length = length;
7756 match_data->ovector[0] = match_partial - subject;
7757 match_data->ovector[1] = end_subject - subject;
7758 match_data->startchar = match_partial - subject;
7759 match_data->leftchar = start_partial - subject;
7760 match_data->rightchar = end_subject - subject;
7761 match_data->rc = PCRE2_ERROR_PARTIAL;
7762 }
7763
7764 /* Else this is the classic nomatch case. */
7765
7766 else match_data->rc = PCRE2_ERROR_NOMATCH;
7767
7768 return match_data->rc;
7769 }
7770
7771 /* These #undefs are here to enable unity builds with CMake. */
7772
7773 #undef NLBLOCK /* Block containing newline information */
7774 #undef PSSTART /* Field containing processed string start */
7775 #undef PSEND /* Field containing processed string end */
7776
7777 /* End of pcre2_match.c */
7778