1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2015-2021 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 /* These defines enable debugging code */
47
48 /* #define DEBUG_FRAMES_DISPLAY */
49 /* #define DEBUG_SHOW_OPS */
50 /* #define DEBUG_SHOW_RMATCH */
51
52 #ifdef DEBUG_FRAME_DISPLAY
53 #include <stdarg.h>
54 #endif
55
56 /* These defines identify the name of the block containing "static"
57 information, and fields within it. */
58
59 #define NLBLOCK mb /* Block containing newline information */
60 #define PSSTART start_subject /* Field containing processed string start */
61 #define PSEND end_subject /* Field containing processed string end */
62
63 #include "pcre2_internal.h"
64
65 #define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
66
67 /* Masks for identifying the public options that are permitted at match time. */
68
69 #define PUBLIC_MATCH_OPTIONS \
70 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
71 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
72 PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT)
73
74 #define PUBLIC_JIT_MATCH_OPTIONS \
75 (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
76 PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\
77 PCRE2_COPY_MATCHED_SUBJECT)
78
79 /* Non-error returns from and within the match() function. Error returns are
80 externally defined PCRE2_ERROR_xxx codes, which are all negative. */
81
82 #define MATCH_MATCH 1
83 #define MATCH_NOMATCH 0
84
85 /* Special internal returns used in the match() function. Make them
86 sufficiently negative to avoid the external error codes. */
87
88 #define MATCH_ACCEPT (-999)
89 #define MATCH_KETRPOS (-998)
90 /* The next 5 must be kept together and in sequence so that a test that checks
91 for any one of them can use a range. */
92 #define MATCH_COMMIT (-997)
93 #define MATCH_PRUNE (-996)
94 #define MATCH_SKIP (-995)
95 #define MATCH_SKIP_ARG (-994)
96 #define MATCH_THEN (-993)
97 #define MATCH_BACKTRACK_MAX MATCH_THEN
98 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
99
100 /* Group frame type values. Zero means the frame is not a group frame. The
101 lower 16 bits are used for data (e.g. the capture number). Group frames are
102 used for most groups so that information about the start is easily available at
103 the end without having to scan back through intermediate frames (backtrack
104 points). */
105
106 #define GF_CAPTURE 0x00010000u
107 #define GF_NOCAPTURE 0x00020000u
108 #define GF_CONDASSERT 0x00030000u
109 #define GF_RECURSE 0x00040000u
110
111 /* Masks for the identity and data parts of the group frame type. */
112
113 #define GF_IDMASK(a) ((a) & 0xffff0000u)
114 #define GF_DATAMASK(a) ((a) & 0x0000ffffu)
115
116 /* Repetition types */
117
118 enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };
119
120 /* Min and max values for the common repeats; a maximum of UINT32_MAX =>
121 infinity. */
122
123 static const uint32_t rep_min[] = {
124 0, 0, /* * and *? */
125 1, 1, /* + and +? */
126 0, 0, /* ? and ?? */
127 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
128 0, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
129
130 static const uint32_t rep_max[] = {
131 UINT32_MAX, UINT32_MAX, /* * and *? */
132 UINT32_MAX, UINT32_MAX, /* + and +? */
133 1, 1, /* ? and ?? */
134 0, 0, /* dummy placefillers for OP_CR[MIN]RANGE */
135 UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */
136
137 /* Repetition types - must include OP_CRPOSRANGE (not needed above) */
138
139 static const uint32_t rep_typ[] = {
140 REPTYPE_MAX, REPTYPE_MIN, /* * and *? */
141 REPTYPE_MAX, REPTYPE_MIN, /* + and +? */
142 REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */
143 REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */
144 REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */
145 REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */
146
147 /* Numbers for RMATCH calls at backtracking points. When these lists are
148 changed, the code at RETURN_SWITCH below must be updated in sync. */
149
150 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
151 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
152 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
153 RM31, RM32, RM33, RM34, RM35, RM36 };
154
155 #ifdef SUPPORT_WIDE_CHARS
156 enum { RM100=100, RM101 };
157 #endif
158
159 #ifdef SUPPORT_UNICODE
160 enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,
161 RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,
162 RM216, RM217, RM218, RM219, RM220, RM221, RM222 };
163 #endif
164
165 /* Define short names for general fields in the current backtrack frame, which
166 is always pointed to by the F variable. Occasional references to fields in
167 other frames are written out explicitly. There are also some fields in the
168 current frame whose names start with "temp" that are used for short-term,
169 localised backtracking memory. These are #defined with Lxxx names at the point
170 of use and undefined afterwards. */
171
172 #define Fback_frame F->back_frame
173 #define Fcapture_last F->capture_last
174 #define Fcurrent_recurse F->current_recurse
175 #define Fecode F->ecode
176 #define Feptr F->eptr
177 #define Fgroup_frame_type F->group_frame_type
178 #define Flast_group_offset F->last_group_offset
179 #define Flength F->length
180 #define Fmark F->mark
181 #define Frdepth F->rdepth
182 #define Fstart_match F->start_match
183 #define Foffset_top F->offset_top
184 #define Foccu F->occu
185 #define Fop F->op
186 #define Fovector F->ovector
187 #define Freturn_id F->return_id
188
189
190 #ifdef DEBUG_FRAMES_DISPLAY
191 /*************************************************
192 * Display current frames and contents *
193 *************************************************/
194
195 /* This debugging function displays the current set of frames and their
196 contents. It is not called automatically from anywhere, the intention being
197 that calls can be inserted where necessary when debugging frame-related
198 problems.
199
200 Arguments:
201 f the file to write to
202 F the current top frame
203 P a previous frame of interest
204 frame_size the frame size
205 mb points to the match block
206 s identification text
207
208 Returns: nothing
209 */
210
211 static void
display_frames(FILE * f,heapframe * F,heapframe * P,PCRE2_SIZE frame_size,match_block * mb,const char * s,...)212 display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,
213 match_block *mb, const char *s, ...)
214 {
215 uint32_t i;
216 heapframe *Q;
217 va_list ap;
218 va_start(ap, s);
219
220 fprintf(f, "FRAMES ");
221 vfprintf(f, s, ap);
222 va_end(ap);
223
224 if (P != NULL) fprintf(f, " P=%lu",
225 ((char *)P - (char *)(mb->match_frames))/frame_size);
226 fprintf(f, "\n");
227
228 for (i = 0, Q = mb->match_frames;
229 Q <= F;
230 i++, Q = (heapframe *)((char *)Q + frame_size))
231 {
232 fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",
233 i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),
234 Q->back_frame, Q->return_id);
235
236 if (Q->last_group_offset == PCRE2_UNSET)
237 fprintf(f, " lgoffset=unset\n");
238 else
239 fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);
240 }
241 }
242
243 #endif
244
245
246
247 /*************************************************
248 * Process a callout *
249 *************************************************/
250
251 /* This function is called for all callouts, whether "standalone" or at the
252 start of a conditional group. Feptr will be pointing to either OP_CALLOUT or
253 OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized
254 with fixed values.
255
256 Arguments:
257 F points to the current backtracking frame
258 mb points to the match block
259 lengthptr where to return the length of the callout item
260
261 Returns: the return from the callout
262 or 0 if no callout function exists
263 */
264
265 static int
do_callout(heapframe * F,match_block * mb,PCRE2_SIZE * lengthptr)266 do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)
267 {
268 int rc;
269 PCRE2_SIZE save0, save1;
270 PCRE2_SIZE *callout_ovector;
271 pcre2_callout_block *cb;
272
273 *lengthptr = (*Fecode == OP_CALLOUT)?
274 PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
275
276 if (mb->callout == NULL) return 0; /* No callout function provided */
277
278 /* The original matching code (pre 10.30) worked directly with the ovector
279 passed by the user, and this was passed to callouts. Now that the working
280 ovector is in the backtracking frame, it no longer needs to reserve space for
281 the overall match offsets (which would waste space in the frame). For backward
282 compatibility, however, we pass capture_top and offset_vector to the callout as
283 if for the extended ovector, and we ensure that the first two slots are unset
284 by preserving and restoring their current contents. Picky compilers complain if
285 references such as Fovector[-2] are use directly, so we set up a separate
286 pointer. */
287
288 callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
289
290 /* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields
291 are set externally. The first 3 never change; the last is updated for each
292 bumpalong. */
293
294 cb = mb->cb;
295 cb->capture_top = (uint32_t)Foffset_top/2 + 1;
296 cb->capture_last = Fcapture_last;
297 cb->offset_vector = callout_ovector;
298 cb->mark = mb->nomatch_mark;
299 cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);
300 cb->pattern_position = GET(Fecode, 1);
301 cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);
302
303 if (*Fecode == OP_CALLOUT) /* Numerical callout */
304 {
305 cb->callout_number = Fecode[1 + 2*LINK_SIZE];
306 cb->callout_string_offset = 0;
307 cb->callout_string = NULL;
308 cb->callout_string_length = 0;
309 }
310 else /* String callout */
311 {
312 cb->callout_number = 0;
313 cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);
314 cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;
315 cb->callout_string_length =
316 *lengthptr - (1 + 4*LINK_SIZE) - 2;
317 }
318
319 save0 = callout_ovector[0];
320 save1 = callout_ovector[1];
321 callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;
322 rc = mb->callout(cb, mb->callout_data);
323 callout_ovector[0] = save0;
324 callout_ovector[1] = save1;
325 cb->callout_flags = 0;
326 return rc;
327 }
328
329
330
331 /*************************************************
332 * Match a back-reference *
333 *************************************************/
334
335 /* This function is called only when it is known that the offset lies within
336 the offsets that have so far been used in the match. Note that in caseless
337 UTF-8 mode, the number of subject bytes matched may be different to the number
338 of reference bytes. (In theory this could also happen in UTF-16 mode, but it
339 seems unlikely.)
340
341 Arguments:
342 offset index into the offset vector
343 caseless TRUE if caseless
344 F the current backtracking frame pointer
345 mb points to match block
346 lengthptr pointer for returning the length matched
347
348 Returns: = 0 sucessful match; number of code units matched is set
349 < 0 no match
350 > 0 partial match
351 */
352
353 static int
match_ref(PCRE2_SIZE offset,BOOL caseless,heapframe * F,match_block * mb,PCRE2_SIZE * lengthptr)354 match_ref(PCRE2_SIZE offset, BOOL caseless, heapframe *F, match_block *mb,
355 PCRE2_SIZE *lengthptr)
356 {
357 PCRE2_SPTR p;
358 PCRE2_SIZE length;
359 PCRE2_SPTR eptr;
360 PCRE2_SPTR eptr_start;
361
362 /* Deal with an unset group. The default is no match, but there is an option to
363 match an empty string. */
364
365 if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)
366 {
367 if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
368 {
369 *lengthptr = 0;
370 return 0; /* Match */
371 }
372 else return -1; /* No match */
373 }
374
375 /* Separate the caseless and UTF cases for speed. */
376
377 eptr = eptr_start = Feptr;
378 p = mb->start_subject + Fovector[offset];
379 length = Fovector[offset+1] - Fovector[offset];
380
381 if (caseless)
382 {
383 #if defined SUPPORT_UNICODE
384 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
385
386 if (utf || (mb->poptions & PCRE2_UCP) != 0)
387 {
388 PCRE2_SPTR endptr = p + length;
389
390 /* Match characters up to the end of the reference. NOTE: the number of
391 code units matched may differ, because in UTF-8 there are some characters
392 whose upper and lower case codes have different numbers of bytes. For
393 example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3
394 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
395 sequence of two of the latter. It is important, therefore, to check the
396 length along the reference, not along the subject (earlier code did this
397 wrong). UCP without uses Unicode properties but without UTF encoding. */
398
399 while (p < endptr)
400 {
401 uint32_t c, d;
402 const ucd_record *ur;
403 if (eptr >= mb->end_subject) return 1; /* Partial match */
404
405 if (utf)
406 {
407 GETCHARINC(c, eptr);
408 GETCHARINC(d, p);
409 }
410 else
411 {
412 c = *eptr++;
413 d = *p++;
414 }
415
416 ur = GET_UCD(d);
417 if (c != d && c != (uint32_t)((int)d + ur->other_case))
418 {
419 const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
420 for (;;)
421 {
422 if (c < *pp) return -1; /* No match */
423 if (c == *pp++) break;
424 }
425 }
426 }
427 }
428 else
429 #endif
430
431 /* Not in UTF or UCP mode */
432 {
433 for (; length > 0; length--)
434 {
435 uint32_t cc, cp;
436 if (eptr >= mb->end_subject) return 1; /* Partial match */
437 cc = UCHAR21TEST(eptr);
438 cp = UCHAR21TEST(p);
439 if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
440 return -1; /* No match */
441 p++;
442 eptr++;
443 }
444 }
445 }
446
447 /* In the caseful case, we can just compare the code units, whether or not we
448 are in UTF and/or UCP mode. When partial matching, we have to do this unit by
449 unit. */
450
451 else
452 {
453 if (mb->partial != 0)
454 {
455 for (; length > 0; length--)
456 {
457 if (eptr >= mb->end_subject) return 1; /* Partial match */
458 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */
459 }
460 }
461
462 /* Not partial matching */
463
464 else
465 {
466 if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */
467 if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */
468 eptr += length;
469 }
470 }
471
472 *lengthptr = eptr - eptr_start;
473 return 0; /* Match */
474 }
475
476
477
478 /******************************************************************************
479 *******************************************************************************
480 "Recursion" in the match() function
481
482 The original match() function was highly recursive, but this proved to be the
483 source of a number of problems over the years, mostly because of the relatively
484 small system stacks that are commonly found. As new features were added to
485 patterns, various kludges were invented to reduce the amount of stack used,
486 making the code hard to understand in places.
487
488 A version did exist that used individual frames on the heap instead of calling
489 match() recursively, but this ran substantially slower. The current version is
490 a refactoring that uses a vector of frames to remember backtracking points.
491 This runs no slower, and possibly even a bit faster than the original recursive
492 implementation. An initial vector of size START_FRAMES_SIZE (enough for maybe
493 50 frames) is allocated on the system stack. If this is not big enough, the
494 heap is used for a larger vector.
495
496 *******************************************************************************
497 ******************************************************************************/
498
499
500
501
502 /*************************************************
503 * Macros for the match() function *
504 *************************************************/
505
506 /* These macros pack up tests that are used for partial matching several times
507 in the code. The second one is used when we already know we are past the end of
508 the subject. We set the "hit end" flag if the pointer is at the end of the
509 subject and either (a) the pointer is past the earliest inspected character
510 (i.e. something has been matched, even if not part of the actual matched
511 string), or (b) the pattern contains a lookbehind. These are the conditions for
512 which adding more characters may allow the current match to continue.
513
514 For hard partial matching, we immediately return a partial match. Otherwise,
515 carrying on means that a complete match on the current subject will be sought.
516 A partial match is returned only if no complete match can be found. */
517
518 #define CHECK_PARTIAL()\
519 if (Feptr >= mb->end_subject) \
520 { \
521 SCHECK_PARTIAL(); \
522 }
523
524 #define SCHECK_PARTIAL()\
525 if (mb->partial != 0 && \
526 (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \
527 { \
528 mb->hitend = TRUE; \
529 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
530 }
531
532
533 /* These macros are used to implement backtracking. They simulate a recursive
534 call to the match() function by means of a local vector of frames which
535 remember the backtracking points. */
536
537 #define RMATCH(ra,rb)\
538 {\
539 start_ecode = ra;\
540 Freturn_id = rb;\
541 goto MATCH_RECURSE;\
542 L_##rb:;\
543 }
544
545 #define RRETURN(ra)\
546 {\
547 rrc = ra;\
548 goto RETURN_SWITCH;\
549 }
550
551
552
553 /*************************************************
554 * Match from current position *
555 *************************************************/
556
557 /* This function is called to run one match attempt at a single starting point
558 in the subject.
559
560 Performance note: It might be tempting to extract commonly used fields from the
561 mb structure (e.g. end_subject) into individual variables to improve
562 performance. Tests using gcc on a SPARC disproved this; in the first case, it
563 made performance worse.
564
565 Arguments:
566 start_eptr starting character in subject
567 start_ecode starting position in compiled code
568 ovector pointer to the final output vector
569 oveccount number of pairs in ovector
570 top_bracket number of capturing parentheses in the pattern
571 frame_size size of each backtracking frame
572 mb pointer to "static" variables block
573
574 Returns: MATCH_MATCH if matched ) these values are >= 0
575 MATCH_NOMATCH if failed to match )
576 negative MATCH_xxx value for PRUNE, SKIP, etc
577 negative PCRE2_ERROR_xxx value if aborted by an error condition
578 (e.g. stopped by repeated call or depth limit)
579 */
580
581 static int
match(PCRE2_SPTR start_eptr,PCRE2_SPTR start_ecode,PCRE2_SIZE * ovector,uint16_t oveccount,uint16_t top_bracket,PCRE2_SIZE frame_size,match_block * mb)582 match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector,
583 uint16_t oveccount, uint16_t top_bracket, PCRE2_SIZE frame_size,
584 match_block *mb)
585 {
586 /* Frame-handling variables */
587
588 heapframe *F; /* Current frame pointer */
589 heapframe *N = NULL; /* Temporary frame pointers */
590 heapframe *P = NULL;
591 heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
592 PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
593
594 /* Local variables that do not need to be preserved over calls to RRMATCH(). */
595
596 PCRE2_SPTR bracode; /* Temp pointer to start of group */
597 PCRE2_SIZE offset; /* Used for group offsets */
598 PCRE2_SIZE length; /* Used for various length calculations */
599
600 int rrc; /* Return from functions & backtracking "recursions" */
601 #ifdef SUPPORT_UNICODE
602 int proptype; /* Type of character property */
603 #endif
604
605 uint32_t i; /* Used for local loops */
606 uint32_t fc; /* Character values */
607 uint32_t number; /* Used for group and other numbers */
608 uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */
609 uint32_t group_frame_type; /* Specifies type for new group frames */
610
611 BOOL condition; /* Used in conditional groups */
612 BOOL cur_is_word; /* Used in "word" tests */
613 BOOL prev_is_word; /* Used in "word" tests */
614
615 /* UTF and UCP flags */
616
617 #ifdef SUPPORT_UNICODE
618 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
619 BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
620 #else
621 BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
622 #endif
623
624 /* This is the length of the last part of a backtracking frame that must be
625 copied when a new frame is created. */
626
627 frame_copy_size = frame_size - offsetof(heapframe, eptr);
628
629 /* Set up the first current frame at the start of the vector, and initialize
630 fields that are not reset for new frames. */
631
632 F = mb->match_frames;
633 Frdepth = 0; /* "Recursion" depth */
634 Fcapture_last = 0; /* Number of most recent capture */
635 Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */
636 Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */
637 Fmark = NULL; /* Most recent mark */
638 Foffset_top = 0; /* End of captures within the frame */
639 Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */
640 group_frame_type = 0; /* Not a start of group frame */
641 goto NEW_FRAME; /* Start processing with this frame */
642
643 /* Come back here when we want to create a new frame for remembering a
644 backtracking point. */
645
646 MATCH_RECURSE:
647
648 /* Set up a new backtracking frame. If the vector is full, get a new one
649 on the heap, doubling the size, but constrained by the heap limit. */
650
651 N = (heapframe *)((char *)F + frame_size);
652 if (N >= mb->match_frames_top)
653 {
654 PCRE2_SIZE newsize = mb->frame_vector_size * 2;
655 heapframe *new;
656
657 if ((newsize / 1024) > mb->heap_limit)
658 {
659 PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size;
660 if (mb->frame_vector_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
661 newsize = maxsize;
662 }
663
664 new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
665 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
666 memcpy(new, mb->match_frames, mb->frame_vector_size);
667
668 F = (heapframe *)((char *)new + ((char *)F - (char *)mb->match_frames));
669 N = (heapframe *)((char *)F + frame_size);
670
671 if (mb->match_frames != mb->stack_frames)
672 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
673 mb->match_frames = new;
674 mb->match_frames_top = (heapframe *)((char *)mb->match_frames + newsize);
675 mb->frame_vector_size = newsize;
676 }
677
678 #ifdef DEBUG_SHOW_RMATCH
679 fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1);
680 if (group_frame_type != 0)
681 {
682 fprintf(stderr, " type=%x ", group_frame_type);
683 switch (GF_IDMASK(group_frame_type))
684 {
685 case GF_CAPTURE:
686 fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));
687 break;
688
689 case GF_NOCAPTURE:
690 fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));
691 break;
692
693 case GF_CONDASSERT:
694 fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));
695 break;
696
697 case GF_RECURSE:
698 fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));
699 break;
700
701 default:
702 fprintf(stderr, "*** unknown ***");
703 break;
704 }
705 }
706 fprintf(stderr, "\n");
707 #endif
708
709 /* Copy those fields that must be copied into the new frame, increase the
710 "recursion" depth (i.e. the new frame's index) and then make the new frame
711 current. */
712
713 memcpy((char *)N + offsetof(heapframe, eptr),
714 (char *)F + offsetof(heapframe, eptr),
715 frame_copy_size);
716
717 N->rdepth = Frdepth + 1;
718 F = N;
719
720 /* Carry on processing with a new frame. */
721
722 NEW_FRAME:
723 Fgroup_frame_type = group_frame_type;
724 Fecode = start_ecode; /* Starting code pointer */
725 Fback_frame = frame_size; /* Default is go back one frame */
726
727 /* If this is a special type of group frame, remember its offset for quick
728 access at the end of the group. If this is a recursion, set a new current
729 recursion value. */
730
731 if (group_frame_type != 0)
732 {
733 Flast_group_offset = (char *)F - (char *)mb->match_frames;
734 if (GF_IDMASK(group_frame_type) == GF_RECURSE)
735 Fcurrent_recurse = GF_DATAMASK(group_frame_type);
736 group_frame_type = 0;
737 }
738
739
740 /* ========================================================================= */
741 /* This is the main processing loop. First check that we haven't recorded too
742 many backtracks (search tree is too large), or that we haven't exceeded the
743 recursive depth limit (used too many backtracking frames). If not, process the
744 opcodes. */
745
746 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
747 if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
748
749 for (;;)
750 {
751 #ifdef DEBUG_SHOW_OPS
752 fprintf(stderr, "++ op=%d\n", *Fecode);
753 #endif
754
755 Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
756 switch(Fop)
757 {
758 /* ===================================================================== */
759 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close
760 any currently open capturing brackets. Unlike reaching the end of a group,
761 where we know the starting frame is at the top of the chained frames, in
762 this case we have to search back for the relevant frame in case other types
763 of group that use chained frames have intervened. Multiple OP_CLOSEs always
764 come innermost first, which matches the chain order. We can ignore this in
765 a recursion, because captures are not passed out of recursions. */
766
767 case OP_CLOSE:
768 if (Fcurrent_recurse == RECURSE_UNSET)
769 {
770 number = GET2(Fecode, 1);
771 offset = Flast_group_offset;
772 for(;;)
773 {
774 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
775 N = (heapframe *)((char *)mb->match_frames + offset);
776 P = (heapframe *)((char *)N - frame_size);
777 if (N->group_frame_type == (GF_CAPTURE | number)) break;
778 offset = P->last_group_offset;
779 }
780 offset = (number << 1) - 2;
781 Fcapture_last = number;
782 Fovector[offset] = P->eptr - mb->start_subject;
783 Fovector[offset+1] = Feptr - mb->start_subject;
784 if (offset >= Foffset_top) Foffset_top = offset + 2;
785 }
786 Fecode += PRIV(OP_lengths)[*Fecode];
787 break;
788
789
790 /* ===================================================================== */
791 /* Real or forced end of the pattern, assertion, or recursion. In an
792 assertion ACCEPT, update the last used pointer and remember the current
793 frame so that the captures and mark can be fished out of it. */
794
795 case OP_ASSERT_ACCEPT:
796 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
797 assert_accept_frame = F;
798 RRETURN(MATCH_ACCEPT);
799
800 /* If recursing, we have to find the most recent recursion. */
801
802 case OP_ACCEPT:
803 case OP_END:
804
805 /* Handle end of a recursion. */
806
807 if (Fcurrent_recurse != RECURSE_UNSET)
808 {
809 offset = Flast_group_offset;
810 for(;;)
811 {
812 if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
813 N = (heapframe *)((char *)mb->match_frames + offset);
814 P = (heapframe *)((char *)N - frame_size);
815 if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;
816 offset = P->last_group_offset;
817 }
818
819 /* N is now the frame of the recursion; the previous frame is at the
820 OP_RECURSE position. Go back there, copying the current subject position
821 and mark, and the start_match position (\K might have changed it), and
822 then move on past the OP_RECURSE. */
823
824 P->eptr = Feptr;
825 P->mark = Fmark;
826 P->start_match = Fstart_match;
827 F = P;
828 Fecode += 1 + LINK_SIZE;
829 continue;
830 }
831
832 /* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
833 is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
834 start of the subject. In both cases, backtracking will then try other
835 alternatives, if any. */
836
837 if (Feptr == Fstart_match &&
838 ((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
839 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
840 Fstart_match == mb->start_subject + mb->start_offset)))
841 RRETURN(MATCH_NOMATCH);
842
843 /* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
844 the end of the subject. After (*ACCEPT) we fail the entire match (at this
845 position) but backtrack on reaching the end of the pattern. */
846
847 if (Feptr < mb->end_subject &&
848 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
849 {
850 if (Fop == OP_END) RRETURN(MATCH_NOMATCH);
851 return MATCH_NOMATCH;
852 }
853
854 /* We have a successful match of the whole pattern. Record the result and
855 then do a direct return from the function. If there is space in the offset
856 vector, set any pairs that follow the highest-numbered captured string but
857 are less than the number of capturing groups in the pattern to PCRE2_UNSET.
858 It is documented that this happens. "Gaps" are set to PCRE2_UNSET
859 dynamically. It is only those at the end that need setting here. */
860
861 mb->end_match_ptr = Feptr; /* Record where we ended */
862 mb->end_offset_top = Foffset_top; /* and how many extracts were taken */
863 mb->mark = Fmark; /* and the last success mark */
864 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
865
866 ovector[0] = Fstart_match - mb->start_subject;
867 ovector[1] = Feptr - mb->start_subject;
868
869 /* Set i to the smaller of the sizes of the external and frame ovectors. */
870
871 i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1);
872 memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));
873 while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET;
874 return MATCH_MATCH; /* Note: NOT RRETURN */
875
876
877 /*===================================================================== */
878 /* Match any single character type except newline; have to take care with
879 CRLF newlines and partial matching. */
880
881 case OP_ANY:
882 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
883 if (mb->partial != 0 &&
884 Feptr == mb->end_subject - 1 &&
885 NLBLOCK->nltype == NLTYPE_FIXED &&
886 NLBLOCK->nllen == 2 &&
887 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
888 {
889 mb->hitend = TRUE;
890 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
891 }
892 /* Fall through */
893
894 /* Match any single character whatsoever. */
895
896 case OP_ALLANY:
897 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
898 { /* not be updated before SCHECK_PARTIAL. */
899 SCHECK_PARTIAL();
900 RRETURN(MATCH_NOMATCH);
901 }
902 Feptr++;
903 #ifdef SUPPORT_UNICODE
904 if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
905 #endif
906 Fecode++;
907 break;
908
909
910 /* ===================================================================== */
911 /* Match a single code unit, even in UTF mode. This opcode really does
912 match any code unit, even newline. (It really should be called ANYCODEUNIT,
913 of course - the byte name is from pre-16 bit days.) */
914
915 case OP_ANYBYTE:
916 if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */
917 { /* not be updated before SCHECK_PARTIAL. */
918 SCHECK_PARTIAL();
919 RRETURN(MATCH_NOMATCH);
920 }
921 Feptr++;
922 Fecode++;
923 break;
924
925
926 /* ===================================================================== */
927 /* Match a single character, casefully */
928
929 case OP_CHAR:
930 #ifdef SUPPORT_UNICODE
931 if (utf)
932 {
933 Flength = 1;
934 Fecode++;
935 GETCHARLEN(fc, Fecode, Flength);
936 if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))
937 {
938 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
939 RRETURN(MATCH_NOMATCH);
940 }
941 for (; Flength > 0; Flength--)
942 {
943 if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);
944 }
945 }
946 else
947 #endif
948
949 /* Not UTF mode */
950 {
951 if (mb->end_subject - Feptr < 1)
952 {
953 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
954 RRETURN(MATCH_NOMATCH);
955 }
956 if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);
957 Fecode += 2;
958 }
959 break;
960
961
962 /* ===================================================================== */
963 /* Match a single character, caselessly. If we are at the end of the
964 subject, give up immediately. We get here only when the pattern character
965 has at most one other case. Characters with more than two cases are coded
966 as OP_PROP with the pseudo-property PT_CLIST. */
967
968 case OP_CHARI:
969 if (Feptr >= mb->end_subject)
970 {
971 SCHECK_PARTIAL();
972 RRETURN(MATCH_NOMATCH);
973 }
974
975 #ifdef SUPPORT_UNICODE
976 if (utf)
977 {
978 Flength = 1;
979 Fecode++;
980 GETCHARLEN(fc, Fecode, Flength);
981
982 /* If the pattern character's value is < 128, we know that its other case
983 (if any) is also < 128 (and therefore only one code unit long in all
984 code-unit widths), so we can use the fast lookup table. We checked above
985 that there is at least one character left in the subject. */
986
987 if (fc < 128)
988 {
989 uint32_t cc = UCHAR21(Feptr);
990 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
991 Fecode++;
992 Feptr++;
993 }
994
995 /* Otherwise we must pick up the subject character and use Unicode
996 property support to test its other case. Note that we cannot use the
997 value of "Flength" to check for sufficient bytes left, because the other
998 case of the character may have more or fewer code units. */
999
1000 else
1001 {
1002 uint32_t dc;
1003 GETCHARINC(dc, Feptr);
1004 Fecode += Flength;
1005 if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1006 }
1007 }
1008
1009 /* If UCP is set without UTF we must do the same as above, but with one
1010 character per code unit. */
1011
1012 else if (ucp)
1013 {
1014 uint32_t cc = UCHAR21(Feptr);
1015 fc = Fecode[1];
1016 if (fc < 128)
1017 {
1018 if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
1019 }
1020 else
1021 {
1022 if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
1023 }
1024 Feptr++;
1025 Fecode += 2;
1026 }
1027
1028 else
1029 #endif /* SUPPORT_UNICODE */
1030
1031 /* Not UTF or UCP mode; use the table for characters < 256. */
1032 {
1033 if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
1034 != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
1035 Feptr++;
1036 Fecode += 2;
1037 }
1038 break;
1039
1040
1041 /* ===================================================================== */
1042 /* Match not a single character. */
1043
1044 case OP_NOT:
1045 case OP_NOTI:
1046 if (Feptr >= mb->end_subject)
1047 {
1048 SCHECK_PARTIAL();
1049 RRETURN(MATCH_NOMATCH);
1050 }
1051
1052 #ifdef SUPPORT_UNICODE
1053 if (utf)
1054 {
1055 uint32_t ch;
1056 Fecode++;
1057 GETCHARINC(ch, Fecode);
1058 GETCHARINC(fc, Feptr);
1059 if (ch == fc)
1060 {
1061 RRETURN(MATCH_NOMATCH); /* Caseful match */
1062 }
1063 else if (Fop == OP_NOTI) /* If caseless */
1064 {
1065 if (ch > 127)
1066 ch = UCD_OTHERCASE(ch);
1067 else
1068 ch = (mb->fcc)[ch];
1069 if (ch == fc) RRETURN(MATCH_NOMATCH);
1070 }
1071 }
1072
1073 /* UCP without UTF is as above, but with one character per code unit. */
1074
1075 else if (ucp)
1076 {
1077 uint32_t ch;
1078 fc = UCHAR21INC(Feptr);
1079 ch = Fecode[1];
1080 Fecode += 2;
1081
1082 if (ch == fc)
1083 {
1084 RRETURN(MATCH_NOMATCH); /* Caseful match */
1085 }
1086 else if (Fop == OP_NOTI) /* If caseless */
1087 {
1088 if (ch > 127)
1089 ch = UCD_OTHERCASE(ch);
1090 else
1091 ch = (mb->fcc)[ch];
1092 if (ch == fc) RRETURN(MATCH_NOMATCH);
1093 }
1094 }
1095
1096 else
1097 #endif /* SUPPORT_UNICODE */
1098
1099 /* Neither UTF nor UCP is set */
1100
1101 {
1102 uint32_t ch = Fecode[1];
1103 fc = UCHAR21INC(Feptr);
1104 if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
1105 RRETURN(MATCH_NOMATCH);
1106 Fecode += 2;
1107 }
1108 break;
1109
1110
1111 /* ===================================================================== */
1112 /* Match a single character repeatedly. */
1113
1114 #define Loclength F->temp_size
1115 #define Lstart_eptr F->temp_sptr[0]
1116 #define Lcharptr F->temp_sptr[1]
1117 #define Lmin F->temp_32[0]
1118 #define Lmax F->temp_32[1]
1119 #define Lc F->temp_32[2]
1120 #define Loc F->temp_32[3]
1121
1122 case OP_EXACT:
1123 case OP_EXACTI:
1124 Lmin = Lmax = GET2(Fecode, 1);
1125 Fecode += 1 + IMM2_SIZE;
1126 goto REPEATCHAR;
1127
1128 case OP_POSUPTO:
1129 case OP_POSUPTOI:
1130 reptype = REPTYPE_POS;
1131 Lmin = 0;
1132 Lmax = GET2(Fecode, 1);
1133 Fecode += 1 + IMM2_SIZE;
1134 goto REPEATCHAR;
1135
1136 case OP_UPTO:
1137 case OP_UPTOI:
1138 reptype = REPTYPE_MAX;
1139 Lmin = 0;
1140 Lmax = GET2(Fecode, 1);
1141 Fecode += 1 + IMM2_SIZE;
1142 goto REPEATCHAR;
1143
1144 case OP_MINUPTO:
1145 case OP_MINUPTOI:
1146 reptype = REPTYPE_MIN;
1147 Lmin = 0;
1148 Lmax = GET2(Fecode, 1);
1149 Fecode += 1 + IMM2_SIZE;
1150 goto REPEATCHAR;
1151
1152 case OP_POSSTAR:
1153 case OP_POSSTARI:
1154 reptype = REPTYPE_POS;
1155 Lmin = 0;
1156 Lmax = UINT32_MAX;
1157 Fecode++;
1158 goto REPEATCHAR;
1159
1160 case OP_POSPLUS:
1161 case OP_POSPLUSI:
1162 reptype = REPTYPE_POS;
1163 Lmin = 1;
1164 Lmax = UINT32_MAX;
1165 Fecode++;
1166 goto REPEATCHAR;
1167
1168 case OP_POSQUERY:
1169 case OP_POSQUERYI:
1170 reptype = REPTYPE_POS;
1171 Lmin = 0;
1172 Lmax = 1;
1173 Fecode++;
1174 goto REPEATCHAR;
1175
1176 case OP_STAR:
1177 case OP_STARI:
1178 case OP_MINSTAR:
1179 case OP_MINSTARI:
1180 case OP_PLUS:
1181 case OP_PLUSI:
1182 case OP_MINPLUS:
1183 case OP_MINPLUSI:
1184 case OP_QUERY:
1185 case OP_QUERYI:
1186 case OP_MINQUERY:
1187 case OP_MINQUERYI:
1188 fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);
1189 Lmin = rep_min[fc];
1190 Lmax = rep_max[fc];
1191 reptype = rep_typ[fc];
1192
1193 /* Common code for all repeated single-character matches. We first check
1194 for the minimum number of characters. If the minimum equals the maximum, we
1195 are done. Otherwise, if minimizing, check the rest of the pattern for a
1196 match; if there isn't one, advance up to the maximum, one character at a
1197 time.
1198
1199 If maximizing, advance up to the maximum number of matching characters,
1200 until Feptr is past the end of the maximum run. If possessive, we are
1201 then done (no backing up). Otherwise, match at this position; anything
1202 other than no match is immediately returned. For nomatch, back up one
1203 character, unless we are matching \R and the last thing matched was
1204 \r\n, in which case, back up two code units until we reach the first
1205 optional character position.
1206
1207 The various UTF/non-UTF and caseful/caseless cases are handled separately,
1208 for speed. */
1209
1210 REPEATCHAR:
1211 #ifdef SUPPORT_UNICODE
1212 if (utf)
1213 {
1214 Flength = 1;
1215 Lcharptr = Fecode;
1216 GETCHARLEN(fc, Fecode, Flength);
1217 Fecode += Flength;
1218
1219 /* Handle multi-code-unit character matching, caseful and caseless. */
1220
1221 if (Flength > 1)
1222 {
1223 uint32_t othercase;
1224
1225 if (Fop >= OP_STARI && /* Caseless */
1226 (othercase = UCD_OTHERCASE(fc)) != fc)
1227 Loclength = PRIV(ord2utf)(othercase, Foccu);
1228 else Loclength = 0;
1229
1230 for (i = 1; i <= Lmin; i++)
1231 {
1232 if (Feptr <= mb->end_subject - Flength &&
1233 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1234 else if (Loclength > 0 &&
1235 Feptr <= mb->end_subject - Loclength &&
1236 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1237 Feptr += Loclength;
1238 else
1239 {
1240 CHECK_PARTIAL();
1241 RRETURN(MATCH_NOMATCH);
1242 }
1243 }
1244
1245 if (Lmin == Lmax) continue;
1246
1247 if (reptype == REPTYPE_MIN)
1248 {
1249 for (;;)
1250 {
1251 RMATCH(Fecode, RM202);
1252 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1253 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1254 if (Feptr <= mb->end_subject - Flength &&
1255 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;
1256 else if (Loclength > 0 &&
1257 Feptr <= mb->end_subject - Loclength &&
1258 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1259 Feptr += Loclength;
1260 else
1261 {
1262 CHECK_PARTIAL();
1263 RRETURN(MATCH_NOMATCH);
1264 }
1265 }
1266 /* Control never gets here */
1267 }
1268
1269 else /* Maximize */
1270 {
1271 Lstart_eptr = Feptr;
1272 for (i = Lmin; i < Lmax; i++)
1273 {
1274 if (Feptr <= mb->end_subject - Flength &&
1275 memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)
1276 Feptr += Flength;
1277 else if (Loclength > 0 &&
1278 Feptr <= mb->end_subject - Loclength &&
1279 memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)
1280 Feptr += Loclength;
1281 else
1282 {
1283 CHECK_PARTIAL();
1284 break;
1285 }
1286 }
1287
1288 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1289 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1290 go too far. */
1291
1292 if (reptype != REPTYPE_POS) for(;;)
1293 {
1294 if (Feptr <= Lstart_eptr) break;
1295 RMATCH(Fecode, RM203);
1296 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1297 Feptr--;
1298 BACKCHAR(Feptr);
1299 }
1300 }
1301 break; /* End of repeated wide character handling */
1302 }
1303
1304 /* Length of UTF character is 1. Put it into the preserved variable and
1305 fall through to the non-UTF code. */
1306
1307 Lc = fc;
1308 }
1309 else
1310 #endif /* SUPPORT_UNICODE */
1311
1312 /* When not in UTF mode, load a single-code-unit character. Then proceed as
1313 above, using Unicode casing if either UTF or UCP is set. */
1314
1315 Lc = *Fecode++;
1316
1317 /* Caseless comparison */
1318
1319 if (Fop >= OP_STARI)
1320 {
1321 #if PCRE2_CODE_UNIT_WIDTH == 8
1322 #ifdef SUPPORT_UNICODE
1323 if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1324 else
1325 #endif /* SUPPORT_UNICODE */
1326 /* Lc will be < 128 in UTF-8 mode. */
1327 Loc = mb->fcc[Lc];
1328 #else /* 16-bit & 32-bit */
1329 #ifdef SUPPORT_UNICODE
1330 if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
1331 else
1332 #endif /* SUPPORT_UNICODE */
1333 Loc = TABLE_GET(Lc, mb->fcc, Lc);
1334 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
1335
1336 for (i = 1; i <= Lmin; i++)
1337 {
1338 uint32_t cc; /* Faster than PCRE2_UCHAR */
1339 if (Feptr >= mb->end_subject)
1340 {
1341 SCHECK_PARTIAL();
1342 RRETURN(MATCH_NOMATCH);
1343 }
1344 cc = UCHAR21TEST(Feptr);
1345 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1346 Feptr++;
1347 }
1348 if (Lmin == Lmax) continue;
1349
1350 if (reptype == REPTYPE_MIN)
1351 {
1352 for (;;)
1353 {
1354 uint32_t cc; /* Faster than PCRE2_UCHAR */
1355 RMATCH(Fecode, RM25);
1356 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1357 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1358 if (Feptr >= mb->end_subject)
1359 {
1360 SCHECK_PARTIAL();
1361 RRETURN(MATCH_NOMATCH);
1362 }
1363 cc = UCHAR21TEST(Feptr);
1364 if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);
1365 Feptr++;
1366 }
1367 /* Control never gets here */
1368 }
1369
1370 else /* Maximize */
1371 {
1372 Lstart_eptr = Feptr;
1373 for (i = Lmin; i < Lmax; i++)
1374 {
1375 uint32_t cc; /* Faster than PCRE2_UCHAR */
1376 if (Feptr >= mb->end_subject)
1377 {
1378 SCHECK_PARTIAL();
1379 break;
1380 }
1381 cc = UCHAR21TEST(Feptr);
1382 if (Lc != cc && Loc != cc) break;
1383 Feptr++;
1384 }
1385 if (reptype != REPTYPE_POS) for (;;)
1386 {
1387 if (Feptr == Lstart_eptr) break;
1388 RMATCH(Fecode, RM26);
1389 Feptr--;
1390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1391 }
1392 }
1393 }
1394
1395 /* Caseful comparisons (includes all multi-byte characters) */
1396
1397 else
1398 {
1399 for (i = 1; i <= Lmin; i++)
1400 {
1401 if (Feptr >= mb->end_subject)
1402 {
1403 SCHECK_PARTIAL();
1404 RRETURN(MATCH_NOMATCH);
1405 }
1406 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1407 }
1408
1409 if (Lmin == Lmax) continue;
1410
1411 if (reptype == REPTYPE_MIN)
1412 {
1413 for (;;)
1414 {
1415 RMATCH(Fecode, RM27);
1416 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1417 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1418 if (Feptr >= mb->end_subject)
1419 {
1420 SCHECK_PARTIAL();
1421 RRETURN(MATCH_NOMATCH);
1422 }
1423 if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);
1424 }
1425 /* Control never gets here */
1426 }
1427 else /* Maximize */
1428 {
1429 Lstart_eptr = Feptr;
1430 for (i = Lmin; i < Lmax; i++)
1431 {
1432 if (Feptr >= mb->end_subject)
1433 {
1434 SCHECK_PARTIAL();
1435 break;
1436 }
1437
1438 if (Lc != UCHAR21TEST(Feptr)) break;
1439 Feptr++;
1440 }
1441
1442 if (reptype != REPTYPE_POS) for (;;)
1443 {
1444 if (Feptr <= Lstart_eptr) break;
1445 RMATCH(Fecode, RM28);
1446 Feptr--;
1447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1448 }
1449 }
1450 }
1451 break;
1452
1453 #undef Loclength
1454 #undef Lstart_eptr
1455 #undef Lcharptr
1456 #undef Lmin
1457 #undef Lmax
1458 #undef Lc
1459 #undef Loc
1460
1461
1462 /* ===================================================================== */
1463 /* Match a negated single one-byte character repeatedly. This is almost a
1464 repeat of the code for a repeated single character, but I haven't found a
1465 nice way of commoning these up that doesn't require a test of the
1466 positive/negative option for each character match. Maybe that wouldn't add
1467 very much to the time taken, but character matching *is* what this is all
1468 about... */
1469
1470 #define Lstart_eptr F->temp_sptr[0]
1471 #define Lmin F->temp_32[0]
1472 #define Lmax F->temp_32[1]
1473 #define Lc F->temp_32[2]
1474 #define Loc F->temp_32[3]
1475
1476 case OP_NOTEXACT:
1477 case OP_NOTEXACTI:
1478 Lmin = Lmax = GET2(Fecode, 1);
1479 Fecode += 1 + IMM2_SIZE;
1480 goto REPEATNOTCHAR;
1481
1482 case OP_NOTUPTO:
1483 case OP_NOTUPTOI:
1484 Lmin = 0;
1485 Lmax = GET2(Fecode, 1);
1486 reptype = REPTYPE_MAX;
1487 Fecode += 1 + IMM2_SIZE;
1488 goto REPEATNOTCHAR;
1489
1490 case OP_NOTMINUPTO:
1491 case OP_NOTMINUPTOI:
1492 Lmin = 0;
1493 Lmax = GET2(Fecode, 1);
1494 reptype = REPTYPE_MIN;
1495 Fecode += 1 + IMM2_SIZE;
1496 goto REPEATNOTCHAR;
1497
1498 case OP_NOTPOSSTAR:
1499 case OP_NOTPOSSTARI:
1500 reptype = REPTYPE_POS;
1501 Lmin = 0;
1502 Lmax = UINT32_MAX;
1503 Fecode++;
1504 goto REPEATNOTCHAR;
1505
1506 case OP_NOTPOSPLUS:
1507 case OP_NOTPOSPLUSI:
1508 reptype = REPTYPE_POS;
1509 Lmin = 1;
1510 Lmax = UINT32_MAX;
1511 Fecode++;
1512 goto REPEATNOTCHAR;
1513
1514 case OP_NOTPOSQUERY:
1515 case OP_NOTPOSQUERYI:
1516 reptype = REPTYPE_POS;
1517 Lmin = 0;
1518 Lmax = 1;
1519 Fecode++;
1520 goto REPEATNOTCHAR;
1521
1522 case OP_NOTPOSUPTO:
1523 case OP_NOTPOSUPTOI:
1524 reptype = REPTYPE_POS;
1525 Lmin = 0;
1526 Lmax = GET2(Fecode, 1);
1527 Fecode += 1 + IMM2_SIZE;
1528 goto REPEATNOTCHAR;
1529
1530 case OP_NOTSTAR:
1531 case OP_NOTSTARI:
1532 case OP_NOTMINSTAR:
1533 case OP_NOTMINSTARI:
1534 case OP_NOTPLUS:
1535 case OP_NOTPLUSI:
1536 case OP_NOTMINPLUS:
1537 case OP_NOTMINPLUSI:
1538 case OP_NOTQUERY:
1539 case OP_NOTQUERYI:
1540 case OP_NOTMINQUERY:
1541 case OP_NOTMINQUERYI:
1542 fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1543 Lmin = rep_min[fc];
1544 Lmax = rep_max[fc];
1545 reptype = rep_typ[fc];
1546
1547 /* Common code for all repeated single-character non-matches. */
1548
1549 REPEATNOTCHAR:
1550 GETCHARINCTEST(Lc, Fecode);
1551
1552 /* The code is duplicated for the caseless and caseful cases, for speed,
1553 since matching characters is likely to be quite common. First, ensure the
1554 minimum number of matches are present. If Lmin = Lmax, we are done.
1555 Otherwise, if minimizing, keep trying the rest of the expression and
1556 advancing one matching character if failing, up to the maximum.
1557 Alternatively, if maximizing, find the maximum number of characters and
1558 work backwards. */
1559
1560 if (Fop >= OP_NOTSTARI) /* Caseless */
1561 {
1562 #ifdef SUPPORT_UNICODE
1563 if ((utf || ucp) && Lc > 127)
1564 Loc = UCD_OTHERCASE(Lc);
1565 else
1566 #endif /* SUPPORT_UNICODE */
1567
1568 Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */
1569
1570 #ifdef SUPPORT_UNICODE
1571 if (utf)
1572 {
1573 uint32_t d;
1574 for (i = 1; i <= Lmin; i++)
1575 {
1576 if (Feptr >= mb->end_subject)
1577 {
1578 SCHECK_PARTIAL();
1579 RRETURN(MATCH_NOMATCH);
1580 }
1581 GETCHARINC(d, Feptr);
1582 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1583 }
1584 }
1585 else
1586 #endif /* SUPPORT_UNICODE */
1587
1588 /* Not UTF mode */
1589 {
1590 for (i = 1; i <= Lmin; i++)
1591 {
1592 if (Feptr >= mb->end_subject)
1593 {
1594 SCHECK_PARTIAL();
1595 RRETURN(MATCH_NOMATCH);
1596 }
1597 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1598 Feptr++;
1599 }
1600 }
1601
1602 if (Lmin == Lmax) continue; /* Finished for exact count */
1603
1604 if (reptype == REPTYPE_MIN)
1605 {
1606 #ifdef SUPPORT_UNICODE
1607 if (utf)
1608 {
1609 uint32_t d;
1610 for (;;)
1611 {
1612 RMATCH(Fecode, RM204);
1613 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1614 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1615 if (Feptr >= mb->end_subject)
1616 {
1617 SCHECK_PARTIAL();
1618 RRETURN(MATCH_NOMATCH);
1619 }
1620 GETCHARINC(d, Feptr);
1621 if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);
1622 }
1623 }
1624 else
1625 #endif /*SUPPORT_UNICODE */
1626
1627 /* Not UTF mode */
1628 {
1629 for (;;)
1630 {
1631 RMATCH(Fecode, RM29);
1632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1633 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1634 if (Feptr >= mb->end_subject)
1635 {
1636 SCHECK_PARTIAL();
1637 RRETURN(MATCH_NOMATCH);
1638 }
1639 if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);
1640 Feptr++;
1641 }
1642 }
1643 /* Control never gets here */
1644 }
1645
1646 /* Maximize case */
1647
1648 else
1649 {
1650 Lstart_eptr = Feptr;
1651
1652 #ifdef SUPPORT_UNICODE
1653 if (utf)
1654 {
1655 uint32_t d;
1656 for (i = Lmin; i < Lmax; i++)
1657 {
1658 int len = 1;
1659 if (Feptr >= mb->end_subject)
1660 {
1661 SCHECK_PARTIAL();
1662 break;
1663 }
1664 GETCHARLEN(d, Feptr, len);
1665 if (Lc == d || Loc == d) break;
1666 Feptr += len;
1667 }
1668
1669 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1670 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1671 go too far. */
1672
1673 if (reptype != REPTYPE_POS) for(;;)
1674 {
1675 if (Feptr <= Lstart_eptr) break;
1676 RMATCH(Fecode, RM205);
1677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1678 Feptr--;
1679 BACKCHAR(Feptr);
1680 }
1681 }
1682 else
1683 #endif /* SUPPORT_UNICODE */
1684
1685 /* Not UTF mode */
1686 {
1687 for (i = Lmin; i < Lmax; i++)
1688 {
1689 if (Feptr >= mb->end_subject)
1690 {
1691 SCHECK_PARTIAL();
1692 break;
1693 }
1694 if (Lc == *Feptr || Loc == *Feptr) break;
1695 Feptr++;
1696 }
1697 if (reptype != REPTYPE_POS) for (;;)
1698 {
1699 if (Feptr == Lstart_eptr) break;
1700 RMATCH(Fecode, RM30);
1701 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1702 Feptr--;
1703 }
1704 }
1705 }
1706 }
1707
1708 /* Caseful comparisons */
1709
1710 else
1711 {
1712 #ifdef SUPPORT_UNICODE
1713 if (utf)
1714 {
1715 uint32_t d;
1716 for (i = 1; i <= Lmin; i++)
1717 {
1718 if (Feptr >= mb->end_subject)
1719 {
1720 SCHECK_PARTIAL();
1721 RRETURN(MATCH_NOMATCH);
1722 }
1723 GETCHARINC(d, Feptr);
1724 if (Lc == d) RRETURN(MATCH_NOMATCH);
1725 }
1726 }
1727 else
1728 #endif
1729 /* Not UTF mode */
1730 {
1731 for (i = 1; i <= Lmin; i++)
1732 {
1733 if (Feptr >= mb->end_subject)
1734 {
1735 SCHECK_PARTIAL();
1736 RRETURN(MATCH_NOMATCH);
1737 }
1738 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1739 }
1740 }
1741
1742 if (Lmin == Lmax) continue;
1743
1744 if (reptype == REPTYPE_MIN)
1745 {
1746 #ifdef SUPPORT_UNICODE
1747 if (utf)
1748 {
1749 uint32_t d;
1750 for (;;)
1751 {
1752 RMATCH(Fecode, RM206);
1753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1754 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1755 if (Feptr >= mb->end_subject)
1756 {
1757 SCHECK_PARTIAL();
1758 RRETURN(MATCH_NOMATCH);
1759 }
1760 GETCHARINC(d, Feptr);
1761 if (Lc == d) RRETURN(MATCH_NOMATCH);
1762 }
1763 }
1764 else
1765 #endif
1766 /* Not UTF mode */
1767 {
1768 for (;;)
1769 {
1770 RMATCH(Fecode, RM31);
1771 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1772 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1773 if (Feptr >= mb->end_subject)
1774 {
1775 SCHECK_PARTIAL();
1776 RRETURN(MATCH_NOMATCH);
1777 }
1778 if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);
1779 }
1780 }
1781 /* Control never gets here */
1782 }
1783
1784 /* Maximize case */
1785
1786 else
1787 {
1788 Lstart_eptr = Feptr;
1789
1790 #ifdef SUPPORT_UNICODE
1791 if (utf)
1792 {
1793 uint32_t d;
1794 for (i = Lmin; i < Lmax; i++)
1795 {
1796 int len = 1;
1797 if (Feptr >= mb->end_subject)
1798 {
1799 SCHECK_PARTIAL();
1800 break;
1801 }
1802 GETCHARLEN(d, Feptr, len);
1803 if (Lc == d) break;
1804 Feptr += len;
1805 }
1806
1807 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
1808 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
1809 go too far. */
1810
1811 if (reptype != REPTYPE_POS) for(;;)
1812 {
1813 if (Feptr <= Lstart_eptr) break;
1814 RMATCH(Fecode, RM207);
1815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1816 Feptr--;
1817 BACKCHAR(Feptr);
1818 }
1819 }
1820 else
1821 #endif
1822 /* Not UTF mode */
1823 {
1824 for (i = Lmin; i < Lmax; i++)
1825 {
1826 if (Feptr >= mb->end_subject)
1827 {
1828 SCHECK_PARTIAL();
1829 break;
1830 }
1831 if (Lc == *Feptr) break;
1832 Feptr++;
1833 }
1834 if (reptype != REPTYPE_POS) for (;;)
1835 {
1836 if (Feptr == Lstart_eptr) break;
1837 RMATCH(Fecode, RM32);
1838 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1839 Feptr--;
1840 }
1841 }
1842 }
1843 }
1844 break;
1845
1846 #undef Lstart_eptr
1847 #undef Lmin
1848 #undef Lmax
1849 #undef Lc
1850 #undef Loc
1851
1852
1853 /* ===================================================================== */
1854 /* Match a bit-mapped character class, possibly repeatedly. These opcodes
1855 are used when all the characters in the class have values in the range
1856 0-255, and either the matching is caseful, or the characters are in the
1857 range 0-127 when UTF processing is enabled. The only difference between
1858 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1859 encountered. */
1860
1861 #define Lmin F->temp_32[0]
1862 #define Lmax F->temp_32[1]
1863 #define Lstart_eptr F->temp_sptr[0]
1864 #define Lbyte_map_address F->temp_sptr[1]
1865 #define Lbyte_map ((unsigned char *)Lbyte_map_address)
1866
1867 case OP_NCLASS:
1868 case OP_CLASS:
1869 {
1870 Lbyte_map_address = Fecode + 1; /* Save for matching */
1871 Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */
1872
1873 /* Look past the end of the item to see if there is repeat information
1874 following. Then obey similar code to character type repeats. */
1875
1876 switch (*Fecode)
1877 {
1878 case OP_CRSTAR:
1879 case OP_CRMINSTAR:
1880 case OP_CRPLUS:
1881 case OP_CRMINPLUS:
1882 case OP_CRQUERY:
1883 case OP_CRMINQUERY:
1884 case OP_CRPOSSTAR:
1885 case OP_CRPOSPLUS:
1886 case OP_CRPOSQUERY:
1887 fc = *Fecode++ - OP_CRSTAR;
1888 Lmin = rep_min[fc];
1889 Lmax = rep_max[fc];
1890 reptype = rep_typ[fc];
1891 break;
1892
1893 case OP_CRRANGE:
1894 case OP_CRMINRANGE:
1895 case OP_CRPOSRANGE:
1896 Lmin = GET2(Fecode, 1);
1897 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
1898 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
1899 reptype = rep_typ[*Fecode - OP_CRSTAR];
1900 Fecode += 1 + 2 * IMM2_SIZE;
1901 break;
1902
1903 default: /* No repeat follows */
1904 Lmin = Lmax = 1;
1905 break;
1906 }
1907
1908 /* First, ensure the minimum number of matches are present. */
1909
1910 #ifdef SUPPORT_UNICODE
1911 if (utf)
1912 {
1913 for (i = 1; i <= Lmin; i++)
1914 {
1915 if (Feptr >= mb->end_subject)
1916 {
1917 SCHECK_PARTIAL();
1918 RRETURN(MATCH_NOMATCH);
1919 }
1920 GETCHARINC(fc, Feptr);
1921 if (fc > 255)
1922 {
1923 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1924 }
1925 else
1926 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1927 }
1928 }
1929 else
1930 #endif
1931 /* Not UTF mode */
1932 {
1933 for (i = 1; i <= Lmin; i++)
1934 {
1935 if (Feptr >= mb->end_subject)
1936 {
1937 SCHECK_PARTIAL();
1938 RRETURN(MATCH_NOMATCH);
1939 }
1940 fc = *Feptr++;
1941 #if PCRE2_CODE_UNIT_WIDTH != 8
1942 if (fc > 255)
1943 {
1944 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1945 }
1946 else
1947 #endif
1948 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1949 }
1950 }
1951
1952 /* If Lmax == Lmin we are done. Continue with main loop. */
1953
1954 if (Lmin == Lmax) continue;
1955
1956 /* If minimizing, keep testing the rest of the expression and advancing
1957 the pointer while it matches the class. */
1958
1959 if (reptype == REPTYPE_MIN)
1960 {
1961 #ifdef SUPPORT_UNICODE
1962 if (utf)
1963 {
1964 for (;;)
1965 {
1966 RMATCH(Fecode, RM200);
1967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1968 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1969 if (Feptr >= mb->end_subject)
1970 {
1971 SCHECK_PARTIAL();
1972 RRETURN(MATCH_NOMATCH);
1973 }
1974 GETCHARINC(fc, Feptr);
1975 if (fc > 255)
1976 {
1977 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
1978 }
1979 else
1980 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
1981 }
1982 }
1983 else
1984 #endif
1985 /* Not UTF mode */
1986 {
1987 for (;;)
1988 {
1989 RMATCH(Fecode, RM23);
1990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1991 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
1992 if (Feptr >= mb->end_subject)
1993 {
1994 SCHECK_PARTIAL();
1995 RRETURN(MATCH_NOMATCH);
1996 }
1997 fc = *Feptr++;
1998 #if PCRE2_CODE_UNIT_WIDTH != 8
1999 if (fc > 255)
2000 {
2001 if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);
2002 }
2003 else
2004 #endif
2005 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);
2006 }
2007 }
2008 /* Control never gets here */
2009 }
2010
2011 /* If maximizing, find the longest possible run, then work backwards. */
2012
2013 else
2014 {
2015 Lstart_eptr = Feptr;
2016
2017 #ifdef SUPPORT_UNICODE
2018 if (utf)
2019 {
2020 for (i = Lmin; i < Lmax; i++)
2021 {
2022 int len = 1;
2023 if (Feptr >= mb->end_subject)
2024 {
2025 SCHECK_PARTIAL();
2026 break;
2027 }
2028 GETCHARLEN(fc, Feptr, len);
2029 if (fc > 255)
2030 {
2031 if (Fop == OP_CLASS) break;
2032 }
2033 else
2034 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2035 Feptr += len;
2036 }
2037
2038 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2039
2040 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2041 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2042 go too far. */
2043
2044 for (;;)
2045 {
2046 RMATCH(Fecode, RM201);
2047 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2049 BACKCHAR(Feptr);
2050 }
2051 }
2052 else
2053 #endif
2054 /* Not UTF mode */
2055 {
2056 for (i = Lmin; i < Lmax; i++)
2057 {
2058 if (Feptr >= mb->end_subject)
2059 {
2060 SCHECK_PARTIAL();
2061 break;
2062 }
2063 fc = *Feptr;
2064 #if PCRE2_CODE_UNIT_WIDTH != 8
2065 if (fc > 255)
2066 {
2067 if (Fop == OP_CLASS) break;
2068 }
2069 else
2070 #endif
2071 if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;
2072 Feptr++;
2073 }
2074
2075 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2076
2077 while (Feptr >= Lstart_eptr)
2078 {
2079 RMATCH(Fecode, RM24);
2080 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2081 Feptr--;
2082 }
2083 }
2084
2085 RRETURN(MATCH_NOMATCH);
2086 }
2087 }
2088 /* Control never gets here */
2089
2090 #undef Lbyte_map_address
2091 #undef Lbyte_map
2092 #undef Lstart_eptr
2093 #undef Lmin
2094 #undef Lmax
2095
2096
2097 /* ===================================================================== */
2098 /* Match an extended character class. In the 8-bit library, this opcode is
2099 encountered only when UTF-8 mode mode is supported. In the 16-bit and
2100 32-bit libraries, codepoints greater than 255 may be encountered even when
2101 UTF is not supported. */
2102
2103 #define Lstart_eptr F->temp_sptr[0]
2104 #define Lxclass_data F->temp_sptr[1]
2105 #define Lmin F->temp_32[0]
2106 #define Lmax F->temp_32[1]
2107
2108 #ifdef SUPPORT_WIDE_CHARS
2109 case OP_XCLASS:
2110 {
2111 Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */
2112 Fecode += GET(Fecode, 1); /* Advance past the item */
2113
2114 switch (*Fecode)
2115 {
2116 case OP_CRSTAR:
2117 case OP_CRMINSTAR:
2118 case OP_CRPLUS:
2119 case OP_CRMINPLUS:
2120 case OP_CRQUERY:
2121 case OP_CRMINQUERY:
2122 case OP_CRPOSSTAR:
2123 case OP_CRPOSPLUS:
2124 case OP_CRPOSQUERY:
2125 fc = *Fecode++ - OP_CRSTAR;
2126 Lmin = rep_min[fc];
2127 Lmax = rep_max[fc];
2128 reptype = rep_typ[fc];
2129 break;
2130
2131 case OP_CRRANGE:
2132 case OP_CRMINRANGE:
2133 case OP_CRPOSRANGE:
2134 Lmin = GET2(Fecode, 1);
2135 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
2136 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
2137 reptype = rep_typ[*Fecode - OP_CRSTAR];
2138 Fecode += 1 + 2 * IMM2_SIZE;
2139 break;
2140
2141 default: /* No repeat follows */
2142 Lmin = Lmax = 1;
2143 break;
2144 }
2145
2146 /* First, ensure the minimum number of matches are present. */
2147
2148 for (i = 1; i <= Lmin; i++)
2149 {
2150 if (Feptr >= mb->end_subject)
2151 {
2152 SCHECK_PARTIAL();
2153 RRETURN(MATCH_NOMATCH);
2154 }
2155 GETCHARINCTEST(fc, Feptr);
2156 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2157 }
2158
2159 /* If Lmax == Lmin we can just continue with the main loop. */
2160
2161 if (Lmin == Lmax) continue;
2162
2163 /* If minimizing, keep testing the rest of the expression and advancing
2164 the pointer while it matches the class. */
2165
2166 if (reptype == REPTYPE_MIN)
2167 {
2168 for (;;)
2169 {
2170 RMATCH(Fecode, RM100);
2171 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2172 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
2173 if (Feptr >= mb->end_subject)
2174 {
2175 SCHECK_PARTIAL();
2176 RRETURN(MATCH_NOMATCH);
2177 }
2178 GETCHARINCTEST(fc, Feptr);
2179 if (!PRIV(xclass)(fc, Lxclass_data, utf)) RRETURN(MATCH_NOMATCH);
2180 }
2181 /* Control never gets here */
2182 }
2183
2184 /* If maximizing, find the longest possible run, then work backwards. */
2185
2186 else
2187 {
2188 Lstart_eptr = Feptr;
2189 for (i = Lmin; i < Lmax; i++)
2190 {
2191 int len = 1;
2192 if (Feptr >= mb->end_subject)
2193 {
2194 SCHECK_PARTIAL();
2195 break;
2196 }
2197 #ifdef SUPPORT_UNICODE
2198 GETCHARLENTEST(fc, Feptr, len);
2199 #else
2200 fc = *Feptr;
2201 #endif
2202 if (!PRIV(xclass)(fc, Lxclass_data, utf)) break;
2203 Feptr += len;
2204 }
2205
2206 if (reptype == REPTYPE_POS) continue; /* No backtracking */
2207
2208 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
2209 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
2210 go too far. */
2211
2212 for(;;)
2213 {
2214 RMATCH(Fecode, RM101);
2215 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2216 if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
2217 #ifdef SUPPORT_UNICODE
2218 if (utf) BACKCHAR(Feptr);
2219 #endif
2220 }
2221 RRETURN(MATCH_NOMATCH);
2222 }
2223
2224 /* Control never gets here */
2225 }
2226 #endif /* SUPPORT_WIDE_CHARS: end of XCLASS */
2227
2228 #undef Lstart_eptr
2229 #undef Lxclass_data
2230 #undef Lmin
2231 #undef Lmax
2232
2233
2234 /* ===================================================================== */
2235 /* Match various character types when PCRE2_UCP is not set. These opcodes
2236 are not generated when PCRE2_UCP is set - instead appropriate property
2237 tests are compiled. */
2238
2239 case OP_NOT_DIGIT:
2240 if (Feptr >= mb->end_subject)
2241 {
2242 SCHECK_PARTIAL();
2243 RRETURN(MATCH_NOMATCH);
2244 }
2245 GETCHARINCTEST(fc, Feptr);
2246 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
2247 RRETURN(MATCH_NOMATCH);
2248 Fecode++;
2249 break;
2250
2251 case OP_DIGIT:
2252 if (Feptr >= mb->end_subject)
2253 {
2254 SCHECK_PARTIAL();
2255 RRETURN(MATCH_NOMATCH);
2256 }
2257 GETCHARINCTEST(fc, Feptr);
2258 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
2259 RRETURN(MATCH_NOMATCH);
2260 Fecode++;
2261 break;
2262
2263 case OP_NOT_WHITESPACE:
2264 if (Feptr >= mb->end_subject)
2265 {
2266 SCHECK_PARTIAL();
2267 RRETURN(MATCH_NOMATCH);
2268 }
2269 GETCHARINCTEST(fc, Feptr);
2270 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
2271 RRETURN(MATCH_NOMATCH);
2272 Fecode++;
2273 break;
2274
2275 case OP_WHITESPACE:
2276 if (Feptr >= mb->end_subject)
2277 {
2278 SCHECK_PARTIAL();
2279 RRETURN(MATCH_NOMATCH);
2280 }
2281 GETCHARINCTEST(fc, Feptr);
2282 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
2283 RRETURN(MATCH_NOMATCH);
2284 Fecode++;
2285 break;
2286
2287 case OP_NOT_WORDCHAR:
2288 if (Feptr >= mb->end_subject)
2289 {
2290 SCHECK_PARTIAL();
2291 RRETURN(MATCH_NOMATCH);
2292 }
2293 GETCHARINCTEST(fc, Feptr);
2294 if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
2295 RRETURN(MATCH_NOMATCH);
2296 Fecode++;
2297 break;
2298
2299 case OP_WORDCHAR:
2300 if (Feptr >= mb->end_subject)
2301 {
2302 SCHECK_PARTIAL();
2303 RRETURN(MATCH_NOMATCH);
2304 }
2305 GETCHARINCTEST(fc, Feptr);
2306 if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
2307 RRETURN(MATCH_NOMATCH);
2308 Fecode++;
2309 break;
2310
2311 case OP_ANYNL:
2312 if (Feptr >= mb->end_subject)
2313 {
2314 SCHECK_PARTIAL();
2315 RRETURN(MATCH_NOMATCH);
2316 }
2317 GETCHARINCTEST(fc, Feptr);
2318 switch(fc)
2319 {
2320 default: RRETURN(MATCH_NOMATCH);
2321
2322 case CHAR_CR:
2323 if (Feptr >= mb->end_subject)
2324 {
2325 SCHECK_PARTIAL();
2326 }
2327 else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;
2328 break;
2329
2330 case CHAR_LF:
2331 break;
2332
2333 case CHAR_VT:
2334 case CHAR_FF:
2335 case CHAR_NEL:
2336 #ifndef EBCDIC
2337 case 0x2028:
2338 case 0x2029:
2339 #endif /* Not EBCDIC */
2340 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2341 break;
2342 }
2343 Fecode++;
2344 break;
2345
2346 case OP_NOT_HSPACE:
2347 if (Feptr >= mb->end_subject)
2348 {
2349 SCHECK_PARTIAL();
2350 RRETURN(MATCH_NOMATCH);
2351 }
2352 GETCHARINCTEST(fc, Feptr);
2353 switch(fc)
2354 {
2355 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2356 default: break;
2357 }
2358 Fecode++;
2359 break;
2360
2361 case OP_HSPACE:
2362 if (Feptr >= mb->end_subject)
2363 {
2364 SCHECK_PARTIAL();
2365 RRETURN(MATCH_NOMATCH);
2366 }
2367 GETCHARINCTEST(fc, Feptr);
2368 switch(fc)
2369 {
2370 HSPACE_CASES: break; /* Byte and multibyte cases */
2371 default: RRETURN(MATCH_NOMATCH);
2372 }
2373 Fecode++;
2374 break;
2375
2376 case OP_NOT_VSPACE:
2377 if (Feptr >= mb->end_subject)
2378 {
2379 SCHECK_PARTIAL();
2380 RRETURN(MATCH_NOMATCH);
2381 }
2382 GETCHARINCTEST(fc, Feptr);
2383 switch(fc)
2384 {
2385 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2386 default: break;
2387 }
2388 Fecode++;
2389 break;
2390
2391 case OP_VSPACE:
2392 if (Feptr >= mb->end_subject)
2393 {
2394 SCHECK_PARTIAL();
2395 RRETURN(MATCH_NOMATCH);
2396 }
2397 GETCHARINCTEST(fc, Feptr);
2398 switch(fc)
2399 {
2400 VSPACE_CASES: break;
2401 default: RRETURN(MATCH_NOMATCH);
2402 }
2403 Fecode++;
2404 break;
2405
2406
2407 #ifdef SUPPORT_UNICODE
2408
2409 /* ===================================================================== */
2410 /* Check the next character by Unicode property. We will get here only
2411 if the support is in the binary; otherwise a compile-time error occurs. */
2412
2413 case OP_PROP:
2414 case OP_NOTPROP:
2415 if (Feptr >= mb->end_subject)
2416 {
2417 SCHECK_PARTIAL();
2418 RRETURN(MATCH_NOMATCH);
2419 }
2420 GETCHARINCTEST(fc, Feptr);
2421 {
2422 const uint32_t *cp;
2423 const ucd_record *prop = GET_UCD(fc);
2424
2425 switch(Fecode[1])
2426 {
2427 case PT_ANY:
2428 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2429 break;
2430
2431 case PT_LAMP:
2432 if ((prop->chartype == ucp_Lu ||
2433 prop->chartype == ucp_Ll ||
2434 prop->chartype == ucp_Lt) == (Fop == OP_NOTPROP))
2435 RRETURN(MATCH_NOMATCH);
2436 break;
2437
2438 case PT_GC:
2439 if ((Fecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (Fop == OP_PROP))
2440 RRETURN(MATCH_NOMATCH);
2441 break;
2442
2443 case PT_PC:
2444 if ((Fecode[2] != prop->chartype) == (Fop == OP_PROP))
2445 RRETURN(MATCH_NOMATCH);
2446 break;
2447
2448 case PT_SC:
2449 if ((Fecode[2] != prop->script) == (Fop == OP_PROP))
2450 RRETURN(MATCH_NOMATCH);
2451 break;
2452
2453 /* These are specials */
2454
2455 case PT_ALNUM:
2456 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2457 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (Fop == OP_NOTPROP))
2458 RRETURN(MATCH_NOMATCH);
2459 break;
2460
2461 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2462 which means that Perl space and POSIX space are now identical. PCRE
2463 was changed at release 8.34. */
2464
2465 case PT_SPACE: /* Perl space */
2466 case PT_PXSPACE: /* POSIX space */
2467 switch(fc)
2468 {
2469 HSPACE_CASES:
2470 VSPACE_CASES:
2471 if (Fop == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2472 break;
2473
2474 default:
2475 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2476 (Fop == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2477 break;
2478 }
2479 break;
2480
2481 case PT_WORD:
2482 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2483 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2484 fc == CHAR_UNDERSCORE) == (Fop == OP_NOTPROP))
2485 RRETURN(MATCH_NOMATCH);
2486 break;
2487
2488 case PT_CLIST:
2489 cp = PRIV(ucd_caseless_sets) + Fecode[2];
2490 for (;;)
2491 {
2492 if (fc < *cp)
2493 { if (Fop == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2494 if (fc == *cp++)
2495 { if (Fop == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2496 }
2497 break;
2498
2499 case PT_UCNC:
2500 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2501 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2502 fc >= 0xe000) == (Fop == OP_NOTPROP))
2503 RRETURN(MATCH_NOMATCH);
2504 break;
2505
2506 /* This should never occur */
2507
2508 default:
2509 return PCRE2_ERROR_INTERNAL;
2510 }
2511
2512 Fecode += 3;
2513 }
2514 break;
2515
2516
2517 /* ===================================================================== */
2518 /* Match an extended Unicode sequence. We will get here only if the support
2519 is in the binary; otherwise a compile-time error occurs. */
2520
2521 case OP_EXTUNI:
2522 if (Feptr >= mb->end_subject)
2523 {
2524 SCHECK_PARTIAL();
2525 RRETURN(MATCH_NOMATCH);
2526 }
2527 else
2528 {
2529 GETCHARINCTEST(fc, Feptr);
2530 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,
2531 NULL);
2532 }
2533 CHECK_PARTIAL();
2534 Fecode++;
2535 break;
2536
2537 #endif /* SUPPORT_UNICODE */
2538
2539
2540 /* ===================================================================== */
2541 /* Match a single character type repeatedly. Note that the property type
2542 does not need to be in a stack frame as it is not used within an RMATCH()
2543 loop. */
2544
2545 #define Lstart_eptr F->temp_sptr[0]
2546 #define Lmin F->temp_32[0]
2547 #define Lmax F->temp_32[1]
2548 #define Lctype F->temp_32[2]
2549 #define Lpropvalue F->temp_32[3]
2550
2551 case OP_TYPEEXACT:
2552 Lmin = Lmax = GET2(Fecode, 1);
2553 Fecode += 1 + IMM2_SIZE;
2554 goto REPEATTYPE;
2555
2556 case OP_TYPEUPTO:
2557 case OP_TYPEMINUPTO:
2558 Lmin = 0;
2559 Lmax = GET2(Fecode, 1);
2560 reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;
2561 Fecode += 1 + IMM2_SIZE;
2562 goto REPEATTYPE;
2563
2564 case OP_TYPEPOSSTAR:
2565 reptype = REPTYPE_POS;
2566 Lmin = 0;
2567 Lmax = UINT32_MAX;
2568 Fecode++;
2569 goto REPEATTYPE;
2570
2571 case OP_TYPEPOSPLUS:
2572 reptype = REPTYPE_POS;
2573 Lmin = 1;
2574 Lmax = UINT32_MAX;
2575 Fecode++;
2576 goto REPEATTYPE;
2577
2578 case OP_TYPEPOSQUERY:
2579 reptype = REPTYPE_POS;
2580 Lmin = 0;
2581 Lmax = 1;
2582 Fecode++;
2583 goto REPEATTYPE;
2584
2585 case OP_TYPEPOSUPTO:
2586 reptype = REPTYPE_POS;
2587 Lmin = 0;
2588 Lmax = GET2(Fecode, 1);
2589 Fecode += 1 + IMM2_SIZE;
2590 goto REPEATTYPE;
2591
2592 case OP_TYPESTAR:
2593 case OP_TYPEMINSTAR:
2594 case OP_TYPEPLUS:
2595 case OP_TYPEMINPLUS:
2596 case OP_TYPEQUERY:
2597 case OP_TYPEMINQUERY:
2598 fc = *Fecode++ - OP_TYPESTAR;
2599 Lmin = rep_min[fc];
2600 Lmax = rep_max[fc];
2601 reptype = rep_typ[fc];
2602
2603 /* Common code for all repeated character type matches. */
2604
2605 REPEATTYPE:
2606 Lctype = *Fecode++; /* Code for the character type */
2607
2608 #ifdef SUPPORT_UNICODE
2609 if (Lctype == OP_PROP || Lctype == OP_NOTPROP)
2610 {
2611 proptype = *Fecode++;
2612 Lpropvalue = *Fecode++;
2613 }
2614 else proptype = -1;
2615 #endif
2616
2617 /* First, ensure the minimum number of matches are present. Use inline
2618 code for maximizing the speed, and do the type test once at the start
2619 (i.e. keep it out of the loop). The code for UTF mode is separated out for
2620 tidiness, except for Unicode property tests. */
2621
2622 if (Lmin > 0)
2623 {
2624 #ifdef SUPPORT_UNICODE
2625 if (proptype >= 0) /* Property tests in all modes */
2626 {
2627 switch(proptype)
2628 {
2629 case PT_ANY:
2630 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2631 for (i = 1; i <= Lmin; i++)
2632 {
2633 if (Feptr >= mb->end_subject)
2634 {
2635 SCHECK_PARTIAL();
2636 RRETURN(MATCH_NOMATCH);
2637 }
2638 GETCHARINCTEST(fc, Feptr);
2639 }
2640 break;
2641
2642 case PT_LAMP:
2643 for (i = 1; i <= Lmin; i++)
2644 {
2645 int chartype;
2646 if (Feptr >= mb->end_subject)
2647 {
2648 SCHECK_PARTIAL();
2649 RRETURN(MATCH_NOMATCH);
2650 }
2651 GETCHARINCTEST(fc, Feptr);
2652 chartype = UCD_CHARTYPE(fc);
2653 if ((chartype == ucp_Lu ||
2654 chartype == ucp_Ll ||
2655 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
2656 RRETURN(MATCH_NOMATCH);
2657 }
2658 break;
2659
2660 case PT_GC:
2661 for (i = 1; i <= Lmin; i++)
2662 {
2663 if (Feptr >= mb->end_subject)
2664 {
2665 SCHECK_PARTIAL();
2666 RRETURN(MATCH_NOMATCH);
2667 }
2668 GETCHARINCTEST(fc, Feptr);
2669 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2670 RRETURN(MATCH_NOMATCH);
2671 }
2672 break;
2673
2674 case PT_PC:
2675 for (i = 1; i <= Lmin; i++)
2676 {
2677 if (Feptr >= mb->end_subject)
2678 {
2679 SCHECK_PARTIAL();
2680 RRETURN(MATCH_NOMATCH);
2681 }
2682 GETCHARINCTEST(fc, Feptr);
2683 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2684 RRETURN(MATCH_NOMATCH);
2685 }
2686 break;
2687
2688 case PT_SC:
2689 for (i = 1; i <= Lmin; i++)
2690 {
2691 if (Feptr >= mb->end_subject)
2692 {
2693 SCHECK_PARTIAL();
2694 RRETURN(MATCH_NOMATCH);
2695 }
2696 GETCHARINCTEST(fc, Feptr);
2697 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
2698 RRETURN(MATCH_NOMATCH);
2699 }
2700 break;
2701
2702 case PT_ALNUM:
2703 for (i = 1; i <= Lmin; i++)
2704 {
2705 int category;
2706 if (Feptr >= mb->end_subject)
2707 {
2708 SCHECK_PARTIAL();
2709 RRETURN(MATCH_NOMATCH);
2710 }
2711 GETCHARINCTEST(fc, Feptr);
2712 category = UCD_CATEGORY(fc);
2713 if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))
2714 RRETURN(MATCH_NOMATCH);
2715 }
2716 break;
2717
2718 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2719 which means that Perl space and POSIX space are now identical. PCRE
2720 was changed at release 8.34. */
2721
2722 case PT_SPACE: /* Perl space */
2723 case PT_PXSPACE: /* POSIX space */
2724 for (i = 1; i <= Lmin; i++)
2725 {
2726 if (Feptr >= mb->end_subject)
2727 {
2728 SCHECK_PARTIAL();
2729 RRETURN(MATCH_NOMATCH);
2730 }
2731 GETCHARINCTEST(fc, Feptr);
2732 switch(fc)
2733 {
2734 HSPACE_CASES:
2735 VSPACE_CASES:
2736 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2737 break;
2738
2739 default:
2740 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
2741 RRETURN(MATCH_NOMATCH);
2742 break;
2743 }
2744 }
2745 break;
2746
2747 case PT_WORD:
2748 for (i = 1; i <= Lmin; i++)
2749 {
2750 int category;
2751 if (Feptr >= mb->end_subject)
2752 {
2753 SCHECK_PARTIAL();
2754 RRETURN(MATCH_NOMATCH);
2755 }
2756 GETCHARINCTEST(fc, Feptr);
2757 category = UCD_CATEGORY(fc);
2758 if ((category == ucp_L || category == ucp_N ||
2759 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
2760 RRETURN(MATCH_NOMATCH);
2761 }
2762 break;
2763
2764 case PT_CLIST:
2765 for (i = 1; i <= Lmin; i++)
2766 {
2767 const uint32_t *cp;
2768 if (Feptr >= mb->end_subject)
2769 {
2770 SCHECK_PARTIAL();
2771 RRETURN(MATCH_NOMATCH);
2772 }
2773 GETCHARINCTEST(fc, Feptr);
2774 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
2775 for (;;)
2776 {
2777 if (fc < *cp)
2778 {
2779 if (Lctype == OP_NOTPROP) break;
2780 RRETURN(MATCH_NOMATCH);
2781 }
2782 if (fc == *cp++)
2783 {
2784 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2785 break;
2786 }
2787 }
2788 }
2789 break;
2790
2791 case PT_UCNC:
2792 for (i = 1; i <= Lmin; i++)
2793 {
2794 if (Feptr >= mb->end_subject)
2795 {
2796 SCHECK_PARTIAL();
2797 RRETURN(MATCH_NOMATCH);
2798 }
2799 GETCHARINCTEST(fc, Feptr);
2800 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
2801 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
2802 fc >= 0xe000) == (Lctype == OP_NOTPROP))
2803 RRETURN(MATCH_NOMATCH);
2804 }
2805 break;
2806
2807 /* This should not occur */
2808
2809 default:
2810 return PCRE2_ERROR_INTERNAL;
2811 }
2812 }
2813
2814 /* Match extended Unicode sequences. We will get here only if the
2815 support is in the binary; otherwise a compile-time error occurs. */
2816
2817 else if (Lctype == OP_EXTUNI)
2818 {
2819 for (i = 1; i <= Lmin; i++)
2820 {
2821 if (Feptr >= mb->end_subject)
2822 {
2823 SCHECK_PARTIAL();
2824 RRETURN(MATCH_NOMATCH);
2825 }
2826 else
2827 {
2828 GETCHARINCTEST(fc, Feptr);
2829 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,
2830 mb->end_subject, utf, NULL);
2831 }
2832 CHECK_PARTIAL();
2833 }
2834 }
2835 else
2836 #endif /* SUPPORT_UNICODE */
2837
2838 /* Handle all other cases in UTF mode */
2839
2840 #ifdef SUPPORT_UNICODE
2841 if (utf) switch(Lctype)
2842 {
2843 case OP_ANY:
2844 for (i = 1; i <= Lmin; i++)
2845 {
2846 if (Feptr >= mb->end_subject)
2847 {
2848 SCHECK_PARTIAL();
2849 RRETURN(MATCH_NOMATCH);
2850 }
2851 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
2852 if (mb->partial != 0 &&
2853 Feptr + 1 >= mb->end_subject &&
2854 NLBLOCK->nltype == NLTYPE_FIXED &&
2855 NLBLOCK->nllen == 2 &&
2856 UCHAR21(Feptr) == NLBLOCK->nl[0])
2857 {
2858 mb->hitend = TRUE;
2859 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
2860 }
2861 Feptr++;
2862 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2863 }
2864 break;
2865
2866 case OP_ALLANY:
2867 for (i = 1; i <= Lmin; i++)
2868 {
2869 if (Feptr >= mb->end_subject)
2870 {
2871 SCHECK_PARTIAL();
2872 RRETURN(MATCH_NOMATCH);
2873 }
2874 Feptr++;
2875 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
2876 }
2877 break;
2878
2879 case OP_ANYBYTE:
2880 if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);
2881 Feptr += Lmin;
2882 break;
2883
2884 case OP_ANYNL:
2885 for (i = 1; i <= Lmin; i++)
2886 {
2887 if (Feptr >= mb->end_subject)
2888 {
2889 SCHECK_PARTIAL();
2890 RRETURN(MATCH_NOMATCH);
2891 }
2892 GETCHARINC(fc, Feptr);
2893 switch(fc)
2894 {
2895 default: RRETURN(MATCH_NOMATCH);
2896
2897 case CHAR_CR:
2898 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
2899 break;
2900
2901 case CHAR_LF:
2902 break;
2903
2904 case CHAR_VT:
2905 case CHAR_FF:
2906 case CHAR_NEL:
2907 #ifndef EBCDIC
2908 case 0x2028:
2909 case 0x2029:
2910 #endif /* Not EBCDIC */
2911 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
2912 break;
2913 }
2914 }
2915 break;
2916
2917 case OP_NOT_HSPACE:
2918 for (i = 1; i <= Lmin; i++)
2919 {
2920 if (Feptr >= mb->end_subject)
2921 {
2922 SCHECK_PARTIAL();
2923 RRETURN(MATCH_NOMATCH);
2924 }
2925 GETCHARINC(fc, Feptr);
2926 switch(fc)
2927 {
2928 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
2929 default: break;
2930 }
2931 }
2932 break;
2933
2934 case OP_HSPACE:
2935 for (i = 1; i <= Lmin; i++)
2936 {
2937 if (Feptr >= mb->end_subject)
2938 {
2939 SCHECK_PARTIAL();
2940 RRETURN(MATCH_NOMATCH);
2941 }
2942 GETCHARINC(fc, Feptr);
2943 switch(fc)
2944 {
2945 HSPACE_CASES: break;
2946 default: RRETURN(MATCH_NOMATCH);
2947 }
2948 }
2949 break;
2950
2951 case OP_NOT_VSPACE:
2952 for (i = 1; i <= Lmin; i++)
2953 {
2954 if (Feptr >= mb->end_subject)
2955 {
2956 SCHECK_PARTIAL();
2957 RRETURN(MATCH_NOMATCH);
2958 }
2959 GETCHARINC(fc, Feptr);
2960 switch(fc)
2961 {
2962 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2963 default: break;
2964 }
2965 }
2966 break;
2967
2968 case OP_VSPACE:
2969 for (i = 1; i <= Lmin; i++)
2970 {
2971 if (Feptr >= mb->end_subject)
2972 {
2973 SCHECK_PARTIAL();
2974 RRETURN(MATCH_NOMATCH);
2975 }
2976 GETCHARINC(fc, Feptr);
2977 switch(fc)
2978 {
2979 VSPACE_CASES: break;
2980 default: RRETURN(MATCH_NOMATCH);
2981 }
2982 }
2983 break;
2984
2985 case OP_NOT_DIGIT:
2986 for (i = 1; i <= Lmin; i++)
2987 {
2988 if (Feptr >= mb->end_subject)
2989 {
2990 SCHECK_PARTIAL();
2991 RRETURN(MATCH_NOMATCH);
2992 }
2993 GETCHARINC(fc, Feptr);
2994 if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)
2995 RRETURN(MATCH_NOMATCH);
2996 }
2997 break;
2998
2999 case OP_DIGIT:
3000 for (i = 1; i <= Lmin; i++)
3001 {
3002 uint32_t cc;
3003 if (Feptr >= mb->end_subject)
3004 {
3005 SCHECK_PARTIAL();
3006 RRETURN(MATCH_NOMATCH);
3007 }
3008 cc = UCHAR21(Feptr);
3009 if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)
3010 RRETURN(MATCH_NOMATCH);
3011 Feptr++;
3012 /* No need to skip more code units - we know it has only one. */
3013 }
3014 break;
3015
3016 case OP_NOT_WHITESPACE:
3017 for (i = 1; i <= Lmin; i++)
3018 {
3019 uint32_t cc;
3020 if (Feptr >= mb->end_subject)
3021 {
3022 SCHECK_PARTIAL();
3023 RRETURN(MATCH_NOMATCH);
3024 }
3025 cc = UCHAR21(Feptr);
3026 if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)
3027 RRETURN(MATCH_NOMATCH);
3028 Feptr++;
3029 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3030 }
3031 break;
3032
3033 case OP_WHITESPACE:
3034 for (i = 1; i <= Lmin; i++)
3035 {
3036 uint32_t cc;
3037 if (Feptr >= mb->end_subject)
3038 {
3039 SCHECK_PARTIAL();
3040 RRETURN(MATCH_NOMATCH);
3041 }
3042 cc = UCHAR21(Feptr);
3043 if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)
3044 RRETURN(MATCH_NOMATCH);
3045 Feptr++;
3046 /* No need to skip more code units - we know it has only one. */
3047 }
3048 break;
3049
3050 case OP_NOT_WORDCHAR:
3051 for (i = 1; i <= Lmin; i++)
3052 {
3053 uint32_t cc;
3054 if (Feptr >= mb->end_subject)
3055 {
3056 SCHECK_PARTIAL();
3057 RRETURN(MATCH_NOMATCH);
3058 }
3059 cc = UCHAR21(Feptr);
3060 if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)
3061 RRETURN(MATCH_NOMATCH);
3062 Feptr++;
3063 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
3064 }
3065 break;
3066
3067 case OP_WORDCHAR:
3068 for (i = 1; i <= Lmin; i++)
3069 {
3070 uint32_t cc;
3071 if (Feptr >= mb->end_subject)
3072 {
3073 SCHECK_PARTIAL();
3074 RRETURN(MATCH_NOMATCH);
3075 }
3076 cc = UCHAR21(Feptr);
3077 if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)
3078 RRETURN(MATCH_NOMATCH);
3079 Feptr++;
3080 /* No need to skip more code units - we know it has only one. */
3081 }
3082 break;
3083
3084 default:
3085 return PCRE2_ERROR_INTERNAL;
3086 } /* End switch(Lctype) */
3087
3088 else
3089 #endif /* SUPPORT_UNICODE */
3090
3091 /* Code for the non-UTF case for minimum matching of operators other
3092 than OP_PROP and OP_NOTPROP. */
3093
3094 switch(Lctype)
3095 {
3096 case OP_ANY:
3097 for (i = 1; i <= Lmin; i++)
3098 {
3099 if (Feptr >= mb->end_subject)
3100 {
3101 SCHECK_PARTIAL();
3102 RRETURN(MATCH_NOMATCH);
3103 }
3104 if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3105 if (mb->partial != 0 &&
3106 Feptr + 1 >= mb->end_subject &&
3107 NLBLOCK->nltype == NLTYPE_FIXED &&
3108 NLBLOCK->nllen == 2 &&
3109 *Feptr == NLBLOCK->nl[0])
3110 {
3111 mb->hitend = TRUE;
3112 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3113 }
3114 Feptr++;
3115 }
3116 break;
3117
3118 case OP_ALLANY:
3119 if (Feptr > mb->end_subject - Lmin)
3120 {
3121 SCHECK_PARTIAL();
3122 RRETURN(MATCH_NOMATCH);
3123 }
3124 Feptr += Lmin;
3125 break;
3126
3127 /* This OP_ANYBYTE case will never be reached because \C gets turned
3128 into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
3129 reports don't complain about it's never being used. */
3130
3131 /* case OP_ANYBYTE:
3132 * if (Feptr > mb->end_subject - Lmin)
3133 * {
3134 * SCHECK_PARTIAL();
3135 * RRETURN(MATCH_NOMATCH);
3136 * }
3137 * Feptr += Lmin;
3138 * break;
3139 */
3140 case OP_ANYNL:
3141 for (i = 1; i <= Lmin; i++)
3142 {
3143 if (Feptr >= mb->end_subject)
3144 {
3145 SCHECK_PARTIAL();
3146 RRETURN(MATCH_NOMATCH);
3147 }
3148 switch(*Feptr++)
3149 {
3150 default: RRETURN(MATCH_NOMATCH);
3151
3152 case CHAR_CR:
3153 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3154 break;
3155
3156 case CHAR_LF:
3157 break;
3158
3159 case CHAR_VT:
3160 case CHAR_FF:
3161 case CHAR_NEL:
3162 #if PCRE2_CODE_UNIT_WIDTH != 8
3163 case 0x2028:
3164 case 0x2029:
3165 #endif
3166 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);
3167 break;
3168 }
3169 }
3170 break;
3171
3172 case OP_NOT_HSPACE:
3173 for (i = 1; i <= Lmin; i++)
3174 {
3175 if (Feptr >= mb->end_subject)
3176 {
3177 SCHECK_PARTIAL();
3178 RRETURN(MATCH_NOMATCH);
3179 }
3180 switch(*Feptr++)
3181 {
3182 default: break;
3183 HSPACE_BYTE_CASES:
3184 #if PCRE2_CODE_UNIT_WIDTH != 8
3185 HSPACE_MULTIBYTE_CASES:
3186 #endif
3187 RRETURN(MATCH_NOMATCH);
3188 }
3189 }
3190 break;
3191
3192 case OP_HSPACE:
3193 for (i = 1; i <= Lmin; i++)
3194 {
3195 if (Feptr >= mb->end_subject)
3196 {
3197 SCHECK_PARTIAL();
3198 RRETURN(MATCH_NOMATCH);
3199 }
3200 switch(*Feptr++)
3201 {
3202 default: RRETURN(MATCH_NOMATCH);
3203 HSPACE_BYTE_CASES:
3204 #if PCRE2_CODE_UNIT_WIDTH != 8
3205 HSPACE_MULTIBYTE_CASES:
3206 #endif
3207 break;
3208 }
3209 }
3210 break;
3211
3212 case OP_NOT_VSPACE:
3213 for (i = 1; i <= Lmin; i++)
3214 {
3215 if (Feptr >= mb->end_subject)
3216 {
3217 SCHECK_PARTIAL();
3218 RRETURN(MATCH_NOMATCH);
3219 }
3220 switch(*Feptr++)
3221 {
3222 VSPACE_BYTE_CASES:
3223 #if PCRE2_CODE_UNIT_WIDTH != 8
3224 VSPACE_MULTIBYTE_CASES:
3225 #endif
3226 RRETURN(MATCH_NOMATCH);
3227 default: break;
3228 }
3229 }
3230 break;
3231
3232 case OP_VSPACE:
3233 for (i = 1; i <= Lmin; i++)
3234 {
3235 if (Feptr >= mb->end_subject)
3236 {
3237 SCHECK_PARTIAL();
3238 RRETURN(MATCH_NOMATCH);
3239 }
3240 switch(*Feptr++)
3241 {
3242 default: RRETURN(MATCH_NOMATCH);
3243 VSPACE_BYTE_CASES:
3244 #if PCRE2_CODE_UNIT_WIDTH != 8
3245 VSPACE_MULTIBYTE_CASES:
3246 #endif
3247 break;
3248 }
3249 }
3250 break;
3251
3252 case OP_NOT_DIGIT:
3253 for (i = 1; i <= Lmin; i++)
3254 {
3255 if (Feptr >= mb->end_subject)
3256 {
3257 SCHECK_PARTIAL();
3258 RRETURN(MATCH_NOMATCH);
3259 }
3260 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
3261 RRETURN(MATCH_NOMATCH);
3262 Feptr++;
3263 }
3264 break;
3265
3266 case OP_DIGIT:
3267 for (i = 1; i <= Lmin; i++)
3268 {
3269 if (Feptr >= mb->end_subject)
3270 {
3271 SCHECK_PARTIAL();
3272 RRETURN(MATCH_NOMATCH);
3273 }
3274 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
3275 RRETURN(MATCH_NOMATCH);
3276 Feptr++;
3277 }
3278 break;
3279
3280 case OP_NOT_WHITESPACE:
3281 for (i = 1; i <= Lmin; i++)
3282 {
3283 if (Feptr >= mb->end_subject)
3284 {
3285 SCHECK_PARTIAL();
3286 RRETURN(MATCH_NOMATCH);
3287 }
3288 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
3289 RRETURN(MATCH_NOMATCH);
3290 Feptr++;
3291 }
3292 break;
3293
3294 case OP_WHITESPACE:
3295 for (i = 1; i <= Lmin; i++)
3296 {
3297 if (Feptr >= mb->end_subject)
3298 {
3299 SCHECK_PARTIAL();
3300 RRETURN(MATCH_NOMATCH);
3301 }
3302 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
3303 RRETURN(MATCH_NOMATCH);
3304 Feptr++;
3305 }
3306 break;
3307
3308 case OP_NOT_WORDCHAR:
3309 for (i = 1; i <= Lmin; i++)
3310 {
3311 if (Feptr >= mb->end_subject)
3312 {
3313 SCHECK_PARTIAL();
3314 RRETURN(MATCH_NOMATCH);
3315 }
3316 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
3317 RRETURN(MATCH_NOMATCH);
3318 Feptr++;
3319 }
3320 break;
3321
3322 case OP_WORDCHAR:
3323 for (i = 1; i <= Lmin; i++)
3324 {
3325 if (Feptr >= mb->end_subject)
3326 {
3327 SCHECK_PARTIAL();
3328 RRETURN(MATCH_NOMATCH);
3329 }
3330 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
3331 RRETURN(MATCH_NOMATCH);
3332 Feptr++;
3333 }
3334 break;
3335
3336 default:
3337 return PCRE2_ERROR_INTERNAL;
3338 }
3339 }
3340
3341 /* If Lmin = Lmax we are done. Continue with the main loop. */
3342
3343 if (Lmin == Lmax) continue;
3344
3345 /* If minimizing, we have to test the rest of the pattern before each
3346 subsequent match. */
3347
3348 if (reptype == REPTYPE_MIN)
3349 {
3350 #ifdef SUPPORT_UNICODE
3351 if (proptype >= 0)
3352 {
3353 switch(proptype)
3354 {
3355 case PT_ANY:
3356 for (;;)
3357 {
3358 RMATCH(Fecode, RM208);
3359 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3360 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3361 if (Feptr >= mb->end_subject)
3362 {
3363 SCHECK_PARTIAL();
3364 RRETURN(MATCH_NOMATCH);
3365 }
3366 GETCHARINCTEST(fc, Feptr);
3367 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3368 }
3369 /* Control never gets here */
3370
3371 case PT_LAMP:
3372 for (;;)
3373 {
3374 int chartype;
3375 RMATCH(Fecode, RM209);
3376 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3377 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3378 if (Feptr >= mb->end_subject)
3379 {
3380 SCHECK_PARTIAL();
3381 RRETURN(MATCH_NOMATCH);
3382 }
3383 GETCHARINCTEST(fc, Feptr);
3384 chartype = UCD_CHARTYPE(fc);
3385 if ((chartype == ucp_Lu ||
3386 chartype == ucp_Ll ||
3387 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3388 RRETURN(MATCH_NOMATCH);
3389 }
3390 /* Control never gets here */
3391
3392 case PT_GC:
3393 for (;;)
3394 {
3395 RMATCH(Fecode, RM210);
3396 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3397 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3398 if (Feptr >= mb->end_subject)
3399 {
3400 SCHECK_PARTIAL();
3401 RRETURN(MATCH_NOMATCH);
3402 }
3403 GETCHARINCTEST(fc, Feptr);
3404 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3405 RRETURN(MATCH_NOMATCH);
3406 }
3407 /* Control never gets here */
3408
3409 case PT_PC:
3410 for (;;)
3411 {
3412 RMATCH(Fecode, RM211);
3413 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3414 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3415 if (Feptr >= mb->end_subject)
3416 {
3417 SCHECK_PARTIAL();
3418 RRETURN(MATCH_NOMATCH);
3419 }
3420 GETCHARINCTEST(fc, Feptr);
3421 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3422 RRETURN(MATCH_NOMATCH);
3423 }
3424 /* Control never gets here */
3425
3426 case PT_SC:
3427 for (;;)
3428 {
3429 RMATCH(Fecode, RM212);
3430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3432 if (Feptr >= mb->end_subject)
3433 {
3434 SCHECK_PARTIAL();
3435 RRETURN(MATCH_NOMATCH);
3436 }
3437 GETCHARINCTEST(fc, Feptr);
3438 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3439 RRETURN(MATCH_NOMATCH);
3440 }
3441 /* Control never gets here */
3442
3443 case PT_ALNUM:
3444 for (;;)
3445 {
3446 int category;
3447 RMATCH(Fecode, RM213);
3448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3449 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3450 if (Feptr >= mb->end_subject)
3451 {
3452 SCHECK_PARTIAL();
3453 RRETURN(MATCH_NOMATCH);
3454 }
3455 GETCHARINCTEST(fc, Feptr);
3456 category = UCD_CATEGORY(fc);
3457 if ((category == ucp_L || category == ucp_N) ==
3458 (Lctype == OP_NOTPROP))
3459 RRETURN(MATCH_NOMATCH);
3460 }
3461 /* Control never gets here */
3462
3463 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3464 which means that Perl space and POSIX space are now identical. PCRE
3465 was changed at release 8.34. */
3466
3467 case PT_SPACE: /* Perl space */
3468 case PT_PXSPACE: /* POSIX space */
3469 for (;;)
3470 {
3471 RMATCH(Fecode, RM214);
3472 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3473 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3474 if (Feptr >= mb->end_subject)
3475 {
3476 SCHECK_PARTIAL();
3477 RRETURN(MATCH_NOMATCH);
3478 }
3479 GETCHARINCTEST(fc, Feptr);
3480 switch(fc)
3481 {
3482 HSPACE_CASES:
3483 VSPACE_CASES:
3484 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3485 break;
3486
3487 default:
3488 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
3489 RRETURN(MATCH_NOMATCH);
3490 break;
3491 }
3492 }
3493 /* Control never gets here */
3494
3495 case PT_WORD:
3496 for (;;)
3497 {
3498 int category;
3499 RMATCH(Fecode, RM215);
3500 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3501 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3502 if (Feptr >= mb->end_subject)
3503 {
3504 SCHECK_PARTIAL();
3505 RRETURN(MATCH_NOMATCH);
3506 }
3507 GETCHARINCTEST(fc, Feptr);
3508 category = UCD_CATEGORY(fc);
3509 if ((category == ucp_L ||
3510 category == ucp_N ||
3511 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
3512 RRETURN(MATCH_NOMATCH);
3513 }
3514 /* Control never gets here */
3515
3516 case PT_CLIST:
3517 for (;;)
3518 {
3519 const uint32_t *cp;
3520 RMATCH(Fecode, RM216);
3521 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3522 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3523 if (Feptr >= mb->end_subject)
3524 {
3525 SCHECK_PARTIAL();
3526 RRETURN(MATCH_NOMATCH);
3527 }
3528 GETCHARINCTEST(fc, Feptr);
3529 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
3530 for (;;)
3531 {
3532 if (fc < *cp)
3533 {
3534 if (Lctype == OP_NOTPROP) break;
3535 RRETURN(MATCH_NOMATCH);
3536 }
3537 if (fc == *cp++)
3538 {
3539 if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
3540 break;
3541 }
3542 }
3543 }
3544 /* Control never gets here */
3545
3546 case PT_UCNC:
3547 for (;;)
3548 {
3549 RMATCH(Fecode, RM217);
3550 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3551 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3552 if (Feptr >= mb->end_subject)
3553 {
3554 SCHECK_PARTIAL();
3555 RRETURN(MATCH_NOMATCH);
3556 }
3557 GETCHARINCTEST(fc, Feptr);
3558 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
3559 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
3560 fc >= 0xe000) == (Lctype == OP_NOTPROP))
3561 RRETURN(MATCH_NOMATCH);
3562 }
3563 /* Control never gets here */
3564
3565 /* This should never occur */
3566 default:
3567 return PCRE2_ERROR_INTERNAL;
3568 }
3569 }
3570
3571 /* Match extended Unicode sequences. We will get here only if the
3572 support is in the binary; otherwise a compile-time error occurs. */
3573
3574 else if (Lctype == OP_EXTUNI)
3575 {
3576 for (;;)
3577 {
3578 RMATCH(Fecode, RM218);
3579 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3580 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3581 if (Feptr >= mb->end_subject)
3582 {
3583 SCHECK_PARTIAL();
3584 RRETURN(MATCH_NOMATCH);
3585 }
3586 else
3587 {
3588 GETCHARINCTEST(fc, Feptr);
3589 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
3590 utf, NULL);
3591 }
3592 CHECK_PARTIAL();
3593 }
3594 }
3595 else
3596 #endif /* SUPPORT_UNICODE */
3597
3598 /* UTF mode for non-property testing character types. */
3599
3600 #ifdef SUPPORT_UNICODE
3601 if (utf)
3602 {
3603 for (;;)
3604 {
3605 RMATCH(Fecode, RM219);
3606 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3607 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3608 if (Feptr >= mb->end_subject)
3609 {
3610 SCHECK_PARTIAL();
3611 RRETURN(MATCH_NOMATCH);
3612 }
3613 if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
3614 GETCHARINC(fc, Feptr);
3615 switch(Lctype)
3616 {
3617 case OP_ANY: /* This is the non-NL case */
3618 if (mb->partial != 0 && /* Take care with CRLF partial */
3619 Feptr >= mb->end_subject &&
3620 NLBLOCK->nltype == NLTYPE_FIXED &&
3621 NLBLOCK->nllen == 2 &&
3622 fc == NLBLOCK->nl[0])
3623 {
3624 mb->hitend = TRUE;
3625 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3626 }
3627 break;
3628
3629 case OP_ALLANY:
3630 case OP_ANYBYTE:
3631 break;
3632
3633 case OP_ANYNL:
3634 switch(fc)
3635 {
3636 default: RRETURN(MATCH_NOMATCH);
3637
3638 case CHAR_CR:
3639 if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;
3640 break;
3641
3642 case CHAR_LF:
3643 break;
3644
3645 case CHAR_VT:
3646 case CHAR_FF:
3647 case CHAR_NEL:
3648 #ifndef EBCDIC
3649 case 0x2028:
3650 case 0x2029:
3651 #endif /* Not EBCDIC */
3652 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3653 RRETURN(MATCH_NOMATCH);
3654 break;
3655 }
3656 break;
3657
3658 case OP_NOT_HSPACE:
3659 switch(fc)
3660 {
3661 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
3662 default: break;
3663 }
3664 break;
3665
3666 case OP_HSPACE:
3667 switch(fc)
3668 {
3669 HSPACE_CASES: break;
3670 default: RRETURN(MATCH_NOMATCH);
3671 }
3672 break;
3673
3674 case OP_NOT_VSPACE:
3675 switch(fc)
3676 {
3677 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
3678 default: break;
3679 }
3680 break;
3681
3682 case OP_VSPACE:
3683 switch(fc)
3684 {
3685 VSPACE_CASES: break;
3686 default: RRETURN(MATCH_NOMATCH);
3687 }
3688 break;
3689
3690 case OP_NOT_DIGIT:
3691 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)
3692 RRETURN(MATCH_NOMATCH);
3693 break;
3694
3695 case OP_DIGIT:
3696 if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)
3697 RRETURN(MATCH_NOMATCH);
3698 break;
3699
3700 case OP_NOT_WHITESPACE:
3701 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)
3702 RRETURN(MATCH_NOMATCH);
3703 break;
3704
3705 case OP_WHITESPACE:
3706 if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)
3707 RRETURN(MATCH_NOMATCH);
3708 break;
3709
3710 case OP_NOT_WORDCHAR:
3711 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)
3712 RRETURN(MATCH_NOMATCH);
3713 break;
3714
3715 case OP_WORDCHAR:
3716 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)
3717 RRETURN(MATCH_NOMATCH);
3718 break;
3719
3720 default:
3721 return PCRE2_ERROR_INTERNAL;
3722 }
3723 }
3724 }
3725 else
3726 #endif /* SUPPORT_UNICODE */
3727
3728 /* Not UTF mode */
3729 {
3730 for (;;)
3731 {
3732 RMATCH(Fecode, RM33);
3733 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3734 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
3735 if (Feptr >= mb->end_subject)
3736 {
3737 SCHECK_PARTIAL();
3738 RRETURN(MATCH_NOMATCH);
3739 }
3740 if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
3741 RRETURN(MATCH_NOMATCH);
3742 fc = *Feptr++;
3743 switch(Lctype)
3744 {
3745 case OP_ANY: /* This is the non-NL case */
3746 if (mb->partial != 0 && /* Take care with CRLF partial */
3747 Feptr >= mb->end_subject &&
3748 NLBLOCK->nltype == NLTYPE_FIXED &&
3749 NLBLOCK->nllen == 2 &&
3750 fc == NLBLOCK->nl[0])
3751 {
3752 mb->hitend = TRUE;
3753 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
3754 }
3755 break;
3756
3757 case OP_ALLANY:
3758 case OP_ANYBYTE:
3759 break;
3760
3761 case OP_ANYNL:
3762 switch(fc)
3763 {
3764 default: RRETURN(MATCH_NOMATCH);
3765
3766 case CHAR_CR:
3767 if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;
3768 break;
3769
3770 case CHAR_LF:
3771 break;
3772
3773 case CHAR_VT:
3774 case CHAR_FF:
3775 case CHAR_NEL:
3776 #if PCRE2_CODE_UNIT_WIDTH != 8
3777 case 0x2028:
3778 case 0x2029:
3779 #endif
3780 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)
3781 RRETURN(MATCH_NOMATCH);
3782 break;
3783 }
3784 break;
3785
3786 case OP_NOT_HSPACE:
3787 switch(fc)
3788 {
3789 default: break;
3790 HSPACE_BYTE_CASES:
3791 #if PCRE2_CODE_UNIT_WIDTH != 8
3792 HSPACE_MULTIBYTE_CASES:
3793 #endif
3794 RRETURN(MATCH_NOMATCH);
3795 }
3796 break;
3797
3798 case OP_HSPACE:
3799 switch(fc)
3800 {
3801 default: RRETURN(MATCH_NOMATCH);
3802 HSPACE_BYTE_CASES:
3803 #if PCRE2_CODE_UNIT_WIDTH != 8
3804 HSPACE_MULTIBYTE_CASES:
3805 #endif
3806 break;
3807 }
3808 break;
3809
3810 case OP_NOT_VSPACE:
3811 switch(fc)
3812 {
3813 default: break;
3814 VSPACE_BYTE_CASES:
3815 #if PCRE2_CODE_UNIT_WIDTH != 8
3816 VSPACE_MULTIBYTE_CASES:
3817 #endif
3818 RRETURN(MATCH_NOMATCH);
3819 }
3820 break;
3821
3822 case OP_VSPACE:
3823 switch(fc)
3824 {
3825 default: RRETURN(MATCH_NOMATCH);
3826 VSPACE_BYTE_CASES:
3827 #if PCRE2_CODE_UNIT_WIDTH != 8
3828 VSPACE_MULTIBYTE_CASES:
3829 #endif
3830 break;
3831 }
3832 break;
3833
3834 case OP_NOT_DIGIT:
3835 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)
3836 RRETURN(MATCH_NOMATCH);
3837 break;
3838
3839 case OP_DIGIT:
3840 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)
3841 RRETURN(MATCH_NOMATCH);
3842 break;
3843
3844 case OP_NOT_WHITESPACE:
3845 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)
3846 RRETURN(MATCH_NOMATCH);
3847 break;
3848
3849 case OP_WHITESPACE:
3850 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)
3851 RRETURN(MATCH_NOMATCH);
3852 break;
3853
3854 case OP_NOT_WORDCHAR:
3855 if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)
3856 RRETURN(MATCH_NOMATCH);
3857 break;
3858
3859 case OP_WORDCHAR:
3860 if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)
3861 RRETURN(MATCH_NOMATCH);
3862 break;
3863
3864 default:
3865 return PCRE2_ERROR_INTERNAL;
3866 }
3867 }
3868 }
3869 /* Control never gets here */
3870 }
3871
3872 /* If maximizing, it is worth using inline code for speed, doing the type
3873 test once at the start (i.e. keep it out of the loop). */
3874
3875 else
3876 {
3877 Lstart_eptr = Feptr; /* Remember where we started */
3878
3879 #ifdef SUPPORT_UNICODE
3880 if (proptype >= 0)
3881 {
3882 switch(proptype)
3883 {
3884 case PT_ANY:
3885 for (i = Lmin; i < Lmax; i++)
3886 {
3887 int len = 1;
3888 if (Feptr >= mb->end_subject)
3889 {
3890 SCHECK_PARTIAL();
3891 break;
3892 }
3893 GETCHARLENTEST(fc, Feptr, len);
3894 if (Lctype == OP_NOTPROP) break;
3895 Feptr+= len;
3896 }
3897 break;
3898
3899 case PT_LAMP:
3900 for (i = Lmin; i < Lmax; i++)
3901 {
3902 int chartype;
3903 int len = 1;
3904 if (Feptr >= mb->end_subject)
3905 {
3906 SCHECK_PARTIAL();
3907 break;
3908 }
3909 GETCHARLENTEST(fc, Feptr, len);
3910 chartype = UCD_CHARTYPE(fc);
3911 if ((chartype == ucp_Lu ||
3912 chartype == ucp_Ll ||
3913 chartype == ucp_Lt) == (Lctype == OP_NOTPROP))
3914 break;
3915 Feptr+= len;
3916 }
3917 break;
3918
3919 case PT_GC:
3920 for (i = Lmin; i < Lmax; i++)
3921 {
3922 int len = 1;
3923 if (Feptr >= mb->end_subject)
3924 {
3925 SCHECK_PARTIAL();
3926 break;
3927 }
3928 GETCHARLENTEST(fc, Feptr, len);
3929 if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3930 break;
3931 Feptr+= len;
3932 }
3933 break;
3934
3935 case PT_PC:
3936 for (i = Lmin; i < Lmax; i++)
3937 {
3938 int len = 1;
3939 if (Feptr >= mb->end_subject)
3940 {
3941 SCHECK_PARTIAL();
3942 break;
3943 }
3944 GETCHARLENTEST(fc, Feptr, len);
3945 if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3946 break;
3947 Feptr+= len;
3948 }
3949 break;
3950
3951 case PT_SC:
3952 for (i = Lmin; i < Lmax; i++)
3953 {
3954 int len = 1;
3955 if (Feptr >= mb->end_subject)
3956 {
3957 SCHECK_PARTIAL();
3958 break;
3959 }
3960 GETCHARLENTEST(fc, Feptr, len);
3961 if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))
3962 break;
3963 Feptr+= len;
3964 }
3965 break;
3966
3967 case PT_ALNUM:
3968 for (i = Lmin; i < Lmax; i++)
3969 {
3970 int category;
3971 int len = 1;
3972 if (Feptr >= mb->end_subject)
3973 {
3974 SCHECK_PARTIAL();
3975 break;
3976 }
3977 GETCHARLENTEST(fc, Feptr, len);
3978 category = UCD_CATEGORY(fc);
3979 if ((category == ucp_L || category == ucp_N) ==
3980 (Lctype == OP_NOTPROP))
3981 break;
3982 Feptr+= len;
3983 }
3984 break;
3985
3986 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
3987 which means that Perl space and POSIX space are now identical. PCRE
3988 was changed at release 8.34. */
3989
3990 case PT_SPACE: /* Perl space */
3991 case PT_PXSPACE: /* POSIX space */
3992 for (i = Lmin; i < Lmax; i++)
3993 {
3994 int len = 1;
3995 if (Feptr >= mb->end_subject)
3996 {
3997 SCHECK_PARTIAL();
3998 break;
3999 }
4000 GETCHARLENTEST(fc, Feptr, len);
4001 switch(fc)
4002 {
4003 HSPACE_CASES:
4004 VSPACE_CASES:
4005 if (Lctype == OP_NOTPROP) goto ENDLOOP99; /* Break the loop */
4006 break;
4007
4008 default:
4009 if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))
4010 goto ENDLOOP99; /* Break the loop */
4011 break;
4012 }
4013 Feptr+= len;
4014 }
4015 ENDLOOP99:
4016 break;
4017
4018 case PT_WORD:
4019 for (i = Lmin; i < Lmax; i++)
4020 {
4021 int category;
4022 int len = 1;
4023 if (Feptr >= mb->end_subject)
4024 {
4025 SCHECK_PARTIAL();
4026 break;
4027 }
4028 GETCHARLENTEST(fc, Feptr, len);
4029 category = UCD_CATEGORY(fc);
4030 if ((category == ucp_L || category == ucp_N ||
4031 fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
4032 break;
4033 Feptr+= len;
4034 }
4035 break;
4036
4037 case PT_CLIST:
4038 for (i = Lmin; i < Lmax; i++)
4039 {
4040 const uint32_t *cp;
4041 int len = 1;
4042 if (Feptr >= mb->end_subject)
4043 {
4044 SCHECK_PARTIAL();
4045 break;
4046 }
4047 GETCHARLENTEST(fc, Feptr, len);
4048 cp = PRIV(ucd_caseless_sets) + Lpropvalue;
4049 for (;;)
4050 {
4051 if (fc < *cp)
4052 { if (Lctype == OP_NOTPROP) break; else goto GOT_MAX; }
4053 if (fc == *cp++)
4054 { if (Lctype == OP_NOTPROP) goto GOT_MAX; else break; }
4055 }
4056 Feptr += len;
4057 }
4058 GOT_MAX:
4059 break;
4060
4061 case PT_UCNC:
4062 for (i = Lmin; i < Lmax; i++)
4063 {
4064 int len = 1;
4065 if (Feptr >= mb->end_subject)
4066 {
4067 SCHECK_PARTIAL();
4068 break;
4069 }
4070 GETCHARLENTEST(fc, Feptr, len);
4071 if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||
4072 fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||
4073 fc >= 0xe000) == (Lctype == OP_NOTPROP))
4074 break;
4075 Feptr += len;
4076 }
4077 break;
4078
4079 default:
4080 return PCRE2_ERROR_INTERNAL;
4081 }
4082
4083 /* Feptr is now past the end of the maximum run */
4084
4085 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4086
4087 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4088 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
4089 go too far. */
4090
4091 for(;;)
4092 {
4093 if (Feptr <= Lstart_eptr) break;
4094 RMATCH(Fecode, RM222);
4095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4096 Feptr--;
4097 if (utf) BACKCHAR(Feptr);
4098 }
4099 }
4100
4101 /* Match extended Unicode grapheme clusters. We will get here only if the
4102 support is in the binary; otherwise a compile-time error occurs. */
4103
4104 else if (Lctype == OP_EXTUNI)
4105 {
4106 for (i = Lmin; i < Lmax; i++)
4107 {
4108 if (Feptr >= mb->end_subject)
4109 {
4110 SCHECK_PARTIAL();
4111 break;
4112 }
4113 else
4114 {
4115 GETCHARINCTEST(fc, Feptr);
4116 Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,
4117 utf, NULL);
4118 }
4119 CHECK_PARTIAL();
4120 }
4121
4122 /* Feptr is now past the end of the maximum run */
4123
4124 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4125
4126 /* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start
4127 of the run while backtracking because the use of \C in UTF mode can
4128 cause BACKCHAR to move back past Lstart_eptr. This is just palliative;
4129 the use of \C in UTF mode is fraught with danger. */
4130
4131 for(;;)
4132 {
4133 int lgb, rgb;
4134 PCRE2_SPTR fptr;
4135
4136 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4137 RMATCH(Fecode, RM220);
4138 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4139
4140 /* Backtracking over an extended grapheme cluster involves inspecting
4141 the previous two characters (if present) to see if a break is
4142 permitted between them. */
4143
4144 Feptr--;
4145 if (!utf) fc = *Feptr; else
4146 {
4147 BACKCHAR(Feptr);
4148 GETCHAR(fc, Feptr);
4149 }
4150 rgb = UCD_GRAPHBREAK(fc);
4151
4152 for (;;)
4153 {
4154 if (Feptr <= Lstart_eptr) break; /* At start of char run */
4155 fptr = Feptr - 1;
4156 if (!utf) fc = *fptr; else
4157 {
4158 BACKCHAR(fptr);
4159 GETCHAR(fc, fptr);
4160 }
4161 lgb = UCD_GRAPHBREAK(fc);
4162 if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
4163 Feptr = fptr;
4164 rgb = lgb;
4165 }
4166 }
4167 }
4168
4169 else
4170 #endif /* SUPPORT_UNICODE */
4171
4172 #ifdef SUPPORT_UNICODE
4173 if (utf)
4174 {
4175 switch(Lctype)
4176 {
4177 case OP_ANY:
4178 for (i = Lmin; i < Lmax; i++)
4179 {
4180 if (Feptr >= mb->end_subject)
4181 {
4182 SCHECK_PARTIAL();
4183 break;
4184 }
4185 if (IS_NEWLINE(Feptr)) break;
4186 if (mb->partial != 0 && /* Take care with CRLF partial */
4187 Feptr + 1 >= mb->end_subject &&
4188 NLBLOCK->nltype == NLTYPE_FIXED &&
4189 NLBLOCK->nllen == 2 &&
4190 UCHAR21(Feptr) == NLBLOCK->nl[0])
4191 {
4192 mb->hitend = TRUE;
4193 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4194 }
4195 Feptr++;
4196 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4197 }
4198 break;
4199
4200 case OP_ALLANY:
4201 if (Lmax < UINT32_MAX)
4202 {
4203 for (i = Lmin; i < Lmax; i++)
4204 {
4205 if (Feptr >= mb->end_subject)
4206 {
4207 SCHECK_PARTIAL();
4208 break;
4209 }
4210 Feptr++;
4211 ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);
4212 }
4213 }
4214 else
4215 {
4216 Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */
4217 SCHECK_PARTIAL();
4218 }
4219 break;
4220
4221 /* The "byte" (i.e. "code unit") case is the same as non-UTF */
4222
4223 case OP_ANYBYTE:
4224 fc = Lmax - Lmin;
4225 if (fc > (uint32_t)(mb->end_subject - Feptr))
4226 {
4227 Feptr = mb->end_subject;
4228 SCHECK_PARTIAL();
4229 }
4230 else Feptr += fc;
4231 break;
4232
4233 case OP_ANYNL:
4234 for (i = Lmin; i < Lmax; i++)
4235 {
4236 int len = 1;
4237 if (Feptr >= mb->end_subject)
4238 {
4239 SCHECK_PARTIAL();
4240 break;
4241 }
4242 GETCHARLEN(fc, Feptr, len);
4243 if (fc == CHAR_CR)
4244 {
4245 if (++Feptr >= mb->end_subject) break;
4246 if (UCHAR21(Feptr) == CHAR_LF) Feptr++;
4247 }
4248 else
4249 {
4250 if (fc != CHAR_LF &&
4251 (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4252 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4253 #ifndef EBCDIC
4254 && fc != 0x2028 && fc != 0x2029
4255 #endif /* Not EBCDIC */
4256 )))
4257 break;
4258 Feptr += len;
4259 }
4260 }
4261 break;
4262
4263 case OP_NOT_HSPACE:
4264 case OP_HSPACE:
4265 for (i = Lmin; i < Lmax; i++)
4266 {
4267 BOOL gotspace;
4268 int len = 1;
4269 if (Feptr >= mb->end_subject)
4270 {
4271 SCHECK_PARTIAL();
4272 break;
4273 }
4274 GETCHARLEN(fc, Feptr, len);
4275 switch(fc)
4276 {
4277 HSPACE_CASES: gotspace = TRUE; break;
4278 default: gotspace = FALSE; break;
4279 }
4280 if (gotspace == (Lctype == OP_NOT_HSPACE)) break;
4281 Feptr += len;
4282 }
4283 break;
4284
4285 case OP_NOT_VSPACE:
4286 case OP_VSPACE:
4287 for (i = Lmin; i < Lmax; i++)
4288 {
4289 BOOL gotspace;
4290 int len = 1;
4291 if (Feptr >= mb->end_subject)
4292 {
4293 SCHECK_PARTIAL();
4294 break;
4295 }
4296 GETCHARLEN(fc, Feptr, len);
4297 switch(fc)
4298 {
4299 VSPACE_CASES: gotspace = TRUE; break;
4300 default: gotspace = FALSE; break;
4301 }
4302 if (gotspace == (Lctype == OP_NOT_VSPACE)) break;
4303 Feptr += len;
4304 }
4305 break;
4306
4307 case OP_NOT_DIGIT:
4308 for (i = Lmin; i < Lmax; i++)
4309 {
4310 int len = 1;
4311 if (Feptr >= mb->end_subject)
4312 {
4313 SCHECK_PARTIAL();
4314 break;
4315 }
4316 GETCHARLEN(fc, Feptr, len);
4317 if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;
4318 Feptr+= len;
4319 }
4320 break;
4321
4322 case OP_DIGIT:
4323 for (i = Lmin; i < Lmax; i++)
4324 {
4325 int len = 1;
4326 if (Feptr >= mb->end_subject)
4327 {
4328 SCHECK_PARTIAL();
4329 break;
4330 }
4331 GETCHARLEN(fc, Feptr, len);
4332 if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;
4333 Feptr+= len;
4334 }
4335 break;
4336
4337 case OP_NOT_WHITESPACE:
4338 for (i = Lmin; i < Lmax; i++)
4339 {
4340 int len = 1;
4341 if (Feptr >= mb->end_subject)
4342 {
4343 SCHECK_PARTIAL();
4344 break;
4345 }
4346 GETCHARLEN(fc, Feptr, len);
4347 if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;
4348 Feptr+= len;
4349 }
4350 break;
4351
4352 case OP_WHITESPACE:
4353 for (i = Lmin; i < Lmax; i++)
4354 {
4355 int len = 1;
4356 if (Feptr >= mb->end_subject)
4357 {
4358 SCHECK_PARTIAL();
4359 break;
4360 }
4361 GETCHARLEN(fc, Feptr, len);
4362 if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;
4363 Feptr+= len;
4364 }
4365 break;
4366
4367 case OP_NOT_WORDCHAR:
4368 for (i = Lmin; i < Lmax; i++)
4369 {
4370 int len = 1;
4371 if (Feptr >= mb->end_subject)
4372 {
4373 SCHECK_PARTIAL();
4374 break;
4375 }
4376 GETCHARLEN(fc, Feptr, len);
4377 if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;
4378 Feptr+= len;
4379 }
4380 break;
4381
4382 case OP_WORDCHAR:
4383 for (i = Lmin; i < Lmax; i++)
4384 {
4385 int len = 1;
4386 if (Feptr >= mb->end_subject)
4387 {
4388 SCHECK_PARTIAL();
4389 break;
4390 }
4391 GETCHARLEN(fc, Feptr, len);
4392 if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;
4393 Feptr+= len;
4394 }
4395 break;
4396
4397 default:
4398 return PCRE2_ERROR_INTERNAL;
4399 }
4400
4401 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4402
4403 /* After \C in UTF mode, Lstart_eptr might be in the middle of a
4404 Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go
4405 too far. */
4406
4407 for(;;)
4408 {
4409 if (Feptr <= Lstart_eptr) break;
4410 RMATCH(Fecode, RM221);
4411 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4412 Feptr--;
4413 BACKCHAR(Feptr);
4414 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&
4415 UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)
4416 Feptr--;
4417 }
4418 }
4419 else
4420 #endif /* SUPPORT_UNICODE */
4421
4422 /* Not UTF mode */
4423 {
4424 switch(Lctype)
4425 {
4426 case OP_ANY:
4427 for (i = Lmin; i < Lmax; i++)
4428 {
4429 if (Feptr >= mb->end_subject)
4430 {
4431 SCHECK_PARTIAL();
4432 break;
4433 }
4434 if (IS_NEWLINE(Feptr)) break;
4435 if (mb->partial != 0 && /* Take care with CRLF partial */
4436 Feptr + 1 >= mb->end_subject &&
4437 NLBLOCK->nltype == NLTYPE_FIXED &&
4438 NLBLOCK->nllen == 2 &&
4439 *Feptr == NLBLOCK->nl[0])
4440 {
4441 mb->hitend = TRUE;
4442 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4443 }
4444 Feptr++;
4445 }
4446 break;
4447
4448 case OP_ALLANY:
4449 case OP_ANYBYTE:
4450 fc = Lmax - Lmin;
4451 if (fc > (uint32_t)(mb->end_subject - Feptr))
4452 {
4453 Feptr = mb->end_subject;
4454 SCHECK_PARTIAL();
4455 }
4456 else Feptr += fc;
4457 break;
4458
4459 case OP_ANYNL:
4460 for (i = Lmin; i < Lmax; i++)
4461 {
4462 if (Feptr >= mb->end_subject)
4463 {
4464 SCHECK_PARTIAL();
4465 break;
4466 }
4467 fc = *Feptr;
4468 if (fc == CHAR_CR)
4469 {
4470 if (++Feptr >= mb->end_subject) break;
4471 if (*Feptr == CHAR_LF) Feptr++;
4472 }
4473 else
4474 {
4475 if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||
4476 (fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL
4477 #if PCRE2_CODE_UNIT_WIDTH != 8
4478 && fc != 0x2028 && fc != 0x2029
4479 #endif
4480 ))) break;
4481 Feptr++;
4482 }
4483 }
4484 break;
4485
4486 case OP_NOT_HSPACE:
4487 for (i = Lmin; i < Lmax; i++)
4488 {
4489 if (Feptr >= mb->end_subject)
4490 {
4491 SCHECK_PARTIAL();
4492 break;
4493 }
4494 switch(*Feptr)
4495 {
4496 default: Feptr++; break;
4497 HSPACE_BYTE_CASES:
4498 #if PCRE2_CODE_UNIT_WIDTH != 8
4499 HSPACE_MULTIBYTE_CASES:
4500 #endif
4501 goto ENDLOOP00;
4502 }
4503 }
4504 ENDLOOP00:
4505 break;
4506
4507 case OP_HSPACE:
4508 for (i = Lmin; i < Lmax; i++)
4509 {
4510 if (Feptr >= mb->end_subject)
4511 {
4512 SCHECK_PARTIAL();
4513 break;
4514 }
4515 switch(*Feptr)
4516 {
4517 default: goto ENDLOOP01;
4518 HSPACE_BYTE_CASES:
4519 #if PCRE2_CODE_UNIT_WIDTH != 8
4520 HSPACE_MULTIBYTE_CASES:
4521 #endif
4522 Feptr++; break;
4523 }
4524 }
4525 ENDLOOP01:
4526 break;
4527
4528 case OP_NOT_VSPACE:
4529 for (i = Lmin; i < Lmax; i++)
4530 {
4531 if (Feptr >= mb->end_subject)
4532 {
4533 SCHECK_PARTIAL();
4534 break;
4535 }
4536 switch(*Feptr)
4537 {
4538 default: Feptr++; break;
4539 VSPACE_BYTE_CASES:
4540 #if PCRE2_CODE_UNIT_WIDTH != 8
4541 VSPACE_MULTIBYTE_CASES:
4542 #endif
4543 goto ENDLOOP02;
4544 }
4545 }
4546 ENDLOOP02:
4547 break;
4548
4549 case OP_VSPACE:
4550 for (i = Lmin; i < Lmax; i++)
4551 {
4552 if (Feptr >= mb->end_subject)
4553 {
4554 SCHECK_PARTIAL();
4555 break;
4556 }
4557 switch(*Feptr)
4558 {
4559 default: goto ENDLOOP03;
4560 VSPACE_BYTE_CASES:
4561 #if PCRE2_CODE_UNIT_WIDTH != 8
4562 VSPACE_MULTIBYTE_CASES:
4563 #endif
4564 Feptr++; break;
4565 }
4566 }
4567 ENDLOOP03:
4568 break;
4569
4570 case OP_NOT_DIGIT:
4571 for (i = Lmin; i < Lmax; i++)
4572 {
4573 if (Feptr >= mb->end_subject)
4574 {
4575 SCHECK_PARTIAL();
4576 break;
4577 }
4578 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)
4579 break;
4580 Feptr++;
4581 }
4582 break;
4583
4584 case OP_DIGIT:
4585 for (i = Lmin; i < Lmax; i++)
4586 {
4587 if (Feptr >= mb->end_subject)
4588 {
4589 SCHECK_PARTIAL();
4590 break;
4591 }
4592 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)
4593 break;
4594 Feptr++;
4595 }
4596 break;
4597
4598 case OP_NOT_WHITESPACE:
4599 for (i = Lmin; i < Lmax; i++)
4600 {
4601 if (Feptr >= mb->end_subject)
4602 {
4603 SCHECK_PARTIAL();
4604 break;
4605 }
4606 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)
4607 break;
4608 Feptr++;
4609 }
4610 break;
4611
4612 case OP_WHITESPACE:
4613 for (i = Lmin; i < Lmax; i++)
4614 {
4615 if (Feptr >= mb->end_subject)
4616 {
4617 SCHECK_PARTIAL();
4618 break;
4619 }
4620 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)
4621 break;
4622 Feptr++;
4623 }
4624 break;
4625
4626 case OP_NOT_WORDCHAR:
4627 for (i = Lmin; i < Lmax; i++)
4628 {
4629 if (Feptr >= mb->end_subject)
4630 {
4631 SCHECK_PARTIAL();
4632 break;
4633 }
4634 if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)
4635 break;
4636 Feptr++;
4637 }
4638 break;
4639
4640 case OP_WORDCHAR:
4641 for (i = Lmin; i < Lmax; i++)
4642 {
4643 if (Feptr >= mb->end_subject)
4644 {
4645 SCHECK_PARTIAL();
4646 break;
4647 }
4648 if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)
4649 break;
4650 Feptr++;
4651 }
4652 break;
4653
4654 default:
4655 return PCRE2_ERROR_INTERNAL;
4656 }
4657
4658 if (reptype == REPTYPE_POS) continue; /* No backtracking */
4659
4660 for (;;)
4661 {
4662 if (Feptr == Lstart_eptr) break;
4663 RMATCH(Fecode, RM34);
4664 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4665 Feptr--;
4666 if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&
4667 Feptr[-1] == CHAR_CR) Feptr--;
4668 }
4669 }
4670 }
4671 break; /* End of repeat character type processing */
4672
4673 #undef Lstart_eptr
4674 #undef Lmin
4675 #undef Lmax
4676 #undef Lctype
4677 #undef Lpropvalue
4678
4679
4680 /* ===================================================================== */
4681 /* Match a back reference, possibly repeatedly. Look past the end of the
4682 item to see if there is repeat information following. The OP_REF and
4683 OP_REFI opcodes are used for a reference to a numbered group or to a
4684 non-duplicated named group. For a duplicated named group, OP_DNREF and
4685 OP_DNREFI are used. In this case we must scan the list of groups to which
4686 the name refers, and use the first one that is set. */
4687
4688 #define Lmin F->temp_32[0]
4689 #define Lmax F->temp_32[1]
4690 #define Lcaseless F->temp_32[2]
4691 #define Lstart F->temp_sptr[0]
4692 #define Loffset F->temp_size
4693
4694 case OP_DNREF:
4695 case OP_DNREFI:
4696 Lcaseless = (Fop == OP_DNREFI);
4697 {
4698 int count = GET2(Fecode, 1+IMM2_SIZE);
4699 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
4700 Fecode += 1 + 2*IMM2_SIZE;
4701
4702 while (count-- > 0)
4703 {
4704 Loffset = (GET2(slot, 0) << 1) - 2;
4705 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;
4706 slot += mb->name_entry_size;
4707 }
4708 }
4709 goto REF_REPEAT;
4710
4711 case OP_REF:
4712 case OP_REFI:
4713 Lcaseless = (Fop == OP_REFI);
4714 Loffset = (GET2(Fecode, 1) << 1) - 2;
4715 Fecode += 1 + IMM2_SIZE;
4716
4717 /* Set up for repetition, or handle the non-repeated case. The maximum and
4718 minimum must be in the heap frame, but as they are short-term values, we
4719 use temporary fields. */
4720
4721 REF_REPEAT:
4722 switch (*Fecode)
4723 {
4724 case OP_CRSTAR:
4725 case OP_CRMINSTAR:
4726 case OP_CRPLUS:
4727 case OP_CRMINPLUS:
4728 case OP_CRQUERY:
4729 case OP_CRMINQUERY:
4730 fc = *Fecode++ - OP_CRSTAR;
4731 Lmin = rep_min[fc];
4732 Lmax = rep_max[fc];
4733 reptype = rep_typ[fc];
4734 break;
4735
4736 case OP_CRRANGE:
4737 case OP_CRMINRANGE:
4738 Lmin = GET2(Fecode, 1);
4739 Lmax = GET2(Fecode, 1 + IMM2_SIZE);
4740 reptype = rep_typ[*Fecode - OP_CRSTAR];
4741 if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */
4742 Fecode += 1 + 2 * IMM2_SIZE;
4743 break;
4744
4745 default: /* No repeat follows */
4746 {
4747 rrc = match_ref(Loffset, Lcaseless, F, mb, &length);
4748 if (rrc != 0)
4749 {
4750 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4751 CHECK_PARTIAL();
4752 RRETURN(MATCH_NOMATCH);
4753 }
4754 }
4755 Feptr += length;
4756 continue; /* With the main loop */
4757 }
4758
4759 /* Handle repeated back references. If a set group has length zero, just
4760 continue with the main loop, because it matches however many times. For an
4761 unset reference, if the minimum is zero, we can also just continue. We can
4762 also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset
4763 group behave as a zero-length group. For any other unset cases, carrying
4764 on will result in NOMATCH. */
4765
4766 if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)
4767 {
4768 if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;
4769 }
4770 else /* Group is not set */
4771 {
4772 if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
4773 continue;
4774 }
4775
4776 /* First, ensure the minimum number of matches are present. */
4777
4778 for (i = 1; i <= Lmin; i++)
4779 {
4780 PCRE2_SIZE slength;
4781 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4782 if (rrc != 0)
4783 {
4784 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4785 CHECK_PARTIAL();
4786 RRETURN(MATCH_NOMATCH);
4787 }
4788 Feptr += slength;
4789 }
4790
4791 /* If min = max, we are done. They are not both allowed to be zero. */
4792
4793 if (Lmin == Lmax) continue;
4794
4795 /* If minimizing, keep trying and advancing the pointer. */
4796
4797 if (reptype == REPTYPE_MIN)
4798 {
4799 for (;;)
4800 {
4801 PCRE2_SIZE slength;
4802 RMATCH(Fecode, RM20);
4803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4804 if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
4805 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4806 if (rrc != 0)
4807 {
4808 if (rrc > 0) Feptr = mb->end_subject; /* Partial match */
4809 CHECK_PARTIAL();
4810 RRETURN(MATCH_NOMATCH);
4811 }
4812 Feptr += slength;
4813 }
4814 /* Control never gets here */
4815 }
4816
4817 /* If maximizing, find the longest string and work backwards, as long as
4818 the matched lengths for each iteration are the same. */
4819
4820 else
4821 {
4822 BOOL samelengths = TRUE;
4823 Lstart = Feptr; /* Starting position */
4824 Flength = Fovector[Loffset+1] - Fovector[Loffset];
4825
4826 for (i = Lmin; i < Lmax; i++)
4827 {
4828 PCRE2_SIZE slength;
4829 rrc = match_ref(Loffset, Lcaseless, F, mb, &slength);
4830 if (rrc != 0)
4831 {
4832 /* Can't use CHECK_PARTIAL because we don't want to update Feptr in
4833 the soft partial matching case. */
4834
4835 if (rrc > 0 && mb->partial != 0 &&
4836 mb->end_subject > mb->start_used_ptr)
4837 {
4838 mb->hitend = TRUE;
4839 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
4840 }
4841 break;
4842 }
4843
4844 if (slength != Flength) samelengths = FALSE;
4845 Feptr += slength;
4846 }
4847
4848 /* If the length matched for each repetition is the same as the length of
4849 the captured group, we can easily work backwards. This is the normal
4850 case. However, in caseless UTF-8 mode there are pairs of case-equivalent
4851 characters whose lengths (in terms of code units) differ. However, this
4852 is very rare, so we handle it by re-matching fewer and fewer times. */
4853
4854 if (samelengths)
4855 {
4856 while (Feptr >= Lstart)
4857 {
4858 RMATCH(Fecode, RM21);
4859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4860 Feptr -= Flength;
4861 }
4862 }
4863
4864 /* The rare case of non-matching lengths. Re-scan the repetition for each
4865 iteration. We know that match_ref() will succeed every time. */
4866
4867 else
4868 {
4869 Lmax = i;
4870 for (;;)
4871 {
4872 RMATCH(Fecode, RM22);
4873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4874 if (Feptr == Lstart) break; /* Failed after minimal repetition */
4875 Feptr = Lstart;
4876 Lmax--;
4877 for (i = Lmin; i < Lmax; i++)
4878 {
4879 PCRE2_SIZE slength;
4880 (void)match_ref(Loffset, Lcaseless, F, mb, &slength);
4881 Feptr += slength;
4882 }
4883 }
4884 }
4885
4886 RRETURN(MATCH_NOMATCH);
4887 }
4888 /* Control never gets here */
4889
4890 #undef Lcaseless
4891 #undef Lmin
4892 #undef Lmax
4893 #undef Lstart
4894 #undef Loffset
4895
4896
4897
4898 /* ========================================================================= */
4899 /* Opcodes for the start of various parenthesized items */
4900 /* ========================================================================= */
4901
4902 /* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the
4903 (*THEN) is within the current branch by comparing the address of OP_THEN
4904 that is passed back with the end of the branch. If (*THEN) is within the
4905 current branch, and the branch is one of two or more alternatives (it
4906 either starts or ends with OP_ALT), we have reached the limit of THEN's
4907 action, so convert the return code to NOMATCH, which will cause normal
4908 backtracking to happen from now on. Otherwise, THEN is passed back to an
4909 outer alternative. This implements Perl's treatment of parenthesized
4910 groups, where a group not containing | does not affect the current
4911 alternative, that is, (X) is NOT the same as (X|(*F)). */
4912
4913
4914 /* ===================================================================== */
4915 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive
4916 bracket group, indicating that it may occur zero times. It may repeat
4917 infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in
4918 the pattern. Brackets with fixed upper repeat limits are compiled as a
4919 number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.
4920 Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */
4921
4922 #define Lnext_ecode F->temp_sptr[0]
4923
4924 case OP_BRAZERO:
4925 Lnext_ecode = Fecode + 1;
4926 RMATCH(Lnext_ecode, RM9);
4927 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4928 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
4929 Fecode = Lnext_ecode + 1 + LINK_SIZE;
4930 break;
4931
4932 case OP_BRAMINZERO:
4933 Lnext_ecode = Fecode + 1;
4934 do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);
4935 RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);
4936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4937 Fecode++;
4938 break;
4939
4940 #undef Lnext_ecode
4941
4942 case OP_SKIPZERO:
4943 Fecode++;
4944 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
4945 Fecode += 1 + LINK_SIZE;
4946 break;
4947
4948
4949 /* ===================================================================== */
4950 /* Handle possessive brackets with an unlimited repeat. The end of these
4951 brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without
4952 going further in the pattern. */
4953
4954 #define Lframe_type F->temp_32[0]
4955 #define Lmatched_once F->temp_32[1]
4956 #define Lzero_allowed F->temp_32[2]
4957 #define Lstart_eptr F->temp_sptr[0]
4958 #define Lstart_group F->temp_sptr[1]
4959
4960 case OP_BRAPOSZERO:
4961 Lzero_allowed = TRUE; /* Zero repeat is allowed */
4962 Fecode += 1;
4963 if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)
4964 goto POSSESSIVE_CAPTURE;
4965 goto POSSESSIVE_NON_CAPTURE;
4966
4967 case OP_BRAPOS:
4968 case OP_SBRAPOS:
4969 Lzero_allowed = FALSE; /* Zero repeat not allowed */
4970
4971 POSSESSIVE_NON_CAPTURE:
4972 Lframe_type = GF_NOCAPTURE; /* Remembered frame type */
4973 goto POSSESSIVE_GROUP;
4974
4975 case OP_CBRAPOS:
4976 case OP_SCBRAPOS:
4977 Lzero_allowed = FALSE; /* Zero repeat not allowed */
4978
4979 POSSESSIVE_CAPTURE:
4980 number = GET2(Fecode, 1+LINK_SIZE);
4981 Lframe_type = GF_CAPTURE | number; /* Remembered frame type */
4982
4983 POSSESSIVE_GROUP:
4984 Lmatched_once = FALSE; /* Never matched */
4985 Lstart_group = Fecode; /* Start of this group */
4986
4987 for (;;)
4988 {
4989 Lstart_eptr = Feptr; /* Position at group start */
4990 group_frame_type = Lframe_type;
4991 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);
4992 if (rrc == MATCH_KETRPOS)
4993 {
4994 Lmatched_once = TRUE; /* Matched at least once */
4995 if (Feptr == Lstart_eptr) /* Empty match; skip to end */
4996 {
4997 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
4998 break;
4999 }
5000
5001 Fecode = Lstart_group;
5002 continue;
5003 }
5004
5005 /* See comment above about handling THEN. */
5006
5007 if (rrc == MATCH_THEN)
5008 {
5009 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5010 if (mb->verb_ecode_ptr < next_ecode &&
5011 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5012 rrc = MATCH_NOMATCH;
5013 }
5014
5015 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5016 Fecode += GET(Fecode, 1);
5017 if (*Fecode != OP_ALT) break;
5018 }
5019
5020 /* Success if matched something or zero repeat allowed */
5021
5022 if (Lmatched_once || Lzero_allowed)
5023 {
5024 Fecode += 1 + LINK_SIZE;
5025 break;
5026 }
5027
5028 RRETURN(MATCH_NOMATCH);
5029
5030 #undef Lmatched_once
5031 #undef Lzero_allowed
5032 #undef Lframe_type
5033 #undef Lstart_eptr
5034 #undef Lstart_group
5035
5036
5037 /* ===================================================================== */
5038 /* Handle non-capturing brackets that cannot match an empty string. When we
5039 get to the final alternative within the brackets, as long as there are no
5040 THEN's in the pattern, we can optimize by not recording a new backtracking
5041 point. (Ideally we should test for a THEN within this group, but we don't
5042 have that information.) Don't do this if we are at the very top level,
5043 however, because that would make handling assertions and once-only brackets
5044 messier when there is nothing to go back to. */
5045
5046 #define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */
5047 #define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */
5048
5049 case OP_BRA:
5050 if (mb->hasthen || Frdepth == 0)
5051 {
5052 Lframe_type = 0;
5053 goto GROUPLOOP;
5054 }
5055
5056 for (;;)
5057 {
5058 Lnext_branch = Fecode + GET(Fecode, 1);
5059 if (*Lnext_branch != OP_ALT) break;
5060
5061 /* This is never the final branch. We do not need to test for MATCH_THEN
5062 here because this code is not used when there is a THEN in the pattern. */
5063
5064 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);
5065 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5066 Fecode = Lnext_branch;
5067 }
5068
5069 /* Hit the start of the final branch. Continue at this level. */
5070
5071 Fecode += PRIV(OP_lengths)[*Fecode];
5072 break;
5073
5074 #undef Lnext_branch
5075
5076
5077 /* ===================================================================== */
5078 /* Handle a capturing bracket, other than those that are possessive with an
5079 unlimited repeat. */
5080
5081 case OP_CBRA:
5082 case OP_SCBRA:
5083 Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);
5084 goto GROUPLOOP;
5085
5086
5087 /* ===================================================================== */
5088 /* Atomic groups and non-capturing brackets that can match an empty string
5089 must record a backtracking point and also set up a chained frame. */
5090
5091 case OP_ONCE:
5092 case OP_SCRIPT_RUN:
5093 case OP_SBRA:
5094 Lframe_type = GF_NOCAPTURE | Fop;
5095
5096 GROUPLOOP:
5097 for (;;)
5098 {
5099 group_frame_type = Lframe_type;
5100 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);
5101 if (rrc == MATCH_THEN)
5102 {
5103 PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);
5104 if (mb->verb_ecode_ptr < next_ecode &&
5105 (*Fecode == OP_ALT || *next_ecode == OP_ALT))
5106 rrc = MATCH_NOMATCH;
5107 }
5108 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5109 Fecode += GET(Fecode, 1);
5110 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5111 }
5112 /* Control never reaches here. */
5113
5114 #undef Lframe_type
5115
5116
5117 /* ===================================================================== */
5118 /* Recursion either matches the current regex, or some subexpression. The
5119 offset data is the offset to the starting bracket from the start of the
5120 whole pattern. (This is so that it works from duplicated subpatterns.) */
5121
5122 #define Lframe_type F->temp_32[0]
5123 #define Lstart_branch F->temp_sptr[0]
5124
5125 case OP_RECURSE:
5126 bracode = mb->start_code + GET(Fecode, 1);
5127 number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
5128
5129 /* If we are already in a recursion, check for repeating the same one
5130 without advancing the subject pointer. This should catch convoluted mutual
5131 recursions. (Some simple cases are caught at compile time.) */
5132
5133 if (Fcurrent_recurse != RECURSE_UNSET)
5134 {
5135 offset = Flast_group_offset;
5136 while (offset != PCRE2_UNSET)
5137 {
5138 N = (heapframe *)((char *)mb->match_frames + offset);
5139 P = (heapframe *)((char *)N - frame_size);
5140 if (N->group_frame_type == (GF_RECURSE | number))
5141 {
5142 if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
5143 break;
5144 }
5145 offset = P->last_group_offset;
5146 }
5147 }
5148
5149 /* Now run the recursion, branch by branch. */
5150
5151 Lstart_branch = bracode;
5152 Lframe_type = GF_RECURSE | number;
5153
5154 for (;;)
5155 {
5156 PCRE2_SPTR next_ecode;
5157
5158 group_frame_type = Lframe_type;
5159 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
5160 next_ecode = Lstart_branch + GET(Lstart_branch,1);
5161
5162 /* Handle backtracking verbs, which are defined in a range that can
5163 easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
5164 escape beyond a recursion; they cause a NOMATCH for the entire recursion.
5165
5166 When one of these verbs triggers, the current recursion group number is
5167 recorded. If it matches the recursion we are processing, the verb
5168 happened within the recursion and we must deal with it. Otherwise it must
5169 have happened after the recursion completed, and so has to be passed
5170 back. See comment above about handling THEN. */
5171
5172 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
5173 mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
5174 {
5175 if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
5176 (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
5177 rrc = MATCH_NOMATCH;
5178 else RRETURN(MATCH_NOMATCH);
5179 }
5180
5181 /* Note that carrying on after (*ACCEPT) in a recursion is handled in the
5182 OP_ACCEPT code. Nothing needs to be done here. */
5183
5184 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5185 Lstart_branch = next_ecode;
5186 if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
5187 }
5188 /* Control never reaches here. */
5189
5190 #undef Lframe_type
5191 #undef Lstart_branch
5192
5193
5194 /* ===================================================================== */
5195 /* Positive assertions are like other groups except that PCRE doesn't allow
5196 the effect of (*THEN) to escape beyond an assertion; it is therefore
5197 treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its
5198 captures and mark retained. Any other return is an error. */
5199
5200 #define Lframe_type F->temp_32[0]
5201
5202 case OP_ASSERT:
5203 case OP_ASSERTBACK:
5204 case OP_ASSERT_NA:
5205 case OP_ASSERTBACK_NA:
5206 Lframe_type = GF_NOCAPTURE | Fop;
5207 for (;;)
5208 {
5209 group_frame_type = Lframe_type;
5210 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);
5211 if (rrc == MATCH_ACCEPT)
5212 {
5213 memcpy(Fovector,
5214 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5215 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5216 Foffset_top = assert_accept_frame->offset_top;
5217 Fmark = assert_accept_frame->mark;
5218 break;
5219 }
5220 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
5221 Fecode += GET(Fecode, 1);
5222 if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);
5223 }
5224
5225 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5226 Fecode += 1 + LINK_SIZE;
5227 break;
5228
5229 #undef Lframe_type
5230
5231
5232 /* ===================================================================== */
5233 /* Handle negative assertions. Loop for each non-matching branch as for
5234 positive assertions. */
5235
5236 #define Lframe_type F->temp_32[0]
5237
5238 case OP_ASSERT_NOT:
5239 case OP_ASSERTBACK_NOT:
5240 Lframe_type = GF_NOCAPTURE | Fop;
5241
5242 for (;;)
5243 {
5244 group_frame_type = Lframe_type;
5245 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);
5246 switch(rrc)
5247 {
5248 case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */
5249 case MATCH_MATCH:
5250 RRETURN (MATCH_NOMATCH);
5251
5252 case MATCH_NOMATCH: /* Branch failed, try next if present. */
5253 case MATCH_THEN:
5254 Fecode += GET(Fecode, 1);
5255 if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;
5256 break;
5257
5258 case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */
5259 case MATCH_SKIP:
5260 case MATCH_PRUNE:
5261 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5262 goto ASSERT_NOT_FAILED;
5263
5264 default: /* Pass back any other return */
5265 RRETURN(rrc);
5266 }
5267 }
5268
5269 /* None of the branches have matched or there was a backtrack to (*COMMIT),
5270 (*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a
5271 negative assertion, so carry on. */
5272
5273 ASSERT_NOT_FAILED:
5274 Fecode += 1 + LINK_SIZE;
5275 break;
5276
5277 #undef Lframe_type
5278
5279
5280 /* ===================================================================== */
5281 /* The callout item calls an external function, if one is provided, passing
5282 details of the match so far. This is mainly for debugging, though the
5283 function is able to force a failure. */
5284
5285 case OP_CALLOUT:
5286 case OP_CALLOUT_STR:
5287 rrc = do_callout(F, mb, &length);
5288 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5289 if (rrc < 0) RRETURN(rrc);
5290 Fecode += length;
5291 break;
5292
5293
5294 /* ===================================================================== */
5295 /* Conditional group: compilation checked that there are no more than two
5296 branches. If the condition is false, skipping the first branch takes us
5297 past the end of the item if there is only one branch, but that's exactly
5298 what we want. */
5299
5300 case OP_COND:
5301 case OP_SCOND:
5302
5303 /* The variable Flength will be added to Fecode when the condition is
5304 false, to get to the second branch. Setting it to the offset to the ALT or
5305 KET, then incrementing Fecode achieves this effect. However, if the second
5306 branch is non-existent, we must point to the KET so that the end of the
5307 group is correctly processed. We now have Fecode pointing to the condition
5308 or callout. */
5309
5310 Flength = GET(Fecode, 1); /* Offset to the second branch */
5311 if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;
5312 Fecode += 1 + LINK_SIZE; /* From this opcode */
5313
5314 /* Because of the way auto-callout works during compile, a callout item is
5315 inserted between OP_COND and an assertion condition. Such a callout can
5316 also be inserted manually. */
5317
5318 if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)
5319 {
5320 rrc = do_callout(F, mb, &length);
5321 if (rrc > 0) RRETURN(MATCH_NOMATCH);
5322 if (rrc < 0) RRETURN(rrc);
5323
5324 /* Advance Fecode past the callout, so it now points to the condition. We
5325 must adjust Flength so that the value of Fecode+Flength is unchanged. */
5326
5327 Fecode += length;
5328 Flength -= length;
5329 }
5330
5331 /* Test the various possible conditions */
5332
5333 condition = FALSE;
5334 switch(*Fecode)
5335 {
5336 case OP_RREF: /* Group recursion test */
5337 if (Fcurrent_recurse != RECURSE_UNSET)
5338 {
5339 number = GET2(Fecode, 1);
5340 condition = (number == RREF_ANY || number == Fcurrent_recurse);
5341 }
5342 break;
5343
5344 case OP_DNRREF: /* Duplicate named group recursion test */
5345 if (Fcurrent_recurse != RECURSE_UNSET)
5346 {
5347 int count = GET2(Fecode, 1 + IMM2_SIZE);
5348 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5349 while (count-- > 0)
5350 {
5351 number = GET2(slot, 0);
5352 condition = number == Fcurrent_recurse;
5353 if (condition) break;
5354 slot += mb->name_entry_size;
5355 }
5356 }
5357 break;
5358
5359 case OP_CREF: /* Numbered group used test */
5360 offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */
5361 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5362 break;
5363
5364 case OP_DNCREF: /* Duplicate named group used test */
5365 {
5366 int count = GET2(Fecode, 1 + IMM2_SIZE);
5367 PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;
5368 while (count-- > 0)
5369 {
5370 offset = (GET2(slot, 0) << 1) - 2;
5371 condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;
5372 if (condition) break;
5373 slot += mb->name_entry_size;
5374 }
5375 }
5376 break;
5377
5378 case OP_FALSE:
5379 case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
5380 break;
5381
5382 case OP_TRUE:
5383 condition = TRUE;
5384 break;
5385
5386 /* The condition is an assertion. Run code similar to the assertion code
5387 above. */
5388
5389 #define Lpositive F->temp_32[0]
5390 #define Lstart_branch F->temp_sptr[0]
5391
5392 default:
5393 Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);
5394 Lstart_branch = Fecode;
5395
5396 for (;;)
5397 {
5398 group_frame_type = GF_CONDASSERT | *Fecode;
5399 RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);
5400
5401 switch(rrc)
5402 {
5403 case MATCH_ACCEPT: /* Save captures */
5404 memcpy(Fovector,
5405 (char *)assert_accept_frame + offsetof(heapframe, ovector),
5406 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
5407 Foffset_top = assert_accept_frame->offset_top;
5408
5409 /* Fall through */
5410 /* In the case of a match, the captures have already been put into
5411 the current frame. */
5412
5413 case MATCH_MATCH:
5414 condition = Lpositive; /* TRUE for positive assertion */
5415 break;
5416
5417 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
5418 assertion; it is therefore always treated as NOMATCH. */
5419
5420 case MATCH_NOMATCH:
5421 case MATCH_THEN:
5422 Lstart_branch += GET(Lstart_branch, 1);
5423 if (*Lstart_branch == OP_ALT) continue; /* Try next branch */
5424 condition = !Lpositive; /* TRUE for negative assertion */
5425 break;
5426
5427 /* These force no match without checking other branches. */
5428
5429 case MATCH_COMMIT:
5430 case MATCH_SKIP:
5431 case MATCH_PRUNE:
5432 condition = !Lpositive;
5433 break;
5434
5435 default:
5436 RRETURN(rrc);
5437 }
5438 break; /* Out of the branch loop */
5439 }
5440
5441 /* If the condition is true, find the end of the assertion so that
5442 advancing past it gets us to the start of the first branch. */
5443
5444 if (condition)
5445 {
5446 do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);
5447 }
5448 break; /* End of assertion condition */
5449 }
5450
5451 #undef Lpositive
5452 #undef Lstart_branch
5453
5454 /* Choose branch according to the condition. */
5455
5456 Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;
5457
5458 /* If the opcode is OP_SCOND it means we are at a repeated conditional
5459 group that might match an empty string. We must therefore descend a level
5460 so that the start is remembered for checking. For OP_COND we can just
5461 continue at this level. */
5462
5463 if (Fop == OP_SCOND)
5464 {
5465 group_frame_type = GF_NOCAPTURE | Fop;
5466 RMATCH(Fecode, RM35);
5467 RRETURN(rrc);
5468 }
5469 break;
5470
5471
5472
5473 /* ========================================================================= */
5474 /* End of start of parenthesis opcodes */
5475 /* ========================================================================= */
5476
5477
5478 /* ===================================================================== */
5479 /* Move the subject pointer back. This occurs only at the start of each
5480 branch of a lookbehind assertion. If we are too close to the start to move
5481 back, fail. When working with UTF-8 we move back a number of characters,
5482 not bytes. */
5483
5484 case OP_REVERSE:
5485 number = GET(Fecode, 1);
5486 #ifdef SUPPORT_UNICODE
5487 if (utf)
5488 {
5489 while (number-- > 0)
5490 {
5491 if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);
5492 Feptr--;
5493 BACKCHAR(Feptr);
5494 }
5495 }
5496 else
5497 #endif
5498
5499 /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */
5500
5501 {
5502 if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
5503 Feptr -= number;
5504 }
5505
5506 /* Save the earliest consulted character, then skip to next opcode */
5507
5508 if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
5509 Fecode += 1 + LINK_SIZE;
5510 break;
5511
5512
5513 /* ===================================================================== */
5514 /* An alternation is the end of a branch; scan along to find the end of the
5515 bracketed group. */
5516
5517 case OP_ALT:
5518 do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
5519 break;
5520
5521
5522 /* ===================================================================== */
5523 /* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
5524 starting frame was added to the chained frames in order to remember the
5525 starting subject position for the group. */
5526
5527 case OP_KET:
5528 case OP_KETRMIN:
5529 case OP_KETRMAX:
5530 case OP_KETRPOS:
5531
5532 bracode = Fecode - GET(Fecode, 1);
5533
5534 /* Point N to the frame at the start of the most recent group.
5535 Remember the subject pointer at the start of the group. */
5536
5537 if (*bracode != OP_BRA && *bracode != OP_COND)
5538 {
5539 N = (heapframe *)((char *)mb->match_frames + Flast_group_offset);
5540 P = (heapframe *)((char *)N - frame_size);
5541 Flast_group_offset = P->last_group_offset;
5542
5543 #ifdef DEBUG_SHOW_RMATCH
5544 fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",
5545 N->rdepth, N->group_frame_type,
5546 (char *)P->eptr - (char *)mb->start_subject);
5547 #endif
5548
5549 /* If we are at the end of an assertion that is a condition, return a
5550 match, discarding any intermediate backtracking points. Copy back the
5551 mark setting and the captures into the frame before N so that they are
5552 set on return. Doing this for all assertions, both positive and negative,
5553 seems to match what Perl does. */
5554
5555 if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
5556 {
5557 memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
5558 Foffset_top * sizeof(PCRE2_SIZE));
5559 P->offset_top = Foffset_top;
5560 P->mark = Fmark;
5561 Fback_frame = (char *)F - (char *)P;
5562 RRETURN(MATCH_MATCH);
5563 }
5564 }
5565 else P = NULL; /* Indicates starting frame not recorded */
5566
5567 /* The group was not a conditional assertion. */
5568
5569 switch (*bracode)
5570 {
5571 case OP_BRA: /* No need to do anything for these */
5572 case OP_COND:
5573 case OP_SCOND:
5574 break;
5575
5576 /* Non-atomic positive assertions are like OP_BRA, except that the
5577 subject pointer must be put back to where it was at the start of the
5578 assertion. */
5579
5580 case OP_ASSERT_NA:
5581 case OP_ASSERTBACK_NA:
5582 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5583 Feptr = P->eptr;
5584 break;
5585
5586 /* Atomic positive assertions are like OP_ONCE, except that in addition
5587 the subject pointer must be put back to where it was at the start of the
5588 assertion. */
5589
5590 case OP_ASSERT:
5591 case OP_ASSERTBACK:
5592 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
5593 Feptr = P->eptr;
5594 /* Fall through */
5595
5596 /* For an atomic group, discard internal backtracking points. We must
5597 also ensure that any remaining branches within the top-level of the group
5598 are not tried. Do this by adjusting the code pointer within the backtrack
5599 frame so that it points to the final branch. */
5600
5601 case OP_ONCE:
5602 Fback_frame = ((char *)F - (char *)P);
5603 for (;;)
5604 {
5605 uint32_t y = GET(P->ecode,1);
5606 if ((P->ecode)[y] != OP_ALT) break;
5607 P->ecode += y;
5608 }
5609 break;
5610
5611 /* A matching negative assertion returns MATCH, which is turned into
5612 NOMATCH at the assertion level. */
5613
5614 case OP_ASSERT_NOT:
5615 case OP_ASSERTBACK_NOT:
5616 RRETURN(MATCH_MATCH);
5617
5618 /* At the end of a script run, apply the script-checking rules. This code
5619 will never by exercised if Unicode support it not compiled, because in
5620 that environment script runs cause an error at compile time. */
5621
5622 case OP_SCRIPT_RUN:
5623 if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
5624 break;
5625
5626 /* Whole-pattern recursion is coded as a recurse into group 0, so it
5627 won't be picked up here. Instead, we catch it when the OP_END is reached.
5628 Other recursion is handled here. */
5629
5630 case OP_CBRA:
5631 case OP_CBRAPOS:
5632 case OP_SCBRA:
5633 case OP_SCBRAPOS:
5634 number = GET2(bracode, 1+LINK_SIZE);
5635
5636 /* Handle a recursively called group. We reinstate the previous set of
5637 captures and then carry on after the recursion call. */
5638
5639 if (Fcurrent_recurse == number)
5640 {
5641 P = (heapframe *)((char *)N - frame_size);
5642 memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
5643 Foffset_top * sizeof(PCRE2_SIZE));
5644 Foffset_top = P->offset_top;
5645 Fcapture_last = P->capture_last;
5646 Fcurrent_recurse = P->current_recurse;
5647 Fecode = P->ecode + 1 + LINK_SIZE;
5648 continue; /* With next opcode */
5649 }
5650
5651 /* Deal with actual capturing. */
5652
5653 offset = (number << 1) - 2;
5654 Fcapture_last = number;
5655 Fovector[offset] = P->eptr - mb->start_subject;
5656 Fovector[offset+1] = Feptr - mb->start_subject;
5657 if (offset >= Foffset_top) Foffset_top = offset + 2;
5658 break;
5659 } /* End actions relating to the starting opcode */
5660
5661 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
5662 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
5663 at a time from the outer level. This must precede the empty string test -
5664 in this case that test is done at the outer level. */
5665
5666 if (*Fecode == OP_KETRPOS)
5667 {
5668 memcpy((char *)P + offsetof(heapframe, eptr),
5669 (char *)F + offsetof(heapframe, eptr),
5670 frame_copy_size);
5671 RRETURN(MATCH_KETRPOS);
5672 }
5673
5674 /* Handle the different kinds of closing brackets. A non-repeating ket
5675 needs no special action, just continuing at this level. This also happens
5676 for the repeating kets if the group matched no characters, in order to
5677 forcibly break infinite loops. Otherwise, the repeating kets try the rest
5678 of the pattern or restart from the preceding bracket, in the appropriate
5679 order. */
5680
5681 if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))
5682 {
5683 if (Fop == OP_KETRMIN)
5684 {
5685 RMATCH(Fecode + 1 + LINK_SIZE, RM6);
5686 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5687 Fecode -= GET(Fecode, 1);
5688 break; /* End of ket processing */
5689 }
5690
5691 /* Repeat the maximum number of times (KETRMAX) */
5692
5693 RMATCH(bracode, RM7);
5694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5695 }
5696
5697 /* Carry on at this level for a non-repeating ket, or after matching an
5698 empty string, or after repeating for a maximum number of times. */
5699
5700 Fecode += 1 + LINK_SIZE;
5701 break;
5702
5703
5704 /* ===================================================================== */
5705 /* Start and end of line assertions, not multiline mode. */
5706
5707 case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */
5708 if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)
5709 RRETURN(MATCH_NOMATCH);
5710 Fecode++;
5711 break;
5712
5713 case OP_SOD: /* Unconditional start of subject */
5714 if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);
5715 Fecode++;
5716 break;
5717
5718 /* When PCRE2_NOTEOL is unset, assert before the subject end, or a
5719 terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */
5720
5721 case OP_DOLL:
5722 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5723 if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
5724
5725 /* Fall through */
5726 /* Unconditional end of subject assertion (\z) */
5727
5728 case OP_EOD:
5729 if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
5730 if (mb->partial != 0)
5731 {
5732 mb->hitend = TRUE;
5733 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5734 }
5735 Fecode++;
5736 break;
5737
5738 /* End of subject or ending \n assertion (\Z) */
5739
5740 case OP_EODN:
5741 ASSERT_NL_OR_EOS:
5742 if (Feptr < mb->end_subject &&
5743 (!IS_NEWLINE(Feptr) || Feptr != mb->end_subject - mb->nllen))
5744 {
5745 if (mb->partial != 0 &&
5746 Feptr + 1 >= mb->end_subject &&
5747 NLBLOCK->nltype == NLTYPE_FIXED &&
5748 NLBLOCK->nllen == 2 &&
5749 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5750 {
5751 mb->hitend = TRUE;
5752 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5753 }
5754 RRETURN(MATCH_NOMATCH);
5755 }
5756
5757 /* Either at end of string or \n before end. */
5758
5759 if (mb->partial != 0)
5760 {
5761 mb->hitend = TRUE;
5762 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5763 }
5764 Fecode++;
5765 break;
5766
5767
5768 /* ===================================================================== */
5769 /* Start and end of line assertions, multiline mode. */
5770
5771 /* Start of subject unless notbol, or after any newline except for one at
5772 the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
5773
5774 case OP_CIRCM:
5775 if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)
5776 RRETURN(MATCH_NOMATCH);
5777 if (Feptr != mb->start_subject &&
5778 ((Feptr == mb->end_subject &&
5779 (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
5780 !WAS_NEWLINE(Feptr)))
5781 RRETURN(MATCH_NOMATCH);
5782 Fecode++;
5783 break;
5784
5785 /* Assert before any newline, or before end of subject unless noteol is
5786 set. */
5787
5788 case OP_DOLLM:
5789 if (Feptr < mb->end_subject)
5790 {
5791 if (!IS_NEWLINE(Feptr))
5792 {
5793 if (mb->partial != 0 &&
5794 Feptr + 1 >= mb->end_subject &&
5795 NLBLOCK->nltype == NLTYPE_FIXED &&
5796 NLBLOCK->nllen == 2 &&
5797 UCHAR21TEST(Feptr) == NLBLOCK->nl[0])
5798 {
5799 mb->hitend = TRUE;
5800 if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
5801 }
5802 RRETURN(MATCH_NOMATCH);
5803 }
5804 }
5805 else
5806 {
5807 if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
5808 SCHECK_PARTIAL();
5809 }
5810 Fecode++;
5811 break;
5812
5813
5814 /* ===================================================================== */
5815 /* Start of match assertion */
5816
5817 case OP_SOM:
5818 if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);
5819 Fecode++;
5820 break;
5821
5822
5823 /* ===================================================================== */
5824 /* Reset the start of match point */
5825
5826 case OP_SET_SOM:
5827 Fstart_match = Feptr;
5828 Fecode++;
5829 break;
5830
5831
5832 /* ===================================================================== */
5833 /* Word boundary assertions. Find out if the previous and current
5834 characters are "word" characters. It takes a bit more work in UTF mode.
5835 Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is
5836 not set. When it is set, use Unicode properties if available, even when not
5837 in UTF mode. Remember the earliest and latest consulted characters. */
5838
5839 case OP_NOT_WORD_BOUNDARY:
5840 case OP_WORD_BOUNDARY:
5841 if (Feptr == mb->check_subject) prev_is_word = FALSE; else
5842 {
5843 PCRE2_SPTR lastptr = Feptr - 1;
5844 #ifdef SUPPORT_UNICODE
5845 if (utf)
5846 {
5847 BACKCHAR(lastptr);
5848 GETCHAR(fc, lastptr);
5849 }
5850 else
5851 #endif /* SUPPORT_UNICODE */
5852 fc = *lastptr;
5853 if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
5854 #ifdef SUPPORT_UNICODE
5855 if ((mb->poptions & PCRE2_UCP) != 0)
5856 {
5857 if (fc == '_') prev_is_word = TRUE; else
5858 {
5859 int cat = UCD_CATEGORY(fc);
5860 prev_is_word = (cat == ucp_L || cat == ucp_N);
5861 }
5862 }
5863 else
5864 #endif /* SUPPORT_UNICODE */
5865 prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
5866 }
5867
5868 /* Get status of next character */
5869
5870 if (Feptr >= mb->end_subject)
5871 {
5872 SCHECK_PARTIAL();
5873 cur_is_word = FALSE;
5874 }
5875 else
5876 {
5877 PCRE2_SPTR nextptr = Feptr + 1;
5878 #ifdef SUPPORT_UNICODE
5879 if (utf)
5880 {
5881 FORWARDCHARTEST(nextptr, mb->end_subject);
5882 GETCHAR(fc, Feptr);
5883 }
5884 else
5885 #endif /* SUPPORT_UNICODE */
5886 fc = *Feptr;
5887 if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
5888 #ifdef SUPPORT_UNICODE
5889 if ((mb->poptions & PCRE2_UCP) != 0)
5890 {
5891 if (fc == '_') cur_is_word = TRUE; else
5892 {
5893 int cat = UCD_CATEGORY(fc);
5894 cur_is_word = (cat == ucp_L || cat == ucp_N);
5895 }
5896 }
5897 else
5898 #endif /* SUPPORT_UNICODE */
5899 cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;
5900 }
5901
5902 /* Now see if the situation is what we want */
5903
5904 if ((*Fecode++ == OP_WORD_BOUNDARY)?
5905 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
5906 RRETURN(MATCH_NOMATCH);
5907 break;
5908
5909
5910 /* ===================================================================== */
5911 /* Backtracking (*VERB)s, with and without arguments. Note that if the
5912 pattern is successfully matched, we do not come back from RMATCH. */
5913
5914 case OP_MARK:
5915 Fmark = mb->nomatch_mark = Fecode + 2;
5916 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);
5917
5918 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
5919 argument, and we must check whether that argument matches this MARK's
5920 argument. It is passed back in mb->verb_skip_ptr. If it does match, we
5921 return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject
5922 position that corresponds to this mark. Otherwise, pass back the return
5923 code unaltered. */
5924
5925 if (rrc == MATCH_SKIP_ARG &&
5926 PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)
5927 {
5928 mb->verb_skip_ptr = Feptr; /* Pass back current position */
5929 RRETURN(MATCH_SKIP);
5930 }
5931 RRETURN(rrc);
5932
5933 case OP_FAIL:
5934 RRETURN(MATCH_NOMATCH);
5935
5936 /* Record the current recursing group number in mb->verb_current_recurse
5937 when a backtracking return such as MATCH_COMMIT is given. This enables the
5938 recurse processing to catch verbs from within the recursion. */
5939
5940 case OP_COMMIT:
5941 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
5942 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5943 mb->verb_current_recurse = Fcurrent_recurse;
5944 RRETURN(MATCH_COMMIT);
5945
5946 case OP_COMMIT_ARG:
5947 Fmark = mb->nomatch_mark = Fecode + 2;
5948 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);
5949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5950 mb->verb_current_recurse = Fcurrent_recurse;
5951 RRETURN(MATCH_COMMIT);
5952
5953 case OP_PRUNE:
5954 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
5955 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5956 mb->verb_current_recurse = Fcurrent_recurse;
5957 RRETURN(MATCH_PRUNE);
5958
5959 case OP_PRUNE_ARG:
5960 Fmark = mb->nomatch_mark = Fecode + 2;
5961 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
5962 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5963 mb->verb_current_recurse = Fcurrent_recurse;
5964 RRETURN(MATCH_PRUNE);
5965
5966 case OP_SKIP:
5967 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
5968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5969 mb->verb_skip_ptr = Feptr; /* Pass back current position */
5970 mb->verb_current_recurse = Fcurrent_recurse;
5971 RRETURN(MATCH_SKIP);
5972
5973 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
5974 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
5975 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
5976 that failed and any that precede it (either they also failed, or were not
5977 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
5978 SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg
5979 set to the count of the one that failed. */
5980
5981 case OP_SKIP_ARG:
5982 mb->skip_arg_count++;
5983 if (mb->skip_arg_count <= mb->ignore_skip_arg)
5984 {
5985 Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];
5986 break;
5987 }
5988 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);
5989 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5990
5991 /* Pass back the current skip name and return the special MATCH_SKIP_ARG
5992 return code. This will either be caught by a matching MARK, or get to the
5993 top, where it causes a rematch with mb->ignore_skip_arg set to the value of
5994 mb->skip_arg_count. */
5995
5996 mb->verb_skip_ptr = Fecode + 2;
5997 mb->verb_current_recurse = Fcurrent_recurse;
5998 RRETURN(MATCH_SKIP_ARG);
5999
6000 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
6001 the branch in which it occurs can be determined. */
6002
6003 case OP_THEN:
6004 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
6005 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6006 mb->verb_ecode_ptr = Fecode;
6007 mb->verb_current_recurse = Fcurrent_recurse;
6008 RRETURN(MATCH_THEN);
6009
6010 case OP_THEN_ARG:
6011 Fmark = mb->nomatch_mark = Fecode + 2;
6012 RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
6013 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6014 mb->verb_ecode_ptr = Fecode;
6015 mb->verb_current_recurse = Fcurrent_recurse;
6016 RRETURN(MATCH_THEN);
6017
6018
6019 /* ===================================================================== */
6020 /* There's been some horrible disaster. Arrival here can only mean there is
6021 something seriously wrong in the code above or the OP_xxx definitions. */
6022
6023 default:
6024 return PCRE2_ERROR_INTERNAL;
6025 }
6026
6027 /* Do not insert any code in here without much thought; it is assumed
6028 that "continue" in the code above comes out to here to repeat the main
6029 loop. */
6030
6031 } /* End of main loop */
6032 /* Control never reaches here */
6033
6034
6035 /* ========================================================================= */
6036 /* The RRETURN() macro jumps here. The number that is saved in Freturn_id
6037 indicates which label we actually want to return to. The value in Frdepth is
6038 the index number of the frame in the vector. The return value has been placed
6039 in rrc. */
6040
6041 #define LBL(val) case val: goto L_RM##val;
6042
6043 RETURN_SWITCH:
6044 if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
6045 if (Frdepth == 0) return rrc; /* Exit from the top level */
6046 F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
6047 mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
6048
6049 #ifdef DEBUG_SHOW_RMATCH
6050 fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
6051 #endif
6052
6053 switch (Freturn_id)
6054 {
6055 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6056 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
6057 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
6058 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
6059 LBL(33) LBL(34) LBL(35) LBL(36)
6060
6061 #ifdef SUPPORT_WIDE_CHARS
6062 LBL(100) LBL(101)
6063 #endif
6064
6065 #ifdef SUPPORT_UNICODE
6066 LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)
6067 LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)
6068 LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)
6069 LBL(221) LBL(222)
6070 #endif
6071
6072 default:
6073 return PCRE2_ERROR_INTERNAL;
6074 }
6075 #undef LBL
6076 }
6077
6078
6079 /*************************************************
6080 * Match a Regular Expression *
6081 *************************************************/
6082
6083 /* This function applies a compiled pattern to a subject string and picks out
6084 portions of the string if it matches. Two elements in the vector are set for
6085 each substring: the offsets to the start and end of the substring.
6086
6087 Arguments:
6088 code points to the compiled expression
6089 subject points to the subject string
6090 length length of subject string (may contain binary zeros)
6091 start_offset where to start in the subject string
6092 options option bits
6093 match_data points to a match_data block
6094 mcontext points a PCRE2 context
6095
6096 Returns: > 0 => success; value is the number of ovector pairs filled
6097 = 0 => success, but ovector is not big enough
6098 = -1 => failed to match (PCRE2_ERROR_NOMATCH)
6099 = -2 => partial match (PCRE2_ERROR_PARTIAL)
6100 < -2 => some kind of unexpected problem
6101 */
6102
6103 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext)6104 pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
6105 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
6106 pcre2_match_context *mcontext)
6107 {
6108 int rc;
6109 int was_zero_terminated = 0;
6110 const uint8_t *start_bits = NULL;
6111 const pcre2_real_code *re = (const pcre2_real_code *)code;
6112
6113 BOOL anchored;
6114 BOOL firstline;
6115 BOOL has_first_cu = FALSE;
6116 BOOL has_req_cu = FALSE;
6117 BOOL startline;
6118
6119 #if PCRE2_CODE_UNIT_WIDTH == 8
6120 PCRE2_SPTR memchr_found_first_cu;
6121 PCRE2_SPTR memchr_found_first_cu2;
6122 #endif
6123
6124 PCRE2_UCHAR first_cu = 0;
6125 PCRE2_UCHAR first_cu2 = 0;
6126 PCRE2_UCHAR req_cu = 0;
6127 PCRE2_UCHAR req_cu2 = 0;
6128
6129 PCRE2_SPTR bumpalong_limit;
6130 PCRE2_SPTR end_subject;
6131 PCRE2_SPTR true_end_subject;
6132 PCRE2_SPTR start_match = subject + start_offset;
6133 PCRE2_SPTR req_cu_ptr = start_match - 1;
6134 PCRE2_SPTR start_partial;
6135 PCRE2_SPTR match_partial;
6136
6137 #ifdef SUPPORT_JIT
6138 BOOL use_jit;
6139 #endif
6140
6141 /* This flag is needed even when Unicode is not supported for convenience
6142 (it is used by the IS_NEWLINE macro). */
6143
6144 BOOL utf = FALSE;
6145
6146 #ifdef SUPPORT_UNICODE
6147 BOOL ucp = FALSE;
6148 BOOL allow_invalid;
6149 uint32_t fragment_options = 0;
6150 #ifdef SUPPORT_JIT
6151 BOOL jit_checked_utf = FALSE;
6152 #endif
6153 #endif /* SUPPORT_UNICODE */
6154
6155 PCRE2_SIZE frame_size;
6156
6157 /* We need to have mb as a pointer to a match block, because the IS_NEWLINE
6158 macro is used below, and it expects NLBLOCK to be defined as a pointer. */
6159
6160 pcre2_callout_block cb;
6161 match_block actual_match_block;
6162 match_block *mb = &actual_match_block;
6163
6164 /* Allocate an initial vector of backtracking frames on the stack. If this
6165 proves to be too small, it is replaced by a larger one on the heap. To get a
6166 vector of the size required that is aligned for pointers, allocate it as a
6167 vector of pointers. */
6168
6169 PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]
6170 PCRE2_KEEP_UNINITIALIZED;
6171 mb->stack_frames = (heapframe *)stack_frames_vector;
6172
6173 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
6174 subject string. */
6175
6176 if (length == PCRE2_ZERO_TERMINATED)
6177 {
6178 length = PRIV(strlen)(subject);
6179 was_zero_terminated = 1;
6180 }
6181 true_end_subject = end_subject = subject + length;
6182
6183 /* Plausibility checks */
6184
6185 if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
6186 if (code == NULL || subject == NULL || match_data == NULL)
6187 return PCRE2_ERROR_NULL;
6188 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
6189
6190 /* Check that the first field in the block is the magic number. */
6191
6192 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
6193
6194 /* Check the code unit width. */
6195
6196 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
6197 return PCRE2_ERROR_BADMODE;
6198
6199 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
6200 options variable for this function. Users of PCRE2 who are not calling the
6201 function directly would like to have a way of setting these flags, in the same
6202 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
6203 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
6204 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now
6205 transfer to the options for this function. The bits are guaranteed to be
6206 adjacent, but do not have the same values. This bit of Boolean trickery assumes
6207 that the match-time bits are not more significant than the flag bits. If by
6208 accident this is not the case, a compile-time division by zero error will
6209 occur. */
6210
6211 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
6212 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
6213 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
6214 #undef FF
6215 #undef OO
6216
6217 /* If the pattern was successfully studied with JIT support, we will run the
6218 JIT executable instead of the rest of this function. Most options must be set
6219 at compile time for the JIT code to be usable. */
6220
6221 #ifdef SUPPORT_JIT
6222 use_jit = (re->executable_jit != NULL &&
6223 (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
6224 #endif
6225
6226 /* Initialize UTF/UCP parameters. */
6227
6228 #ifdef SUPPORT_UNICODE
6229 utf = (re->overall_options & PCRE2_UTF) != 0;
6230 allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
6231 ucp = (re->overall_options & PCRE2_UCP) != 0;
6232 #endif /* SUPPORT_UNICODE */
6233
6234 /* Convert the partial matching flags into an integer. */
6235
6236 mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
6237 ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
6238
6239 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
6240 time. */
6241
6242 if (mb->partial != 0 &&
6243 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
6244 return PCRE2_ERROR_BADOPTION;
6245
6246 /* It is an error to set an offset limit without setting the flag at compile
6247 time. */
6248
6249 if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&
6250 (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
6251 return PCRE2_ERROR_BADOFFSETLIMIT;
6252
6253 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
6254 free the memory that was obtained. Set the field to NULL for no match cases. */
6255
6256 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
6257 {
6258 match_data->memctl.free((void *)match_data->subject,
6259 match_data->memctl.memory_data);
6260 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
6261 }
6262 match_data->subject = NULL;
6263
6264 /* Zero the error offset in case the first code unit is invalid UTF. */
6265
6266 match_data->startchar = 0;
6267
6268
6269 /* ============================= JIT matching ============================== */
6270
6271 /* Prepare for JIT matching. Check a UTF string for validity unless no check is
6272 requested or invalid UTF can be handled. We check only the portion of the
6273 subject that might be be inspected during matching - from the offset minus the
6274 maximum lookbehind to the given length. This saves time when a small part of a
6275 large subject is being matched by the use of a starting offset. Note that the
6276 maximum lookbehind is a number of characters, not code units. */
6277
6278 #ifdef SUPPORT_JIT
6279 if (use_jit)
6280 {
6281 #ifdef SUPPORT_UNICODE
6282 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)
6283 {
6284 #if PCRE2_CODE_UNIT_WIDTH != 32
6285 unsigned int i;
6286 #endif
6287
6288 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6289 character start. */
6290
6291 #if PCRE2_CODE_UNIT_WIDTH != 32
6292 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6293 {
6294 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6295 #if PCRE2_CODE_UNIT_WIDTH == 8
6296 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6297 #else
6298 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6299 #endif
6300 }
6301 #endif /* WIDTH != 32 */
6302
6303 /* Move back by the maximum lookbehind, just in case it happens at the very
6304 start of matching. */
6305
6306 #if PCRE2_CODE_UNIT_WIDTH != 32
6307 for (i = re->max_lookbehind; i > 0 && start_match > subject; i--)
6308 {
6309 start_match--;
6310 while (start_match > subject &&
6311 #if PCRE2_CODE_UNIT_WIDTH == 8
6312 (*start_match & 0xc0) == 0x80)
6313 #else /* 16-bit */
6314 (*start_match & 0xfc00) == 0xdc00)
6315 #endif
6316 start_match--;
6317 }
6318 #else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6319
6320 /* In the 32-bit library, one code unit equals one character. However,
6321 we cannot just subtract the lookbehind and then compare pointers, because
6322 a very large lookbehind could create an invalid pointer. */
6323
6324 if (start_offset >= re->max_lookbehind)
6325 start_match -= re->max_lookbehind;
6326 else
6327 start_match = subject;
6328 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6329
6330 /* Validate the relevant portion of the subject. Adjust the offset of an
6331 invalid code point to be an absolute offset in the whole string. */
6332
6333 match_data->rc = PRIV(valid_utf)(start_match,
6334 length - (start_match - subject), &(match_data->startchar));
6335 if (match_data->rc != 0)
6336 {
6337 match_data->startchar += start_match - subject;
6338 return match_data->rc;
6339 }
6340 jit_checked_utf = TRUE;
6341 }
6342 #endif /* SUPPORT_UNICODE */
6343
6344 /* If JIT returns BADOPTION, which means that the selected complete or
6345 partial matching mode was not compiled, fall through to the interpreter. */
6346
6347 rc = pcre2_jit_match(code, subject, length, start_offset, options,
6348 match_data, mcontext);
6349 if (rc != PCRE2_ERROR_JIT_BADOPTION)
6350 {
6351 if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
6352 {
6353 length = CU2BYTES(length + was_zero_terminated);
6354 match_data->subject = match_data->memctl.malloc(length,
6355 match_data->memctl.memory_data);
6356 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
6357 memcpy((void *)match_data->subject, subject, length);
6358 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
6359 }
6360 return rc;
6361 }
6362 }
6363 #endif /* SUPPORT_JIT */
6364
6365 /* ========================= End of JIT matching ========================== */
6366
6367
6368 /* Proceed with non-JIT matching. The default is to allow lookbehinds to the
6369 start of the subject. A UTF check when there is a non-zero offset may change
6370 this. */
6371
6372 mb->check_subject = subject;
6373
6374 /* If a UTF subject string was not checked for validity in the JIT code above,
6375 check it here, and handle support for invalid UTF strings. The check above
6376 happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.
6377 If we get here in those circumstances, it means the subject string is valid,
6378 but for some reason JIT matching was not successful. There is no need to check
6379 the subject again.
6380
6381 We check only the portion of the subject that might be be inspected during
6382 matching - from the offset minus the maximum lookbehind to the given length.
6383 This saves time when a small part of a large subject is being matched by the
6384 use of a starting offset. Note that the maximum lookbehind is a number of
6385 characters, not code units.
6386
6387 Note also that support for invalid UTF forces a check, overriding the setting
6388 of PCRE2_NO_CHECK_UTF. */
6389
6390 #ifdef SUPPORT_UNICODE
6391 if (utf &&
6392 #ifdef SUPPORT_JIT
6393 !jit_checked_utf &&
6394 #endif
6395 ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))
6396 {
6397 #if PCRE2_CODE_UNIT_WIDTH != 32
6398 BOOL skipped_bad_start = FALSE;
6399 #endif
6400
6401 /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
6402 character start. If we are handling invalid UTF, just skip over such code
6403 units. Otherwise, give an appropriate error. */
6404
6405 #if PCRE2_CODE_UNIT_WIDTH != 32
6406 if (allow_invalid)
6407 {
6408 while (start_match < end_subject && NOT_FIRSTCU(*start_match))
6409 {
6410 start_match++;
6411 skipped_bad_start = TRUE;
6412 }
6413 }
6414 else if (start_match < end_subject && NOT_FIRSTCU(*start_match))
6415 {
6416 if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
6417 #if PCRE2_CODE_UNIT_WIDTH == 8
6418 return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */
6419 #else
6420 return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */
6421 #endif
6422 }
6423 #endif /* WIDTH != 32 */
6424
6425 /* The mb->check_subject field points to the start of UTF checking;
6426 lookbehinds can go back no further than this. */
6427
6428 mb->check_subject = start_match;
6429
6430 /* Move back by the maximum lookbehind, just in case it happens at the very
6431 start of matching, but don't do this if we skipped bad 8-bit or 16-bit code
6432 units above. */
6433
6434 #if PCRE2_CODE_UNIT_WIDTH != 32
6435 if (!skipped_bad_start)
6436 {
6437 unsigned int i;
6438 for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)
6439 {
6440 mb->check_subject--;
6441 while (mb->check_subject > subject &&
6442 #if PCRE2_CODE_UNIT_WIDTH == 8
6443 (*mb->check_subject & 0xc0) == 0x80)
6444 #else /* 16-bit */
6445 (*mb->check_subject & 0xfc00) == 0xdc00)
6446 #endif
6447 mb->check_subject--;
6448 }
6449 }
6450 #else /* PCRE2_CODE_UNIT_WIDTH != 32 */
6451
6452 /* In the 32-bit library, one code unit equals one character. However,
6453 we cannot just subtract the lookbehind and then compare pointers, because
6454 a very large lookbehind could create an invalid pointer. */
6455
6456 if (start_offset >= re->max_lookbehind)
6457 mb->check_subject -= re->max_lookbehind;
6458 else
6459 mb->check_subject = subject;
6460 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
6461
6462 /* Validate the relevant portion of the subject. There's a loop in case we
6463 encounter bad UTF in the characters preceding start_match which we are
6464 scanning because of a lookbehind. */
6465
6466 for (;;)
6467 {
6468 match_data->rc = PRIV(valid_utf)(mb->check_subject,
6469 length - (mb->check_subject - subject), &(match_data->startchar));
6470
6471 if (match_data->rc == 0) break; /* Valid UTF string */
6472
6473 /* Invalid UTF string. Adjust the offset to be an absolute offset in the
6474 whole string. If we are handling invalid UTF strings, set end_subject to
6475 stop before the bad code unit, and set the options to "not end of line".
6476 Otherwise return the error. */
6477
6478 match_data->startchar += mb->check_subject - subject;
6479 if (!allow_invalid || match_data->rc > 0) return match_data->rc;
6480 end_subject = subject + match_data->startchar;
6481
6482 /* If the end precedes start_match, it means there is invalid UTF in the
6483 extra code units we reversed over because of a lookbehind. Advance past the
6484 first bad code unit, and then skip invalid character starting code units in
6485 8-bit and 16-bit modes, and try again. */
6486
6487 if (end_subject < start_match)
6488 {
6489 mb->check_subject = end_subject + 1;
6490 #if PCRE2_CODE_UNIT_WIDTH != 32
6491 while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))
6492 mb->check_subject++;
6493 #endif
6494 }
6495
6496 /* Otherwise, set the not end of line option, and do the match. */
6497
6498 else
6499 {
6500 fragment_options = PCRE2_NOTEOL;
6501 break;
6502 }
6503 }
6504 }
6505 #endif /* SUPPORT_UNICODE */
6506
6507 /* A NULL match context means "use a default context", but we take the memory
6508 control functions from the pattern. */
6509
6510 if (mcontext == NULL)
6511 {
6512 mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
6513 mb->memctl = re->memctl;
6514 }
6515 else mb->memctl = mcontext->memctl;
6516
6517 anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
6518 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
6519 startline = (re->flags & PCRE2_STARTLINE) != 0;
6520 bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
6521 true_end_subject : subject + mcontext->offset_limit;
6522
6523 /* Initialize and set up the fixed fields in the callout block, with a pointer
6524 in the match block. */
6525
6526 mb->cb = &cb;
6527 cb.version = 2;
6528 cb.subject = subject;
6529 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
6530 cb.callout_flags = 0;
6531
6532 /* Fill in the remaining fields in the match block, except for moptions, which
6533 gets set later. */
6534
6535 mb->callout = mcontext->callout;
6536 mb->callout_data = mcontext->callout_data;
6537
6538 mb->start_subject = subject;
6539 mb->start_offset = start_offset;
6540 mb->end_subject = end_subject;
6541 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
6542 mb->allowemptypartial = (re->max_lookbehind > 0) ||
6543 (re->flags & PCRE2_MATCH_EMPTY) != 0;
6544 mb->poptions = re->overall_options; /* Pattern options */
6545 mb->ignore_skip_arg = 0;
6546 mb->mark = mb->nomatch_mark = NULL; /* In case never set */
6547
6548 /* The name table is needed for finding all the numbers associated with a
6549 given name, for condition testing. The code follows the name table. */
6550
6551 mb->name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
6552 mb->name_count = re->name_count;
6553 mb->name_entry_size = re->name_entry_size;
6554 mb->start_code = mb->name_table + re->name_count * re->name_entry_size;
6555
6556 /* Process the \R and newline settings. */
6557
6558 mb->bsr_convention = re->bsr_convention;
6559 mb->nltype = NLTYPE_FIXED;
6560 switch(re->newline_convention)
6561 {
6562 case PCRE2_NEWLINE_CR:
6563 mb->nllen = 1;
6564 mb->nl[0] = CHAR_CR;
6565 break;
6566
6567 case PCRE2_NEWLINE_LF:
6568 mb->nllen = 1;
6569 mb->nl[0] = CHAR_NL;
6570 break;
6571
6572 case PCRE2_NEWLINE_NUL:
6573 mb->nllen = 1;
6574 mb->nl[0] = CHAR_NUL;
6575 break;
6576
6577 case PCRE2_NEWLINE_CRLF:
6578 mb->nllen = 2;
6579 mb->nl[0] = CHAR_CR;
6580 mb->nl[1] = CHAR_NL;
6581 break;
6582
6583 case PCRE2_NEWLINE_ANY:
6584 mb->nltype = NLTYPE_ANY;
6585 break;
6586
6587 case PCRE2_NEWLINE_ANYCRLF:
6588 mb->nltype = NLTYPE_ANYCRLF;
6589 break;
6590
6591 default: return PCRE2_ERROR_INTERNAL;
6592 }
6593
6594 /* The backtracking frames have fixed data at the front, and a PCRE2_SIZE
6595 vector at the end, whose size depends on the number of capturing parentheses in
6596 the pattern. It is not used at all if there are no capturing parentheses.
6597
6598 frame_size is the total size of each frame
6599 mb->frame_vector_size is the total usable size of the vector (rounded down
6600 to a whole number of frames)
6601
6602 The last of these is changed within the match() function if the frame vector
6603 has to be expanded. We therefore put it into the match block so that it is
6604 correct when calling match() more than once for non-anchored patterns. */
6605
6606 frame_size = offsetof(heapframe, ovector) +
6607 re->top_bracket * 2 * sizeof(PCRE2_SIZE);
6608
6609 /* Limits set in the pattern override the match context only if they are
6610 smaller. */
6611
6612 mb->heap_limit = (mcontext->heap_limit < re->limit_heap)?
6613 mcontext->heap_limit : re->limit_heap;
6614
6615 mb->match_limit = (mcontext->match_limit < re->limit_match)?
6616 mcontext->match_limit : re->limit_match;
6617
6618 mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
6619 mcontext->depth_limit : re->limit_depth;
6620
6621 /* If a pattern has very many capturing parentheses, the frame size may be very
6622 large. Ensure that there are at least 10 available frames by getting an initial
6623 vector on the heap if necessary, except when the heap limit prevents this. Get
6624 fewer if possible. (The heap limit is in kibibytes.) */
6625
6626 if (frame_size <= START_FRAMES_SIZE/10)
6627 {
6628 mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */
6629 mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size);
6630 }
6631 else
6632 {
6633 mb->frame_vector_size = frame_size * 10;
6634 if ((mb->frame_vector_size / 1024) > mb->heap_limit)
6635 {
6636 if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT;
6637 mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size;
6638 }
6639 mb->match_frames = mb->memctl.malloc(mb->frame_vector_size,
6640 mb->memctl.memory_data);
6641 if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY;
6642 }
6643
6644 mb->match_frames_top =
6645 (heapframe *)((char *)mb->match_frames + mb->frame_vector_size);
6646
6647 /* Write to the ovector within the first frame to mark every capture unset and
6648 to avoid uninitialized memory read errors when it is copied to a new frame. */
6649
6650 memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff,
6651 re->top_bracket * 2 * sizeof(PCRE2_SIZE));
6652
6653 /* Pointers to the individual character tables */
6654
6655 mb->lcc = re->tables + lcc_offset;
6656 mb->fcc = re->tables + fcc_offset;
6657 mb->ctypes = re->tables + ctypes_offset;
6658
6659 /* Set up the first code unit to match, if available. If there's no first code
6660 unit there may be a bitmap of possible first characters. */
6661
6662 if ((re->flags & PCRE2_FIRSTSET) != 0)
6663 {
6664 has_first_cu = TRUE;
6665 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
6666 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
6667 {
6668 first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
6669 #ifdef SUPPORT_UNICODE
6670 #if PCRE2_CODE_UNIT_WIDTH == 8
6671 if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
6672 #else
6673 if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
6674 #endif
6675 #endif /* SUPPORT_UNICODE */
6676 }
6677 }
6678 else
6679 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
6680 start_bits = re->start_bitmap;
6681
6682 /* There may also be a "last known required character" set. */
6683
6684 if ((re->flags & PCRE2_LASTSET) != 0)
6685 {
6686 has_req_cu = TRUE;
6687 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
6688 if ((re->flags & PCRE2_LASTCASELESS) != 0)
6689 {
6690 req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
6691 #ifdef SUPPORT_UNICODE
6692 #if PCRE2_CODE_UNIT_WIDTH == 8
6693 if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
6694 #else
6695 if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
6696 #endif
6697 #endif /* SUPPORT_UNICODE */
6698 }
6699 }
6700
6701
6702 /* ==========================================================================*/
6703
6704 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6705 the loop runs just once. */
6706
6707 #ifdef SUPPORT_UNICODE
6708 FRAGMENT_RESTART:
6709 #endif
6710
6711 start_partial = match_partial = NULL;
6712 mb->hitend = FALSE;
6713
6714 #if PCRE2_CODE_UNIT_WIDTH == 8
6715 memchr_found_first_cu = NULL;
6716 memchr_found_first_cu2 = NULL;
6717 #endif
6718
6719 for(;;)
6720 {
6721 PCRE2_SPTR new_start_match;
6722
6723 /* ----------------- Start of match optimizations ---------------- */
6724
6725 /* There are some optimizations that avoid running the match if a known
6726 starting point is not found, or if a known later code unit is not present.
6727 However, there is an option (settable at compile time) that disables these,
6728 for testing and for ensuring that all callouts do actually occur. */
6729
6730 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
6731 {
6732 /* If firstline is TRUE, the start of the match is constrained to the first
6733 line of a multiline string. That is, the match must be before or at the
6734 first newline following the start of matching. Temporarily adjust
6735 end_subject so that we stop the scans for a first code unit at a newline.
6736 If the match fails at the newline, later code breaks the loop. */
6737
6738 if (firstline)
6739 {
6740 PCRE2_SPTR t = start_match;
6741 #ifdef SUPPORT_UNICODE
6742 if (utf)
6743 {
6744 while (t < end_subject && !IS_NEWLINE(t))
6745 {
6746 t++;
6747 ACROSSCHAR(t < end_subject, t, t++);
6748 }
6749 }
6750 else
6751 #endif
6752 while (t < end_subject && !IS_NEWLINE(t)) t++;
6753 end_subject = t;
6754 }
6755
6756 /* Anchored: check the first code unit if one is recorded. This may seem
6757 pointless but it can help in detecting a no match case without scanning for
6758 the required code unit. */
6759
6760 if (anchored)
6761 {
6762 if (has_first_cu || start_bits != NULL)
6763 {
6764 BOOL ok = start_match < end_subject;
6765 if (ok)
6766 {
6767 PCRE2_UCHAR c = UCHAR21TEST(start_match);
6768 ok = has_first_cu && (c == first_cu || c == first_cu2);
6769 if (!ok && start_bits != NULL)
6770 {
6771 #if PCRE2_CODE_UNIT_WIDTH != 8
6772 if (c > 255) c = 255;
6773 #endif
6774 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
6775 }
6776 }
6777 if (!ok)
6778 {
6779 rc = MATCH_NOMATCH;
6780 break;
6781 }
6782 }
6783 }
6784
6785 /* Not anchored. Advance to a unique first code unit if there is one. */
6786
6787 else
6788 {
6789 if (has_first_cu)
6790 {
6791 if (first_cu != first_cu2) /* Caseless */
6792 {
6793 /* In 16-bit and 32_bit modes we have to do our own search, so can
6794 look for both cases at once. */
6795
6796 #if PCRE2_CODE_UNIT_WIDTH != 8
6797 PCRE2_UCHAR smc;
6798 while (start_match < end_subject &&
6799 (smc = UCHAR21TEST(start_match)) != first_cu &&
6800 smc != first_cu2)
6801 start_match++;
6802 #else
6803 /* In 8-bit mode, the use of memchr() gives a big speed up, even
6804 though we have to call it twice in order to find the earliest
6805 occurrence of the code unit in either of its cases. Caching is used
6806 to remember the positions of previously found code units. This can
6807 make a huge difference when the strings are very long and only one
6808 case is actually present. */
6809
6810 PCRE2_SPTR pp1 = NULL;
6811 PCRE2_SPTR pp2 = NULL;
6812 PCRE2_SIZE searchlength = end_subject - start_match;
6813
6814 /* If we haven't got a previously found position for first_cu, or if
6815 the current starting position is later, we need to do a search. If
6816 the code unit is not found, set it to the end. */
6817
6818 if (memchr_found_first_cu == NULL ||
6819 start_match > memchr_found_first_cu)
6820 {
6821 pp1 = memchr(start_match, first_cu, searchlength);
6822 memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
6823 }
6824
6825 /* If the start is before a previously found position, use the
6826 previous position, or NULL if a previous search failed. */
6827
6828 else pp1 = (memchr_found_first_cu == end_subject)? NULL :
6829 memchr_found_first_cu;
6830
6831 /* Do the same thing for the other case. */
6832
6833 if (memchr_found_first_cu2 == NULL ||
6834 start_match > memchr_found_first_cu2)
6835 {
6836 pp2 = memchr(start_match, first_cu2, searchlength);
6837 memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
6838 }
6839
6840 else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
6841 memchr_found_first_cu2;
6842
6843 /* Set the start to the end of the subject if neither case was found.
6844 Otherwise, use the earlier found point. */
6845
6846 if (pp1 == NULL)
6847 start_match = (pp2 == NULL)? end_subject : pp2;
6848 else
6849 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
6850
6851 #endif /* 8-bit handling */
6852 }
6853
6854 /* The caseful case is much simpler. */
6855
6856 else
6857 {
6858 #if PCRE2_CODE_UNIT_WIDTH != 8
6859 while (start_match < end_subject && UCHAR21TEST(start_match) !=
6860 first_cu)
6861 start_match++;
6862 #else
6863 start_match = memchr(start_match, first_cu, end_subject - start_match);
6864 if (start_match == NULL) start_match = end_subject;
6865 #endif
6866 }
6867
6868 /* If we can't find the required first code unit, having reached the
6869 true end of the subject, break the bumpalong loop, to force a match
6870 failure, except when doing partial matching, when we let the next cycle
6871 run at the end of the subject. To see why, consider the pattern
6872 /(?<=abc)def/, which partially matches "abc", even though the string
6873 does not contain the starting character "d". If we have not reached the
6874 true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
6875 temporarily modified) we also let the cycle run, because the matching
6876 string is legitimately allowed to start with the first code unit of a
6877 newline. */
6878
6879 if (mb->partial == 0 && start_match >= mb->end_subject)
6880 {
6881 rc = MATCH_NOMATCH;
6882 break;
6883 }
6884 }
6885
6886 /* If there's no first code unit, advance to just after a linebreak for a
6887 multiline match if required. */
6888
6889 else if (startline)
6890 {
6891 if (start_match > mb->start_subject + start_offset)
6892 {
6893 #ifdef SUPPORT_UNICODE
6894 if (utf)
6895 {
6896 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6897 {
6898 start_match++;
6899 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
6900 }
6901 }
6902 else
6903 #endif
6904 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6905 start_match++;
6906
6907 /* If we have just passed a CR and the newline option is ANY or
6908 ANYCRLF, and we are now at a LF, advance the match position by one
6909 more code unit. */
6910
6911 if (start_match[-1] == CHAR_CR &&
6912 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
6913 start_match < end_subject &&
6914 UCHAR21TEST(start_match) == CHAR_NL)
6915 start_match++;
6916 }
6917 }
6918
6919 /* If there's no first code unit or a requirement for a multiline line
6920 start, advance to a non-unique first code unit if any have been
6921 identified. The bitmap contains only 256 bits. When code units are 16 or
6922 32 bits wide, all code units greater than 254 set the 255 bit. */
6923
6924 else if (start_bits != NULL)
6925 {
6926 while (start_match < end_subject)
6927 {
6928 uint32_t c = UCHAR21TEST(start_match);
6929 #if PCRE2_CODE_UNIT_WIDTH != 8
6930 if (c > 255) c = 255;
6931 #endif
6932 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
6933 start_match++;
6934 }
6935
6936 /* See comment above in first_cu checking about the next few lines. */
6937
6938 if (mb->partial == 0 && start_match >= mb->end_subject)
6939 {
6940 rc = MATCH_NOMATCH;
6941 break;
6942 }
6943 }
6944 } /* End first code unit handling */
6945
6946 /* Restore fudged end_subject */
6947
6948 end_subject = mb->end_subject;
6949
6950 /* The following two optimizations must be disabled for partial matching. */
6951
6952 if (mb->partial == 0)
6953 {
6954 PCRE2_SPTR p;
6955
6956 /* The minimum matching length is a lower bound; no string of that length
6957 may actually match the pattern. Although the value is, strictly, in
6958 characters, we treat it as code units to avoid spending too much time in
6959 this optimization. */
6960
6961 if (end_subject - start_match < re->minlength)
6962 {
6963 rc = MATCH_NOMATCH;
6964 break;
6965 }
6966
6967 /* If req_cu is set, we know that that code unit must appear in the
6968 subject for the (non-partial) match to succeed. If the first code unit is
6969 set, req_cu must be later in the subject; otherwise the test starts at
6970 the match point. This optimization can save a huge amount of backtracking
6971 in patterns with nested unlimited repeats that aren't going to match.
6972 Writing separate code for caseful/caseless versions makes it go faster,
6973 as does using an autoincrement and backing off on a match. As in the case
6974 of the first code unit, using memchr() in the 8-bit library gives a big
6975 speed up. Unlike the first_cu check above, we do not need to call
6976 memchr() twice in the caseless case because we only need to check for the
6977 presence of the character in either case, not find the first occurrence.
6978
6979 The search can be skipped if the code unit was found later than the
6980 current starting point in a previous iteration of the bumpalong loop.
6981
6982 HOWEVER: when the subject string is very, very long, searching to its end
6983 can take a long time, and give bad performance on quite ordinary
6984 anchored patterns. This showed up when somebody was matching something
6985 like /^\d+C/ on a 32-megabyte string... so we don't do this when the
6986 string is sufficiently long, but it's worth searching a lot more for
6987 unanchored patterns. */
6988
6989 p = start_match + (has_first_cu? 1:0);
6990 if (has_req_cu && p > req_cu_ptr)
6991 {
6992 PCRE2_SIZE check_length = end_subject - start_match;
6993
6994 if (check_length < REQ_CU_MAX ||
6995 (!anchored && check_length < REQ_CU_MAX * 1000))
6996 {
6997 if (req_cu != req_cu2) /* Caseless */
6998 {
6999 #if PCRE2_CODE_UNIT_WIDTH != 8
7000 while (p < end_subject)
7001 {
7002 uint32_t pp = UCHAR21INCTEST(p);
7003 if (pp == req_cu || pp == req_cu2) { p--; break; }
7004 }
7005 #else /* 8-bit code units */
7006 PCRE2_SPTR pp = p;
7007 p = memchr(pp, req_cu, end_subject - pp);
7008 if (p == NULL)
7009 {
7010 p = memchr(pp, req_cu2, end_subject - pp);
7011 if (p == NULL) p = end_subject;
7012 }
7013 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
7014 }
7015
7016 /* The caseful case */
7017
7018 else
7019 {
7020 #if PCRE2_CODE_UNIT_WIDTH != 8
7021 while (p < end_subject)
7022 {
7023 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
7024 }
7025
7026 #else /* 8-bit code units */
7027 p = memchr(p, req_cu, end_subject - p);
7028 if (p == NULL) p = end_subject;
7029 #endif
7030 }
7031
7032 /* If we can't find the required code unit, break the bumpalong loop,
7033 forcing a match failure. */
7034
7035 if (p >= end_subject)
7036 {
7037 rc = MATCH_NOMATCH;
7038 break;
7039 }
7040
7041 /* If we have found the required code unit, save the point where we
7042 found it, so that we don't search again next time round the bumpalong
7043 loop if the start hasn't yet passed this code unit. */
7044
7045 req_cu_ptr = p;
7046 }
7047 }
7048 }
7049 }
7050
7051 /* ------------ End of start of match optimizations ------------ */
7052
7053 /* Give no match if we have passed the bumpalong limit. */
7054
7055 if (start_match > bumpalong_limit)
7056 {
7057 rc = MATCH_NOMATCH;
7058 break;
7059 }
7060
7061 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
7062 first starting point for which a partial match was found. */
7063
7064 cb.start_match = (PCRE2_SIZE)(start_match - subject);
7065 cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
7066
7067 mb->start_used_ptr = start_match;
7068 mb->last_used_ptr = start_match;
7069 #ifdef SUPPORT_UNICODE
7070 mb->moptions = options | fragment_options;
7071 #else
7072 mb->moptions = options;
7073 #endif
7074 mb->match_call_count = 0;
7075 mb->end_offset_top = 0;
7076 mb->skip_arg_count = 0;
7077
7078 rc = match(start_match, mb->start_code, match_data->ovector,
7079 match_data->oveccount, re->top_bracket, frame_size, mb);
7080
7081 if (mb->hitend && start_partial == NULL)
7082 {
7083 start_partial = mb->start_used_ptr;
7084 match_partial = start_match;
7085 }
7086
7087 switch(rc)
7088 {
7089 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
7090 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
7091 entirely. The only way we can do that is to re-do the match at the same
7092 point, with a flag to force SKIP with an argument to be ignored. Just
7093 treating this case as NOMATCH does not work because it does not check other
7094 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
7095
7096 case MATCH_SKIP_ARG:
7097 new_start_match = start_match;
7098 mb->ignore_skip_arg = mb->skip_arg_count;
7099 break;
7100
7101 /* SKIP passes back the next starting point explicitly, but if it is no
7102 greater than the match we have just done, treat it as NOMATCH. */
7103
7104 case MATCH_SKIP:
7105 if (mb->verb_skip_ptr > start_match)
7106 {
7107 new_start_match = mb->verb_skip_ptr;
7108 break;
7109 }
7110 /* Fall through */
7111
7112 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
7113 exactly like PRUNE. Unset ignore SKIP-with-argument. */
7114
7115 case MATCH_NOMATCH:
7116 case MATCH_PRUNE:
7117 case MATCH_THEN:
7118 mb->ignore_skip_arg = 0;
7119 new_start_match = start_match + 1;
7120 #ifdef SUPPORT_UNICODE
7121 if (utf)
7122 ACROSSCHAR(new_start_match < end_subject, new_start_match,
7123 new_start_match++);
7124 #endif
7125 break;
7126
7127 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7128
7129 case MATCH_COMMIT:
7130 rc = MATCH_NOMATCH;
7131 goto ENDLOOP;
7132
7133 /* Any other return is either a match, or some kind of error. */
7134
7135 default:
7136 goto ENDLOOP;
7137 }
7138
7139 /* Control reaches here for the various types of "no match at this point"
7140 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7141
7142 rc = MATCH_NOMATCH;
7143
7144 /* If PCRE2_FIRSTLINE is set, the match must happen before or at the first
7145 newline in the subject (though it may continue over the newline). Therefore,
7146 if we have just failed to match, starting at a newline, do not continue. */
7147
7148 if (firstline && IS_NEWLINE(start_match)) break;
7149
7150 /* Advance to new matching position */
7151
7152 start_match = new_start_match;
7153
7154 /* Break the loop if the pattern is anchored or if we have passed the end of
7155 the subject. */
7156
7157 if (anchored || start_match > end_subject) break;
7158
7159 /* If we have just passed a CR and we are now at a LF, and the pattern does
7160 not contain any explicit matches for \r or \n, and the newline option is CRLF
7161 or ANY or ANYCRLF, advance the match position by one more code unit. In
7162 normal matching start_match will aways be greater than the first position at
7163 this stage, but a failed *SKIP can cause a return at the same point, which is
7164 why the first test exists. */
7165
7166 if (start_match > subject + start_offset &&
7167 start_match[-1] == CHAR_CR &&
7168 start_match < end_subject &&
7169 *start_match == CHAR_NL &&
7170 (re->flags & PCRE2_HASCRORLF) == 0 &&
7171 (mb->nltype == NLTYPE_ANY ||
7172 mb->nltype == NLTYPE_ANYCRLF ||
7173 mb->nllen == 2))
7174 start_match++;
7175
7176 mb->mark = NULL; /* Reset for start of next match attempt */
7177 } /* End of for(;;) "bumpalong" loop */
7178
7179 /* ==========================================================================*/
7180
7181 /* When we reach here, one of the following stopping conditions is true:
7182
7183 (1) The match succeeded, either completely, or partially;
7184
7185 (2) The pattern is anchored or the match was failed after (*COMMIT);
7186
7187 (3) We are past the end of the subject or the bumpalong limit;
7188
7189 (4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because
7190 this option requests that a match occur at or before the first newline in
7191 the subject.
7192
7193 (5) Some kind of error occurred.
7194
7195 */
7196
7197 ENDLOOP:
7198
7199 /* If end_subject != true_end_subject, it means we are handling invalid UTF,
7200 and have just processed a non-terminal fragment. If this resulted in no match
7201 or a partial match we must carry on to the next fragment (a partial match is
7202 returned to the caller only at the very end of the subject). A loop is used to
7203 avoid trying to match against empty fragments; if the pattern can match an
7204 empty string it would have done so already. */
7205
7206 #ifdef SUPPORT_UNICODE
7207 if (utf && end_subject != true_end_subject &&
7208 (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))
7209 {
7210 for (;;)
7211 {
7212 /* Advance past the first bad code unit, and then skip invalid character
7213 starting code units in 8-bit and 16-bit modes. */
7214
7215 start_match = end_subject + 1;
7216
7217 #if PCRE2_CODE_UNIT_WIDTH != 32
7218 while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))
7219 start_match++;
7220 #endif
7221
7222 /* If we have hit the end of the subject, there isn't another non-empty
7223 fragment, so give up. */
7224
7225 if (start_match >= true_end_subject)
7226 {
7227 rc = MATCH_NOMATCH; /* In case it was partial */
7228 break;
7229 }
7230
7231 /* Check the rest of the subject */
7232
7233 mb->check_subject = start_match;
7234 rc = PRIV(valid_utf)(start_match, length - (start_match - subject),
7235 &(match_data->startchar));
7236
7237 /* The rest of the subject is valid UTF. */
7238
7239 if (rc == 0)
7240 {
7241 mb->end_subject = end_subject = true_end_subject;
7242 fragment_options = PCRE2_NOTBOL;
7243 goto FRAGMENT_RESTART;
7244 }
7245
7246 /* A subsequent UTF error has been found; if the next fragment is
7247 non-empty, set up to process it. Otherwise, let the loop advance. */
7248
7249 else if (rc < 0)
7250 {
7251 mb->end_subject = end_subject = start_match + match_data->startchar;
7252 if (end_subject > start_match)
7253 {
7254 fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;
7255 goto FRAGMENT_RESTART;
7256 }
7257 }
7258 }
7259 }
7260 #endif /* SUPPORT_UNICODE */
7261
7262 /* Release an enlarged frame vector that is on the heap. */
7263
7264 if (mb->match_frames != mb->stack_frames)
7265 mb->memctl.free(mb->match_frames, mb->memctl.memory_data);
7266
7267 /* Fill in fields that are always returned in the match data. */
7268
7269 match_data->code = re;
7270 match_data->mark = mb->mark;
7271 match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;
7272
7273 /* Handle a fully successful match. Set the return code to the number of
7274 captured strings, or 0 if there were too many to fit into the ovector, and then
7275 set the remaining returned values before returning. Make a copy of the subject
7276 string if requested. */
7277
7278 if (rc == MATCH_MATCH)
7279 {
7280 match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
7281 0 : (int)mb->end_offset_top/2 + 1;
7282 match_data->startchar = start_match - subject;
7283 match_data->leftchar = mb->start_used_ptr - subject;
7284 match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
7285 mb->last_used_ptr : mb->end_match_ptr) - subject;
7286 if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
7287 {
7288 length = CU2BYTES(length + was_zero_terminated);
7289 match_data->subject = match_data->memctl.malloc(length,
7290 match_data->memctl.memory_data);
7291 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
7292 memcpy((void *)match_data->subject, subject, length);
7293 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
7294 }
7295 else match_data->subject = subject;
7296 return match_data->rc;
7297 }
7298
7299 /* Control gets here if there has been a partial match, an error, or if the
7300 overall match attempt has failed at all permitted starting positions. Any mark
7301 data is in the nomatch_mark field. */
7302
7303 match_data->mark = mb->nomatch_mark;
7304
7305 /* For anything other than nomatch or partial match, just return the code. */
7306
7307 if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;
7308
7309 /* Handle a partial match. If a "soft" partial match was requested, searching
7310 for a complete match will have continued, and the value of rc at this point
7311 will be MATCH_NOMATCH. For a "hard" partial match, it will already be
7312 PCRE2_ERROR_PARTIAL. */
7313
7314 else if (match_partial != NULL)
7315 {
7316 match_data->subject = subject;
7317 match_data->ovector[0] = match_partial - subject;
7318 match_data->ovector[1] = end_subject - subject;
7319 match_data->startchar = match_partial - subject;
7320 match_data->leftchar = start_partial - subject;
7321 match_data->rightchar = end_subject - subject;
7322 match_data->rc = PCRE2_ERROR_PARTIAL;
7323 }
7324
7325 /* Else this is the classic nomatch case. */
7326
7327 else match_data->rc = PCRE2_ERROR_NOMATCH;
7328
7329 return match_data->rc;
7330 }
7331
7332 /* End of pcre2_match.c */
7333