1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2022 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK mb /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre2_internal.h"
84
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89 PCRE2_COPY_MATCHED_SUBJECT)
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const uint8_t coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
155 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
156 0, /* CLASS */
157 0, /* NCLASS */
158 0, /* XCLASS - variable length */
159 0, /* REF */
160 0, /* REFI */
161 0, /* DNREF */
162 0, /* DNREFI */
163 0, /* RECURSE */
164 0, /* CALLOUT */
165 0, /* CALLOUT_STR */
166 0, /* Alt */
167 0, /* Ket */
168 0, /* KetRmax */
169 0, /* KetRmin */
170 0, /* KetRpos */
171 0, /* Reverse */
172 0, /* Assert */
173 0, /* Assert not */
174 0, /* Assert behind */
175 0, /* Assert behind not */
176 0, /* NA assert */
177 0, /* NA assert behind */
178 0, /* ONCE */
179 0, /* SCRIPT_RUN */
180 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
181 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
182 0, 0, /* CREF, DNCREF */
183 0, 0, /* RREF, DNRREF */
184 0, 0, /* FALSE, TRUE */
185 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
186 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
187 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
188 0, 0, /* COMMIT, COMMIT_ARG */
189 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
190 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
191 };
192
193 /* This table identifies those opcodes that inspect a character. It is used to
194 remember the fact that a character could have been inspected when the end of
195 the subject is reached. ***NOTE*** If the start of this table is modified, the
196 two tables that follow must also be modified. */
197
198 static const uint8_t poptable[] = {
199 0, /* End */
200 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
201 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
202 1, 1, 1, /* Any, AllAny, Anybyte */
203 1, 1, /* \P, \p */
204 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
205 1, /* \X */
206 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
207 1, /* Char */
208 1, /* Chari */
209 1, /* not */
210 1, /* noti */
211 /* Positive single-char repeats */
212 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
213 1, 1, 1, /* upto, minupto, exact */
214 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
215 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
216 1, 1, 1, /* upto I, minupto I, exact I */
217 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
218 /* Negative single-char repeats - only for chars < 256 */
219 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
220 1, 1, 1, /* NOT upto, minupto, exact */
221 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
222 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
223 1, 1, 1, /* NOT upto I, minupto I, exact I */
224 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
225 /* Positive type repeats */
226 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
227 1, 1, 1, /* Type upto, minupto, exact */
228 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
229 /* Character class & ref repeats */
230 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
231 1, 1, /* CRRANGE, CRMINRANGE */
232 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
233 1, /* CLASS */
234 1, /* NCLASS */
235 1, /* XCLASS - variable length */
236 0, /* REF */
237 0, /* REFI */
238 0, /* DNREF */
239 0, /* DNREFI */
240 0, /* RECURSE */
241 0, /* CALLOUT */
242 0, /* CALLOUT_STR */
243 0, /* Alt */
244 0, /* Ket */
245 0, /* KetRmax */
246 0, /* KetRmin */
247 0, /* KetRpos */
248 0, /* Reverse */
249 0, /* Assert */
250 0, /* Assert not */
251 0, /* Assert behind */
252 0, /* Assert behind not */
253 0, /* NA assert */
254 0, /* NA assert behind */
255 0, /* ONCE */
256 0, /* SCRIPT_RUN */
257 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
258 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
259 0, 0, /* CREF, DNCREF */
260 0, 0, /* RREF, DNRREF */
261 0, 0, /* FALSE, TRUE */
262 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
263 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
264 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
265 0, 0, /* COMMIT, COMMIT_ARG */
266 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
267 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
268 };
269
270 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
271 and \w */
272
273 static const uint8_t toptable1[] = {
274 0, 0, 0, 0, 0, 0,
275 ctype_digit, ctype_digit,
276 ctype_space, ctype_space,
277 ctype_word, ctype_word,
278 0, 0 /* OP_ANY, OP_ALLANY */
279 };
280
281 static const uint8_t toptable2[] = {
282 0, 0, 0, 0, 0, 0,
283 ctype_digit, 0,
284 ctype_space, 0,
285 ctype_word, 0,
286 1, 1 /* OP_ANY, OP_ALLANY */
287 };
288
289
290 /* Structure for holding data about a particular state, which is in effect the
291 current data for an active path through the match tree. It must consist
292 entirely of ints because the working vector we are passed, and which we put
293 these structures in, is a vector of ints. */
294
295 typedef struct stateblock {
296 int offset; /* Offset to opcode (-ve has meaning) */
297 int count; /* Count for repeats */
298 int data; /* Some use extra data */
299 } stateblock;
300
301 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
302
303
304 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
305 local working space and output vectors that were created on the stack. This has
306 caused issues for some patterns, especially in small-stack environments such as
307 Windows. A new scheme is now in use which sets up a vector on the stack, but if
308 this is too small, heap memory is used, up to the heap_limit. The main
309 parameters are all numbers of ints because the workspace is a vector of ints.
310
311 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
312 defined in pcre2_internal.h so as to be available to pcre2test when it is
313 finding the minimum heap requirement for a match. */
314
315 #define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
316
317 #define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
318 #define RWS_RSIZE 1000 /* Work size for recursion */
319 #define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
320 #define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
321
322 /* This structure is at the start of each workspace block. */
323
324 typedef struct RWS_anchor {
325 struct RWS_anchor *next;
326 uint32_t size; /* Number of ints */
327 uint32_t free; /* Number of ints */
328 } RWS_anchor;
329
330 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
331
332
333
334 /*************************************************
335 * Process a callout *
336 *************************************************/
337
338 /* This function is called to perform a callout.
339
340 Arguments:
341 code current code pointer
342 offsets points to current capture offsets
343 current_subject start of current subject match
344 ptr current position in subject
345 mb the match block
346 extracode extra code offset when called from condition
347 lengthptr where to return the callout length
348
349 Returns: the return from the callout
350 */
351
352 static int
do_callout_dfa(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)353 do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
354 PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
355 PCRE2_SIZE *lengthptr)
356 {
357 pcre2_callout_block *cb = mb->cb;
358
359 *lengthptr = (code[extracode] == OP_CALLOUT)?
360 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
361 (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
362
363 if (mb->callout == NULL) return 0; /* No callout provided */
364
365 /* Fixed fields in the callout block are set once and for all at the start of
366 matching. */
367
368 cb->offset_vector = offsets;
369 cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
370 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
371 cb->pattern_position = GET(code, 1 + extracode);
372 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
373
374 if (code[extracode] == OP_CALLOUT)
375 {
376 cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
377 cb->callout_string_offset = 0;
378 cb->callout_string = NULL;
379 cb->callout_string_length = 0;
380 }
381 else
382 {
383 cb->callout_number = 0;
384 cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
385 cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
386 cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
387 }
388
389 return (mb->callout)(cb, mb->callout_data);
390 }
391
392
393
394 /*************************************************
395 * Expand local workspace memory *
396 *************************************************/
397
398 /* This function is called when internal_dfa_match() is about to be called
399 recursively and there is insufficient working space left in the current
400 workspace block. If there's an existing next block, use it; otherwise get a new
401 block unless the heap limit is reached.
402
403 Arguments:
404 rwsptr pointer to block pointer (updated)
405 ovecsize space needed for an ovector
406 mb the match block
407
408 Returns: 0 rwsptr has been updated
409 !0 an error code
410 */
411
412 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)413 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
414 {
415 RWS_anchor *rws = *rwsptr;
416 RWS_anchor *new;
417
418 if (rws->next != NULL)
419 {
420 new = rws->next;
421 }
422
423 /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
424 mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
425 overflow. */
426
427 else
428 {
429 uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
430 uint32_t newsizeK = newsize/(1024/sizeof(int));
431
432 if (newsizeK + mb->heap_used > mb->heap_limit)
433 newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
434 newsize = newsizeK*(1024/sizeof(int));
435
436 if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
437 return PCRE2_ERROR_HEAPLIMIT;
438 new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
439 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
440 mb->heap_used += newsizeK;
441 new->next = NULL;
442 new->size = newsize;
443 rws->next = new;
444 }
445
446 new->free = new->size - RWS_ANCHOR_SIZE;
447 *rwsptr = new;
448 return 0;
449 }
450
451
452
453 /*************************************************
454 * Match a Regular Expression - DFA engine *
455 *************************************************/
456
457 /* This internal function applies a compiled pattern to a subject string,
458 starting at a given point, using a DFA engine. This function is called from the
459 external one, possibly multiple times if the pattern is not anchored. The
460 function calls itself recursively for some kinds of subpattern.
461
462 Arguments:
463 mb the match_data block with fixed information
464 this_start_code the opening bracket of this subexpression's code
465 current_subject where we currently are in the subject string
466 start_offset start offset in the subject string
467 offsets vector to contain the matching string offsets
468 offsetcount size of same
469 workspace vector of workspace
470 wscount size of same
471 rlevel function call recursion level
472
473 Returns: > 0 => number of match offset pairs placed in offsets
474 = 0 => offsets overflowed; longest matches are present
475 -1 => failed to match
476 < -1 => some kind of unexpected problem
477
478 The following macros are used for adding states to the two state vectors (one
479 for the current character, one for the following character). */
480
481 #define ADD_ACTIVE(x,y) \
482 if (active_count++ < wscount) \
483 { \
484 next_active_state->offset = (x); \
485 next_active_state->count = (y); \
486 next_active_state++; \
487 } \
488 else return PCRE2_ERROR_DFA_WSSIZE
489
490 #define ADD_ACTIVE_DATA(x,y,z) \
491 if (active_count++ < wscount) \
492 { \
493 next_active_state->offset = (x); \
494 next_active_state->count = (y); \
495 next_active_state->data = (z); \
496 next_active_state++; \
497 } \
498 else return PCRE2_ERROR_DFA_WSSIZE
499
500 #define ADD_NEW(x,y) \
501 if (new_count++ < wscount) \
502 { \
503 next_new_state->offset = (x); \
504 next_new_state->count = (y); \
505 next_new_state++; \
506 } \
507 else return PCRE2_ERROR_DFA_WSSIZE
508
509 #define ADD_NEW_DATA(x,y,z) \
510 if (new_count++ < wscount) \
511 { \
512 next_new_state->offset = (x); \
513 next_new_state->count = (y); \
514 next_new_state->data = (z); \
515 next_new_state++; \
516 } \
517 else return PCRE2_ERROR_DFA_WSSIZE
518
519 /* And now, here is the code */
520
521 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)522 internal_dfa_match(
523 dfa_match_block *mb,
524 PCRE2_SPTR this_start_code,
525 PCRE2_SPTR current_subject,
526 PCRE2_SIZE start_offset,
527 PCRE2_SIZE *offsets,
528 uint32_t offsetcount,
529 int *workspace,
530 int wscount,
531 uint32_t rlevel,
532 int *RWS)
533 {
534 stateblock *active_states, *new_states, *temp_states;
535 stateblock *next_active_state, *next_new_state;
536 const uint8_t *ctypes, *lcc, *fcc;
537 PCRE2_SPTR ptr;
538 PCRE2_SPTR end_code;
539 dfa_recursion_info new_recursive;
540 int active_count, new_count, match_count;
541
542 /* Some fields in the mb block are frequently referenced, so we load them into
543 independent variables in the hope that this will perform better. */
544
545 PCRE2_SPTR start_subject = mb->start_subject;
546 PCRE2_SPTR end_subject = mb->end_subject;
547 PCRE2_SPTR start_code = mb->start_code;
548
549 #ifdef SUPPORT_UNICODE
550 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
551 BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
552 #else
553 BOOL utf = FALSE;
554 #endif
555
556 BOOL reset_could_continue = FALSE;
557
558 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
559 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
560 offsetcount &= (uint32_t)(-2); /* Round down */
561
562 wscount -= 2;
563 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
564 (2 * INTS_PER_STATEBLOCK);
565
566 ctypes = mb->tables + ctypes_offset;
567 lcc = mb->tables + lcc_offset;
568 fcc = mb->tables + fcc_offset;
569
570 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
571
572 active_states = (stateblock *)(workspace + 2);
573 next_new_state = new_states = active_states + wscount;
574 new_count = 0;
575
576 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
577 the alternative states onto the list, and find out where the end is. This
578 makes is possible to use this function recursively, when we want to stop at a
579 matching internal ket rather than at the end.
580
581 If we are dealing with a backward assertion we have to find out the maximum
582 amount to move back, and set up each alternative appropriately. */
583
584 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
585 {
586 size_t max_back = 0;
587 size_t gone_back;
588
589 end_code = this_start_code;
590 do
591 {
592 size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
593 if (back > max_back) max_back = back;
594 end_code += GET(end_code, 1);
595 }
596 while (*end_code == OP_ALT);
597
598 /* If we can't go back the amount required for the longest lookbehind
599 pattern, go back as far as we can; some alternatives may still be viable. */
600
601 #ifdef SUPPORT_UNICODE
602 /* In character mode we have to step back character by character */
603
604 if (utf)
605 {
606 for (gone_back = 0; gone_back < max_back; gone_back++)
607 {
608 if (current_subject <= start_subject) break;
609 current_subject--;
610 ACROSSCHAR(current_subject > start_subject, current_subject,
611 current_subject--);
612 }
613 }
614 else
615 #endif
616
617 /* In byte-mode we can do this quickly. */
618
619 {
620 size_t current_offset = (size_t)(current_subject - start_subject);
621 gone_back = (current_offset < max_back)? current_offset : max_back;
622 current_subject -= gone_back;
623 }
624
625 /* Save the earliest consulted character */
626
627 if (current_subject < mb->start_used_ptr)
628 mb->start_used_ptr = current_subject;
629
630 /* Now we can process the individual branches. There will be an OP_REVERSE at
631 the start of each branch, except when the length of the branch is zero. */
632
633 end_code = this_start_code;
634 do
635 {
636 uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
637 size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
638 if (back <= gone_back)
639 {
640 int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
641 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
642 }
643 end_code += GET(end_code, 1);
644 }
645 while (*end_code == OP_ALT);
646 }
647
648 /* This is the code for a "normal" subpattern (not a backward assertion). The
649 start of a whole pattern is always one of these. If we are at the top level,
650 we may be asked to restart matching from the same point that we reached for a
651 previous partial match. We still have to scan through the top-level branches to
652 find the end state. */
653
654 else
655 {
656 end_code = this_start_code;
657
658 /* Restarting */
659
660 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
661 {
662 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
663 new_count = workspace[1];
664 if (!workspace[0])
665 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
666 }
667
668 /* Not restarting */
669
670 else
671 {
672 int length = 1 + LINK_SIZE +
673 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
674 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
675 ? IMM2_SIZE:0);
676 do
677 {
678 ADD_NEW((int)(end_code - start_code + length), 0);
679 end_code += GET(end_code, 1);
680 length = 1 + LINK_SIZE;
681 }
682 while (*end_code == OP_ALT);
683 }
684 }
685
686 workspace[0] = 0; /* Bit indicating which vector is current */
687
688 /* Loop for scanning the subject */
689
690 ptr = current_subject;
691 for (;;)
692 {
693 int i, j;
694 int clen, dlen;
695 uint32_t c, d;
696 int forced_fail = 0;
697 BOOL partial_newline = FALSE;
698 BOOL could_continue = reset_could_continue;
699 reset_could_continue = FALSE;
700
701 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
702
703 /* Make the new state list into the active state list and empty the
704 new state list. */
705
706 temp_states = active_states;
707 active_states = new_states;
708 new_states = temp_states;
709 active_count = new_count;
710 new_count = 0;
711
712 workspace[0] ^= 1; /* Remember for the restarting feature */
713 workspace[1] = active_count;
714
715 /* Set the pointers for adding new states */
716
717 next_active_state = active_states + active_count;
718 next_new_state = new_states;
719
720 /* Load the current character from the subject outside the loop, as many
721 different states may want to look at it, and we assume that at least one
722 will. */
723
724 if (ptr < end_subject)
725 {
726 clen = 1; /* Number of data items in the character */
727 #ifdef SUPPORT_UNICODE
728 GETCHARLENTEST(c, ptr, clen);
729 #else
730 c = *ptr;
731 #endif /* SUPPORT_UNICODE */
732 }
733 else
734 {
735 clen = 0; /* This indicates the end of the subject */
736 c = NOTACHAR; /* This value should never actually be used */
737 }
738
739 /* Scan up the active states and act on each one. The result of an action
740 may be to add more states to the currently active list (e.g. on hitting a
741 parenthesis) or it may be to put states on the new list, for considering
742 when we move the character pointer on. */
743
744 for (i = 0; i < active_count; i++)
745 {
746 stateblock *current_state = active_states + i;
747 BOOL caseless = FALSE;
748 PCRE2_SPTR code;
749 uint32_t codevalue;
750 int state_offset = current_state->offset;
751 int rrc;
752 int count;
753
754 /* A negative offset is a special case meaning "hold off going to this
755 (negated) state until the number of characters in the data field have
756 been skipped". If the could_continue flag was passed over from a previous
757 state, arrange for it to passed on. */
758
759 if (state_offset < 0)
760 {
761 if (current_state->data > 0)
762 {
763 ADD_NEW_DATA(state_offset, current_state->count,
764 current_state->data - 1);
765 if (could_continue) reset_could_continue = TRUE;
766 continue;
767 }
768 else
769 {
770 current_state->offset = state_offset = -state_offset;
771 }
772 }
773
774 /* Check for a duplicate state with the same count, and skip if found.
775 See the note at the head of this module about the possibility of improving
776 performance here. */
777
778 for (j = 0; j < i; j++)
779 {
780 if (active_states[j].offset == state_offset &&
781 active_states[j].count == current_state->count)
782 goto NEXT_ACTIVE_STATE;
783 }
784
785 /* The state offset is the offset to the opcode */
786
787 code = start_code + state_offset;
788 codevalue = *code;
789
790 /* If this opcode inspects a character, but we are at the end of the
791 subject, remember the fact for use when testing for a partial match. */
792
793 if (clen == 0 && poptable[codevalue] != 0)
794 could_continue = TRUE;
795
796 /* If this opcode is followed by an inline character, load it. It is
797 tempting to test for the presence of a subject character here, but that
798 is wrong, because sometimes zero repetitions of the subject are
799 permitted.
800
801 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
802 argument that is not a data character - but is always one byte long because
803 the values are small. We have to take special action to deal with \P, \p,
804 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
805 these ones to new opcodes. */
806
807 if (coptable[codevalue] > 0)
808 {
809 dlen = 1;
810 #ifdef SUPPORT_UNICODE
811 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
812 #endif /* SUPPORT_UNICODE */
813 d = code[coptable[codevalue]];
814 if (codevalue >= OP_TYPESTAR)
815 {
816 switch(d)
817 {
818 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
819 case OP_NOTPROP:
820 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
821 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
822 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
823 case OP_NOT_HSPACE:
824 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
825 case OP_NOT_VSPACE:
826 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
827 default: break;
828 }
829 }
830 }
831 else
832 {
833 dlen = 0; /* Not strictly necessary, but compilers moan */
834 d = NOTACHAR; /* if these variables are not set. */
835 }
836
837
838 /* Now process the individual opcodes */
839
840 switch (codevalue)
841 {
842 /* ========================================================================== */
843 /* These cases are never obeyed. This is a fudge that causes a compile-
844 time error if the vectors coptable or poptable, which are indexed by
845 opcode, are not the correct length. It seems to be the only way to do
846 such a check at compile time, as the sizeof() operator does not work
847 in the C preprocessor. */
848
849 case OP_TABLE_LENGTH:
850 case OP_TABLE_LENGTH +
851 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
852 (sizeof(poptable) == OP_TABLE_LENGTH)):
853 return 0;
854
855 /* ========================================================================== */
856 /* Reached a closing bracket. If not at the end of the pattern, carry
857 on with the next opcode. For repeating opcodes, also add the repeat
858 state. Note that KETRPOS will always be encountered at the end of the
859 subpattern, because the possessive subpattern repeats are always handled
860 using recursive calls. Thus, it never adds any new states.
861
862 At the end of the (sub)pattern, unless we have an empty string and
863 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
864 start of the subject, save the match data, shifting up all previous
865 matches so we always have the longest first. */
866
867 case OP_KET:
868 case OP_KETRMIN:
869 case OP_KETRMAX:
870 case OP_KETRPOS:
871 if (code != end_code)
872 {
873 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
874 if (codevalue != OP_KET)
875 {
876 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
877 }
878 }
879 else
880 {
881 if (ptr > current_subject ||
882 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
883 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
884 current_subject > start_subject + mb->start_offset)))
885 {
886 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
887 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
888 match_count = 0;
889 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
890 if (count > 0) (void)memmove(offsets + 2, offsets,
891 (size_t)count * sizeof(PCRE2_SIZE));
892 if (offsetcount >= 2)
893 {
894 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
895 offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
896 }
897 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
898 }
899 }
900 break;
901
902 /* ========================================================================== */
903 /* These opcodes add to the current list of states without looking
904 at the current character. */
905
906 /*-----------------------------------------------------------------*/
907 case OP_ALT:
908 do { code += GET(code, 1); } while (*code == OP_ALT);
909 ADD_ACTIVE((int)(code - start_code), 0);
910 break;
911
912 /*-----------------------------------------------------------------*/
913 case OP_BRA:
914 case OP_SBRA:
915 do
916 {
917 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
918 code += GET(code, 1);
919 }
920 while (*code == OP_ALT);
921 break;
922
923 /*-----------------------------------------------------------------*/
924 case OP_CBRA:
925 case OP_SCBRA:
926 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
927 code += GET(code, 1);
928 while (*code == OP_ALT)
929 {
930 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
931 code += GET(code, 1);
932 }
933 break;
934
935 /*-----------------------------------------------------------------*/
936 case OP_BRAZERO:
937 case OP_BRAMINZERO:
938 ADD_ACTIVE(state_offset + 1, 0);
939 code += 1 + GET(code, 2);
940 while (*code == OP_ALT) code += GET(code, 1);
941 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
942 break;
943
944 /*-----------------------------------------------------------------*/
945 case OP_SKIPZERO:
946 code += 1 + GET(code, 2);
947 while (*code == OP_ALT) code += GET(code, 1);
948 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
949 break;
950
951 /*-----------------------------------------------------------------*/
952 case OP_CIRC:
953 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
954 { ADD_ACTIVE(state_offset + 1, 0); }
955 break;
956
957 /*-----------------------------------------------------------------*/
958 case OP_CIRCM:
959 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
960 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
961 && WAS_NEWLINE(ptr)))
962 { ADD_ACTIVE(state_offset + 1, 0); }
963 break;
964
965 /*-----------------------------------------------------------------*/
966 case OP_EOD:
967 if (ptr >= end_subject)
968 {
969 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
970 return PCRE2_ERROR_PARTIAL;
971 else { ADD_ACTIVE(state_offset + 1, 0); }
972 }
973 break;
974
975 /*-----------------------------------------------------------------*/
976 case OP_SOD:
977 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
978 break;
979
980 /*-----------------------------------------------------------------*/
981 case OP_SOM:
982 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
983 break;
984
985
986 /* ========================================================================== */
987 /* These opcodes inspect the next subject character, and sometimes
988 the previous one as well, but do not have an argument. The variable
989 clen contains the length of the current character and is zero if we are
990 at the end of the subject. */
991
992 /*-----------------------------------------------------------------*/
993 case OP_ANY:
994 if (clen > 0 && !IS_NEWLINE(ptr))
995 {
996 if (ptr + 1 >= mb->end_subject &&
997 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
998 NLBLOCK->nltype == NLTYPE_FIXED &&
999 NLBLOCK->nllen == 2 &&
1000 c == NLBLOCK->nl[0])
1001 {
1002 could_continue = partial_newline = TRUE;
1003 }
1004 else
1005 {
1006 ADD_NEW(state_offset + 1, 0);
1007 }
1008 }
1009 break;
1010
1011 /*-----------------------------------------------------------------*/
1012 case OP_ALLANY:
1013 if (clen > 0)
1014 { ADD_NEW(state_offset + 1, 0); }
1015 break;
1016
1017 /*-----------------------------------------------------------------*/
1018 case OP_EODN:
1019 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1020 {
1021 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1022 return PCRE2_ERROR_PARTIAL;
1023 ADD_ACTIVE(state_offset + 1, 0);
1024 }
1025 break;
1026
1027 /*-----------------------------------------------------------------*/
1028 case OP_DOLL:
1029 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1030 {
1031 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1032 could_continue = TRUE;
1033 else if (clen == 0 ||
1034 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1035 (ptr == end_subject - mb->nllen)
1036 ))
1037 { ADD_ACTIVE(state_offset + 1, 0); }
1038 else if (ptr + 1 >= mb->end_subject &&
1039 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1040 NLBLOCK->nltype == NLTYPE_FIXED &&
1041 NLBLOCK->nllen == 2 &&
1042 c == NLBLOCK->nl[0])
1043 {
1044 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1045 {
1046 reset_could_continue = TRUE;
1047 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1048 }
1049 else could_continue = partial_newline = TRUE;
1050 }
1051 }
1052 break;
1053
1054 /*-----------------------------------------------------------------*/
1055 case OP_DOLLM:
1056 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1057 {
1058 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1059 could_continue = TRUE;
1060 else if (clen == 0 ||
1061 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1062 { ADD_ACTIVE(state_offset + 1, 0); }
1063 else if (ptr + 1 >= mb->end_subject &&
1064 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1065 NLBLOCK->nltype == NLTYPE_FIXED &&
1066 NLBLOCK->nllen == 2 &&
1067 c == NLBLOCK->nl[0])
1068 {
1069 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1070 {
1071 reset_could_continue = TRUE;
1072 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1073 }
1074 else could_continue = partial_newline = TRUE;
1075 }
1076 }
1077 else if (IS_NEWLINE(ptr))
1078 { ADD_ACTIVE(state_offset + 1, 0); }
1079 break;
1080
1081 /*-----------------------------------------------------------------*/
1082
1083 case OP_DIGIT:
1084 case OP_WHITESPACE:
1085 case OP_WORDCHAR:
1086 if (clen > 0 && c < 256 &&
1087 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1088 { ADD_NEW(state_offset + 1, 0); }
1089 break;
1090
1091 /*-----------------------------------------------------------------*/
1092 case OP_NOT_DIGIT:
1093 case OP_NOT_WHITESPACE:
1094 case OP_NOT_WORDCHAR:
1095 if (clen > 0 && (c >= 256 ||
1096 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1097 { ADD_NEW(state_offset + 1, 0); }
1098 break;
1099
1100 /*-----------------------------------------------------------------*/
1101 case OP_WORD_BOUNDARY:
1102 case OP_NOT_WORD_BOUNDARY:
1103 {
1104 int left_word, right_word;
1105
1106 if (ptr > start_subject)
1107 {
1108 PCRE2_SPTR temp = ptr - 1;
1109 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1110 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1111 if (utf) { BACKCHAR(temp); }
1112 #endif
1113 GETCHARTEST(d, temp);
1114 #ifdef SUPPORT_UNICODE
1115 if ((mb->poptions & PCRE2_UCP) != 0)
1116 {
1117 if (d == '_') left_word = TRUE; else
1118 {
1119 uint32_t cat = UCD_CATEGORY(d);
1120 left_word = (cat == ucp_L || cat == ucp_N);
1121 }
1122 }
1123 else
1124 #endif
1125 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1126 }
1127 else left_word = FALSE;
1128
1129 if (clen > 0)
1130 {
1131 if (ptr >= mb->last_used_ptr)
1132 {
1133 PCRE2_SPTR temp = ptr + 1;
1134 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1135 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1136 #endif
1137 mb->last_used_ptr = temp;
1138 }
1139 #ifdef SUPPORT_UNICODE
1140 if ((mb->poptions & PCRE2_UCP) != 0)
1141 {
1142 if (c == '_') right_word = TRUE; else
1143 {
1144 uint32_t cat = UCD_CATEGORY(c);
1145 right_word = (cat == ucp_L || cat == ucp_N);
1146 }
1147 }
1148 else
1149 #endif
1150 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1151 }
1152 else right_word = FALSE;
1153
1154 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1155 { ADD_ACTIVE(state_offset + 1, 0); }
1156 }
1157 break;
1158
1159
1160 /*-----------------------------------------------------------------*/
1161 /* Check the next character by Unicode property. We will get here only
1162 if the support is in the binary; otherwise a compile-time error occurs.
1163 */
1164
1165 #ifdef SUPPORT_UNICODE
1166 case OP_PROP:
1167 case OP_NOTPROP:
1168 if (clen > 0)
1169 {
1170 BOOL OK;
1171 const uint32_t *cp;
1172 const ucd_record * prop = GET_UCD(c);
1173 switch(code[1])
1174 {
1175 case PT_ANY:
1176 OK = TRUE;
1177 break;
1178
1179 case PT_LAMP:
1180 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1181 prop->chartype == ucp_Lt;
1182 break;
1183
1184 case PT_GC:
1185 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1186 break;
1187
1188 case PT_PC:
1189 OK = prop->chartype == code[2];
1190 break;
1191
1192 case PT_SC:
1193 OK = prop->script == code[2];
1194 break;
1195
1196 case PT_SCX:
1197 OK = (prop->script == code[2] ||
1198 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1199 break;
1200
1201 /* These are specials for combination cases. */
1202
1203 case PT_ALNUM:
1204 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1205 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1206 break;
1207
1208 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1209 which means that Perl space and POSIX space are now identical. PCRE
1210 was changed at release 8.34. */
1211
1212 case PT_SPACE: /* Perl space */
1213 case PT_PXSPACE: /* POSIX space */
1214 switch(c)
1215 {
1216 HSPACE_CASES:
1217 VSPACE_CASES:
1218 OK = TRUE;
1219 break;
1220
1221 default:
1222 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1223 break;
1224 }
1225 break;
1226
1227 case PT_WORD:
1228 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1229 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1230 c == CHAR_UNDERSCORE;
1231 break;
1232
1233 case PT_CLIST:
1234 cp = PRIV(ucd_caseless_sets) + code[2];
1235 for (;;)
1236 {
1237 if (c < *cp) { OK = FALSE; break; }
1238 if (c == *cp++) { OK = TRUE; break; }
1239 }
1240 break;
1241
1242 case PT_UCNC:
1243 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1244 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1245 c >= 0xe000;
1246 break;
1247
1248 case PT_BIDICL:
1249 OK = UCD_BIDICLASS(c) == code[2];
1250 break;
1251
1252 case PT_BOOL:
1253 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1254 UCD_BPROPS_PROP(prop), code[2]) != 0;
1255 break;
1256
1257 /* Should never occur, but keep compilers from grumbling. */
1258
1259 default:
1260 OK = codevalue != OP_PROP;
1261 break;
1262 }
1263
1264 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1265 }
1266 break;
1267 #endif
1268
1269
1270
1271 /* ========================================================================== */
1272 /* These opcodes likewise inspect the subject character, but have an
1273 argument that is not a data character. It is one of these opcodes:
1274 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1275 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1276
1277 case OP_TYPEPLUS:
1278 case OP_TYPEMINPLUS:
1279 case OP_TYPEPOSPLUS:
1280 count = current_state->count; /* Already matched */
1281 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1282 if (clen > 0)
1283 {
1284 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1285 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1286 NLBLOCK->nltype == NLTYPE_FIXED &&
1287 NLBLOCK->nllen == 2 &&
1288 c == NLBLOCK->nl[0])
1289 {
1290 could_continue = partial_newline = TRUE;
1291 }
1292 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1293 (c < 256 &&
1294 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1295 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1296 {
1297 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1298 {
1299 active_count--; /* Remove non-match possibility */
1300 next_active_state--;
1301 }
1302 count++;
1303 ADD_NEW(state_offset, count);
1304 }
1305 }
1306 break;
1307
1308 /*-----------------------------------------------------------------*/
1309 case OP_TYPEQUERY:
1310 case OP_TYPEMINQUERY:
1311 case OP_TYPEPOSQUERY:
1312 ADD_ACTIVE(state_offset + 2, 0);
1313 if (clen > 0)
1314 {
1315 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1316 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1317 NLBLOCK->nltype == NLTYPE_FIXED &&
1318 NLBLOCK->nllen == 2 &&
1319 c == NLBLOCK->nl[0])
1320 {
1321 could_continue = partial_newline = TRUE;
1322 }
1323 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1324 (c < 256 &&
1325 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1326 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1327 {
1328 if (codevalue == OP_TYPEPOSQUERY)
1329 {
1330 active_count--; /* Remove non-match possibility */
1331 next_active_state--;
1332 }
1333 ADD_NEW(state_offset + 2, 0);
1334 }
1335 }
1336 break;
1337
1338 /*-----------------------------------------------------------------*/
1339 case OP_TYPESTAR:
1340 case OP_TYPEMINSTAR:
1341 case OP_TYPEPOSSTAR:
1342 ADD_ACTIVE(state_offset + 2, 0);
1343 if (clen > 0)
1344 {
1345 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1346 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1347 NLBLOCK->nltype == NLTYPE_FIXED &&
1348 NLBLOCK->nllen == 2 &&
1349 c == NLBLOCK->nl[0])
1350 {
1351 could_continue = partial_newline = TRUE;
1352 }
1353 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1354 (c < 256 &&
1355 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1356 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1357 {
1358 if (codevalue == OP_TYPEPOSSTAR)
1359 {
1360 active_count--; /* Remove non-match possibility */
1361 next_active_state--;
1362 }
1363 ADD_NEW(state_offset, 0);
1364 }
1365 }
1366 break;
1367
1368 /*-----------------------------------------------------------------*/
1369 case OP_TYPEEXACT:
1370 count = current_state->count; /* Number already matched */
1371 if (clen > 0)
1372 {
1373 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1374 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1375 NLBLOCK->nltype == NLTYPE_FIXED &&
1376 NLBLOCK->nllen == 2 &&
1377 c == NLBLOCK->nl[0])
1378 {
1379 could_continue = partial_newline = TRUE;
1380 }
1381 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1382 (c < 256 &&
1383 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1384 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1385 {
1386 if (++count >= (int)GET2(code, 1))
1387 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1388 else
1389 { ADD_NEW(state_offset, count); }
1390 }
1391 }
1392 break;
1393
1394 /*-----------------------------------------------------------------*/
1395 case OP_TYPEUPTO:
1396 case OP_TYPEMINUPTO:
1397 case OP_TYPEPOSUPTO:
1398 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1399 count = current_state->count; /* Number already matched */
1400 if (clen > 0)
1401 {
1402 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1403 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1404 NLBLOCK->nltype == NLTYPE_FIXED &&
1405 NLBLOCK->nllen == 2 &&
1406 c == NLBLOCK->nl[0])
1407 {
1408 could_continue = partial_newline = TRUE;
1409 }
1410 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1411 (c < 256 &&
1412 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1413 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1414 {
1415 if (codevalue == OP_TYPEPOSUPTO)
1416 {
1417 active_count--; /* Remove non-match possibility */
1418 next_active_state--;
1419 }
1420 if (++count >= (int)GET2(code, 1))
1421 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1422 else
1423 { ADD_NEW(state_offset, count); }
1424 }
1425 }
1426 break;
1427
1428 /* ========================================================================== */
1429 /* These are virtual opcodes that are used when something like
1430 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1431 argument. It keeps the code above fast for the other cases. The argument
1432 is in the d variable. */
1433
1434 #ifdef SUPPORT_UNICODE
1435 case OP_PROP_EXTRA + OP_TYPEPLUS:
1436 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1437 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1438 count = current_state->count; /* Already matched */
1439 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1440 if (clen > 0)
1441 {
1442 BOOL OK;
1443 const uint32_t *cp;
1444 const ucd_record * prop = GET_UCD(c);
1445 switch(code[2])
1446 {
1447 case PT_ANY:
1448 OK = TRUE;
1449 break;
1450
1451 case PT_LAMP:
1452 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1453 prop->chartype == ucp_Lt;
1454 break;
1455
1456 case PT_GC:
1457 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1458 break;
1459
1460 case PT_PC:
1461 OK = prop->chartype == code[3];
1462 break;
1463
1464 case PT_SC:
1465 OK = prop->script == code[3];
1466 break;
1467
1468 case PT_SCX:
1469 OK = (prop->script == code[3] ||
1470 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1471 break;
1472
1473 /* These are specials for combination cases. */
1474
1475 case PT_ALNUM:
1476 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1477 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1478 break;
1479
1480 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1481 which means that Perl space and POSIX space are now identical. PCRE
1482 was changed at release 8.34. */
1483
1484 case PT_SPACE: /* Perl space */
1485 case PT_PXSPACE: /* POSIX space */
1486 switch(c)
1487 {
1488 HSPACE_CASES:
1489 VSPACE_CASES:
1490 OK = TRUE;
1491 break;
1492
1493 default:
1494 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1495 break;
1496 }
1497 break;
1498
1499 case PT_WORD:
1500 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1501 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1502 c == CHAR_UNDERSCORE;
1503 break;
1504
1505 case PT_CLIST:
1506 cp = PRIV(ucd_caseless_sets) + code[3];
1507 for (;;)
1508 {
1509 if (c < *cp) { OK = FALSE; break; }
1510 if (c == *cp++) { OK = TRUE; break; }
1511 }
1512 break;
1513
1514 case PT_UCNC:
1515 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1516 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1517 c >= 0xe000;
1518 break;
1519
1520 case PT_BIDICL:
1521 OK = UCD_BIDICLASS(c) == code[3];
1522 break;
1523
1524 case PT_BOOL:
1525 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1526 UCD_BPROPS_PROP(prop), code[3]) != 0;
1527 break;
1528
1529 /* Should never occur, but keep compilers from grumbling. */
1530
1531 default:
1532 OK = codevalue != OP_PROP;
1533 break;
1534 }
1535
1536 if (OK == (d == OP_PROP))
1537 {
1538 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1539 {
1540 active_count--; /* Remove non-match possibility */
1541 next_active_state--;
1542 }
1543 count++;
1544 ADD_NEW(state_offset, count);
1545 }
1546 }
1547 break;
1548
1549 /*-----------------------------------------------------------------*/
1550 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1551 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1552 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1553 count = current_state->count; /* Already matched */
1554 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1555 if (clen > 0)
1556 {
1557 int ncount = 0;
1558 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1559 {
1560 active_count--; /* Remove non-match possibility */
1561 next_active_state--;
1562 }
1563 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1564 &ncount);
1565 count++;
1566 ADD_NEW_DATA(-state_offset, count, ncount);
1567 }
1568 break;
1569 #endif
1570
1571 /*-----------------------------------------------------------------*/
1572 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1573 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1574 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1575 count = current_state->count; /* Already matched */
1576 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1577 if (clen > 0)
1578 {
1579 int ncount = 0;
1580 switch (c)
1581 {
1582 case CHAR_VT:
1583 case CHAR_FF:
1584 case CHAR_NEL:
1585 #ifndef EBCDIC
1586 case 0x2028:
1587 case 0x2029:
1588 #endif /* Not EBCDIC */
1589 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1590 goto ANYNL01;
1591
1592 case CHAR_CR:
1593 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1594 /* Fall through */
1595
1596 ANYNL01:
1597 case CHAR_LF:
1598 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1599 {
1600 active_count--; /* Remove non-match possibility */
1601 next_active_state--;
1602 }
1603 count++;
1604 ADD_NEW_DATA(-state_offset, count, ncount);
1605 break;
1606
1607 default:
1608 break;
1609 }
1610 }
1611 break;
1612
1613 /*-----------------------------------------------------------------*/
1614 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1615 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1616 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1617 count = current_state->count; /* Already matched */
1618 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1619 if (clen > 0)
1620 {
1621 BOOL OK;
1622 switch (c)
1623 {
1624 VSPACE_CASES:
1625 OK = TRUE;
1626 break;
1627
1628 default:
1629 OK = FALSE;
1630 break;
1631 }
1632
1633 if (OK == (d == OP_VSPACE))
1634 {
1635 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1636 {
1637 active_count--; /* Remove non-match possibility */
1638 next_active_state--;
1639 }
1640 count++;
1641 ADD_NEW_DATA(-state_offset, count, 0);
1642 }
1643 }
1644 break;
1645
1646 /*-----------------------------------------------------------------*/
1647 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1648 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1649 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1650 count = current_state->count; /* Already matched */
1651 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1652 if (clen > 0)
1653 {
1654 BOOL OK;
1655 switch (c)
1656 {
1657 HSPACE_CASES:
1658 OK = TRUE;
1659 break;
1660
1661 default:
1662 OK = FALSE;
1663 break;
1664 }
1665
1666 if (OK == (d == OP_HSPACE))
1667 {
1668 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1669 {
1670 active_count--; /* Remove non-match possibility */
1671 next_active_state--;
1672 }
1673 count++;
1674 ADD_NEW_DATA(-state_offset, count, 0);
1675 }
1676 }
1677 break;
1678
1679 /*-----------------------------------------------------------------*/
1680 #ifdef SUPPORT_UNICODE
1681 case OP_PROP_EXTRA + OP_TYPEQUERY:
1682 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1683 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1684 count = 4;
1685 goto QS1;
1686
1687 case OP_PROP_EXTRA + OP_TYPESTAR:
1688 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1689 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1690 count = 0;
1691
1692 QS1:
1693
1694 ADD_ACTIVE(state_offset + 4, 0);
1695 if (clen > 0)
1696 {
1697 BOOL OK;
1698 const uint32_t *cp;
1699 const ucd_record * prop = GET_UCD(c);
1700 switch(code[2])
1701 {
1702 case PT_ANY:
1703 OK = TRUE;
1704 break;
1705
1706 case PT_LAMP:
1707 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1708 prop->chartype == ucp_Lt;
1709 break;
1710
1711 case PT_GC:
1712 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1713 break;
1714
1715 case PT_PC:
1716 OK = prop->chartype == code[3];
1717 break;
1718
1719 case PT_SC:
1720 OK = prop->script == code[3];
1721 break;
1722
1723 case PT_SCX:
1724 OK = (prop->script == code[3] ||
1725 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1726 break;
1727
1728 /* These are specials for combination cases. */
1729
1730 case PT_ALNUM:
1731 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1732 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1733 break;
1734
1735 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1736 which means that Perl space and POSIX space are now identical. PCRE
1737 was changed at release 8.34. */
1738
1739 case PT_SPACE: /* Perl space */
1740 case PT_PXSPACE: /* POSIX space */
1741 switch(c)
1742 {
1743 HSPACE_CASES:
1744 VSPACE_CASES:
1745 OK = TRUE;
1746 break;
1747
1748 default:
1749 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1750 break;
1751 }
1752 break;
1753
1754 case PT_WORD:
1755 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1756 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1757 c == CHAR_UNDERSCORE;
1758 break;
1759
1760 case PT_CLIST:
1761 cp = PRIV(ucd_caseless_sets) + code[3];
1762 for (;;)
1763 {
1764 if (c < *cp) { OK = FALSE; break; }
1765 if (c == *cp++) { OK = TRUE; break; }
1766 }
1767 break;
1768
1769 case PT_UCNC:
1770 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1771 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1772 c >= 0xe000;
1773 break;
1774
1775 case PT_BIDICL:
1776 OK = UCD_BIDICLASS(c) == code[3];
1777 break;
1778
1779 case PT_BOOL:
1780 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1781 UCD_BPROPS_PROP(prop), code[3]) != 0;
1782 break;
1783
1784 /* Should never occur, but keep compilers from grumbling. */
1785
1786 default:
1787 OK = codevalue != OP_PROP;
1788 break;
1789 }
1790
1791 if (OK == (d == OP_PROP))
1792 {
1793 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1794 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1795 {
1796 active_count--; /* Remove non-match possibility */
1797 next_active_state--;
1798 }
1799 ADD_NEW(state_offset + count, 0);
1800 }
1801 }
1802 break;
1803
1804 /*-----------------------------------------------------------------*/
1805 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1806 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1807 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1808 count = 2;
1809 goto QS2;
1810
1811 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1812 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1813 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1814 count = 0;
1815
1816 QS2:
1817
1818 ADD_ACTIVE(state_offset + 2, 0);
1819 if (clen > 0)
1820 {
1821 int ncount = 0;
1822 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1823 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1824 {
1825 active_count--; /* Remove non-match possibility */
1826 next_active_state--;
1827 }
1828 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1829 &ncount);
1830 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1831 }
1832 break;
1833 #endif
1834
1835 /*-----------------------------------------------------------------*/
1836 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1837 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1838 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1839 count = 2;
1840 goto QS3;
1841
1842 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1843 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1844 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1845 count = 0;
1846
1847 QS3:
1848 ADD_ACTIVE(state_offset + 2, 0);
1849 if (clen > 0)
1850 {
1851 int ncount = 0;
1852 switch (c)
1853 {
1854 case CHAR_VT:
1855 case CHAR_FF:
1856 case CHAR_NEL:
1857 #ifndef EBCDIC
1858 case 0x2028:
1859 case 0x2029:
1860 #endif /* Not EBCDIC */
1861 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1862 goto ANYNL02;
1863
1864 case CHAR_CR:
1865 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1866 /* Fall through */
1867
1868 ANYNL02:
1869 case CHAR_LF:
1870 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1871 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1872 {
1873 active_count--; /* Remove non-match possibility */
1874 next_active_state--;
1875 }
1876 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1877 break;
1878
1879 default:
1880 break;
1881 }
1882 }
1883 break;
1884
1885 /*-----------------------------------------------------------------*/
1886 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1887 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1888 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1889 count = 2;
1890 goto QS4;
1891
1892 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1893 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1894 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1895 count = 0;
1896
1897 QS4:
1898 ADD_ACTIVE(state_offset + 2, 0);
1899 if (clen > 0)
1900 {
1901 BOOL OK;
1902 switch (c)
1903 {
1904 VSPACE_CASES:
1905 OK = TRUE;
1906 break;
1907
1908 default:
1909 OK = FALSE;
1910 break;
1911 }
1912 if (OK == (d == OP_VSPACE))
1913 {
1914 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1915 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1916 {
1917 active_count--; /* Remove non-match possibility */
1918 next_active_state--;
1919 }
1920 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1921 }
1922 }
1923 break;
1924
1925 /*-----------------------------------------------------------------*/
1926 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1927 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1928 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1929 count = 2;
1930 goto QS5;
1931
1932 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1933 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1934 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1935 count = 0;
1936
1937 QS5:
1938 ADD_ACTIVE(state_offset + 2, 0);
1939 if (clen > 0)
1940 {
1941 BOOL OK;
1942 switch (c)
1943 {
1944 HSPACE_CASES:
1945 OK = TRUE;
1946 break;
1947
1948 default:
1949 OK = FALSE;
1950 break;
1951 }
1952
1953 if (OK == (d == OP_HSPACE))
1954 {
1955 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1956 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1957 {
1958 active_count--; /* Remove non-match possibility */
1959 next_active_state--;
1960 }
1961 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1962 }
1963 }
1964 break;
1965
1966 /*-----------------------------------------------------------------*/
1967 #ifdef SUPPORT_UNICODE
1968 case OP_PROP_EXTRA + OP_TYPEEXACT:
1969 case OP_PROP_EXTRA + OP_TYPEUPTO:
1970 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1971 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1972 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1973 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1974 count = current_state->count; /* Number already matched */
1975 if (clen > 0)
1976 {
1977 BOOL OK;
1978 const uint32_t *cp;
1979 const ucd_record * prop = GET_UCD(c);
1980 switch(code[1 + IMM2_SIZE + 1])
1981 {
1982 case PT_ANY:
1983 OK = TRUE;
1984 break;
1985
1986 case PT_LAMP:
1987 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1988 prop->chartype == ucp_Lt;
1989 break;
1990
1991 case PT_GC:
1992 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1993 break;
1994
1995 case PT_PC:
1996 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1997 break;
1998
1999 case PT_SC:
2000 OK = prop->script == code[1 + IMM2_SIZE + 2];
2001 break;
2002
2003 case PT_SCX:
2004 OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2005 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2006 code[1 + IMM2_SIZE + 2]) != 0);
2007 break;
2008
2009 /* These are specials for combination cases. */
2010
2011 case PT_ALNUM:
2012 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2013 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
2014 break;
2015
2016 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2017 which means that Perl space and POSIX space are now identical. PCRE
2018 was changed at release 8.34. */
2019
2020 case PT_SPACE: /* Perl space */
2021 case PT_PXSPACE: /* POSIX space */
2022 switch(c)
2023 {
2024 HSPACE_CASES:
2025 VSPACE_CASES:
2026 OK = TRUE;
2027 break;
2028
2029 default:
2030 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2031 break;
2032 }
2033 break;
2034
2035 case PT_WORD:
2036 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2037 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2038 c == CHAR_UNDERSCORE;
2039 break;
2040
2041 case PT_CLIST:
2042 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2043 for (;;)
2044 {
2045 if (c < *cp) { OK = FALSE; break; }
2046 if (c == *cp++) { OK = TRUE; break; }
2047 }
2048 break;
2049
2050 case PT_UCNC:
2051 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2052 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2053 c >= 0xe000;
2054 break;
2055
2056 case PT_BIDICL:
2057 OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2058 break;
2059
2060 case PT_BOOL:
2061 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2062 UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2063 break;
2064
2065 /* Should never occur, but keep compilers from grumbling. */
2066
2067 default:
2068 OK = codevalue != OP_PROP;
2069 break;
2070 }
2071
2072 if (OK == (d == OP_PROP))
2073 {
2074 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2075 {
2076 active_count--; /* Remove non-match possibility */
2077 next_active_state--;
2078 }
2079 if (++count >= (int)GET2(code, 1))
2080 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2081 else
2082 { ADD_NEW(state_offset, count); }
2083 }
2084 }
2085 break;
2086
2087 /*-----------------------------------------------------------------*/
2088 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2089 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2090 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2091 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2092 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2093 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2094 count = current_state->count; /* Number already matched */
2095 if (clen > 0)
2096 {
2097 PCRE2_SPTR nptr;
2098 int ncount = 0;
2099 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2100 {
2101 active_count--; /* Remove non-match possibility */
2102 next_active_state--;
2103 }
2104 nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2105 &ncount);
2106 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2107 reset_could_continue = TRUE;
2108 if (++count >= (int)GET2(code, 1))
2109 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2110 else
2111 { ADD_NEW_DATA(-state_offset, count, ncount); }
2112 }
2113 break;
2114 #endif
2115
2116 /*-----------------------------------------------------------------*/
2117 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2118 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2119 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2120 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2121 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2122 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2123 count = current_state->count; /* Number already matched */
2124 if (clen > 0)
2125 {
2126 int ncount = 0;
2127 switch (c)
2128 {
2129 case CHAR_VT:
2130 case CHAR_FF:
2131 case CHAR_NEL:
2132 #ifndef EBCDIC
2133 case 0x2028:
2134 case 0x2029:
2135 #endif /* Not EBCDIC */
2136 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2137 goto ANYNL03;
2138
2139 case CHAR_CR:
2140 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2141 /* Fall through */
2142
2143 ANYNL03:
2144 case CHAR_LF:
2145 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2146 {
2147 active_count--; /* Remove non-match possibility */
2148 next_active_state--;
2149 }
2150 if (++count >= (int)GET2(code, 1))
2151 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2152 else
2153 { ADD_NEW_DATA(-state_offset, count, ncount); }
2154 break;
2155
2156 default:
2157 break;
2158 }
2159 }
2160 break;
2161
2162 /*-----------------------------------------------------------------*/
2163 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2164 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2165 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2166 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2167 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2168 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2169 count = current_state->count; /* Number already matched */
2170 if (clen > 0)
2171 {
2172 BOOL OK;
2173 switch (c)
2174 {
2175 VSPACE_CASES:
2176 OK = TRUE;
2177 break;
2178
2179 default:
2180 OK = FALSE;
2181 }
2182
2183 if (OK == (d == OP_VSPACE))
2184 {
2185 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2186 {
2187 active_count--; /* Remove non-match possibility */
2188 next_active_state--;
2189 }
2190 if (++count >= (int)GET2(code, 1))
2191 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2192 else
2193 { ADD_NEW_DATA(-state_offset, count, 0); }
2194 }
2195 }
2196 break;
2197
2198 /*-----------------------------------------------------------------*/
2199 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2200 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2201 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2202 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2203 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2204 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2205 count = current_state->count; /* Number already matched */
2206 if (clen > 0)
2207 {
2208 BOOL OK;
2209 switch (c)
2210 {
2211 HSPACE_CASES:
2212 OK = TRUE;
2213 break;
2214
2215 default:
2216 OK = FALSE;
2217 break;
2218 }
2219
2220 if (OK == (d == OP_HSPACE))
2221 {
2222 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2223 {
2224 active_count--; /* Remove non-match possibility */
2225 next_active_state--;
2226 }
2227 if (++count >= (int)GET2(code, 1))
2228 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2229 else
2230 { ADD_NEW_DATA(-state_offset, count, 0); }
2231 }
2232 }
2233 break;
2234
2235 /* ========================================================================== */
2236 /* These opcodes are followed by a character that is usually compared
2237 to the current subject character; it is loaded into d. We still get
2238 here even if there is no subject character, because in some cases zero
2239 repetitions are permitted. */
2240
2241 /*-----------------------------------------------------------------*/
2242 case OP_CHAR:
2243 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2244 break;
2245
2246 /*-----------------------------------------------------------------*/
2247 case OP_CHARI:
2248 if (clen == 0) break;
2249
2250 #ifdef SUPPORT_UNICODE
2251 if (utf_or_ucp)
2252 {
2253 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2254 {
2255 unsigned int othercase;
2256 if (c < 128)
2257 othercase = fcc[c];
2258 else
2259 othercase = UCD_OTHERCASE(c);
2260 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2261 }
2262 }
2263 else
2264 #endif /* SUPPORT_UNICODE */
2265 /* Not UTF or UCP mode */
2266 {
2267 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2268 { ADD_NEW(state_offset + 2, 0); }
2269 }
2270 break;
2271
2272
2273 #ifdef SUPPORT_UNICODE
2274 /*-----------------------------------------------------------------*/
2275 /* This is a tricky one because it can match more than one character.
2276 Find out how many characters to skip, and then set up a negative state
2277 to wait for them to pass before continuing. */
2278
2279 case OP_EXTUNI:
2280 if (clen > 0)
2281 {
2282 int ncount = 0;
2283 PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2284 end_subject, utf, &ncount);
2285 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2286 reset_could_continue = TRUE;
2287 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2288 }
2289 break;
2290 #endif
2291
2292 /*-----------------------------------------------------------------*/
2293 /* This is a tricky like EXTUNI because it too can match more than one
2294 character (when CR is followed by LF). In this case, set up a negative
2295 state to wait for one character to pass before continuing. */
2296
2297 case OP_ANYNL:
2298 if (clen > 0) switch(c)
2299 {
2300 case CHAR_VT:
2301 case CHAR_FF:
2302 case CHAR_NEL:
2303 #ifndef EBCDIC
2304 case 0x2028:
2305 case 0x2029:
2306 #endif /* Not EBCDIC */
2307 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2308 /* Fall through */
2309
2310 case CHAR_LF:
2311 ADD_NEW(state_offset + 1, 0);
2312 break;
2313
2314 case CHAR_CR:
2315 if (ptr + 1 >= end_subject)
2316 {
2317 ADD_NEW(state_offset + 1, 0);
2318 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2319 reset_could_continue = TRUE;
2320 }
2321 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2322 {
2323 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2324 }
2325 else
2326 {
2327 ADD_NEW(state_offset + 1, 0);
2328 }
2329 break;
2330 }
2331 break;
2332
2333 /*-----------------------------------------------------------------*/
2334 case OP_NOT_VSPACE:
2335 if (clen > 0) switch(c)
2336 {
2337 VSPACE_CASES:
2338 break;
2339
2340 default:
2341 ADD_NEW(state_offset + 1, 0);
2342 break;
2343 }
2344 break;
2345
2346 /*-----------------------------------------------------------------*/
2347 case OP_VSPACE:
2348 if (clen > 0) switch(c)
2349 {
2350 VSPACE_CASES:
2351 ADD_NEW(state_offset + 1, 0);
2352 break;
2353
2354 default:
2355 break;
2356 }
2357 break;
2358
2359 /*-----------------------------------------------------------------*/
2360 case OP_NOT_HSPACE:
2361 if (clen > 0) switch(c)
2362 {
2363 HSPACE_CASES:
2364 break;
2365
2366 default:
2367 ADD_NEW(state_offset + 1, 0);
2368 break;
2369 }
2370 break;
2371
2372 /*-----------------------------------------------------------------*/
2373 case OP_HSPACE:
2374 if (clen > 0) switch(c)
2375 {
2376 HSPACE_CASES:
2377 ADD_NEW(state_offset + 1, 0);
2378 break;
2379
2380 default:
2381 break;
2382 }
2383 break;
2384
2385 /*-----------------------------------------------------------------*/
2386 /* Match a negated single character casefully. */
2387
2388 case OP_NOT:
2389 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2390 break;
2391
2392 /*-----------------------------------------------------------------*/
2393 /* Match a negated single character caselessly. */
2394
2395 case OP_NOTI:
2396 if (clen > 0)
2397 {
2398 uint32_t otherd;
2399 #ifdef SUPPORT_UNICODE
2400 if (utf_or_ucp && d >= 128)
2401 otherd = UCD_OTHERCASE(d);
2402 else
2403 #endif /* SUPPORT_UNICODE */
2404 otherd = TABLE_GET(d, fcc, d);
2405 if (c != d && c != otherd)
2406 { ADD_NEW(state_offset + dlen + 1, 0); }
2407 }
2408 break;
2409
2410 /*-----------------------------------------------------------------*/
2411 case OP_PLUSI:
2412 case OP_MINPLUSI:
2413 case OP_POSPLUSI:
2414 case OP_NOTPLUSI:
2415 case OP_NOTMINPLUSI:
2416 case OP_NOTPOSPLUSI:
2417 caseless = TRUE;
2418 codevalue -= OP_STARI - OP_STAR;
2419
2420 /* Fall through */
2421 case OP_PLUS:
2422 case OP_MINPLUS:
2423 case OP_POSPLUS:
2424 case OP_NOTPLUS:
2425 case OP_NOTMINPLUS:
2426 case OP_NOTPOSPLUS:
2427 count = current_state->count; /* Already matched */
2428 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2429 if (clen > 0)
2430 {
2431 uint32_t otherd = NOTACHAR;
2432 if (caseless)
2433 {
2434 #ifdef SUPPORT_UNICODE
2435 if (utf_or_ucp && d >= 128)
2436 otherd = UCD_OTHERCASE(d);
2437 else
2438 #endif /* SUPPORT_UNICODE */
2439 otherd = TABLE_GET(d, fcc, d);
2440 }
2441 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2442 {
2443 if (count > 0 &&
2444 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2445 {
2446 active_count--; /* Remove non-match possibility */
2447 next_active_state--;
2448 }
2449 count++;
2450 ADD_NEW(state_offset, count);
2451 }
2452 }
2453 break;
2454
2455 /*-----------------------------------------------------------------*/
2456 case OP_QUERYI:
2457 case OP_MINQUERYI:
2458 case OP_POSQUERYI:
2459 case OP_NOTQUERYI:
2460 case OP_NOTMINQUERYI:
2461 case OP_NOTPOSQUERYI:
2462 caseless = TRUE;
2463 codevalue -= OP_STARI - OP_STAR;
2464 /* Fall through */
2465 case OP_QUERY:
2466 case OP_MINQUERY:
2467 case OP_POSQUERY:
2468 case OP_NOTQUERY:
2469 case OP_NOTMINQUERY:
2470 case OP_NOTPOSQUERY:
2471 ADD_ACTIVE(state_offset + dlen + 1, 0);
2472 if (clen > 0)
2473 {
2474 uint32_t otherd = NOTACHAR;
2475 if (caseless)
2476 {
2477 #ifdef SUPPORT_UNICODE
2478 if (utf_or_ucp && d >= 128)
2479 otherd = UCD_OTHERCASE(d);
2480 else
2481 #endif /* SUPPORT_UNICODE */
2482 otherd = TABLE_GET(d, fcc, d);
2483 }
2484 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2485 {
2486 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2487 {
2488 active_count--; /* Remove non-match possibility */
2489 next_active_state--;
2490 }
2491 ADD_NEW(state_offset + dlen + 1, 0);
2492 }
2493 }
2494 break;
2495
2496 /*-----------------------------------------------------------------*/
2497 case OP_STARI:
2498 case OP_MINSTARI:
2499 case OP_POSSTARI:
2500 case OP_NOTSTARI:
2501 case OP_NOTMINSTARI:
2502 case OP_NOTPOSSTARI:
2503 caseless = TRUE;
2504 codevalue -= OP_STARI - OP_STAR;
2505 /* Fall through */
2506 case OP_STAR:
2507 case OP_MINSTAR:
2508 case OP_POSSTAR:
2509 case OP_NOTSTAR:
2510 case OP_NOTMINSTAR:
2511 case OP_NOTPOSSTAR:
2512 ADD_ACTIVE(state_offset + dlen + 1, 0);
2513 if (clen > 0)
2514 {
2515 uint32_t otherd = NOTACHAR;
2516 if (caseless)
2517 {
2518 #ifdef SUPPORT_UNICODE
2519 if (utf_or_ucp && d >= 128)
2520 otherd = UCD_OTHERCASE(d);
2521 else
2522 #endif /* SUPPORT_UNICODE */
2523 otherd = TABLE_GET(d, fcc, d);
2524 }
2525 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2526 {
2527 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2528 {
2529 active_count--; /* Remove non-match possibility */
2530 next_active_state--;
2531 }
2532 ADD_NEW(state_offset, 0);
2533 }
2534 }
2535 break;
2536
2537 /*-----------------------------------------------------------------*/
2538 case OP_EXACTI:
2539 case OP_NOTEXACTI:
2540 caseless = TRUE;
2541 codevalue -= OP_STARI - OP_STAR;
2542 /* Fall through */
2543 case OP_EXACT:
2544 case OP_NOTEXACT:
2545 count = current_state->count; /* Number already matched */
2546 if (clen > 0)
2547 {
2548 uint32_t otherd = NOTACHAR;
2549 if (caseless)
2550 {
2551 #ifdef SUPPORT_UNICODE
2552 if (utf_or_ucp && d >= 128)
2553 otherd = UCD_OTHERCASE(d);
2554 else
2555 #endif /* SUPPORT_UNICODE */
2556 otherd = TABLE_GET(d, fcc, d);
2557 }
2558 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2559 {
2560 if (++count >= (int)GET2(code, 1))
2561 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2562 else
2563 { ADD_NEW(state_offset, count); }
2564 }
2565 }
2566 break;
2567
2568 /*-----------------------------------------------------------------*/
2569 case OP_UPTOI:
2570 case OP_MINUPTOI:
2571 case OP_POSUPTOI:
2572 case OP_NOTUPTOI:
2573 case OP_NOTMINUPTOI:
2574 case OP_NOTPOSUPTOI:
2575 caseless = TRUE;
2576 codevalue -= OP_STARI - OP_STAR;
2577 /* Fall through */
2578 case OP_UPTO:
2579 case OP_MINUPTO:
2580 case OP_POSUPTO:
2581 case OP_NOTUPTO:
2582 case OP_NOTMINUPTO:
2583 case OP_NOTPOSUPTO:
2584 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2585 count = current_state->count; /* Number already matched */
2586 if (clen > 0)
2587 {
2588 uint32_t otherd = NOTACHAR;
2589 if (caseless)
2590 {
2591 #ifdef SUPPORT_UNICODE
2592 if (utf_or_ucp && d >= 128)
2593 otherd = UCD_OTHERCASE(d);
2594 else
2595 #endif /* SUPPORT_UNICODE */
2596 otherd = TABLE_GET(d, fcc, d);
2597 }
2598 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2599 {
2600 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2601 {
2602 active_count--; /* Remove non-match possibility */
2603 next_active_state--;
2604 }
2605 if (++count >= (int)GET2(code, 1))
2606 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2607 else
2608 { ADD_NEW(state_offset, count); }
2609 }
2610 }
2611 break;
2612
2613
2614 /* ========================================================================== */
2615 /* These are the class-handling opcodes */
2616
2617 case OP_CLASS:
2618 case OP_NCLASS:
2619 case OP_XCLASS:
2620 {
2621 BOOL isinclass = FALSE;
2622 int next_state_offset;
2623 PCRE2_SPTR ecode;
2624
2625 /* For a simple class, there is always just a 32-byte table, and we
2626 can set isinclass from it. */
2627
2628 if (codevalue != OP_XCLASS)
2629 {
2630 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2631 if (clen > 0)
2632 {
2633 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2634 ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2635 }
2636 }
2637
2638 /* An extended class may have a table or a list of single characters,
2639 ranges, or both, and it may be positive or negative. There's a
2640 function that sorts all this out. */
2641
2642 else
2643 {
2644 ecode = code + GET(code, 1);
2645 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2646 }
2647
2648 /* At this point, isinclass is set for all kinds of class, and ecode
2649 points to the byte after the end of the class. If there is a
2650 quantifier, this is where it will be. */
2651
2652 next_state_offset = (int)(ecode - start_code);
2653
2654 switch (*ecode)
2655 {
2656 case OP_CRSTAR:
2657 case OP_CRMINSTAR:
2658 case OP_CRPOSSTAR:
2659 ADD_ACTIVE(next_state_offset + 1, 0);
2660 if (isinclass)
2661 {
2662 if (*ecode == OP_CRPOSSTAR)
2663 {
2664 active_count--; /* Remove non-match possibility */
2665 next_active_state--;
2666 }
2667 ADD_NEW(state_offset, 0);
2668 }
2669 break;
2670
2671 case OP_CRPLUS:
2672 case OP_CRMINPLUS:
2673 case OP_CRPOSPLUS:
2674 count = current_state->count; /* Already matched */
2675 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2676 if (isinclass)
2677 {
2678 if (count > 0 && *ecode == OP_CRPOSPLUS)
2679 {
2680 active_count--; /* Remove non-match possibility */
2681 next_active_state--;
2682 }
2683 count++;
2684 ADD_NEW(state_offset, count);
2685 }
2686 break;
2687
2688 case OP_CRQUERY:
2689 case OP_CRMINQUERY:
2690 case OP_CRPOSQUERY:
2691 ADD_ACTIVE(next_state_offset + 1, 0);
2692 if (isinclass)
2693 {
2694 if (*ecode == OP_CRPOSQUERY)
2695 {
2696 active_count--; /* Remove non-match possibility */
2697 next_active_state--;
2698 }
2699 ADD_NEW(next_state_offset + 1, 0);
2700 }
2701 break;
2702
2703 case OP_CRRANGE:
2704 case OP_CRMINRANGE:
2705 case OP_CRPOSRANGE:
2706 count = current_state->count; /* Already matched */
2707 if (count >= (int)GET2(ecode, 1))
2708 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2709 if (isinclass)
2710 {
2711 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2712
2713 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2714 {
2715 active_count--; /* Remove non-match possibility */
2716 next_active_state--;
2717 }
2718
2719 if (++count >= max && max != 0) /* Max 0 => no limit */
2720 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2721 else
2722 { ADD_NEW(state_offset, count); }
2723 }
2724 break;
2725
2726 default:
2727 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2728 break;
2729 }
2730 }
2731 break;
2732
2733 /* ========================================================================== */
2734 /* These are the opcodes for fancy brackets of various kinds. We have
2735 to use recursion in order to handle them. The "always failing" assertion
2736 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2737 though the other "backtracking verbs" are not supported. */
2738
2739 case OP_FAIL:
2740 forced_fail++; /* Count FAILs for multiple states */
2741 break;
2742
2743 case OP_ASSERT:
2744 case OP_ASSERT_NOT:
2745 case OP_ASSERTBACK:
2746 case OP_ASSERTBACK_NOT:
2747 {
2748 int rc;
2749 int *local_workspace;
2750 PCRE2_SIZE *local_offsets;
2751 PCRE2_SPTR endasscode = code + GET(code, 1);
2752 RWS_anchor *rws = (RWS_anchor *)RWS;
2753
2754 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2755 {
2756 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2757 if (rc != 0) return rc;
2758 RWS = (int *)rws;
2759 }
2760
2761 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2762 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2763 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2764
2765 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2766
2767 rc = internal_dfa_match(
2768 mb, /* static match data */
2769 code, /* this subexpression's code */
2770 ptr, /* where we currently are */
2771 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2772 local_offsets, /* offset vector */
2773 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2774 local_workspace, /* workspace vector */
2775 RWS_RSIZE, /* size of same */
2776 rlevel, /* function recursion level */
2777 RWS); /* recursion workspace */
2778
2779 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2780
2781 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2782 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2783 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2784 }
2785 break;
2786
2787 /*-----------------------------------------------------------------*/
2788 case OP_COND:
2789 case OP_SCOND:
2790 {
2791 int codelink = (int)GET(code, 1);
2792 PCRE2_UCHAR condcode;
2793
2794 /* Because of the way auto-callout works during compile, a callout item
2795 is inserted between OP_COND and an assertion condition. This does not
2796 happen for the other conditions. */
2797
2798 if (code[LINK_SIZE + 1] == OP_CALLOUT
2799 || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2800 {
2801 PCRE2_SIZE callout_length;
2802 rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2803 1 + LINK_SIZE, &callout_length);
2804 if (rrc < 0) return rrc; /* Abandon */
2805 if (rrc > 0) break; /* Fail this thread */
2806 code += callout_length; /* Skip callout data */
2807 }
2808
2809 condcode = code[LINK_SIZE+1];
2810
2811 /* Back reference conditions and duplicate named recursion conditions
2812 are not supported */
2813
2814 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2815 condcode == OP_DNRREF)
2816 return PCRE2_ERROR_DFA_UCOND;
2817
2818 /* The DEFINE condition is always false, and the assertion (?!) is
2819 converted to OP_FAIL. */
2820
2821 if (condcode == OP_FALSE || condcode == OP_FAIL)
2822 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2823
2824 /* There is also an always-true condition */
2825
2826 else if (condcode == OP_TRUE)
2827 { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2828
2829 /* The only supported version of OP_RREF is for the value RREF_ANY,
2830 which means "test if in any recursion". We can't test for specifically
2831 recursed groups. */
2832
2833 else if (condcode == OP_RREF)
2834 {
2835 unsigned int value = GET2(code, LINK_SIZE + 2);
2836 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2837 if (mb->recursive != NULL)
2838 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2839 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2840 }
2841
2842 /* Otherwise, the condition is an assertion */
2843
2844 else
2845 {
2846 int rc;
2847 int *local_workspace;
2848 PCRE2_SIZE *local_offsets;
2849 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2850 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2851 RWS_anchor *rws = (RWS_anchor *)RWS;
2852
2853 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2854 {
2855 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2856 if (rc != 0) return rc;
2857 RWS = (int *)rws;
2858 }
2859
2860 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2861 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2862 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2863
2864 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2865
2866 rc = internal_dfa_match(
2867 mb, /* fixed match data */
2868 asscode, /* this subexpression's code */
2869 ptr, /* where we currently are */
2870 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2871 local_offsets, /* offset vector */
2872 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2873 local_workspace, /* workspace vector */
2874 RWS_RSIZE, /* size of same */
2875 rlevel, /* function recursion level */
2876 RWS); /* recursion workspace */
2877
2878 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2879
2880 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2881 if ((rc >= 0) ==
2882 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2883 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2884 else
2885 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2886 }
2887 }
2888 break;
2889
2890 /*-----------------------------------------------------------------*/
2891 case OP_RECURSE:
2892 {
2893 int rc;
2894 int *local_workspace;
2895 PCRE2_SIZE *local_offsets;
2896 RWS_anchor *rws = (RWS_anchor *)RWS;
2897 dfa_recursion_info *ri;
2898 PCRE2_SPTR callpat = start_code + GET(code, 1);
2899 uint32_t recno = (callpat == mb->start_code)? 0 :
2900 GET2(callpat, 1 + LINK_SIZE);
2901
2902 if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2903 {
2904 rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2905 if (rc != 0) return rc;
2906 RWS = (int *)rws;
2907 }
2908
2909 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2910 local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2911 rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2912
2913 /* Check for repeating a recursion without advancing the subject
2914 pointer. This should catch convoluted mutual recursions. (Some simple
2915 cases are caught at compile time.) */
2916
2917 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2918 if (recno == ri->group_num && ptr == ri->subject_position)
2919 return PCRE2_ERROR_RECURSELOOP;
2920
2921 /* Remember this recursion and where we started it so as to
2922 catch infinite loops. */
2923
2924 new_recursive.group_num = recno;
2925 new_recursive.subject_position = ptr;
2926 new_recursive.prevrec = mb->recursive;
2927 mb->recursive = &new_recursive;
2928
2929 rc = internal_dfa_match(
2930 mb, /* fixed match data */
2931 callpat, /* this subexpression's code */
2932 ptr, /* where we currently are */
2933 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2934 local_offsets, /* offset vector */
2935 RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2936 local_workspace, /* workspace vector */
2937 RWS_RSIZE, /* size of same */
2938 rlevel, /* function recursion level */
2939 RWS); /* recursion workspace */
2940
2941 rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2942 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2943
2944 /* Ran out of internal offsets */
2945
2946 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2947
2948 /* For each successful matched substring, set up the next state with a
2949 count of characters to skip before trying it. Note that the count is in
2950 characters, not bytes. */
2951
2952 if (rc > 0)
2953 {
2954 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2955 {
2956 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2957 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2958 if (utf)
2959 {
2960 PCRE2_SPTR p = start_subject + local_offsets[rc];
2961 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2962 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2963 }
2964 #endif
2965 if (charcount > 0)
2966 {
2967 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2968 (int)(charcount - 1));
2969 }
2970 else
2971 {
2972 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2973 }
2974 }
2975 }
2976 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2977 }
2978 break;
2979
2980 /*-----------------------------------------------------------------*/
2981 case OP_BRAPOS:
2982 case OP_SBRAPOS:
2983 case OP_CBRAPOS:
2984 case OP_SCBRAPOS:
2985 case OP_BRAPOSZERO:
2986 {
2987 int rc;
2988 int *local_workspace;
2989 PCRE2_SIZE *local_offsets;
2990 PCRE2_SIZE charcount, matched_count;
2991 PCRE2_SPTR local_ptr = ptr;
2992 RWS_anchor *rws = (RWS_anchor *)RWS;
2993 BOOL allow_zero;
2994
2995 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2996 {
2997 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2998 if (rc != 0) return rc;
2999 RWS = (int *)rws;
3000 }
3001
3002 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3003 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3004 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3005
3006 if (codevalue == OP_BRAPOSZERO)
3007 {
3008 allow_zero = TRUE;
3009 codevalue = *(++code); /* Codevalue will be one of above BRAs */
3010 }
3011 else allow_zero = FALSE;
3012
3013 /* Loop to match the subpattern as many times as possible as if it were
3014 a complete pattern. */
3015
3016 for (matched_count = 0;; matched_count++)
3017 {
3018 rc = internal_dfa_match(
3019 mb, /* fixed match data */
3020 code, /* this subexpression's code */
3021 local_ptr, /* where we currently are */
3022 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3023 local_offsets, /* offset vector */
3024 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3025 local_workspace, /* workspace vector */
3026 RWS_RSIZE, /* size of same */
3027 rlevel, /* function recursion level */
3028 RWS); /* recursion workspace */
3029
3030 /* Failed to match */
3031
3032 if (rc < 0)
3033 {
3034 if (rc != PCRE2_ERROR_NOMATCH) return rc;
3035 break;
3036 }
3037
3038 /* Matched: break the loop if zero characters matched. */
3039
3040 charcount = local_offsets[1] - local_offsets[0];
3041 if (charcount == 0) break;
3042 local_ptr += charcount; /* Advance temporary position ptr */
3043 }
3044
3045 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3046
3047 /* At this point we have matched the subpattern matched_count
3048 times, and local_ptr is pointing to the character after the end of the
3049 last match. */
3050
3051 if (matched_count > 0 || allow_zero)
3052 {
3053 PCRE2_SPTR end_subpattern = code;
3054 int next_state_offset;
3055
3056 do { end_subpattern += GET(end_subpattern, 1); }
3057 while (*end_subpattern == OP_ALT);
3058 next_state_offset =
3059 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3060
3061 /* Optimization: if there are no more active states, and there
3062 are no new states yet set up, then skip over the subject string
3063 right here, to save looping. Otherwise, set up the new state to swing
3064 into action when the end of the matched substring is reached. */
3065
3066 if (i + 1 >= active_count && new_count == 0)
3067 {
3068 ptr = local_ptr;
3069 clen = 0;
3070 ADD_NEW(next_state_offset, 0);
3071 }
3072 else
3073 {
3074 PCRE2_SPTR p = ptr;
3075 PCRE2_SPTR pp = local_ptr;
3076 charcount = (PCRE2_SIZE)(pp - p);
3077 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3078 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3079 #endif
3080 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3081 }
3082 }
3083 }
3084 break;
3085
3086 /*-----------------------------------------------------------------*/
3087 case OP_ONCE:
3088 {
3089 int rc;
3090 int *local_workspace;
3091 PCRE2_SIZE *local_offsets;
3092 RWS_anchor *rws = (RWS_anchor *)RWS;
3093
3094 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3095 {
3096 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3097 if (rc != 0) return rc;
3098 RWS = (int *)rws;
3099 }
3100
3101 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3102 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3103 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3104
3105 rc = internal_dfa_match(
3106 mb, /* fixed match data */
3107 code, /* this subexpression's code */
3108 ptr, /* where we currently are */
3109 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3110 local_offsets, /* offset vector */
3111 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3112 local_workspace, /* workspace vector */
3113 RWS_RSIZE, /* size of same */
3114 rlevel, /* function recursion level */
3115 RWS); /* recursion workspace */
3116
3117 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3118
3119 if (rc >= 0)
3120 {
3121 PCRE2_SPTR end_subpattern = code;
3122 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3123 int next_state_offset, repeat_state_offset;
3124
3125 do { end_subpattern += GET(end_subpattern, 1); }
3126 while (*end_subpattern == OP_ALT);
3127 next_state_offset =
3128 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3129
3130 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3131 arrange for the repeat state also to be added to the relevant list.
3132 Calculate the offset, or set -1 for no repeat. */
3133
3134 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3135 *end_subpattern == OP_KETRMIN)?
3136 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3137
3138 /* If we have matched an empty string, add the next state at the
3139 current character pointer. This is important so that the duplicate
3140 checking kicks in, which is what breaks infinite loops that match an
3141 empty string. */
3142
3143 if (charcount == 0)
3144 {
3145 ADD_ACTIVE(next_state_offset, 0);
3146 }
3147
3148 /* Optimization: if there are no more active states, and there
3149 are no new states yet set up, then skip over the subject string
3150 right here, to save looping. Otherwise, set up the new state to swing
3151 into action when the end of the matched substring is reached. */
3152
3153 else if (i + 1 >= active_count && new_count == 0)
3154 {
3155 ptr += charcount;
3156 clen = 0;
3157 ADD_NEW(next_state_offset, 0);
3158
3159 /* If we are adding a repeat state at the new character position,
3160 we must fudge things so that it is the only current state.
3161 Otherwise, it might be a duplicate of one we processed before, and
3162 that would cause it to be skipped. */
3163
3164 if (repeat_state_offset >= 0)
3165 {
3166 next_active_state = active_states;
3167 active_count = 0;
3168 i = -1;
3169 ADD_ACTIVE(repeat_state_offset, 0);
3170 }
3171 }
3172 else
3173 {
3174 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3175 if (utf)
3176 {
3177 PCRE2_SPTR p = start_subject + local_offsets[0];
3178 PCRE2_SPTR pp = start_subject + local_offsets[1];
3179 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3180 }
3181 #endif
3182 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3183 if (repeat_state_offset >= 0)
3184 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3185 }
3186 }
3187 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3188 }
3189 break;
3190
3191
3192 /* ========================================================================== */
3193 /* Handle callouts */
3194
3195 case OP_CALLOUT:
3196 case OP_CALLOUT_STR:
3197 {
3198 PCRE2_SIZE callout_length;
3199 rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3200 &callout_length);
3201 if (rrc < 0) return rrc; /* Abandon */
3202 if (rrc == 0)
3203 { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3204 }
3205 break;
3206
3207
3208 /* ========================================================================== */
3209 default: /* Unsupported opcode */
3210 return PCRE2_ERROR_DFA_UITEM;
3211 }
3212
3213 NEXT_ACTIVE_STATE: continue;
3214
3215 } /* End of loop scanning active states */
3216
3217 /* We have finished the processing at the current subject character. If no
3218 new states have been set for the next character, we have found all the
3219 matches that we are going to find. If partial matching has been requested,
3220 check for appropriate conditions.
3221
3222 The "forced_ fail" variable counts the number of (*F) encountered for the
3223 character. If it is equal to the original active_count (saved in
3224 workspace[1]) it means that (*F) was found on every active state. In this
3225 case we don't want to give a partial match.
3226
3227 The "could_continue" variable is true if a state could have continued but
3228 for the fact that the end of the subject was reached. */
3229
3230 if (new_count <= 0)
3231 {
3232 if (could_continue && /* Some could go on, and */
3233 forced_fail != workspace[1] && /* Not all forced fail & */
3234 ( /* either... */
3235 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3236 || /* or... */
3237 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3238 match_count < 0) /* no matches */
3239 ) && /* And... */
3240 (
3241 partial_newline || /* Either partial NL */
3242 ( /* or ... */
3243 ptr >= end_subject && /* End of subject and */
3244 ( /* either */
3245 ptr > mb->start_used_ptr || /* Inspected non-empty string */
3246 mb->allowemptypartial /* or pattern has lookbehind */
3247 ) /* or could match empty */
3248 )
3249 ))
3250 match_count = PCRE2_ERROR_PARTIAL;
3251 break; /* Exit from loop along the subject string */
3252 }
3253
3254 /* One or more states are active for the next character. */
3255
3256 ptr += clen; /* Advance to next subject character */
3257 } /* Loop to move along the subject string */
3258
3259 /* Control gets here from "break" a few lines above. If we have a match and
3260 PCRE2_ENDANCHORED is set, the match fails. */
3261
3262 if (match_count >= 0 &&
3263 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3264 ptr < end_subject)
3265 match_count = PCRE2_ERROR_NOMATCH;
3266
3267 return match_count;
3268 }
3269
3270
3271
3272 /*************************************************
3273 * Match a pattern using the DFA algorithm *
3274 *************************************************/
3275
3276 /* This function matches a compiled pattern to a subject string, using the
3277 alternate matching algorithm that finds all matches at once.
3278
3279 Arguments:
3280 code points to the compiled pattern
3281 subject subject string
3282 length length of subject string
3283 startoffset where to start matching in the subject
3284 options option bits
3285 match_data points to a match data structure
3286 gcontext points to a match context
3287 workspace pointer to workspace
3288 wscount size of workspace
3289
3290 Returns: > 0 => number of match offset pairs placed in offsets
3291 = 0 => offsets overflowed; longest matches are present
3292 -1 => failed to match
3293 < -1 => some kind of unexpected problem
3294 */
3295
3296 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3297 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3298 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3299 pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3300 {
3301 int rc;
3302 int was_zero_terminated = 0;
3303
3304 const pcre2_real_code *re = (const pcre2_real_code *)code;
3305
3306 PCRE2_SPTR start_match;
3307 PCRE2_SPTR end_subject;
3308 PCRE2_SPTR bumpalong_limit;
3309 PCRE2_SPTR req_cu_ptr;
3310
3311 BOOL utf, anchored, startline, firstline;
3312 BOOL has_first_cu = FALSE;
3313 BOOL has_req_cu = FALSE;
3314
3315 #if PCRE2_CODE_UNIT_WIDTH == 8
3316 PCRE2_SPTR memchr_found_first_cu = NULL;
3317 PCRE2_SPTR memchr_found_first_cu2 = NULL;
3318 #endif
3319
3320 PCRE2_UCHAR first_cu = 0;
3321 PCRE2_UCHAR first_cu2 = 0;
3322 PCRE2_UCHAR req_cu = 0;
3323 PCRE2_UCHAR req_cu2 = 0;
3324
3325 const uint8_t *start_bits = NULL;
3326
3327 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3328 is used below, and it expects NLBLOCK to be defined as a pointer. */
3329
3330 pcre2_callout_block cb;
3331 dfa_match_block actual_match_block;
3332 dfa_match_block *mb = &actual_match_block;
3333
3334 /* Set up a starting block of memory for use during recursive calls to
3335 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3336 in the case when it is not needed. If this is too small, more memory is
3337 obtained from the heap. At the start of each block is an anchor structure.*/
3338
3339 int base_recursion_workspace[RWS_BASE_SIZE];
3340 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3341 rws->next = NULL;
3342 rws->size = RWS_BASE_SIZE;
3343 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3344
3345 /* Recognize NULL, length 0 as an empty string. */
3346
3347 if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3348
3349 /* Plausibility checks */
3350
3351 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3352 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3353 return PCRE2_ERROR_NULL;
3354
3355 if (length == PCRE2_ZERO_TERMINATED)
3356 {
3357 length = PRIV(strlen)(subject);
3358 was_zero_terminated = 1;
3359 }
3360
3361 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3362 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3363
3364 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3365 time. */
3366
3367 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3368 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3369 return PCRE2_ERROR_BADOPTION;
3370
3371 /* Invalid UTF support is not available for DFA matching. */
3372
3373 if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3374 return PCRE2_ERROR_DFA_UINVALID_UTF;
3375
3376 /* Check that the first field in the block is the magic number. If it is not,
3377 return with PCRE2_ERROR_BADMAGIC. */
3378
3379 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3380
3381 /* Check the code unit width. */
3382
3383 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3384 return PCRE2_ERROR_BADMODE;
3385
3386 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3387 options variable for this function. Users of PCRE2 who are not calling the
3388 function directly would like to have a way of setting these flags, in the same
3389 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3390 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3391 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3392 transferred to the options for this function. The bits are guaranteed to be
3393 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3394 that the match-time bits are not more significant than the flag bits. If by
3395 accident this is not the case, a compile-time division by zero error will
3396 occur. */
3397
3398 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3399 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3400 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3401 #undef FF
3402 #undef OO
3403
3404 /* If restarting after a partial match, do some sanity checks on the contents
3405 of the workspace. */
3406
3407 if ((options & PCRE2_DFA_RESTART) != 0)
3408 {
3409 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3410 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3411 return PCRE2_ERROR_DFA_BADRESTART;
3412 }
3413
3414 /* Set some local values */
3415
3416 utf = (re->overall_options & PCRE2_UTF) != 0;
3417 start_match = subject + start_offset;
3418 end_subject = subject + length;
3419 req_cu_ptr = start_match - 1;
3420 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3421 (re->overall_options & PCRE2_ANCHORED) != 0;
3422
3423 /* The "must be at the start of a line" flags are used in a loop when finding
3424 where to start. */
3425
3426 startline = (re->flags & PCRE2_STARTLINE) != 0;
3427 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3428 bumpalong_limit = end_subject;
3429
3430 /* Initialize and set up the fixed fields in the callout block, with a pointer
3431 in the match block. */
3432
3433 mb->cb = &cb;
3434 cb.version = 2;
3435 cb.subject = subject;
3436 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3437 cb.callout_flags = 0;
3438 cb.capture_top = 1; /* No capture support */
3439 cb.capture_last = 0;
3440 cb.mark = NULL; /* No (*MARK) support */
3441
3442 /* Get data from the match context, if present, and fill in the remaining
3443 fields in the match block. It is an error to set an offset limit without
3444 setting the flag at compile time. */
3445
3446 if (mcontext == NULL)
3447 {
3448 mb->callout = NULL;
3449 mb->memctl = re->memctl;
3450 mb->match_limit = PRIV(default_match_context).match_limit;
3451 mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3452 mb->heap_limit = PRIV(default_match_context).heap_limit;
3453 }
3454 else
3455 {
3456 if (mcontext->offset_limit != PCRE2_UNSET)
3457 {
3458 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3459 return PCRE2_ERROR_BADOFFSETLIMIT;
3460 bumpalong_limit = subject + mcontext->offset_limit;
3461 }
3462 mb->callout = mcontext->callout;
3463 mb->callout_data = mcontext->callout_data;
3464 mb->memctl = mcontext->memctl;
3465 mb->match_limit = mcontext->match_limit;
3466 mb->match_limit_depth = mcontext->depth_limit;
3467 mb->heap_limit = mcontext->heap_limit;
3468 }
3469
3470 if (mb->match_limit > re->limit_match)
3471 mb->match_limit = re->limit_match;
3472
3473 if (mb->match_limit_depth > re->limit_depth)
3474 mb->match_limit_depth = re->limit_depth;
3475
3476 if (mb->heap_limit > re->limit_heap)
3477 mb->heap_limit = re->limit_heap;
3478
3479 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3480 re->name_count * re->name_entry_size;
3481 mb->tables = re->tables;
3482 mb->start_subject = subject;
3483 mb->end_subject = end_subject;
3484 mb->start_offset = start_offset;
3485 mb->allowemptypartial = (re->max_lookbehind > 0) ||
3486 (re->flags & PCRE2_MATCH_EMPTY) != 0;
3487 mb->moptions = options;
3488 mb->poptions = re->overall_options;
3489 mb->match_call_count = 0;
3490 mb->heap_used = 0;
3491
3492 /* Process the \R and newline settings. */
3493
3494 mb->bsr_convention = re->bsr_convention;
3495 mb->nltype = NLTYPE_FIXED;
3496 switch(re->newline_convention)
3497 {
3498 case PCRE2_NEWLINE_CR:
3499 mb->nllen = 1;
3500 mb->nl[0] = CHAR_CR;
3501 break;
3502
3503 case PCRE2_NEWLINE_LF:
3504 mb->nllen = 1;
3505 mb->nl[0] = CHAR_NL;
3506 break;
3507
3508 case PCRE2_NEWLINE_NUL:
3509 mb->nllen = 1;
3510 mb->nl[0] = CHAR_NUL;
3511 break;
3512
3513 case PCRE2_NEWLINE_CRLF:
3514 mb->nllen = 2;
3515 mb->nl[0] = CHAR_CR;
3516 mb->nl[1] = CHAR_NL;
3517 break;
3518
3519 case PCRE2_NEWLINE_ANY:
3520 mb->nltype = NLTYPE_ANY;
3521 break;
3522
3523 case PCRE2_NEWLINE_ANYCRLF:
3524 mb->nltype = NLTYPE_ANYCRLF;
3525 break;
3526
3527 default: return PCRE2_ERROR_INTERNAL;
3528 }
3529
3530 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3531 we must also check that a starting offset does not point into the middle of a
3532 multiunit character. We check only the portion of the subject that is going to
3533 be inspected during matching - from the offset minus the maximum back reference
3534 to the given length. This saves time when a small part of a large subject is
3535 being matched by the use of a starting offset. Note that the maximum lookbehind
3536 is a number of characters, not code units. */
3537
3538 #ifdef SUPPORT_UNICODE
3539 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3540 {
3541 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3542
3543 if (start_offset > 0)
3544 {
3545 #if PCRE2_CODE_UNIT_WIDTH != 32
3546 unsigned int i;
3547 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3548 return PCRE2_ERROR_BADUTFOFFSET;
3549 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3550 {
3551 check_subject--;
3552 while (check_subject > subject &&
3553 #if PCRE2_CODE_UNIT_WIDTH == 8
3554 (*check_subject & 0xc0) == 0x80)
3555 #else /* 16-bit */
3556 (*check_subject & 0xfc00) == 0xdc00)
3557 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3558 check_subject--;
3559 }
3560 #else /* In the 32-bit library, one code unit equals one character. */
3561 check_subject -= re->max_lookbehind;
3562 if (check_subject < subject) check_subject = subject;
3563 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3564 }
3565
3566 /* Validate the relevant portion of the subject. After an error, adjust the
3567 offset to be an absolute offset in the whole string. */
3568
3569 match_data->rc = PRIV(valid_utf)(check_subject,
3570 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3571 if (match_data->rc != 0)
3572 {
3573 match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3574 return match_data->rc;
3575 }
3576 }
3577 #endif /* SUPPORT_UNICODE */
3578
3579 /* Set up the first code unit to match, if available. If there's no first code
3580 unit there may be a bitmap of possible first characters. */
3581
3582 if ((re->flags & PCRE2_FIRSTSET) != 0)
3583 {
3584 has_first_cu = TRUE;
3585 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3586 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3587 {
3588 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3589 #ifdef SUPPORT_UNICODE
3590 #if PCRE2_CODE_UNIT_WIDTH == 8
3591 if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3592 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3593 #else
3594 if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3595 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3596 #endif
3597 #endif /* SUPPORT_UNICODE */
3598 }
3599 }
3600 else
3601 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3602 start_bits = re->start_bitmap;
3603
3604 /* There may be a "last known required code unit" set. */
3605
3606 if ((re->flags & PCRE2_LASTSET) != 0)
3607 {
3608 has_req_cu = TRUE;
3609 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3610 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3611 {
3612 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3613 #ifdef SUPPORT_UNICODE
3614 #if PCRE2_CODE_UNIT_WIDTH == 8
3615 if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3616 req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3617 #else
3618 if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3619 req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3620 #endif
3621 #endif /* SUPPORT_UNICODE */
3622 }
3623 }
3624
3625 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3626 free the memory that was obtained. */
3627
3628 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3629 {
3630 match_data->memctl.free((void *)match_data->subject,
3631 match_data->memctl.memory_data);
3632 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3633 }
3634
3635 /* Fill in fields that are always returned in the match data. */
3636
3637 match_data->code = re;
3638 match_data->subject = NULL; /* Default for no match */
3639 match_data->mark = NULL;
3640 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3641
3642 /* Call the main matching function, looping for a non-anchored regex after a
3643 failed match. If not restarting, perform certain optimizations at the start of
3644 a match. */
3645
3646 for (;;)
3647 {
3648 /* ----------------- Start of match optimizations ---------------- */
3649
3650 /* There are some optimizations that avoid running the match if a known
3651 starting point is not found, or if a known later code unit is not present.
3652 However, there is an option (settable at compile time) that disables
3653 these, for testing and for ensuring that all callouts do actually occur.
3654 The optimizations must also be avoided when restarting a DFA match. */
3655
3656 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3657 (options & PCRE2_DFA_RESTART) == 0)
3658 {
3659 /* If firstline is TRUE, the start of the match is constrained to the first
3660 line of a multiline string. That is, the match must be before or at the
3661 first newline following the start of matching. Temporarily adjust
3662 end_subject so that we stop the optimization scans for a first code unit
3663 immediately after the first character of a newline (the first code unit can
3664 legitimately be a newline). If the match fails at the newline, later code
3665 breaks this loop. */
3666
3667 if (firstline)
3668 {
3669 PCRE2_SPTR t = start_match;
3670 #ifdef SUPPORT_UNICODE
3671 if (utf)
3672 {
3673 while (t < end_subject && !IS_NEWLINE(t))
3674 {
3675 t++;
3676 ACROSSCHAR(t < end_subject, t, t++);
3677 }
3678 }
3679 else
3680 #endif
3681 while (t < end_subject && !IS_NEWLINE(t)) t++;
3682 end_subject = t;
3683 }
3684
3685 /* Anchored: check the first code unit if one is recorded. This may seem
3686 pointless but it can help in detecting a no match case without scanning for
3687 the required code unit. */
3688
3689 if (anchored)
3690 {
3691 if (has_first_cu || start_bits != NULL)
3692 {
3693 BOOL ok = start_match < end_subject;
3694 if (ok)
3695 {
3696 PCRE2_UCHAR c = UCHAR21TEST(start_match);
3697 ok = has_first_cu && (c == first_cu || c == first_cu2);
3698 if (!ok && start_bits != NULL)
3699 {
3700 #if PCRE2_CODE_UNIT_WIDTH != 8
3701 if (c > 255) c = 255;
3702 #endif
3703 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3704 }
3705 }
3706 if (!ok) break;
3707 }
3708 }
3709
3710 /* Not anchored. Advance to a unique first code unit if there is one. */
3711
3712 else
3713 {
3714 if (has_first_cu)
3715 {
3716 if (first_cu != first_cu2) /* Caseless */
3717 {
3718 /* In 16-bit and 32_bit modes we have to do our own search, so can
3719 look for both cases at once. */
3720
3721 #if PCRE2_CODE_UNIT_WIDTH != 8
3722 PCRE2_UCHAR smc;
3723 while (start_match < end_subject &&
3724 (smc = UCHAR21TEST(start_match)) != first_cu &&
3725 smc != first_cu2)
3726 start_match++;
3727 #else
3728 /* In 8-bit mode, the use of memchr() gives a big speed up, even
3729 though we have to call it twice in order to find the earliest
3730 occurrence of the code unit in either of its cases. Caching is used
3731 to remember the positions of previously found code units. This can
3732 make a huge difference when the strings are very long and only one
3733 case is actually present. */
3734
3735 PCRE2_SPTR pp1 = NULL;
3736 PCRE2_SPTR pp2 = NULL;
3737 PCRE2_SIZE searchlength = end_subject - start_match;
3738
3739 /* If we haven't got a previously found position for first_cu, or if
3740 the current starting position is later, we need to do a search. If
3741 the code unit is not found, set it to the end. */
3742
3743 if (memchr_found_first_cu == NULL ||
3744 start_match > memchr_found_first_cu)
3745 {
3746 pp1 = memchr(start_match, first_cu, searchlength);
3747 memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3748 }
3749
3750 /* If the start is before a previously found position, use the
3751 previous position, or NULL if a previous search failed. */
3752
3753 else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3754 memchr_found_first_cu;
3755
3756 /* Do the same thing for the other case. */
3757
3758 if (memchr_found_first_cu2 == NULL ||
3759 start_match > memchr_found_first_cu2)
3760 {
3761 pp2 = memchr(start_match, first_cu2, searchlength);
3762 memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3763 }
3764
3765 else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3766 memchr_found_first_cu2;
3767
3768 /* Set the start to the end of the subject if neither case was found.
3769 Otherwise, use the earlier found point. */
3770
3771 if (pp1 == NULL)
3772 start_match = (pp2 == NULL)? end_subject : pp2;
3773 else
3774 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3775
3776 #endif /* 8-bit handling */
3777 }
3778
3779 /* The caseful case is much simpler. */
3780
3781 else
3782 {
3783 #if PCRE2_CODE_UNIT_WIDTH != 8
3784 while (start_match < end_subject && UCHAR21TEST(start_match) !=
3785 first_cu)
3786 start_match++;
3787 #else /* 8-bit code units */
3788 start_match = memchr(start_match, first_cu, end_subject - start_match);
3789 if (start_match == NULL) start_match = end_subject;
3790 #endif
3791 }
3792
3793 /* If we can't find the required code unit, having reached the true end
3794 of the subject, break the bumpalong loop, to force a match failure,
3795 except when doing partial matching, when we let the next cycle run at
3796 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3797 which partially matches "abc", even though the string does not contain
3798 the starting character "d". If we have not reached the true end of the
3799 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3800 we also let the cycle run, because the matching string is legitimately
3801 allowed to start with the first code unit of a newline. */
3802
3803 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3804 start_match >= mb->end_subject)
3805 break;
3806 }
3807
3808 /* If there's no first code unit, advance to just after a linebreak for a
3809 multiline match if required. */
3810
3811 else if (startline)
3812 {
3813 if (start_match > mb->start_subject + start_offset)
3814 {
3815 #ifdef SUPPORT_UNICODE
3816 if (utf)
3817 {
3818 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3819 {
3820 start_match++;
3821 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3822 }
3823 }
3824 else
3825 #endif
3826 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3827 start_match++;
3828
3829 /* If we have just passed a CR and the newline option is ANY or
3830 ANYCRLF, and we are now at a LF, advance the match position by one
3831 more code unit. */
3832
3833 if (start_match[-1] == CHAR_CR &&
3834 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3835 start_match < end_subject &&
3836 UCHAR21TEST(start_match) == CHAR_NL)
3837 start_match++;
3838 }
3839 }
3840
3841 /* If there's no first code unit or a requirement for a multiline line
3842 start, advance to a non-unique first code unit if any have been
3843 identified. The bitmap contains only 256 bits. When code units are 16 or
3844 32 bits wide, all code units greater than 254 set the 255 bit. */
3845
3846 else if (start_bits != NULL)
3847 {
3848 while (start_match < end_subject)
3849 {
3850 uint32_t c = UCHAR21TEST(start_match);
3851 #if PCRE2_CODE_UNIT_WIDTH != 8
3852 if (c > 255) c = 255;
3853 #endif
3854 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3855 start_match++;
3856 }
3857
3858 /* See comment above in first_cu checking about the next line. */
3859
3860 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3861 start_match >= mb->end_subject)
3862 break;
3863 }
3864 } /* End of first code unit handling */
3865
3866 /* Restore fudged end_subject */
3867
3868 end_subject = mb->end_subject;
3869
3870 /* The following two optimizations are disabled for partial matching. */
3871
3872 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3873 {
3874 PCRE2_SPTR p;
3875
3876 /* The minimum matching length is a lower bound; no actual string of that
3877 length may actually match the pattern. Although the value is, strictly,
3878 in characters, we treat it as code units to avoid spending too much time
3879 in this optimization. */
3880
3881 if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3882
3883 /* If req_cu is set, we know that that code unit must appear in the
3884 subject for the match to succeed. If the first code unit is set, req_cu
3885 must be later in the subject; otherwise the test starts at the match
3886 point. This optimization can save a huge amount of backtracking in
3887 patterns with nested unlimited repeats that aren't going to match.
3888 Writing separate code for cased/caseless versions makes it go faster, as
3889 does using an autoincrement and backing off on a match. As in the case of
3890 the first code unit, using memchr() in the 8-bit library gives a big
3891 speed up. Unlike the first_cu check above, we do not need to call
3892 memchr() twice in the caseless case because we only need to check for the
3893 presence of the character in either case, not find the first occurrence.
3894
3895 The search can be skipped if the code unit was found later than the
3896 current starting point in a previous iteration of the bumpalong loop.
3897
3898 HOWEVER: when the subject string is very, very long, searching to its end
3899 can take a long time, and give bad performance on quite ordinary
3900 patterns. This showed up when somebody was matching something like
3901 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3902 sufficiently long, but it's worth searching a lot more for unanchored
3903 patterns. */
3904
3905 p = start_match + (has_first_cu? 1:0);
3906 if (has_req_cu && p > req_cu_ptr)
3907 {
3908 PCRE2_SIZE check_length = end_subject - start_match;
3909
3910 if (check_length < REQ_CU_MAX ||
3911 (!anchored && check_length < REQ_CU_MAX * 1000))
3912 {
3913 if (req_cu != req_cu2) /* Caseless */
3914 {
3915 #if PCRE2_CODE_UNIT_WIDTH != 8
3916 while (p < end_subject)
3917 {
3918 uint32_t pp = UCHAR21INCTEST(p);
3919 if (pp == req_cu || pp == req_cu2) { p--; break; }
3920 }
3921 #else /* 8-bit code units */
3922 PCRE2_SPTR pp = p;
3923 p = memchr(pp, req_cu, end_subject - pp);
3924 if (p == NULL)
3925 {
3926 p = memchr(pp, req_cu2, end_subject - pp);
3927 if (p == NULL) p = end_subject;
3928 }
3929 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3930 }
3931
3932 /* The caseful case */
3933
3934 else
3935 {
3936 #if PCRE2_CODE_UNIT_WIDTH != 8
3937 while (p < end_subject)
3938 {
3939 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3940 }
3941
3942 #else /* 8-bit code units */
3943 p = memchr(p, req_cu, end_subject - p);
3944 if (p == NULL) p = end_subject;
3945 #endif
3946 }
3947
3948 /* If we can't find the required code unit, break the matching loop,
3949 forcing a match failure. */
3950
3951 if (p >= end_subject) break;
3952
3953 /* If we have found the required code unit, save the point where we
3954 found it, so that we don't search again next time round the loop if
3955 the start hasn't passed this code unit yet. */
3956
3957 req_cu_ptr = p;
3958 }
3959 }
3960 }
3961 }
3962
3963 /* ------------ End of start of match optimizations ------------ */
3964
3965 /* Give no match if we have passed the bumpalong limit. */
3966
3967 if (start_match > bumpalong_limit) break;
3968
3969 /* OK, now we can do the business */
3970
3971 mb->start_used_ptr = start_match;
3972 mb->last_used_ptr = start_match;
3973 mb->recursive = NULL;
3974
3975 rc = internal_dfa_match(
3976 mb, /* fixed match data */
3977 mb->start_code, /* this subexpression's code */
3978 start_match, /* where we currently are */
3979 start_offset, /* start offset in subject */
3980 match_data->ovector, /* offset vector */
3981 (uint32_t)match_data->oveccount * 2, /* actual size of same */
3982 workspace, /* workspace vector */
3983 (int)wscount, /* size of same */
3984 0, /* function recurse level */
3985 base_recursion_workspace); /* initial workspace for recursion */
3986
3987 /* Anything other than "no match" means we are done, always; otherwise, carry
3988 on only if not anchored. */
3989
3990 if (rc != PCRE2_ERROR_NOMATCH || anchored)
3991 {
3992 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3993 {
3994 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3995 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3996 }
3997 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3998 match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3999 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4000 match_data->rc = rc;
4001
4002 if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4003 {
4004 length = CU2BYTES(length + was_zero_terminated);
4005 match_data->subject = match_data->memctl.malloc(length,
4006 match_data->memctl.memory_data);
4007 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4008 memcpy((void *)match_data->subject, subject, length);
4009 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4010 }
4011 else
4012 {
4013 if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4014 }
4015 goto EXIT;
4016 }
4017
4018 /* Advance to the next subject character unless we are at the end of a line
4019 and firstline is set. */
4020
4021 if (firstline && IS_NEWLINE(start_match)) break;
4022 start_match++;
4023 #ifdef SUPPORT_UNICODE
4024 if (utf)
4025 {
4026 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4027 }
4028 #endif
4029 if (start_match > end_subject) break;
4030
4031 /* If we have just passed a CR and we are now at a LF, and the pattern does
4032 not contain any explicit matches for \r or \n, and the newline option is CRLF
4033 or ANY or ANYCRLF, advance the match position by one more character. */
4034
4035 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4036 start_match < end_subject &&
4037 UCHAR21TEST(start_match) == CHAR_NL &&
4038 (re->flags & PCRE2_HASCRORLF) == 0 &&
4039 (mb->nltype == NLTYPE_ANY ||
4040 mb->nltype == NLTYPE_ANYCRLF ||
4041 mb->nllen == 2))
4042 start_match++;
4043
4044 } /* "Bumpalong" loop */
4045
4046 NOMATCH_EXIT:
4047 rc = PCRE2_ERROR_NOMATCH;
4048
4049 EXIT:
4050 while (rws->next != NULL)
4051 {
4052 RWS_anchor *next = rws->next;
4053 rws->next = next->next;
4054 mb->memctl.free(next, mb->memctl.memory_data);
4055 }
4056
4057 return rc;
4058 }
4059
4060 /* These #undefs are here to enable unity builds with CMake. */
4061
4062 #undef NLBLOCK /* Block containing newline information */
4063 #undef PSSTART /* Field containing processed string start */
4064 #undef PSEND /* Field containing processed string end */
4065
4066 /* End of pcre2_dfa_match.c */
4067