1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2023 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK mb /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre2_internal.h"
84
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89 PCRE2_COPY_MATCHED_SUBJECT)
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const uint8_t coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
155 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
156 0, /* CLASS */
157 0, /* NCLASS */
158 0, /* XCLASS - variable length */
159 0, /* REF */
160 0, /* REFI */
161 0, /* DNREF */
162 0, /* DNREFI */
163 0, /* RECURSE */
164 0, /* CALLOUT */
165 0, /* CALLOUT_STR */
166 0, /* Alt */
167 0, /* Ket */
168 0, /* KetRmax */
169 0, /* KetRmin */
170 0, /* KetRpos */
171 0, 0, /* Reverse, Vreverse */
172 0, /* Assert */
173 0, /* Assert not */
174 0, /* Assert behind */
175 0, /* Assert behind not */
176 0, /* NA assert */
177 0, /* NA assert behind */
178 0, /* ONCE */
179 0, /* SCRIPT_RUN */
180 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
181 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
182 0, 0, /* CREF, DNCREF */
183 0, 0, /* RREF, DNRREF */
184 0, 0, /* FALSE, TRUE */
185 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
186 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
187 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
188 0, 0, /* COMMIT, COMMIT_ARG */
189 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
190 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
191 0, 0 /* \B and \b in UCP mode */
192 };
193
194 /* This table identifies those opcodes that inspect a character. It is used to
195 remember the fact that a character could have been inspected when the end of
196 the subject is reached. ***NOTE*** If the start of this table is modified, the
197 two tables that follow must also be modified. */
198
199 static const uint8_t poptable[] = {
200 0, /* End */
201 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
202 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
203 1, 1, 1, /* Any, AllAny, Anybyte */
204 1, 1, /* \P, \p */
205 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
206 1, /* \X */
207 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
208 1, /* Char */
209 1, /* Chari */
210 1, /* not */
211 1, /* noti */
212 /* Positive single-char repeats */
213 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
214 1, 1, 1, /* upto, minupto, exact */
215 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
216 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
217 1, 1, 1, /* upto I, minupto I, exact I */
218 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
219 /* Negative single-char repeats - only for chars < 256 */
220 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
221 1, 1, 1, /* NOT upto, minupto, exact */
222 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
223 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
224 1, 1, 1, /* NOT upto I, minupto I, exact I */
225 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
226 /* Positive type repeats */
227 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
228 1, 1, 1, /* Type upto, minupto, exact */
229 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
230 /* Character class & ref repeats */
231 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
232 1, 1, /* CRRANGE, CRMINRANGE */
233 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
234 1, /* CLASS */
235 1, /* NCLASS */
236 1, /* XCLASS - variable length */
237 0, /* REF */
238 0, /* REFI */
239 0, /* DNREF */
240 0, /* DNREFI */
241 0, /* RECURSE */
242 0, /* CALLOUT */
243 0, /* CALLOUT_STR */
244 0, /* Alt */
245 0, /* Ket */
246 0, /* KetRmax */
247 0, /* KetRmin */
248 0, /* KetRpos */
249 0, 0, /* Reverse, Vreverse */
250 0, /* Assert */
251 0, /* Assert not */
252 0, /* Assert behind */
253 0, /* Assert behind not */
254 0, /* NA assert */
255 0, /* NA assert behind */
256 0, /* ONCE */
257 0, /* SCRIPT_RUN */
258 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
259 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
260 0, 0, /* CREF, DNCREF */
261 0, 0, /* RREF, DNRREF */
262 0, 0, /* FALSE, TRUE */
263 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
264 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
265 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
266 0, 0, /* COMMIT, COMMIT_ARG */
267 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
268 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
269 1, 1 /* \B and \b in UCP mode */
270 };
271
272 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
273 and \w */
274
275 static const uint8_t toptable1[] = {
276 0, 0, 0, 0, 0, 0,
277 ctype_digit, ctype_digit,
278 ctype_space, ctype_space,
279 ctype_word, ctype_word,
280 0, 0 /* OP_ANY, OP_ALLANY */
281 };
282
283 static const uint8_t toptable2[] = {
284 0, 0, 0, 0, 0, 0,
285 ctype_digit, 0,
286 ctype_space, 0,
287 ctype_word, 0,
288 1, 1 /* OP_ANY, OP_ALLANY */
289 };
290
291
292 /* Structure for holding data about a particular state, which is in effect the
293 current data for an active path through the match tree. It must consist
294 entirely of ints because the working vector we are passed, and which we put
295 these structures in, is a vector of ints. */
296
297 typedef struct stateblock {
298 int offset; /* Offset to opcode (-ve has meaning) */
299 int count; /* Count for repeats */
300 int data; /* Some use extra data */
301 } stateblock;
302
303 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
304
305
306 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
307 local working space and output vectors that were created on the stack. This has
308 caused issues for some patterns, especially in small-stack environments such as
309 Windows. A new scheme is now in use which sets up a vector on the stack, but if
310 this is too small, heap memory is used, up to the heap_limit. The main
311 parameters are all numbers of ints because the workspace is a vector of ints.
312
313 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
314 defined in pcre2_internal.h so as to be available to pcre2test when it is
315 finding the minimum heap requirement for a match. */
316
317 #define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
318
319 #define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
320 #define RWS_RSIZE 1000 /* Work size for recursion */
321 #define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
322 #define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
323
324 /* This structure is at the start of each workspace block. */
325
326 typedef struct RWS_anchor {
327 struct RWS_anchor *next;
328 uint32_t size; /* Number of ints */
329 uint32_t free; /* Number of ints */
330 } RWS_anchor;
331
332 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
333
334
335
336 /*************************************************
337 * Process a callout *
338 *************************************************/
339
340 /* This function is called to perform a callout.
341
342 Arguments:
343 code current code pointer
344 offsets points to current capture offsets
345 current_subject start of current subject match
346 ptr current position in subject
347 mb the match block
348 extracode extra code offset when called from condition
349 lengthptr where to return the callout length
350
351 Returns: the return from the callout
352 */
353
354 static int
do_callout_dfa(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)355 do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
356 PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
357 PCRE2_SIZE *lengthptr)
358 {
359 pcre2_callout_block *cb = mb->cb;
360
361 *lengthptr = (code[extracode] == OP_CALLOUT)?
362 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
363 (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
364
365 if (mb->callout == NULL) return 0; /* No callout provided */
366
367 /* Fixed fields in the callout block are set once and for all at the start of
368 matching. */
369
370 cb->offset_vector = offsets;
371 cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
372 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
373 cb->pattern_position = GET(code, 1 + extracode);
374 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
375
376 if (code[extracode] == OP_CALLOUT)
377 {
378 cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
379 cb->callout_string_offset = 0;
380 cb->callout_string = NULL;
381 cb->callout_string_length = 0;
382 }
383 else
384 {
385 cb->callout_number = 0;
386 cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
387 cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
388 cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
389 }
390
391 return (mb->callout)(cb, mb->callout_data);
392 }
393
394
395
396 /*************************************************
397 * Expand local workspace memory *
398 *************************************************/
399
400 /* This function is called when internal_dfa_match() is about to be called
401 recursively and there is insufficient working space left in the current
402 workspace block. If there's an existing next block, use it; otherwise get a new
403 block unless the heap limit is reached.
404
405 Arguments:
406 rwsptr pointer to block pointer (updated)
407 ovecsize space needed for an ovector
408 mb the match block
409
410 Returns: 0 rwsptr has been updated
411 !0 an error code
412 */
413
414 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)415 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
416 {
417 RWS_anchor *rws = *rwsptr;
418 RWS_anchor *new;
419
420 if (rws->next != NULL)
421 {
422 new = rws->next;
423 }
424
425 /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
426 mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
427 overflow. */
428
429 else
430 {
431 uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
432 uint32_t newsizeK = newsize/(1024/sizeof(int));
433
434 if (newsizeK + mb->heap_used > mb->heap_limit)
435 newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
436 newsize = newsizeK*(1024/sizeof(int));
437
438 if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
439 return PCRE2_ERROR_HEAPLIMIT;
440 new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
441 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
442 mb->heap_used += newsizeK;
443 new->next = NULL;
444 new->size = newsize;
445 rws->next = new;
446 }
447
448 new->free = new->size - RWS_ANCHOR_SIZE;
449 *rwsptr = new;
450 return 0;
451 }
452
453
454
455 /*************************************************
456 * Match a Regular Expression - DFA engine *
457 *************************************************/
458
459 /* This internal function applies a compiled pattern to a subject string,
460 starting at a given point, using a DFA engine. This function is called from the
461 external one, possibly multiple times if the pattern is not anchored. The
462 function calls itself recursively for some kinds of subpattern.
463
464 Arguments:
465 mb the match_data block with fixed information
466 this_start_code the opening bracket of this subexpression's code
467 current_subject where we currently are in the subject string
468 start_offset start offset in the subject string
469 offsets vector to contain the matching string offsets
470 offsetcount size of same
471 workspace vector of workspace
472 wscount size of same
473 rlevel function call recursion level
474
475 Returns: > 0 => number of match offset pairs placed in offsets
476 = 0 => offsets overflowed; longest matches are present
477 -1 => failed to match
478 < -1 => some kind of unexpected problem
479
480 The following macros are used for adding states to the two state vectors (one
481 for the current character, one for the following character). */
482
483 #define ADD_ACTIVE(x,y) \
484 if (active_count++ < wscount) \
485 { \
486 next_active_state->offset = (x); \
487 next_active_state->count = (y); \
488 next_active_state++; \
489 } \
490 else return PCRE2_ERROR_DFA_WSSIZE
491
492 #define ADD_ACTIVE_DATA(x,y,z) \
493 if (active_count++ < wscount) \
494 { \
495 next_active_state->offset = (x); \
496 next_active_state->count = (y); \
497 next_active_state->data = (z); \
498 next_active_state++; \
499 } \
500 else return PCRE2_ERROR_DFA_WSSIZE
501
502 #define ADD_NEW(x,y) \
503 if (new_count++ < wscount) \
504 { \
505 next_new_state->offset = (x); \
506 next_new_state->count = (y); \
507 next_new_state++; \
508 } \
509 else return PCRE2_ERROR_DFA_WSSIZE
510
511 #define ADD_NEW_DATA(x,y,z) \
512 if (new_count++ < wscount) \
513 { \
514 next_new_state->offset = (x); \
515 next_new_state->count = (y); \
516 next_new_state->data = (z); \
517 next_new_state++; \
518 } \
519 else return PCRE2_ERROR_DFA_WSSIZE
520
521 /* And now, here is the code */
522
523 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)524 internal_dfa_match(
525 dfa_match_block *mb,
526 PCRE2_SPTR this_start_code,
527 PCRE2_SPTR current_subject,
528 PCRE2_SIZE start_offset,
529 PCRE2_SIZE *offsets,
530 uint32_t offsetcount,
531 int *workspace,
532 int wscount,
533 uint32_t rlevel,
534 int *RWS)
535 {
536 stateblock *active_states, *new_states, *temp_states;
537 stateblock *next_active_state, *next_new_state;
538 const uint8_t *ctypes, *lcc, *fcc;
539 PCRE2_SPTR ptr;
540 PCRE2_SPTR end_code;
541 dfa_recursion_info new_recursive;
542 int active_count, new_count, match_count;
543
544 /* Some fields in the mb block are frequently referenced, so we load them into
545 independent variables in the hope that this will perform better. */
546
547 PCRE2_SPTR start_subject = mb->start_subject;
548 PCRE2_SPTR end_subject = mb->end_subject;
549 PCRE2_SPTR start_code = mb->start_code;
550
551 #ifdef SUPPORT_UNICODE
552 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
553 BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
554 #else
555 BOOL utf = FALSE;
556 #endif
557
558 BOOL reset_could_continue = FALSE;
559
560 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
561 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
562 offsetcount &= (uint32_t)(-2); /* Round down */
563
564 wscount -= 2;
565 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
566 (2 * INTS_PER_STATEBLOCK);
567
568 ctypes = mb->tables + ctypes_offset;
569 lcc = mb->tables + lcc_offset;
570 fcc = mb->tables + fcc_offset;
571
572 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
573
574 active_states = (stateblock *)(workspace + 2);
575 next_new_state = new_states = active_states + wscount;
576 new_count = 0;
577
578 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
579 the alternative states onto the list, and find out where the end is. This
580 makes is possible to use this function recursively, when we want to stop at a
581 matching internal ket rather than at the end.
582
583 If we are dealing with a backward assertion we have to find out the maximum
584 amount to move back, and set up each alternative appropriately. */
585
586 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
587 {
588 size_t max_back = 0;
589 size_t gone_back;
590
591 end_code = this_start_code;
592 do
593 {
594 size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
595 if (back > max_back) max_back = back;
596 end_code += GET(end_code, 1);
597 }
598 while (*end_code == OP_ALT);
599
600 /* If we can't go back the amount required for the longest lookbehind
601 pattern, go back as far as we can; some alternatives may still be viable. */
602
603 #ifdef SUPPORT_UNICODE
604 /* In character mode we have to step back character by character */
605
606 if (utf)
607 {
608 for (gone_back = 0; gone_back < max_back; gone_back++)
609 {
610 if (current_subject <= start_subject) break;
611 current_subject--;
612 ACROSSCHAR(current_subject > start_subject, current_subject,
613 current_subject--);
614 }
615 }
616 else
617 #endif
618
619 /* In byte-mode we can do this quickly. */
620
621 {
622 size_t current_offset = (size_t)(current_subject - start_subject);
623 gone_back = (current_offset < max_back)? current_offset : max_back;
624 current_subject -= gone_back;
625 }
626
627 /* Save the earliest consulted character */
628
629 if (current_subject < mb->start_used_ptr)
630 mb->start_used_ptr = current_subject;
631
632 /* Now we can process the individual branches. There will be an OP_REVERSE at
633 the start of each branch, except when the length of the branch is zero. */
634
635 end_code = this_start_code;
636 do
637 {
638 uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
639 size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
640 if (back <= gone_back)
641 {
642 int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
643 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
644 }
645 end_code += GET(end_code, 1);
646 }
647 while (*end_code == OP_ALT);
648 }
649
650 /* This is the code for a "normal" subpattern (not a backward assertion). The
651 start of a whole pattern is always one of these. If we are at the top level,
652 we may be asked to restart matching from the same point that we reached for a
653 previous partial match. We still have to scan through the top-level branches to
654 find the end state. */
655
656 else
657 {
658 end_code = this_start_code;
659
660 /* Restarting */
661
662 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
663 {
664 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
665 new_count = workspace[1];
666 if (!workspace[0])
667 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
668 }
669
670 /* Not restarting */
671
672 else
673 {
674 int length = 1 + LINK_SIZE +
675 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
676 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
677 ? IMM2_SIZE:0);
678 do
679 {
680 ADD_NEW((int)(end_code - start_code + length), 0);
681 end_code += GET(end_code, 1);
682 length = 1 + LINK_SIZE;
683 }
684 while (*end_code == OP_ALT);
685 }
686 }
687
688 workspace[0] = 0; /* Bit indicating which vector is current */
689
690 /* Loop for scanning the subject */
691
692 ptr = current_subject;
693 for (;;)
694 {
695 int i, j;
696 int clen, dlen;
697 uint32_t c, d;
698 int forced_fail = 0;
699 BOOL partial_newline = FALSE;
700 BOOL could_continue = reset_could_continue;
701 reset_could_continue = FALSE;
702
703 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
704
705 /* Make the new state list into the active state list and empty the
706 new state list. */
707
708 temp_states = active_states;
709 active_states = new_states;
710 new_states = temp_states;
711 active_count = new_count;
712 new_count = 0;
713
714 workspace[0] ^= 1; /* Remember for the restarting feature */
715 workspace[1] = active_count;
716
717 /* Set the pointers for adding new states */
718
719 next_active_state = active_states + active_count;
720 next_new_state = new_states;
721
722 /* Load the current character from the subject outside the loop, as many
723 different states may want to look at it, and we assume that at least one
724 will. */
725
726 if (ptr < end_subject)
727 {
728 clen = 1; /* Number of data items in the character */
729 #ifdef SUPPORT_UNICODE
730 GETCHARLENTEST(c, ptr, clen);
731 #else
732 c = *ptr;
733 #endif /* SUPPORT_UNICODE */
734 }
735 else
736 {
737 clen = 0; /* This indicates the end of the subject */
738 c = NOTACHAR; /* This value should never actually be used */
739 }
740
741 /* Scan up the active states and act on each one. The result of an action
742 may be to add more states to the currently active list (e.g. on hitting a
743 parenthesis) or it may be to put states on the new list, for considering
744 when we move the character pointer on. */
745
746 for (i = 0; i < active_count; i++)
747 {
748 stateblock *current_state = active_states + i;
749 BOOL caseless = FALSE;
750 PCRE2_SPTR code;
751 uint32_t codevalue;
752 int state_offset = current_state->offset;
753 int rrc;
754 int count;
755
756 /* A negative offset is a special case meaning "hold off going to this
757 (negated) state until the number of characters in the data field have
758 been skipped". If the could_continue flag was passed over from a previous
759 state, arrange for it to passed on. */
760
761 if (state_offset < 0)
762 {
763 if (current_state->data > 0)
764 {
765 ADD_NEW_DATA(state_offset, current_state->count,
766 current_state->data - 1);
767 if (could_continue) reset_could_continue = TRUE;
768 continue;
769 }
770 else
771 {
772 current_state->offset = state_offset = -state_offset;
773 }
774 }
775
776 /* Check for a duplicate state with the same count, and skip if found.
777 See the note at the head of this module about the possibility of improving
778 performance here. */
779
780 for (j = 0; j < i; j++)
781 {
782 if (active_states[j].offset == state_offset &&
783 active_states[j].count == current_state->count)
784 goto NEXT_ACTIVE_STATE;
785 }
786
787 /* The state offset is the offset to the opcode */
788
789 code = start_code + state_offset;
790 codevalue = *code;
791
792 /* If this opcode inspects a character, but we are at the end of the
793 subject, remember the fact for use when testing for a partial match. */
794
795 if (clen == 0 && poptable[codevalue] != 0)
796 could_continue = TRUE;
797
798 /* If this opcode is followed by an inline character, load it. It is
799 tempting to test for the presence of a subject character here, but that
800 is wrong, because sometimes zero repetitions of the subject are
801 permitted.
802
803 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
804 argument that is not a data character - but is always one byte long because
805 the values are small. We have to take special action to deal with \P, \p,
806 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
807 these ones to new opcodes. */
808
809 if (coptable[codevalue] > 0)
810 {
811 dlen = 1;
812 #ifdef SUPPORT_UNICODE
813 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
814 #endif /* SUPPORT_UNICODE */
815 d = code[coptable[codevalue]];
816 if (codevalue >= OP_TYPESTAR)
817 {
818 switch(d)
819 {
820 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
821 case OP_NOTPROP:
822 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
823 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
824 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
825 case OP_NOT_HSPACE:
826 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
827 case OP_NOT_VSPACE:
828 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
829 default: break;
830 }
831 }
832 }
833 else
834 {
835 dlen = 0; /* Not strictly necessary, but compilers moan */
836 d = NOTACHAR; /* if these variables are not set. */
837 }
838
839
840 /* Now process the individual opcodes */
841
842 switch (codevalue)
843 {
844 /* ========================================================================== */
845 /* These cases are never obeyed. This is a fudge that causes a compile-
846 time error if the vectors coptable or poptable, which are indexed by
847 opcode, are not the correct length. It seems to be the only way to do
848 such a check at compile time, as the sizeof() operator does not work
849 in the C preprocessor. */
850
851 case OP_TABLE_LENGTH:
852 case OP_TABLE_LENGTH +
853 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
854 (sizeof(poptable) == OP_TABLE_LENGTH)):
855 return 0;
856
857 /* ========================================================================== */
858 /* Reached a closing bracket. If not at the end of the pattern, carry
859 on with the next opcode. For repeating opcodes, also add the repeat
860 state. Note that KETRPOS will always be encountered at the end of the
861 subpattern, because the possessive subpattern repeats are always handled
862 using recursive calls. Thus, it never adds any new states.
863
864 At the end of the (sub)pattern, unless we have an empty string and
865 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
866 start of the subject, save the match data, shifting up all previous
867 matches so we always have the longest first. */
868
869 case OP_KET:
870 case OP_KETRMIN:
871 case OP_KETRMAX:
872 case OP_KETRPOS:
873 if (code != end_code)
874 {
875 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
876 if (codevalue != OP_KET)
877 {
878 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
879 }
880 }
881 else
882 {
883 if (ptr > current_subject ||
884 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
885 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
886 current_subject > start_subject + mb->start_offset)))
887 {
888 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
889 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
890 match_count = 0;
891 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
892 if (count > 0) (void)memmove(offsets + 2, offsets,
893 (size_t)count * sizeof(PCRE2_SIZE));
894 if (offsetcount >= 2)
895 {
896 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
897 offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
898 }
899 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
900 }
901 }
902 break;
903
904 /* ========================================================================== */
905 /* These opcodes add to the current list of states without looking
906 at the current character. */
907
908 /*-----------------------------------------------------------------*/
909 case OP_ALT:
910 do { code += GET(code, 1); } while (*code == OP_ALT);
911 ADD_ACTIVE((int)(code - start_code), 0);
912 break;
913
914 /*-----------------------------------------------------------------*/
915 case OP_BRA:
916 case OP_SBRA:
917 do
918 {
919 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
920 code += GET(code, 1);
921 }
922 while (*code == OP_ALT);
923 break;
924
925 /*-----------------------------------------------------------------*/
926 case OP_CBRA:
927 case OP_SCBRA:
928 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
929 code += GET(code, 1);
930 while (*code == OP_ALT)
931 {
932 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
933 code += GET(code, 1);
934 }
935 break;
936
937 /*-----------------------------------------------------------------*/
938 case OP_BRAZERO:
939 case OP_BRAMINZERO:
940 ADD_ACTIVE(state_offset + 1, 0);
941 code += 1 + GET(code, 2);
942 while (*code == OP_ALT) code += GET(code, 1);
943 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
944 break;
945
946 /*-----------------------------------------------------------------*/
947 case OP_SKIPZERO:
948 code += 1 + GET(code, 2);
949 while (*code == OP_ALT) code += GET(code, 1);
950 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
951 break;
952
953 /*-----------------------------------------------------------------*/
954 case OP_CIRC:
955 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
956 { ADD_ACTIVE(state_offset + 1, 0); }
957 break;
958
959 /*-----------------------------------------------------------------*/
960 case OP_CIRCM:
961 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
962 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
963 && WAS_NEWLINE(ptr)))
964 { ADD_ACTIVE(state_offset + 1, 0); }
965 break;
966
967 /*-----------------------------------------------------------------*/
968 case OP_EOD:
969 if (ptr >= end_subject)
970 {
971 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
972 return PCRE2_ERROR_PARTIAL;
973 else { ADD_ACTIVE(state_offset + 1, 0); }
974 }
975 break;
976
977 /*-----------------------------------------------------------------*/
978 case OP_SOD:
979 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
980 break;
981
982 /*-----------------------------------------------------------------*/
983 case OP_SOM:
984 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
985 break;
986
987
988 /* ========================================================================== */
989 /* These opcodes inspect the next subject character, and sometimes
990 the previous one as well, but do not have an argument. The variable
991 clen contains the length of the current character and is zero if we are
992 at the end of the subject. */
993
994 /*-----------------------------------------------------------------*/
995 case OP_ANY:
996 if (clen > 0 && !IS_NEWLINE(ptr))
997 {
998 if (ptr + 1 >= mb->end_subject &&
999 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1000 NLBLOCK->nltype == NLTYPE_FIXED &&
1001 NLBLOCK->nllen == 2 &&
1002 c == NLBLOCK->nl[0])
1003 {
1004 could_continue = partial_newline = TRUE;
1005 }
1006 else
1007 {
1008 ADD_NEW(state_offset + 1, 0);
1009 }
1010 }
1011 break;
1012
1013 /*-----------------------------------------------------------------*/
1014 case OP_ALLANY:
1015 if (clen > 0)
1016 { ADD_NEW(state_offset + 1, 0); }
1017 break;
1018
1019 /*-----------------------------------------------------------------*/
1020 case OP_EODN:
1021 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1022 {
1023 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1024 return PCRE2_ERROR_PARTIAL;
1025 ADD_ACTIVE(state_offset + 1, 0);
1026 }
1027 break;
1028
1029 /*-----------------------------------------------------------------*/
1030 case OP_DOLL:
1031 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1032 {
1033 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1034 could_continue = TRUE;
1035 else if (clen == 0 ||
1036 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1037 (ptr == end_subject - mb->nllen)
1038 ))
1039 { ADD_ACTIVE(state_offset + 1, 0); }
1040 else if (ptr + 1 >= mb->end_subject &&
1041 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1042 NLBLOCK->nltype == NLTYPE_FIXED &&
1043 NLBLOCK->nllen == 2 &&
1044 c == NLBLOCK->nl[0])
1045 {
1046 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1047 {
1048 reset_could_continue = TRUE;
1049 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1050 }
1051 else could_continue = partial_newline = TRUE;
1052 }
1053 }
1054 break;
1055
1056 /*-----------------------------------------------------------------*/
1057 case OP_DOLLM:
1058 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1059 {
1060 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1061 could_continue = TRUE;
1062 else if (clen == 0 ||
1063 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1064 { ADD_ACTIVE(state_offset + 1, 0); }
1065 else if (ptr + 1 >= mb->end_subject &&
1066 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1067 NLBLOCK->nltype == NLTYPE_FIXED &&
1068 NLBLOCK->nllen == 2 &&
1069 c == NLBLOCK->nl[0])
1070 {
1071 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1072 {
1073 reset_could_continue = TRUE;
1074 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1075 }
1076 else could_continue = partial_newline = TRUE;
1077 }
1078 }
1079 else if (IS_NEWLINE(ptr))
1080 { ADD_ACTIVE(state_offset + 1, 0); }
1081 break;
1082
1083 /*-----------------------------------------------------------------*/
1084
1085 case OP_DIGIT:
1086 case OP_WHITESPACE:
1087 case OP_WORDCHAR:
1088 if (clen > 0 && c < 256 &&
1089 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1090 { ADD_NEW(state_offset + 1, 0); }
1091 break;
1092
1093 /*-----------------------------------------------------------------*/
1094 case OP_NOT_DIGIT:
1095 case OP_NOT_WHITESPACE:
1096 case OP_NOT_WORDCHAR:
1097 if (clen > 0 && (c >= 256 ||
1098 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1099 { ADD_NEW(state_offset + 1, 0); }
1100 break;
1101
1102 /*-----------------------------------------------------------------*/
1103 case OP_WORD_BOUNDARY:
1104 case OP_NOT_WORD_BOUNDARY:
1105 case OP_NOT_UCP_WORD_BOUNDARY:
1106 case OP_UCP_WORD_BOUNDARY:
1107 {
1108 int left_word, right_word;
1109
1110 if (ptr > start_subject)
1111 {
1112 PCRE2_SPTR temp = ptr - 1;
1113 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1114 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1115 if (utf) { BACKCHAR(temp); }
1116 #endif
1117 GETCHARTEST(d, temp);
1118 #ifdef SUPPORT_UNICODE
1119 if (codevalue == OP_UCP_WORD_BOUNDARY ||
1120 codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1121 {
1122 int chartype = UCD_CHARTYPE(d);
1123 int category = PRIV(ucp_gentype)[chartype];
1124 left_word = (category == ucp_L || category == ucp_N ||
1125 chartype == ucp_Mn || chartype == ucp_Pc);
1126 }
1127 else
1128 #endif
1129 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1130 }
1131 else left_word = FALSE;
1132
1133 if (clen > 0)
1134 {
1135 if (ptr >= mb->last_used_ptr)
1136 {
1137 PCRE2_SPTR temp = ptr + 1;
1138 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1139 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1140 #endif
1141 mb->last_used_ptr = temp;
1142 }
1143 #ifdef SUPPORT_UNICODE
1144 if (codevalue == OP_UCP_WORD_BOUNDARY ||
1145 codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1146 {
1147 int chartype = UCD_CHARTYPE(c);
1148 int category = PRIV(ucp_gentype)[chartype];
1149 right_word = (category == ucp_L || category == ucp_N ||
1150 chartype == ucp_Mn || chartype == ucp_Pc);
1151 }
1152 else
1153 #endif
1154 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1155 }
1156 else right_word = FALSE;
1157
1158 if ((left_word == right_word) ==
1159 (codevalue == OP_NOT_WORD_BOUNDARY ||
1160 codevalue == OP_NOT_UCP_WORD_BOUNDARY))
1161 { ADD_ACTIVE(state_offset + 1, 0); }
1162 }
1163 break;
1164
1165
1166 /*-----------------------------------------------------------------*/
1167 /* Check the next character by Unicode property. We will get here only
1168 if the support is in the binary; otherwise a compile-time error occurs.
1169 */
1170
1171 #ifdef SUPPORT_UNICODE
1172 case OP_PROP:
1173 case OP_NOTPROP:
1174 if (clen > 0)
1175 {
1176 BOOL OK;
1177 int chartype;
1178 const uint32_t *cp;
1179 const ucd_record * prop = GET_UCD(c);
1180 switch(code[1])
1181 {
1182 case PT_ANY:
1183 OK = TRUE;
1184 break;
1185
1186 case PT_LAMP:
1187 chartype = prop->chartype;
1188 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1189 chartype == ucp_Lt;
1190 break;
1191
1192 case PT_GC:
1193 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1194 break;
1195
1196 case PT_PC:
1197 OK = prop->chartype == code[2];
1198 break;
1199
1200 case PT_SC:
1201 OK = prop->script == code[2];
1202 break;
1203
1204 case PT_SCX:
1205 OK = (prop->script == code[2] ||
1206 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1207 break;
1208
1209 /* These are specials for combination cases. */
1210
1211 case PT_ALNUM:
1212 chartype = prop->chartype;
1213 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1214 PRIV(ucp_gentype)[chartype] == ucp_N;
1215 break;
1216
1217 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1218 which means that Perl space and POSIX space are now identical. PCRE
1219 was changed at release 8.34. */
1220
1221 case PT_SPACE: /* Perl space */
1222 case PT_PXSPACE: /* POSIX space */
1223 switch(c)
1224 {
1225 HSPACE_CASES:
1226 VSPACE_CASES:
1227 OK = TRUE;
1228 break;
1229
1230 default:
1231 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1232 break;
1233 }
1234 break;
1235
1236 case PT_WORD:
1237 chartype = prop->chartype;
1238 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1239 PRIV(ucp_gentype)[chartype] == ucp_N ||
1240 chartype == ucp_Mn || chartype == ucp_Pc;
1241 break;
1242
1243 case PT_CLIST:
1244 #if PCRE2_CODE_UNIT_WIDTH == 32
1245 if (c > MAX_UTF_CODE_POINT)
1246 {
1247 OK = FALSE;
1248 break;
1249 }
1250 #endif
1251 cp = PRIV(ucd_caseless_sets) + code[2];
1252 for (;;)
1253 {
1254 if (c < *cp) { OK = FALSE; break; }
1255 if (c == *cp++) { OK = TRUE; break; }
1256 }
1257 break;
1258
1259 case PT_UCNC:
1260 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1261 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1262 c >= 0xe000;
1263 break;
1264
1265 case PT_BIDICL:
1266 OK = UCD_BIDICLASS(c) == code[2];
1267 break;
1268
1269 case PT_BOOL:
1270 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1271 UCD_BPROPS_PROP(prop), code[2]) != 0;
1272 break;
1273
1274 /* Should never occur, but keep compilers from grumbling. */
1275
1276 default:
1277 OK = codevalue != OP_PROP;
1278 break;
1279 }
1280
1281 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1282 }
1283 break;
1284 #endif
1285
1286
1287
1288 /* ========================================================================== */
1289 /* These opcodes likewise inspect the subject character, but have an
1290 argument that is not a data character. It is one of these opcodes:
1291 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1292 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1293
1294 case OP_TYPEPLUS:
1295 case OP_TYPEMINPLUS:
1296 case OP_TYPEPOSPLUS:
1297 count = current_state->count; /* Already matched */
1298 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1299 if (clen > 0)
1300 {
1301 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1302 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1303 NLBLOCK->nltype == NLTYPE_FIXED &&
1304 NLBLOCK->nllen == 2 &&
1305 c == NLBLOCK->nl[0])
1306 {
1307 could_continue = partial_newline = TRUE;
1308 }
1309 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1310 (c < 256 &&
1311 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1312 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1313 {
1314 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1315 {
1316 active_count--; /* Remove non-match possibility */
1317 next_active_state--;
1318 }
1319 count++;
1320 ADD_NEW(state_offset, count);
1321 }
1322 }
1323 break;
1324
1325 /*-----------------------------------------------------------------*/
1326 case OP_TYPEQUERY:
1327 case OP_TYPEMINQUERY:
1328 case OP_TYPEPOSQUERY:
1329 ADD_ACTIVE(state_offset + 2, 0);
1330 if (clen > 0)
1331 {
1332 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1333 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1334 NLBLOCK->nltype == NLTYPE_FIXED &&
1335 NLBLOCK->nllen == 2 &&
1336 c == NLBLOCK->nl[0])
1337 {
1338 could_continue = partial_newline = TRUE;
1339 }
1340 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1341 (c < 256 &&
1342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1343 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1344 {
1345 if (codevalue == OP_TYPEPOSQUERY)
1346 {
1347 active_count--; /* Remove non-match possibility */
1348 next_active_state--;
1349 }
1350 ADD_NEW(state_offset + 2, 0);
1351 }
1352 }
1353 break;
1354
1355 /*-----------------------------------------------------------------*/
1356 case OP_TYPESTAR:
1357 case OP_TYPEMINSTAR:
1358 case OP_TYPEPOSSTAR:
1359 ADD_ACTIVE(state_offset + 2, 0);
1360 if (clen > 0)
1361 {
1362 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1363 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1364 NLBLOCK->nltype == NLTYPE_FIXED &&
1365 NLBLOCK->nllen == 2 &&
1366 c == NLBLOCK->nl[0])
1367 {
1368 could_continue = partial_newline = TRUE;
1369 }
1370 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1371 (c < 256 &&
1372 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1373 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1374 {
1375 if (codevalue == OP_TYPEPOSSTAR)
1376 {
1377 active_count--; /* Remove non-match possibility */
1378 next_active_state--;
1379 }
1380 ADD_NEW(state_offset, 0);
1381 }
1382 }
1383 break;
1384
1385 /*-----------------------------------------------------------------*/
1386 case OP_TYPEEXACT:
1387 count = current_state->count; /* Number already matched */
1388 if (clen > 0)
1389 {
1390 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1391 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1392 NLBLOCK->nltype == NLTYPE_FIXED &&
1393 NLBLOCK->nllen == 2 &&
1394 c == NLBLOCK->nl[0])
1395 {
1396 could_continue = partial_newline = TRUE;
1397 }
1398 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1399 (c < 256 &&
1400 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1401 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1402 {
1403 if (++count >= (int)GET2(code, 1))
1404 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1405 else
1406 { ADD_NEW(state_offset, count); }
1407 }
1408 }
1409 break;
1410
1411 /*-----------------------------------------------------------------*/
1412 case OP_TYPEUPTO:
1413 case OP_TYPEMINUPTO:
1414 case OP_TYPEPOSUPTO:
1415 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1416 count = current_state->count; /* Number already matched */
1417 if (clen > 0)
1418 {
1419 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1420 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1421 NLBLOCK->nltype == NLTYPE_FIXED &&
1422 NLBLOCK->nllen == 2 &&
1423 c == NLBLOCK->nl[0])
1424 {
1425 could_continue = partial_newline = TRUE;
1426 }
1427 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1428 (c < 256 &&
1429 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1430 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1431 {
1432 if (codevalue == OP_TYPEPOSUPTO)
1433 {
1434 active_count--; /* Remove non-match possibility */
1435 next_active_state--;
1436 }
1437 if (++count >= (int)GET2(code, 1))
1438 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1439 else
1440 { ADD_NEW(state_offset, count); }
1441 }
1442 }
1443 break;
1444
1445 /* ========================================================================== */
1446 /* These are virtual opcodes that are used when something like
1447 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1448 argument. It keeps the code above fast for the other cases. The argument
1449 is in the d variable. */
1450
1451 #ifdef SUPPORT_UNICODE
1452 case OP_PROP_EXTRA + OP_TYPEPLUS:
1453 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1454 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1455 count = current_state->count; /* Already matched */
1456 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1457 if (clen > 0)
1458 {
1459 BOOL OK;
1460 int chartype;
1461 const uint32_t *cp;
1462 const ucd_record * prop = GET_UCD(c);
1463 switch(code[2])
1464 {
1465 case PT_ANY:
1466 OK = TRUE;
1467 break;
1468
1469 case PT_LAMP:
1470 chartype = prop->chartype;
1471 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1472 break;
1473
1474 case PT_GC:
1475 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1476 break;
1477
1478 case PT_PC:
1479 OK = prop->chartype == code[3];
1480 break;
1481
1482 case PT_SC:
1483 OK = prop->script == code[3];
1484 break;
1485
1486 case PT_SCX:
1487 OK = (prop->script == code[3] ||
1488 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1489 break;
1490
1491 /* These are specials for combination cases. */
1492
1493 case PT_ALNUM:
1494 chartype = prop->chartype;
1495 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1496 PRIV(ucp_gentype)[chartype] == ucp_N;
1497 break;
1498
1499 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1500 which means that Perl space and POSIX space are now identical. PCRE
1501 was changed at release 8.34. */
1502
1503 case PT_SPACE: /* Perl space */
1504 case PT_PXSPACE: /* POSIX space */
1505 switch(c)
1506 {
1507 HSPACE_CASES:
1508 VSPACE_CASES:
1509 OK = TRUE;
1510 break;
1511
1512 default:
1513 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1514 break;
1515 }
1516 break;
1517
1518 case PT_WORD:
1519 chartype = prop->chartype;
1520 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1521 PRIV(ucp_gentype)[chartype] == ucp_N ||
1522 chartype == ucp_Mn || chartype == ucp_Pc;
1523 break;
1524
1525 case PT_CLIST:
1526 #if PCRE2_CODE_UNIT_WIDTH == 32
1527 if (c > MAX_UTF_CODE_POINT)
1528 {
1529 OK = FALSE;
1530 break;
1531 }
1532 #endif
1533 cp = PRIV(ucd_caseless_sets) + code[3];
1534 for (;;)
1535 {
1536 if (c < *cp) { OK = FALSE; break; }
1537 if (c == *cp++) { OK = TRUE; break; }
1538 }
1539 break;
1540
1541 case PT_UCNC:
1542 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1543 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1544 c >= 0xe000;
1545 break;
1546
1547 case PT_BIDICL:
1548 OK = UCD_BIDICLASS(c) == code[3];
1549 break;
1550
1551 case PT_BOOL:
1552 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1553 UCD_BPROPS_PROP(prop), code[3]) != 0;
1554 break;
1555
1556 /* Should never occur, but keep compilers from grumbling. */
1557
1558 default:
1559 OK = codevalue != OP_PROP;
1560 break;
1561 }
1562
1563 if (OK == (d == OP_PROP))
1564 {
1565 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1566 {
1567 active_count--; /* Remove non-match possibility */
1568 next_active_state--;
1569 }
1570 count++;
1571 ADD_NEW(state_offset, count);
1572 }
1573 }
1574 break;
1575
1576 /*-----------------------------------------------------------------*/
1577 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1578 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1579 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1580 count = current_state->count; /* Already matched */
1581 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1582 if (clen > 0)
1583 {
1584 int ncount = 0;
1585 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1586 {
1587 active_count--; /* Remove non-match possibility */
1588 next_active_state--;
1589 }
1590 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1591 &ncount);
1592 count++;
1593 ADD_NEW_DATA(-state_offset, count, ncount);
1594 }
1595 break;
1596 #endif
1597
1598 /*-----------------------------------------------------------------*/
1599 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1600 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1601 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1602 count = current_state->count; /* Already matched */
1603 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1604 if (clen > 0)
1605 {
1606 int ncount = 0;
1607 switch (c)
1608 {
1609 case CHAR_VT:
1610 case CHAR_FF:
1611 case CHAR_NEL:
1612 #ifndef EBCDIC
1613 case 0x2028:
1614 case 0x2029:
1615 #endif /* Not EBCDIC */
1616 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1617 goto ANYNL01;
1618
1619 case CHAR_CR:
1620 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1621 /* Fall through */
1622
1623 ANYNL01:
1624 case CHAR_LF:
1625 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1626 {
1627 active_count--; /* Remove non-match possibility */
1628 next_active_state--;
1629 }
1630 count++;
1631 ADD_NEW_DATA(-state_offset, count, ncount);
1632 break;
1633
1634 default:
1635 break;
1636 }
1637 }
1638 break;
1639
1640 /*-----------------------------------------------------------------*/
1641 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1642 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1643 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1644 count = current_state->count; /* Already matched */
1645 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1646 if (clen > 0)
1647 {
1648 BOOL OK;
1649 switch (c)
1650 {
1651 VSPACE_CASES:
1652 OK = TRUE;
1653 break;
1654
1655 default:
1656 OK = FALSE;
1657 break;
1658 }
1659
1660 if (OK == (d == OP_VSPACE))
1661 {
1662 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1663 {
1664 active_count--; /* Remove non-match possibility */
1665 next_active_state--;
1666 }
1667 count++;
1668 ADD_NEW_DATA(-state_offset, count, 0);
1669 }
1670 }
1671 break;
1672
1673 /*-----------------------------------------------------------------*/
1674 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1675 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1676 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1677 count = current_state->count; /* Already matched */
1678 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1679 if (clen > 0)
1680 {
1681 BOOL OK;
1682 switch (c)
1683 {
1684 HSPACE_CASES:
1685 OK = TRUE;
1686 break;
1687
1688 default:
1689 OK = FALSE;
1690 break;
1691 }
1692
1693 if (OK == (d == OP_HSPACE))
1694 {
1695 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1696 {
1697 active_count--; /* Remove non-match possibility */
1698 next_active_state--;
1699 }
1700 count++;
1701 ADD_NEW_DATA(-state_offset, count, 0);
1702 }
1703 }
1704 break;
1705
1706 /*-----------------------------------------------------------------*/
1707 #ifdef SUPPORT_UNICODE
1708 case OP_PROP_EXTRA + OP_TYPEQUERY:
1709 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1710 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1711 count = 4;
1712 goto QS1;
1713
1714 case OP_PROP_EXTRA + OP_TYPESTAR:
1715 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1716 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1717 count = 0;
1718
1719 QS1:
1720
1721 ADD_ACTIVE(state_offset + 4, 0);
1722 if (clen > 0)
1723 {
1724 BOOL OK;
1725 int chartype;
1726 const uint32_t *cp;
1727 const ucd_record * prop = GET_UCD(c);
1728 switch(code[2])
1729 {
1730 case PT_ANY:
1731 OK = TRUE;
1732 break;
1733
1734 case PT_LAMP:
1735 chartype = prop->chartype;
1736 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1737 break;
1738
1739 case PT_GC:
1740 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1741 break;
1742
1743 case PT_PC:
1744 OK = prop->chartype == code[3];
1745 break;
1746
1747 case PT_SC:
1748 OK = prop->script == code[3];
1749 break;
1750
1751 case PT_SCX:
1752 OK = (prop->script == code[3] ||
1753 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1754 break;
1755
1756 /* These are specials for combination cases. */
1757
1758 case PT_ALNUM:
1759 chartype = prop->chartype;
1760 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1761 PRIV(ucp_gentype)[chartype] == ucp_N;
1762 break;
1763
1764 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1765 which means that Perl space and POSIX space are now identical. PCRE
1766 was changed at release 8.34. */
1767
1768 case PT_SPACE: /* Perl space */
1769 case PT_PXSPACE: /* POSIX space */
1770 switch(c)
1771 {
1772 HSPACE_CASES:
1773 VSPACE_CASES:
1774 OK = TRUE;
1775 break;
1776
1777 default:
1778 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1779 break;
1780 }
1781 break;
1782
1783 case PT_WORD:
1784 chartype = prop->chartype;
1785 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1786 PRIV(ucp_gentype)[chartype] == ucp_N ||
1787 chartype == ucp_Mn || chartype == ucp_Pc;
1788 break;
1789
1790 case PT_CLIST:
1791 #if PCRE2_CODE_UNIT_WIDTH == 32
1792 if (c > MAX_UTF_CODE_POINT)
1793 {
1794 OK = FALSE;
1795 break;
1796 }
1797 #endif
1798 cp = PRIV(ucd_caseless_sets) + code[3];
1799 for (;;)
1800 {
1801 if (c < *cp) { OK = FALSE; break; }
1802 if (c == *cp++) { OK = TRUE; break; }
1803 }
1804 break;
1805
1806 case PT_UCNC:
1807 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1808 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1809 c >= 0xe000;
1810 break;
1811
1812 case PT_BIDICL:
1813 OK = UCD_BIDICLASS(c) == code[3];
1814 break;
1815
1816 case PT_BOOL:
1817 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1818 UCD_BPROPS_PROP(prop), code[3]) != 0;
1819 break;
1820
1821 /* Should never occur, but keep compilers from grumbling. */
1822
1823 default:
1824 OK = codevalue != OP_PROP;
1825 break;
1826 }
1827
1828 if (OK == (d == OP_PROP))
1829 {
1830 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1831 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1832 {
1833 active_count--; /* Remove non-match possibility */
1834 next_active_state--;
1835 }
1836 ADD_NEW(state_offset + count, 0);
1837 }
1838 }
1839 break;
1840
1841 /*-----------------------------------------------------------------*/
1842 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1843 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1844 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1845 count = 2;
1846 goto QS2;
1847
1848 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1849 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1850 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1851 count = 0;
1852
1853 QS2:
1854
1855 ADD_ACTIVE(state_offset + 2, 0);
1856 if (clen > 0)
1857 {
1858 int ncount = 0;
1859 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1860 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1861 {
1862 active_count--; /* Remove non-match possibility */
1863 next_active_state--;
1864 }
1865 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1866 &ncount);
1867 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1868 }
1869 break;
1870 #endif
1871
1872 /*-----------------------------------------------------------------*/
1873 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1874 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1875 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1876 count = 2;
1877 goto QS3;
1878
1879 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1880 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1881 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1882 count = 0;
1883
1884 QS3:
1885 ADD_ACTIVE(state_offset + 2, 0);
1886 if (clen > 0)
1887 {
1888 int ncount = 0;
1889 switch (c)
1890 {
1891 case CHAR_VT:
1892 case CHAR_FF:
1893 case CHAR_NEL:
1894 #ifndef EBCDIC
1895 case 0x2028:
1896 case 0x2029:
1897 #endif /* Not EBCDIC */
1898 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1899 goto ANYNL02;
1900
1901 case CHAR_CR:
1902 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1903 /* Fall through */
1904
1905 ANYNL02:
1906 case CHAR_LF:
1907 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1908 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1909 {
1910 active_count--; /* Remove non-match possibility */
1911 next_active_state--;
1912 }
1913 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1914 break;
1915
1916 default:
1917 break;
1918 }
1919 }
1920 break;
1921
1922 /*-----------------------------------------------------------------*/
1923 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1924 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1925 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1926 count = 2;
1927 goto QS4;
1928
1929 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1930 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1931 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1932 count = 0;
1933
1934 QS4:
1935 ADD_ACTIVE(state_offset + 2, 0);
1936 if (clen > 0)
1937 {
1938 BOOL OK;
1939 switch (c)
1940 {
1941 VSPACE_CASES:
1942 OK = TRUE;
1943 break;
1944
1945 default:
1946 OK = FALSE;
1947 break;
1948 }
1949 if (OK == (d == OP_VSPACE))
1950 {
1951 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1952 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1953 {
1954 active_count--; /* Remove non-match possibility */
1955 next_active_state--;
1956 }
1957 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1958 }
1959 }
1960 break;
1961
1962 /*-----------------------------------------------------------------*/
1963 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1964 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1965 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1966 count = 2;
1967 goto QS5;
1968
1969 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1970 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1971 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1972 count = 0;
1973
1974 QS5:
1975 ADD_ACTIVE(state_offset + 2, 0);
1976 if (clen > 0)
1977 {
1978 BOOL OK;
1979 switch (c)
1980 {
1981 HSPACE_CASES:
1982 OK = TRUE;
1983 break;
1984
1985 default:
1986 OK = FALSE;
1987 break;
1988 }
1989
1990 if (OK == (d == OP_HSPACE))
1991 {
1992 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1993 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1994 {
1995 active_count--; /* Remove non-match possibility */
1996 next_active_state--;
1997 }
1998 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1999 }
2000 }
2001 break;
2002
2003 /*-----------------------------------------------------------------*/
2004 #ifdef SUPPORT_UNICODE
2005 case OP_PROP_EXTRA + OP_TYPEEXACT:
2006 case OP_PROP_EXTRA + OP_TYPEUPTO:
2007 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
2008 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
2009 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
2010 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
2011 count = current_state->count; /* Number already matched */
2012 if (clen > 0)
2013 {
2014 BOOL OK;
2015 int chartype;
2016 const uint32_t *cp;
2017 const ucd_record * prop = GET_UCD(c);
2018 switch(code[1 + IMM2_SIZE + 1])
2019 {
2020 case PT_ANY:
2021 OK = TRUE;
2022 break;
2023
2024 case PT_LAMP:
2025 chartype = prop->chartype;
2026 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
2027 break;
2028
2029 case PT_GC:
2030 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
2031 break;
2032
2033 case PT_PC:
2034 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
2035 break;
2036
2037 case PT_SC:
2038 OK = prop->script == code[1 + IMM2_SIZE + 2];
2039 break;
2040
2041 case PT_SCX:
2042 OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2043 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2044 code[1 + IMM2_SIZE + 2]) != 0);
2045 break;
2046
2047 /* These are specials for combination cases. */
2048
2049 case PT_ALNUM:
2050 chartype = prop->chartype;
2051 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2052 PRIV(ucp_gentype)[chartype] == ucp_N;
2053 break;
2054
2055 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2056 which means that Perl space and POSIX space are now identical. PCRE
2057 was changed at release 8.34. */
2058
2059 case PT_SPACE: /* Perl space */
2060 case PT_PXSPACE: /* POSIX space */
2061 switch(c)
2062 {
2063 HSPACE_CASES:
2064 VSPACE_CASES:
2065 OK = TRUE;
2066 break;
2067
2068 default:
2069 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2070 break;
2071 }
2072 break;
2073
2074 case PT_WORD:
2075 chartype = prop->chartype;
2076 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2077 PRIV(ucp_gentype)[chartype] == ucp_N ||
2078 chartype == ucp_Mn || chartype == ucp_Pc;
2079 break;
2080
2081 case PT_CLIST:
2082 #if PCRE2_CODE_UNIT_WIDTH == 32
2083 if (c > MAX_UTF_CODE_POINT)
2084 {
2085 OK = FALSE;
2086 break;
2087 }
2088 #endif
2089 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2090 for (;;)
2091 {
2092 if (c < *cp) { OK = FALSE; break; }
2093 if (c == *cp++) { OK = TRUE; break; }
2094 }
2095 break;
2096
2097 case PT_UCNC:
2098 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2099 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2100 c >= 0xe000;
2101 break;
2102
2103 case PT_BIDICL:
2104 OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2105 break;
2106
2107 case PT_BOOL:
2108 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2109 UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2110 break;
2111
2112 /* Should never occur, but keep compilers from grumbling. */
2113
2114 default:
2115 OK = codevalue != OP_PROP;
2116 break;
2117 }
2118
2119 if (OK == (d == OP_PROP))
2120 {
2121 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2122 {
2123 active_count--; /* Remove non-match possibility */
2124 next_active_state--;
2125 }
2126 if (++count >= (int)GET2(code, 1))
2127 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2128 else
2129 { ADD_NEW(state_offset, count); }
2130 }
2131 }
2132 break;
2133
2134 /*-----------------------------------------------------------------*/
2135 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2136 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2137 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2138 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2139 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2140 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2141 count = current_state->count; /* Number already matched */
2142 if (clen > 0)
2143 {
2144 PCRE2_SPTR nptr;
2145 int ncount = 0;
2146 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2147 {
2148 active_count--; /* Remove non-match possibility */
2149 next_active_state--;
2150 }
2151 nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2152 &ncount);
2153 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2154 reset_could_continue = TRUE;
2155 if (++count >= (int)GET2(code, 1))
2156 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2157 else
2158 { ADD_NEW_DATA(-state_offset, count, ncount); }
2159 }
2160 break;
2161 #endif
2162
2163 /*-----------------------------------------------------------------*/
2164 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2165 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2166 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2167 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2168 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2169 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2170 count = current_state->count; /* Number already matched */
2171 if (clen > 0)
2172 {
2173 int ncount = 0;
2174 switch (c)
2175 {
2176 case CHAR_VT:
2177 case CHAR_FF:
2178 case CHAR_NEL:
2179 #ifndef EBCDIC
2180 case 0x2028:
2181 case 0x2029:
2182 #endif /* Not EBCDIC */
2183 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2184 goto ANYNL03;
2185
2186 case CHAR_CR:
2187 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2188 /* Fall through */
2189
2190 ANYNL03:
2191 case CHAR_LF:
2192 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2193 {
2194 active_count--; /* Remove non-match possibility */
2195 next_active_state--;
2196 }
2197 if (++count >= (int)GET2(code, 1))
2198 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2199 else
2200 { ADD_NEW_DATA(-state_offset, count, ncount); }
2201 break;
2202
2203 default:
2204 break;
2205 }
2206 }
2207 break;
2208
2209 /*-----------------------------------------------------------------*/
2210 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2211 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2212 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2213 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2214 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2215 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2216 count = current_state->count; /* Number already matched */
2217 if (clen > 0)
2218 {
2219 BOOL OK;
2220 switch (c)
2221 {
2222 VSPACE_CASES:
2223 OK = TRUE;
2224 break;
2225
2226 default:
2227 OK = FALSE;
2228 }
2229
2230 if (OK == (d == OP_VSPACE))
2231 {
2232 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2233 {
2234 active_count--; /* Remove non-match possibility */
2235 next_active_state--;
2236 }
2237 if (++count >= (int)GET2(code, 1))
2238 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2239 else
2240 { ADD_NEW_DATA(-state_offset, count, 0); }
2241 }
2242 }
2243 break;
2244
2245 /*-----------------------------------------------------------------*/
2246 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2247 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2248 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2249 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2250 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2251 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2252 count = current_state->count; /* Number already matched */
2253 if (clen > 0)
2254 {
2255 BOOL OK;
2256 switch (c)
2257 {
2258 HSPACE_CASES:
2259 OK = TRUE;
2260 break;
2261
2262 default:
2263 OK = FALSE;
2264 break;
2265 }
2266
2267 if (OK == (d == OP_HSPACE))
2268 {
2269 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2270 {
2271 active_count--; /* Remove non-match possibility */
2272 next_active_state--;
2273 }
2274 if (++count >= (int)GET2(code, 1))
2275 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2276 else
2277 { ADD_NEW_DATA(-state_offset, count, 0); }
2278 }
2279 }
2280 break;
2281
2282 /* ========================================================================== */
2283 /* These opcodes are followed by a character that is usually compared
2284 to the current subject character; it is loaded into d. We still get
2285 here even if there is no subject character, because in some cases zero
2286 repetitions are permitted. */
2287
2288 /*-----------------------------------------------------------------*/
2289 case OP_CHAR:
2290 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2291 break;
2292
2293 /*-----------------------------------------------------------------*/
2294 case OP_CHARI:
2295 if (clen == 0) break;
2296
2297 #ifdef SUPPORT_UNICODE
2298 if (utf_or_ucp)
2299 {
2300 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2301 {
2302 unsigned int othercase;
2303 if (c < 128)
2304 othercase = fcc[c];
2305 else
2306 othercase = UCD_OTHERCASE(c);
2307 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2308 }
2309 }
2310 else
2311 #endif /* SUPPORT_UNICODE */
2312 /* Not UTF or UCP mode */
2313 {
2314 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2315 { ADD_NEW(state_offset + 2, 0); }
2316 }
2317 break;
2318
2319
2320 #ifdef SUPPORT_UNICODE
2321 /*-----------------------------------------------------------------*/
2322 /* This is a tricky one because it can match more than one character.
2323 Find out how many characters to skip, and then set up a negative state
2324 to wait for them to pass before continuing. */
2325
2326 case OP_EXTUNI:
2327 if (clen > 0)
2328 {
2329 int ncount = 0;
2330 PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2331 end_subject, utf, &ncount);
2332 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2333 reset_could_continue = TRUE;
2334 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2335 }
2336 break;
2337 #endif
2338
2339 /*-----------------------------------------------------------------*/
2340 /* This is a tricky like EXTUNI because it too can match more than one
2341 character (when CR is followed by LF). In this case, set up a negative
2342 state to wait for one character to pass before continuing. */
2343
2344 case OP_ANYNL:
2345 if (clen > 0) switch(c)
2346 {
2347 case CHAR_VT:
2348 case CHAR_FF:
2349 case CHAR_NEL:
2350 #ifndef EBCDIC
2351 case 0x2028:
2352 case 0x2029:
2353 #endif /* Not EBCDIC */
2354 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2355 /* Fall through */
2356
2357 case CHAR_LF:
2358 ADD_NEW(state_offset + 1, 0);
2359 break;
2360
2361 case CHAR_CR:
2362 if (ptr + 1 >= end_subject)
2363 {
2364 ADD_NEW(state_offset + 1, 0);
2365 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2366 reset_could_continue = TRUE;
2367 }
2368 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2369 {
2370 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2371 }
2372 else
2373 {
2374 ADD_NEW(state_offset + 1, 0);
2375 }
2376 break;
2377 }
2378 break;
2379
2380 /*-----------------------------------------------------------------*/
2381 case OP_NOT_VSPACE:
2382 if (clen > 0) switch(c)
2383 {
2384 VSPACE_CASES:
2385 break;
2386
2387 default:
2388 ADD_NEW(state_offset + 1, 0);
2389 break;
2390 }
2391 break;
2392
2393 /*-----------------------------------------------------------------*/
2394 case OP_VSPACE:
2395 if (clen > 0) switch(c)
2396 {
2397 VSPACE_CASES:
2398 ADD_NEW(state_offset + 1, 0);
2399 break;
2400
2401 default:
2402 break;
2403 }
2404 break;
2405
2406 /*-----------------------------------------------------------------*/
2407 case OP_NOT_HSPACE:
2408 if (clen > 0) switch(c)
2409 {
2410 HSPACE_CASES:
2411 break;
2412
2413 default:
2414 ADD_NEW(state_offset + 1, 0);
2415 break;
2416 }
2417 break;
2418
2419 /*-----------------------------------------------------------------*/
2420 case OP_HSPACE:
2421 if (clen > 0) switch(c)
2422 {
2423 HSPACE_CASES:
2424 ADD_NEW(state_offset + 1, 0);
2425 break;
2426
2427 default:
2428 break;
2429 }
2430 break;
2431
2432 /*-----------------------------------------------------------------*/
2433 /* Match a negated single character casefully. */
2434
2435 case OP_NOT:
2436 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2437 break;
2438
2439 /*-----------------------------------------------------------------*/
2440 /* Match a negated single character caselessly. */
2441
2442 case OP_NOTI:
2443 if (clen > 0)
2444 {
2445 uint32_t otherd;
2446 #ifdef SUPPORT_UNICODE
2447 if (utf_or_ucp && d >= 128)
2448 otherd = UCD_OTHERCASE(d);
2449 else
2450 #endif /* SUPPORT_UNICODE */
2451 otherd = TABLE_GET(d, fcc, d);
2452 if (c != d && c != otherd)
2453 { ADD_NEW(state_offset + dlen + 1, 0); }
2454 }
2455 break;
2456
2457 /*-----------------------------------------------------------------*/
2458 case OP_PLUSI:
2459 case OP_MINPLUSI:
2460 case OP_POSPLUSI:
2461 case OP_NOTPLUSI:
2462 case OP_NOTMINPLUSI:
2463 case OP_NOTPOSPLUSI:
2464 caseless = TRUE;
2465 codevalue -= OP_STARI - OP_STAR;
2466
2467 /* Fall through */
2468 case OP_PLUS:
2469 case OP_MINPLUS:
2470 case OP_POSPLUS:
2471 case OP_NOTPLUS:
2472 case OP_NOTMINPLUS:
2473 case OP_NOTPOSPLUS:
2474 count = current_state->count; /* Already matched */
2475 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2476 if (clen > 0)
2477 {
2478 uint32_t otherd = NOTACHAR;
2479 if (caseless)
2480 {
2481 #ifdef SUPPORT_UNICODE
2482 if (utf_or_ucp && d >= 128)
2483 otherd = UCD_OTHERCASE(d);
2484 else
2485 #endif /* SUPPORT_UNICODE */
2486 otherd = TABLE_GET(d, fcc, d);
2487 }
2488 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2489 {
2490 if (count > 0 &&
2491 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2492 {
2493 active_count--; /* Remove non-match possibility */
2494 next_active_state--;
2495 }
2496 count++;
2497 ADD_NEW(state_offset, count);
2498 }
2499 }
2500 break;
2501
2502 /*-----------------------------------------------------------------*/
2503 case OP_QUERYI:
2504 case OP_MINQUERYI:
2505 case OP_POSQUERYI:
2506 case OP_NOTQUERYI:
2507 case OP_NOTMINQUERYI:
2508 case OP_NOTPOSQUERYI:
2509 caseless = TRUE;
2510 codevalue -= OP_STARI - OP_STAR;
2511 /* Fall through */
2512 case OP_QUERY:
2513 case OP_MINQUERY:
2514 case OP_POSQUERY:
2515 case OP_NOTQUERY:
2516 case OP_NOTMINQUERY:
2517 case OP_NOTPOSQUERY:
2518 ADD_ACTIVE(state_offset + dlen + 1, 0);
2519 if (clen > 0)
2520 {
2521 uint32_t otherd = NOTACHAR;
2522 if (caseless)
2523 {
2524 #ifdef SUPPORT_UNICODE
2525 if (utf_or_ucp && d >= 128)
2526 otherd = UCD_OTHERCASE(d);
2527 else
2528 #endif /* SUPPORT_UNICODE */
2529 otherd = TABLE_GET(d, fcc, d);
2530 }
2531 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2532 {
2533 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2534 {
2535 active_count--; /* Remove non-match possibility */
2536 next_active_state--;
2537 }
2538 ADD_NEW(state_offset + dlen + 1, 0);
2539 }
2540 }
2541 break;
2542
2543 /*-----------------------------------------------------------------*/
2544 case OP_STARI:
2545 case OP_MINSTARI:
2546 case OP_POSSTARI:
2547 case OP_NOTSTARI:
2548 case OP_NOTMINSTARI:
2549 case OP_NOTPOSSTARI:
2550 caseless = TRUE;
2551 codevalue -= OP_STARI - OP_STAR;
2552 /* Fall through */
2553 case OP_STAR:
2554 case OP_MINSTAR:
2555 case OP_POSSTAR:
2556 case OP_NOTSTAR:
2557 case OP_NOTMINSTAR:
2558 case OP_NOTPOSSTAR:
2559 ADD_ACTIVE(state_offset + dlen + 1, 0);
2560 if (clen > 0)
2561 {
2562 uint32_t otherd = NOTACHAR;
2563 if (caseless)
2564 {
2565 #ifdef SUPPORT_UNICODE
2566 if (utf_or_ucp && d >= 128)
2567 otherd = UCD_OTHERCASE(d);
2568 else
2569 #endif /* SUPPORT_UNICODE */
2570 otherd = TABLE_GET(d, fcc, d);
2571 }
2572 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2573 {
2574 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2575 {
2576 active_count--; /* Remove non-match possibility */
2577 next_active_state--;
2578 }
2579 ADD_NEW(state_offset, 0);
2580 }
2581 }
2582 break;
2583
2584 /*-----------------------------------------------------------------*/
2585 case OP_EXACTI:
2586 case OP_NOTEXACTI:
2587 caseless = TRUE;
2588 codevalue -= OP_STARI - OP_STAR;
2589 /* Fall through */
2590 case OP_EXACT:
2591 case OP_NOTEXACT:
2592 count = current_state->count; /* Number already matched */
2593 if (clen > 0)
2594 {
2595 uint32_t otherd = NOTACHAR;
2596 if (caseless)
2597 {
2598 #ifdef SUPPORT_UNICODE
2599 if (utf_or_ucp && d >= 128)
2600 otherd = UCD_OTHERCASE(d);
2601 else
2602 #endif /* SUPPORT_UNICODE */
2603 otherd = TABLE_GET(d, fcc, d);
2604 }
2605 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2606 {
2607 if (++count >= (int)GET2(code, 1))
2608 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2609 else
2610 { ADD_NEW(state_offset, count); }
2611 }
2612 }
2613 break;
2614
2615 /*-----------------------------------------------------------------*/
2616 case OP_UPTOI:
2617 case OP_MINUPTOI:
2618 case OP_POSUPTOI:
2619 case OP_NOTUPTOI:
2620 case OP_NOTMINUPTOI:
2621 case OP_NOTPOSUPTOI:
2622 caseless = TRUE;
2623 codevalue -= OP_STARI - OP_STAR;
2624 /* Fall through */
2625 case OP_UPTO:
2626 case OP_MINUPTO:
2627 case OP_POSUPTO:
2628 case OP_NOTUPTO:
2629 case OP_NOTMINUPTO:
2630 case OP_NOTPOSUPTO:
2631 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2632 count = current_state->count; /* Number already matched */
2633 if (clen > 0)
2634 {
2635 uint32_t otherd = NOTACHAR;
2636 if (caseless)
2637 {
2638 #ifdef SUPPORT_UNICODE
2639 if (utf_or_ucp && d >= 128)
2640 otherd = UCD_OTHERCASE(d);
2641 else
2642 #endif /* SUPPORT_UNICODE */
2643 otherd = TABLE_GET(d, fcc, d);
2644 }
2645 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2646 {
2647 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2648 {
2649 active_count--; /* Remove non-match possibility */
2650 next_active_state--;
2651 }
2652 if (++count >= (int)GET2(code, 1))
2653 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2654 else
2655 { ADD_NEW(state_offset, count); }
2656 }
2657 }
2658 break;
2659
2660
2661 /* ========================================================================== */
2662 /* These are the class-handling opcodes */
2663
2664 case OP_CLASS:
2665 case OP_NCLASS:
2666 case OP_XCLASS:
2667 {
2668 BOOL isinclass = FALSE;
2669 int next_state_offset;
2670 PCRE2_SPTR ecode;
2671
2672 /* For a simple class, there is always just a 32-byte table, and we
2673 can set isinclass from it. */
2674
2675 if (codevalue != OP_XCLASS)
2676 {
2677 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2678 if (clen > 0)
2679 {
2680 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2681 ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2682 }
2683 }
2684
2685 /* An extended class may have a table or a list of single characters,
2686 ranges, or both, and it may be positive or negative. There's a
2687 function that sorts all this out. */
2688
2689 else
2690 {
2691 ecode = code + GET(code, 1);
2692 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2693 }
2694
2695 /* At this point, isinclass is set for all kinds of class, and ecode
2696 points to the byte after the end of the class. If there is a
2697 quantifier, this is where it will be. */
2698
2699 next_state_offset = (int)(ecode - start_code);
2700
2701 switch (*ecode)
2702 {
2703 case OP_CRSTAR:
2704 case OP_CRMINSTAR:
2705 case OP_CRPOSSTAR:
2706 ADD_ACTIVE(next_state_offset + 1, 0);
2707 if (isinclass)
2708 {
2709 if (*ecode == OP_CRPOSSTAR)
2710 {
2711 active_count--; /* Remove non-match possibility */
2712 next_active_state--;
2713 }
2714 ADD_NEW(state_offset, 0);
2715 }
2716 break;
2717
2718 case OP_CRPLUS:
2719 case OP_CRMINPLUS:
2720 case OP_CRPOSPLUS:
2721 count = current_state->count; /* Already matched */
2722 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2723 if (isinclass)
2724 {
2725 if (count > 0 && *ecode == OP_CRPOSPLUS)
2726 {
2727 active_count--; /* Remove non-match possibility */
2728 next_active_state--;
2729 }
2730 count++;
2731 ADD_NEW(state_offset, count);
2732 }
2733 break;
2734
2735 case OP_CRQUERY:
2736 case OP_CRMINQUERY:
2737 case OP_CRPOSQUERY:
2738 ADD_ACTIVE(next_state_offset + 1, 0);
2739 if (isinclass)
2740 {
2741 if (*ecode == OP_CRPOSQUERY)
2742 {
2743 active_count--; /* Remove non-match possibility */
2744 next_active_state--;
2745 }
2746 ADD_NEW(next_state_offset + 1, 0);
2747 }
2748 break;
2749
2750 case OP_CRRANGE:
2751 case OP_CRMINRANGE:
2752 case OP_CRPOSRANGE:
2753 count = current_state->count; /* Already matched */
2754 if (count >= (int)GET2(ecode, 1))
2755 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2756 if (isinclass)
2757 {
2758 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2759
2760 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2761 {
2762 active_count--; /* Remove non-match possibility */
2763 next_active_state--;
2764 }
2765
2766 if (++count >= max && max != 0) /* Max 0 => no limit */
2767 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2768 else
2769 { ADD_NEW(state_offset, count); }
2770 }
2771 break;
2772
2773 default:
2774 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2775 break;
2776 }
2777 }
2778 break;
2779
2780 /* ========================================================================== */
2781 /* These are the opcodes for fancy brackets of various kinds. We have
2782 to use recursion in order to handle them. The "always failing" assertion
2783 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2784 though the other "backtracking verbs" are not supported. */
2785
2786 case OP_FAIL:
2787 forced_fail++; /* Count FAILs for multiple states */
2788 break;
2789
2790 case OP_ASSERT:
2791 case OP_ASSERT_NOT:
2792 case OP_ASSERTBACK:
2793 case OP_ASSERTBACK_NOT:
2794 {
2795 int rc;
2796 int *local_workspace;
2797 PCRE2_SIZE *local_offsets;
2798 PCRE2_SPTR endasscode = code + GET(code, 1);
2799 RWS_anchor *rws = (RWS_anchor *)RWS;
2800
2801 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2802 {
2803 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2804 if (rc != 0) return rc;
2805 RWS = (int *)rws;
2806 }
2807
2808 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2809 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2810 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2811
2812 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2813
2814 rc = internal_dfa_match(
2815 mb, /* static match data */
2816 code, /* this subexpression's code */
2817 ptr, /* where we currently are */
2818 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2819 local_offsets, /* offset vector */
2820 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2821 local_workspace, /* workspace vector */
2822 RWS_RSIZE, /* size of same */
2823 rlevel, /* function recursion level */
2824 RWS); /* recursion workspace */
2825
2826 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2827
2828 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2829 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2830 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2831 }
2832 break;
2833
2834 /*-----------------------------------------------------------------*/
2835 case OP_COND:
2836 case OP_SCOND:
2837 {
2838 int codelink = (int)GET(code, 1);
2839 PCRE2_UCHAR condcode;
2840
2841 /* Because of the way auto-callout works during compile, a callout item
2842 is inserted between OP_COND and an assertion condition. This does not
2843 happen for the other conditions. */
2844
2845 if (code[LINK_SIZE + 1] == OP_CALLOUT
2846 || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2847 {
2848 PCRE2_SIZE callout_length;
2849 rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2850 1 + LINK_SIZE, &callout_length);
2851 if (rrc < 0) return rrc; /* Abandon */
2852 if (rrc > 0) break; /* Fail this thread */
2853 code += callout_length; /* Skip callout data */
2854 }
2855
2856 condcode = code[LINK_SIZE+1];
2857
2858 /* Back reference conditions and duplicate named recursion conditions
2859 are not supported */
2860
2861 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2862 condcode == OP_DNRREF)
2863 return PCRE2_ERROR_DFA_UCOND;
2864
2865 /* The DEFINE condition is always false, and the assertion (?!) is
2866 converted to OP_FAIL. */
2867
2868 if (condcode == OP_FALSE || condcode == OP_FAIL)
2869 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2870
2871 /* There is also an always-true condition */
2872
2873 else if (condcode == OP_TRUE)
2874 { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2875
2876 /* The only supported version of OP_RREF is for the value RREF_ANY,
2877 which means "test if in any recursion". We can't test for specifically
2878 recursed groups. */
2879
2880 else if (condcode == OP_RREF)
2881 {
2882 unsigned int value = GET2(code, LINK_SIZE + 2);
2883 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2884 if (mb->recursive != NULL)
2885 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2886 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2887 }
2888
2889 /* Otherwise, the condition is an assertion */
2890
2891 else
2892 {
2893 int rc;
2894 int *local_workspace;
2895 PCRE2_SIZE *local_offsets;
2896 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2897 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2898 RWS_anchor *rws = (RWS_anchor *)RWS;
2899
2900 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2901 {
2902 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2903 if (rc != 0) return rc;
2904 RWS = (int *)rws;
2905 }
2906
2907 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2908 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2909 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2910
2911 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2912
2913 rc = internal_dfa_match(
2914 mb, /* fixed match data */
2915 asscode, /* this subexpression's code */
2916 ptr, /* where we currently are */
2917 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2918 local_offsets, /* offset vector */
2919 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2920 local_workspace, /* workspace vector */
2921 RWS_RSIZE, /* size of same */
2922 rlevel, /* function recursion level */
2923 RWS); /* recursion workspace */
2924
2925 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2926
2927 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2928 if ((rc >= 0) ==
2929 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2930 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2931 else
2932 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2933 }
2934 }
2935 break;
2936
2937 /*-----------------------------------------------------------------*/
2938 case OP_RECURSE:
2939 {
2940 int rc;
2941 int *local_workspace;
2942 PCRE2_SIZE *local_offsets;
2943 RWS_anchor *rws = (RWS_anchor *)RWS;
2944 PCRE2_SPTR callpat = start_code + GET(code, 1);
2945 uint32_t recno = (callpat == mb->start_code)? 0 :
2946 GET2(callpat, 1 + LINK_SIZE);
2947
2948 if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2949 {
2950 rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2951 if (rc != 0) return rc;
2952 RWS = (int *)rws;
2953 }
2954
2955 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2956 local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2957 rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2958
2959 /* Check for repeating a recursion without advancing the subject
2960 pointer or last used character. This should catch convoluted mutual
2961 recursions. (Some simple cases are caught at compile time.) */
2962
2963 for (dfa_recursion_info *ri = mb->recursive;
2964 ri != NULL;
2965 ri = ri->prevrec)
2966 {
2967 if (recno == ri->group_num && ptr == ri->subject_position &&
2968 mb->last_used_ptr == ri->last_used_ptr)
2969 return PCRE2_ERROR_RECURSELOOP;
2970 }
2971
2972 /* Remember this recursion and where we started it so as to
2973 catch infinite loops. */
2974
2975 new_recursive.group_num = recno;
2976 new_recursive.subject_position = ptr;
2977 new_recursive.last_used_ptr = mb->last_used_ptr;
2978 new_recursive.prevrec = mb->recursive;
2979 mb->recursive = &new_recursive;
2980
2981 rc = internal_dfa_match(
2982 mb, /* fixed match data */
2983 callpat, /* this subexpression's code */
2984 ptr, /* where we currently are */
2985 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2986 local_offsets, /* offset vector */
2987 RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2988 local_workspace, /* workspace vector */
2989 RWS_RSIZE, /* size of same */
2990 rlevel, /* function recursion level */
2991 RWS); /* recursion workspace */
2992
2993 rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2994 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2995
2996 /* Ran out of internal offsets */
2997
2998 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2999
3000 /* For each successful matched substring, set up the next state with a
3001 count of characters to skip before trying it. Note that the count is in
3002 characters, not bytes. */
3003
3004 if (rc > 0)
3005 {
3006 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
3007 {
3008 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
3009 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3010 if (utf)
3011 {
3012 PCRE2_SPTR p = start_subject + local_offsets[rc];
3013 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
3014 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3015 }
3016 #endif
3017 if (charcount > 0)
3018 {
3019 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
3020 (int)(charcount - 1));
3021 }
3022 else
3023 {
3024 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
3025 }
3026 }
3027 }
3028 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3029 }
3030 break;
3031
3032 /*-----------------------------------------------------------------*/
3033 case OP_BRAPOS:
3034 case OP_SBRAPOS:
3035 case OP_CBRAPOS:
3036 case OP_SCBRAPOS:
3037 case OP_BRAPOSZERO:
3038 {
3039 int rc;
3040 int *local_workspace;
3041 PCRE2_SIZE *local_offsets;
3042 PCRE2_SIZE charcount, matched_count;
3043 PCRE2_SPTR local_ptr = ptr;
3044 RWS_anchor *rws = (RWS_anchor *)RWS;
3045 BOOL allow_zero;
3046
3047 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3048 {
3049 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3050 if (rc != 0) return rc;
3051 RWS = (int *)rws;
3052 }
3053
3054 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3055 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3056 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3057
3058 if (codevalue == OP_BRAPOSZERO)
3059 {
3060 allow_zero = TRUE;
3061 codevalue = *(++code); /* Codevalue will be one of above BRAs */
3062 }
3063 else allow_zero = FALSE;
3064
3065 /* Loop to match the subpattern as many times as possible as if it were
3066 a complete pattern. */
3067
3068 for (matched_count = 0;; matched_count++)
3069 {
3070 rc = internal_dfa_match(
3071 mb, /* fixed match data */
3072 code, /* this subexpression's code */
3073 local_ptr, /* where we currently are */
3074 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3075 local_offsets, /* offset vector */
3076 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3077 local_workspace, /* workspace vector */
3078 RWS_RSIZE, /* size of same */
3079 rlevel, /* function recursion level */
3080 RWS); /* recursion workspace */
3081
3082 /* Failed to match */
3083
3084 if (rc < 0)
3085 {
3086 if (rc != PCRE2_ERROR_NOMATCH) return rc;
3087 break;
3088 }
3089
3090 /* Matched: break the loop if zero characters matched. */
3091
3092 charcount = local_offsets[1] - local_offsets[0];
3093 if (charcount == 0) break;
3094 local_ptr += charcount; /* Advance temporary position ptr */
3095 }
3096
3097 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3098
3099 /* At this point we have matched the subpattern matched_count
3100 times, and local_ptr is pointing to the character after the end of the
3101 last match. */
3102
3103 if (matched_count > 0 || allow_zero)
3104 {
3105 PCRE2_SPTR end_subpattern = code;
3106 int next_state_offset;
3107
3108 do { end_subpattern += GET(end_subpattern, 1); }
3109 while (*end_subpattern == OP_ALT);
3110 next_state_offset =
3111 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3112
3113 /* Optimization: if there are no more active states, and there
3114 are no new states yet set up, then skip over the subject string
3115 right here, to save looping. Otherwise, set up the new state to swing
3116 into action when the end of the matched substring is reached. */
3117
3118 if (i + 1 >= active_count && new_count == 0)
3119 {
3120 ptr = local_ptr;
3121 clen = 0;
3122 ADD_NEW(next_state_offset, 0);
3123 }
3124 else
3125 {
3126 PCRE2_SPTR p = ptr;
3127 PCRE2_SPTR pp = local_ptr;
3128 charcount = (PCRE2_SIZE)(pp - p);
3129 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3130 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3131 #endif
3132 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3133 }
3134 }
3135 }
3136 break;
3137
3138 /*-----------------------------------------------------------------*/
3139 case OP_ONCE:
3140 {
3141 int rc;
3142 int *local_workspace;
3143 PCRE2_SIZE *local_offsets;
3144 RWS_anchor *rws = (RWS_anchor *)RWS;
3145
3146 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3147 {
3148 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3149 if (rc != 0) return rc;
3150 RWS = (int *)rws;
3151 }
3152
3153 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3154 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3155 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3156
3157 rc = internal_dfa_match(
3158 mb, /* fixed match data */
3159 code, /* this subexpression's code */
3160 ptr, /* where we currently are */
3161 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3162 local_offsets, /* offset vector */
3163 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3164 local_workspace, /* workspace vector */
3165 RWS_RSIZE, /* size of same */
3166 rlevel, /* function recursion level */
3167 RWS); /* recursion workspace */
3168
3169 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3170
3171 if (rc >= 0)
3172 {
3173 PCRE2_SPTR end_subpattern = code;
3174 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3175 int next_state_offset, repeat_state_offset;
3176
3177 do { end_subpattern += GET(end_subpattern, 1); }
3178 while (*end_subpattern == OP_ALT);
3179 next_state_offset =
3180 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3181
3182 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3183 arrange for the repeat state also to be added to the relevant list.
3184 Calculate the offset, or set -1 for no repeat. */
3185
3186 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3187 *end_subpattern == OP_KETRMIN)?
3188 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3189
3190 /* If we have matched an empty string, add the next state at the
3191 current character pointer. This is important so that the duplicate
3192 checking kicks in, which is what breaks infinite loops that match an
3193 empty string. */
3194
3195 if (charcount == 0)
3196 {
3197 ADD_ACTIVE(next_state_offset, 0);
3198 }
3199
3200 /* Optimization: if there are no more active states, and there
3201 are no new states yet set up, then skip over the subject string
3202 right here, to save looping. Otherwise, set up the new state to swing
3203 into action when the end of the matched substring is reached. */
3204
3205 else if (i + 1 >= active_count && new_count == 0)
3206 {
3207 ptr += charcount;
3208 clen = 0;
3209 ADD_NEW(next_state_offset, 0);
3210
3211 /* If we are adding a repeat state at the new character position,
3212 we must fudge things so that it is the only current state.
3213 Otherwise, it might be a duplicate of one we processed before, and
3214 that would cause it to be skipped. */
3215
3216 if (repeat_state_offset >= 0)
3217 {
3218 next_active_state = active_states;
3219 active_count = 0;
3220 i = -1;
3221 ADD_ACTIVE(repeat_state_offset, 0);
3222 }
3223 }
3224 else
3225 {
3226 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3227 if (utf)
3228 {
3229 PCRE2_SPTR p = start_subject + local_offsets[0];
3230 PCRE2_SPTR pp = start_subject + local_offsets[1];
3231 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3232 }
3233 #endif
3234 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3235 if (repeat_state_offset >= 0)
3236 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3237 }
3238 }
3239 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3240 }
3241 break;
3242
3243
3244 /* ========================================================================== */
3245 /* Handle callouts */
3246
3247 case OP_CALLOUT:
3248 case OP_CALLOUT_STR:
3249 {
3250 PCRE2_SIZE callout_length;
3251 rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3252 &callout_length);
3253 if (rrc < 0) return rrc; /* Abandon */
3254 if (rrc == 0)
3255 { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3256 }
3257 break;
3258
3259
3260 /* ========================================================================== */
3261 default: /* Unsupported opcode */
3262 return PCRE2_ERROR_DFA_UITEM;
3263 }
3264
3265 NEXT_ACTIVE_STATE: continue;
3266
3267 } /* End of loop scanning active states */
3268
3269 /* We have finished the processing at the current subject character. If no
3270 new states have been set for the next character, we have found all the
3271 matches that we are going to find. If partial matching has been requested,
3272 check for appropriate conditions.
3273
3274 The "forced_ fail" variable counts the number of (*F) encountered for the
3275 character. If it is equal to the original active_count (saved in
3276 workspace[1]) it means that (*F) was found on every active state. In this
3277 case we don't want to give a partial match.
3278
3279 The "could_continue" variable is true if a state could have continued but
3280 for the fact that the end of the subject was reached. */
3281
3282 if (new_count <= 0)
3283 {
3284 if (could_continue && /* Some could go on, and */
3285 forced_fail != workspace[1] && /* Not all forced fail & */
3286 ( /* either... */
3287 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3288 || /* or... */
3289 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3290 match_count < 0) /* no matches */
3291 ) && /* And... */
3292 (
3293 partial_newline || /* Either partial NL */
3294 ( /* or ... */
3295 ptr >= end_subject && /* End of subject and */
3296 ( /* either */
3297 ptr > mb->start_used_ptr || /* Inspected non-empty string */
3298 mb->allowemptypartial /* or pattern has lookbehind */
3299 ) /* or could match empty */
3300 )
3301 ))
3302 match_count = PCRE2_ERROR_PARTIAL;
3303 break; /* Exit from loop along the subject string */
3304 }
3305
3306 /* One or more states are active for the next character. */
3307
3308 ptr += clen; /* Advance to next subject character */
3309 } /* Loop to move along the subject string */
3310
3311 /* Control gets here from "break" a few lines above. If we have a match and
3312 PCRE2_ENDANCHORED is set, the match fails. */
3313
3314 if (match_count >= 0 &&
3315 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3316 ptr < end_subject)
3317 match_count = PCRE2_ERROR_NOMATCH;
3318
3319 return match_count;
3320 }
3321
3322
3323
3324 /*************************************************
3325 * Match a pattern using the DFA algorithm *
3326 *************************************************/
3327
3328 /* This function matches a compiled pattern to a subject string, using the
3329 alternate matching algorithm that finds all matches at once.
3330
3331 Arguments:
3332 code points to the compiled pattern
3333 subject subject string
3334 length length of subject string
3335 startoffset where to start matching in the subject
3336 options option bits
3337 match_data points to a match data structure
3338 gcontext points to a match context
3339 workspace pointer to workspace
3340 wscount size of workspace
3341
3342 Returns: > 0 => number of match offset pairs placed in offsets
3343 = 0 => offsets overflowed; longest matches are present
3344 -1 => failed to match
3345 < -1 => some kind of unexpected problem
3346 */
3347
3348 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3349 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3350 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3351 pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3352 {
3353 int rc;
3354 int was_zero_terminated = 0;
3355
3356 const pcre2_real_code *re = (const pcre2_real_code *)code;
3357
3358 PCRE2_SPTR start_match;
3359 PCRE2_SPTR end_subject;
3360 PCRE2_SPTR bumpalong_limit;
3361 PCRE2_SPTR req_cu_ptr;
3362
3363 BOOL utf, anchored, startline, firstline;
3364 BOOL has_first_cu = FALSE;
3365 BOOL has_req_cu = FALSE;
3366
3367 #if PCRE2_CODE_UNIT_WIDTH == 8
3368 PCRE2_SPTR memchr_found_first_cu = NULL;
3369 PCRE2_SPTR memchr_found_first_cu2 = NULL;
3370 #endif
3371
3372 PCRE2_UCHAR first_cu = 0;
3373 PCRE2_UCHAR first_cu2 = 0;
3374 PCRE2_UCHAR req_cu = 0;
3375 PCRE2_UCHAR req_cu2 = 0;
3376
3377 const uint8_t *start_bits = NULL;
3378
3379 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3380 is used below, and it expects NLBLOCK to be defined as a pointer. */
3381
3382 pcre2_callout_block cb;
3383 dfa_match_block actual_match_block;
3384 dfa_match_block *mb = &actual_match_block;
3385
3386 /* Set up a starting block of memory for use during recursive calls to
3387 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3388 in the case when it is not needed. If this is too small, more memory is
3389 obtained from the heap. At the start of each block is an anchor structure.*/
3390
3391 int base_recursion_workspace[RWS_BASE_SIZE];
3392 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3393 rws->next = NULL;
3394 rws->size = RWS_BASE_SIZE;
3395 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3396
3397 /* Recognize NULL, length 0 as an empty string. */
3398
3399 if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3400
3401 /* Plausibility checks */
3402
3403 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3404 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3405 return PCRE2_ERROR_NULL;
3406
3407 if (length == PCRE2_ZERO_TERMINATED)
3408 {
3409 length = PRIV(strlen)(subject);
3410 was_zero_terminated = 1;
3411 }
3412
3413 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3414 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3415
3416 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3417 time. */
3418
3419 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3420 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3421 return PCRE2_ERROR_BADOPTION;
3422
3423 /* Invalid UTF support is not available for DFA matching. */
3424
3425 if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3426 return PCRE2_ERROR_DFA_UINVALID_UTF;
3427
3428 /* Check that the first field in the block is the magic number. If it is not,
3429 return with PCRE2_ERROR_BADMAGIC. */
3430
3431 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3432
3433 /* Check the code unit width. */
3434
3435 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3436 return PCRE2_ERROR_BADMODE;
3437
3438 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3439 options variable for this function. Users of PCRE2 who are not calling the
3440 function directly would like to have a way of setting these flags, in the same
3441 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3442 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3443 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3444 transferred to the options for this function. The bits are guaranteed to be
3445 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3446 that the match-time bits are not more significant than the flag bits. If by
3447 accident this is not the case, a compile-time division by zero error will
3448 occur. */
3449
3450 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3451 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3452 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3453 #undef FF
3454 #undef OO
3455
3456 /* If restarting after a partial match, do some sanity checks on the contents
3457 of the workspace. */
3458
3459 if ((options & PCRE2_DFA_RESTART) != 0)
3460 {
3461 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3462 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3463 return PCRE2_ERROR_DFA_BADRESTART;
3464 }
3465
3466 /* Set some local values */
3467
3468 utf = (re->overall_options & PCRE2_UTF) != 0;
3469 start_match = subject + start_offset;
3470 end_subject = subject + length;
3471 req_cu_ptr = start_match - 1;
3472 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3473 (re->overall_options & PCRE2_ANCHORED) != 0;
3474
3475 /* The "must be at the start of a line" flags are used in a loop when finding
3476 where to start. */
3477
3478 startline = (re->flags & PCRE2_STARTLINE) != 0;
3479 firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
3480 bumpalong_limit = end_subject;
3481
3482 /* Initialize and set up the fixed fields in the callout block, with a pointer
3483 in the match block. */
3484
3485 mb->cb = &cb;
3486 cb.version = 2;
3487 cb.subject = subject;
3488 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3489 cb.callout_flags = 0;
3490 cb.capture_top = 1; /* No capture support */
3491 cb.capture_last = 0;
3492 cb.mark = NULL; /* No (*MARK) support */
3493
3494 /* Get data from the match context, if present, and fill in the remaining
3495 fields in the match block. It is an error to set an offset limit without
3496 setting the flag at compile time. */
3497
3498 if (mcontext == NULL)
3499 {
3500 mb->callout = NULL;
3501 mb->memctl = re->memctl;
3502 mb->match_limit = PRIV(default_match_context).match_limit;
3503 mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3504 mb->heap_limit = PRIV(default_match_context).heap_limit;
3505 }
3506 else
3507 {
3508 if (mcontext->offset_limit != PCRE2_UNSET)
3509 {
3510 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3511 return PCRE2_ERROR_BADOFFSETLIMIT;
3512 bumpalong_limit = subject + mcontext->offset_limit;
3513 }
3514 mb->callout = mcontext->callout;
3515 mb->callout_data = mcontext->callout_data;
3516 mb->memctl = mcontext->memctl;
3517 mb->match_limit = mcontext->match_limit;
3518 mb->match_limit_depth = mcontext->depth_limit;
3519 mb->heap_limit = mcontext->heap_limit;
3520 }
3521
3522 if (mb->match_limit > re->limit_match)
3523 mb->match_limit = re->limit_match;
3524
3525 if (mb->match_limit_depth > re->limit_depth)
3526 mb->match_limit_depth = re->limit_depth;
3527
3528 if (mb->heap_limit > re->limit_heap)
3529 mb->heap_limit = re->limit_heap;
3530
3531 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3532 re->name_count * re->name_entry_size;
3533 mb->tables = re->tables;
3534 mb->start_subject = subject;
3535 mb->end_subject = end_subject;
3536 mb->start_offset = start_offset;
3537 mb->allowemptypartial = (re->max_lookbehind > 0) ||
3538 (re->flags & PCRE2_MATCH_EMPTY) != 0;
3539 mb->moptions = options;
3540 mb->poptions = re->overall_options;
3541 mb->match_call_count = 0;
3542 mb->heap_used = 0;
3543
3544 /* Process the \R and newline settings. */
3545
3546 mb->bsr_convention = re->bsr_convention;
3547 mb->nltype = NLTYPE_FIXED;
3548 switch(re->newline_convention)
3549 {
3550 case PCRE2_NEWLINE_CR:
3551 mb->nllen = 1;
3552 mb->nl[0] = CHAR_CR;
3553 break;
3554
3555 case PCRE2_NEWLINE_LF:
3556 mb->nllen = 1;
3557 mb->nl[0] = CHAR_NL;
3558 break;
3559
3560 case PCRE2_NEWLINE_NUL:
3561 mb->nllen = 1;
3562 mb->nl[0] = CHAR_NUL;
3563 break;
3564
3565 case PCRE2_NEWLINE_CRLF:
3566 mb->nllen = 2;
3567 mb->nl[0] = CHAR_CR;
3568 mb->nl[1] = CHAR_NL;
3569 break;
3570
3571 case PCRE2_NEWLINE_ANY:
3572 mb->nltype = NLTYPE_ANY;
3573 break;
3574
3575 case PCRE2_NEWLINE_ANYCRLF:
3576 mb->nltype = NLTYPE_ANYCRLF;
3577 break;
3578
3579 default: return PCRE2_ERROR_INTERNAL;
3580 }
3581
3582 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3583 we must also check that a starting offset does not point into the middle of a
3584 multiunit character. We check only the portion of the subject that is going to
3585 be inspected during matching - from the offset minus the maximum back reference
3586 to the given length. This saves time when a small part of a large subject is
3587 being matched by the use of a starting offset. Note that the maximum lookbehind
3588 is a number of characters, not code units. */
3589
3590 #ifdef SUPPORT_UNICODE
3591 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3592 {
3593 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3594
3595 if (start_offset > 0)
3596 {
3597 #if PCRE2_CODE_UNIT_WIDTH != 32
3598 unsigned int i;
3599 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3600 return PCRE2_ERROR_BADUTFOFFSET;
3601 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3602 {
3603 check_subject--;
3604 while (check_subject > subject &&
3605 #if PCRE2_CODE_UNIT_WIDTH == 8
3606 (*check_subject & 0xc0) == 0x80)
3607 #else /* 16-bit */
3608 (*check_subject & 0xfc00) == 0xdc00)
3609 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3610 check_subject--;
3611 }
3612 #else /* In the 32-bit library, one code unit equals one character. */
3613 check_subject -= re->max_lookbehind;
3614 if (check_subject < subject) check_subject = subject;
3615 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3616 }
3617
3618 /* Validate the relevant portion of the subject. After an error, adjust the
3619 offset to be an absolute offset in the whole string. */
3620
3621 match_data->rc = PRIV(valid_utf)(check_subject,
3622 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3623 if (match_data->rc != 0)
3624 {
3625 match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3626 return match_data->rc;
3627 }
3628 }
3629 #endif /* SUPPORT_UNICODE */
3630
3631 /* Set up the first code unit to match, if available. If there's no first code
3632 unit there may be a bitmap of possible first characters. */
3633
3634 if ((re->flags & PCRE2_FIRSTSET) != 0)
3635 {
3636 has_first_cu = TRUE;
3637 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3638 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3639 {
3640 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3641 #ifdef SUPPORT_UNICODE
3642 #if PCRE2_CODE_UNIT_WIDTH == 8
3643 if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3644 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3645 #else
3646 if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3647 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3648 #endif
3649 #endif /* SUPPORT_UNICODE */
3650 }
3651 }
3652 else
3653 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3654 start_bits = re->start_bitmap;
3655
3656 /* There may be a "last known required code unit" set. */
3657
3658 if ((re->flags & PCRE2_LASTSET) != 0)
3659 {
3660 has_req_cu = TRUE;
3661 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3662 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3663 {
3664 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3665 #ifdef SUPPORT_UNICODE
3666 #if PCRE2_CODE_UNIT_WIDTH == 8
3667 if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3668 req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3669 #else
3670 if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3671 req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3672 #endif
3673 #endif /* SUPPORT_UNICODE */
3674 }
3675 }
3676
3677 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3678 free the memory that was obtained. */
3679
3680 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3681 {
3682 match_data->memctl.free((void *)match_data->subject,
3683 match_data->memctl.memory_data);
3684 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3685 }
3686
3687 /* Fill in fields that are always returned in the match data. */
3688
3689 match_data->code = re;
3690 match_data->subject = NULL; /* Default for no match */
3691 match_data->mark = NULL;
3692 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3693
3694 /* Call the main matching function, looping for a non-anchored regex after a
3695 failed match. If not restarting, perform certain optimizations at the start of
3696 a match. */
3697
3698 for (;;)
3699 {
3700 /* ----------------- Start of match optimizations ---------------- */
3701
3702 /* There are some optimizations that avoid running the match if a known
3703 starting point is not found, or if a known later code unit is not present.
3704 However, there is an option (settable at compile time) that disables
3705 these, for testing and for ensuring that all callouts do actually occur.
3706 The optimizations must also be avoided when restarting a DFA match. */
3707
3708 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3709 (options & PCRE2_DFA_RESTART) == 0)
3710 {
3711 /* If firstline is TRUE, the start of the match is constrained to the first
3712 line of a multiline string. That is, the match must be before or at the
3713 first newline following the start of matching. Temporarily adjust
3714 end_subject so that we stop the optimization scans for a first code unit
3715 immediately after the first character of a newline (the first code unit can
3716 legitimately be a newline). If the match fails at the newline, later code
3717 breaks this loop. */
3718
3719 if (firstline)
3720 {
3721 PCRE2_SPTR t = start_match;
3722 #ifdef SUPPORT_UNICODE
3723 if (utf)
3724 {
3725 while (t < end_subject && !IS_NEWLINE(t))
3726 {
3727 t++;
3728 ACROSSCHAR(t < end_subject, t, t++);
3729 }
3730 }
3731 else
3732 #endif
3733 while (t < end_subject && !IS_NEWLINE(t)) t++;
3734 end_subject = t;
3735 }
3736
3737 /* Anchored: check the first code unit if one is recorded. This may seem
3738 pointless but it can help in detecting a no match case without scanning for
3739 the required code unit. */
3740
3741 if (anchored)
3742 {
3743 if (has_first_cu || start_bits != NULL)
3744 {
3745 BOOL ok = start_match < end_subject;
3746 if (ok)
3747 {
3748 PCRE2_UCHAR c = UCHAR21TEST(start_match);
3749 ok = has_first_cu && (c == first_cu || c == first_cu2);
3750 if (!ok && start_bits != NULL)
3751 {
3752 #if PCRE2_CODE_UNIT_WIDTH != 8
3753 if (c > 255) c = 255;
3754 #endif
3755 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3756 }
3757 }
3758 if (!ok) break;
3759 }
3760 }
3761
3762 /* Not anchored. Advance to a unique first code unit if there is one. */
3763
3764 else
3765 {
3766 if (has_first_cu)
3767 {
3768 if (first_cu != first_cu2) /* Caseless */
3769 {
3770 /* In 16-bit and 32_bit modes we have to do our own search, so can
3771 look for both cases at once. */
3772
3773 #if PCRE2_CODE_UNIT_WIDTH != 8
3774 PCRE2_UCHAR smc;
3775 while (start_match < end_subject &&
3776 (smc = UCHAR21TEST(start_match)) != first_cu &&
3777 smc != first_cu2)
3778 start_match++;
3779 #else
3780 /* In 8-bit mode, the use of memchr() gives a big speed up, even
3781 though we have to call it twice in order to find the earliest
3782 occurrence of the code unit in either of its cases. Caching is used
3783 to remember the positions of previously found code units. This can
3784 make a huge difference when the strings are very long and only one
3785 case is actually present. */
3786
3787 PCRE2_SPTR pp1 = NULL;
3788 PCRE2_SPTR pp2 = NULL;
3789 PCRE2_SIZE searchlength = end_subject - start_match;
3790
3791 /* If we haven't got a previously found position for first_cu, or if
3792 the current starting position is later, we need to do a search. If
3793 the code unit is not found, set it to the end. */
3794
3795 if (memchr_found_first_cu == NULL ||
3796 start_match > memchr_found_first_cu)
3797 {
3798 pp1 = memchr(start_match, first_cu, searchlength);
3799 memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3800 }
3801
3802 /* If the start is before a previously found position, use the
3803 previous position, or NULL if a previous search failed. */
3804
3805 else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3806 memchr_found_first_cu;
3807
3808 /* Do the same thing for the other case. */
3809
3810 if (memchr_found_first_cu2 == NULL ||
3811 start_match > memchr_found_first_cu2)
3812 {
3813 pp2 = memchr(start_match, first_cu2, searchlength);
3814 memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3815 }
3816
3817 else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3818 memchr_found_first_cu2;
3819
3820 /* Set the start to the end of the subject if neither case was found.
3821 Otherwise, use the earlier found point. */
3822
3823 if (pp1 == NULL)
3824 start_match = (pp2 == NULL)? end_subject : pp2;
3825 else
3826 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3827
3828 #endif /* 8-bit handling */
3829 }
3830
3831 /* The caseful case is much simpler. */
3832
3833 else
3834 {
3835 #if PCRE2_CODE_UNIT_WIDTH != 8
3836 while (start_match < end_subject && UCHAR21TEST(start_match) !=
3837 first_cu)
3838 start_match++;
3839 #else /* 8-bit code units */
3840 start_match = memchr(start_match, first_cu, end_subject - start_match);
3841 if (start_match == NULL) start_match = end_subject;
3842 #endif
3843 }
3844
3845 /* If we can't find the required code unit, having reached the true end
3846 of the subject, break the bumpalong loop, to force a match failure,
3847 except when doing partial matching, when we let the next cycle run at
3848 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3849 which partially matches "abc", even though the string does not contain
3850 the starting character "d". If we have not reached the true end of the
3851 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3852 we also let the cycle run, because the matching string is legitimately
3853 allowed to start with the first code unit of a newline. */
3854
3855 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3856 start_match >= mb->end_subject)
3857 break;
3858 }
3859
3860 /* If there's no first code unit, advance to just after a linebreak for a
3861 multiline match if required. */
3862
3863 else if (startline)
3864 {
3865 if (start_match > mb->start_subject + start_offset)
3866 {
3867 #ifdef SUPPORT_UNICODE
3868 if (utf)
3869 {
3870 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3871 {
3872 start_match++;
3873 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3874 }
3875 }
3876 else
3877 #endif
3878 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3879 start_match++;
3880
3881 /* If we have just passed a CR and the newline option is ANY or
3882 ANYCRLF, and we are now at a LF, advance the match position by one
3883 more code unit. */
3884
3885 if (start_match[-1] == CHAR_CR &&
3886 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3887 start_match < end_subject &&
3888 UCHAR21TEST(start_match) == CHAR_NL)
3889 start_match++;
3890 }
3891 }
3892
3893 /* If there's no first code unit or a requirement for a multiline line
3894 start, advance to a non-unique first code unit if any have been
3895 identified. The bitmap contains only 256 bits. When code units are 16 or
3896 32 bits wide, all code units greater than 254 set the 255 bit. */
3897
3898 else if (start_bits != NULL)
3899 {
3900 while (start_match < end_subject)
3901 {
3902 uint32_t c = UCHAR21TEST(start_match);
3903 #if PCRE2_CODE_UNIT_WIDTH != 8
3904 if (c > 255) c = 255;
3905 #endif
3906 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3907 start_match++;
3908 }
3909
3910 /* See comment above in first_cu checking about the next line. */
3911
3912 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3913 start_match >= mb->end_subject)
3914 break;
3915 }
3916 } /* End of first code unit handling */
3917
3918 /* Restore fudged end_subject */
3919
3920 end_subject = mb->end_subject;
3921
3922 /* The following two optimizations are disabled for partial matching. */
3923
3924 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3925 {
3926 PCRE2_SPTR p;
3927
3928 /* The minimum matching length is a lower bound; no actual string of that
3929 length may actually match the pattern. Although the value is, strictly,
3930 in characters, we treat it as code units to avoid spending too much time
3931 in this optimization. */
3932
3933 if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3934
3935 /* If req_cu is set, we know that that code unit must appear in the
3936 subject for the match to succeed. If the first code unit is set, req_cu
3937 must be later in the subject; otherwise the test starts at the match
3938 point. This optimization can save a huge amount of backtracking in
3939 patterns with nested unlimited repeats that aren't going to match.
3940 Writing separate code for cased/caseless versions makes it go faster, as
3941 does using an autoincrement and backing off on a match. As in the case of
3942 the first code unit, using memchr() in the 8-bit library gives a big
3943 speed up. Unlike the first_cu check above, we do not need to call
3944 memchr() twice in the caseless case because we only need to check for the
3945 presence of the character in either case, not find the first occurrence.
3946
3947 The search can be skipped if the code unit was found later than the
3948 current starting point in a previous iteration of the bumpalong loop.
3949
3950 HOWEVER: when the subject string is very, very long, searching to its end
3951 can take a long time, and give bad performance on quite ordinary
3952 patterns. This showed up when somebody was matching something like
3953 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3954 sufficiently long, but it's worth searching a lot more for unanchored
3955 patterns. */
3956
3957 p = start_match + (has_first_cu? 1:0);
3958 if (has_req_cu && p > req_cu_ptr)
3959 {
3960 PCRE2_SIZE check_length = end_subject - start_match;
3961
3962 if (check_length < REQ_CU_MAX ||
3963 (!anchored && check_length < REQ_CU_MAX * 1000))
3964 {
3965 if (req_cu != req_cu2) /* Caseless */
3966 {
3967 #if PCRE2_CODE_UNIT_WIDTH != 8
3968 while (p < end_subject)
3969 {
3970 uint32_t pp = UCHAR21INCTEST(p);
3971 if (pp == req_cu || pp == req_cu2) { p--; break; }
3972 }
3973 #else /* 8-bit code units */
3974 PCRE2_SPTR pp = p;
3975 p = memchr(pp, req_cu, end_subject - pp);
3976 if (p == NULL)
3977 {
3978 p = memchr(pp, req_cu2, end_subject - pp);
3979 if (p == NULL) p = end_subject;
3980 }
3981 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3982 }
3983
3984 /* The caseful case */
3985
3986 else
3987 {
3988 #if PCRE2_CODE_UNIT_WIDTH != 8
3989 while (p < end_subject)
3990 {
3991 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3992 }
3993
3994 #else /* 8-bit code units */
3995 p = memchr(p, req_cu, end_subject - p);
3996 if (p == NULL) p = end_subject;
3997 #endif
3998 }
3999
4000 /* If we can't find the required code unit, break the matching loop,
4001 forcing a match failure. */
4002
4003 if (p >= end_subject) break;
4004
4005 /* If we have found the required code unit, save the point where we
4006 found it, so that we don't search again next time round the loop if
4007 the start hasn't passed this code unit yet. */
4008
4009 req_cu_ptr = p;
4010 }
4011 }
4012 }
4013 }
4014
4015 /* ------------ End of start of match optimizations ------------ */
4016
4017 /* Give no match if we have passed the bumpalong limit. */
4018
4019 if (start_match > bumpalong_limit) break;
4020
4021 /* OK, now we can do the business */
4022
4023 mb->start_used_ptr = start_match;
4024 mb->last_used_ptr = start_match;
4025 mb->recursive = NULL;
4026
4027 rc = internal_dfa_match(
4028 mb, /* fixed match data */
4029 mb->start_code, /* this subexpression's code */
4030 start_match, /* where we currently are */
4031 start_offset, /* start offset in subject */
4032 match_data->ovector, /* offset vector */
4033 (uint32_t)match_data->oveccount * 2, /* actual size of same */
4034 workspace, /* workspace vector */
4035 (int)wscount, /* size of same */
4036 0, /* function recurse level */
4037 base_recursion_workspace); /* initial workspace for recursion */
4038
4039 /* Anything other than "no match" means we are done, always; otherwise, carry
4040 on only if not anchored. */
4041
4042 if (rc != PCRE2_ERROR_NOMATCH || anchored)
4043 {
4044 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
4045 {
4046 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
4047 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
4048 }
4049 match_data->subject_length = length;
4050 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
4051 match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
4052 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4053 match_data->rc = rc;
4054
4055 if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4056 {
4057 length = CU2BYTES(length + was_zero_terminated);
4058 match_data->subject = match_data->memctl.malloc(length,
4059 match_data->memctl.memory_data);
4060 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4061 memcpy((void *)match_data->subject, subject, length);
4062 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4063 }
4064 else
4065 {
4066 if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4067 }
4068 goto EXIT;
4069 }
4070
4071 /* Advance to the next subject character unless we are at the end of a line
4072 and firstline is set. */
4073
4074 if (firstline && IS_NEWLINE(start_match)) break;
4075 start_match++;
4076 #ifdef SUPPORT_UNICODE
4077 if (utf)
4078 {
4079 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4080 }
4081 #endif
4082 if (start_match > end_subject) break;
4083
4084 /* If we have just passed a CR and we are now at a LF, and the pattern does
4085 not contain any explicit matches for \r or \n, and the newline option is CRLF
4086 or ANY or ANYCRLF, advance the match position by one more character. */
4087
4088 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4089 start_match < end_subject &&
4090 UCHAR21TEST(start_match) == CHAR_NL &&
4091 (re->flags & PCRE2_HASCRORLF) == 0 &&
4092 (mb->nltype == NLTYPE_ANY ||
4093 mb->nltype == NLTYPE_ANYCRLF ||
4094 mb->nllen == 2))
4095 start_match++;
4096
4097 } /* "Bumpalong" loop */
4098
4099 NOMATCH_EXIT:
4100 rc = PCRE2_ERROR_NOMATCH;
4101
4102 EXIT:
4103 while (rws->next != NULL)
4104 {
4105 RWS_anchor *next = rws->next;
4106 rws->next = next->next;
4107 mb->memctl.free(next, mb->memctl.memory_data);
4108 }
4109
4110 return rc;
4111 }
4112
4113 /* These #undefs are here to enable unity builds with CMake. */
4114
4115 #undef NLBLOCK /* Block containing newline information */
4116 #undef PSSTART /* Field containing processed string start */
4117 #undef PSEND /* Field containing processed string end */
4118
4119 /* End of pcre2_dfa_match.c */
4120