xref: /PHP-8.3/ext/pcre/pcre2lib/pcre2_dfa_match.c (revision c4e8f652)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2022 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46 
47 
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52 
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61 
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70 
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73 
74 
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78 
79 #define NLBLOCK mb             /* Block containing newline information */
80 #define PSSTART start_subject  /* Field containing processed string start */
81 #define PSEND   end_subject    /* Field containing processed string end */
82 
83 #include "pcre2_internal.h"
84 
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86   (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87    PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88    PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89    PCRE2_COPY_MATCHED_SUBJECT)
90 
91 
92 /*************************************************
93 *      Code parameters and static tables         *
94 *************************************************/
95 
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100 
101 #define OP_PROP_EXTRA       300
102 #define OP_EXTUNI_EXTRA     320
103 #define OP_ANYNL_EXTRA      340
104 #define OP_HSPACE_EXTRA     360
105 #define OP_VSPACE_EXTRA     380
106 
107 
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115 
116 static const uint8_t coptable[] = {
117   0,                             /* End                                    */
118   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121   0, 0,                          /* \P, \p                                 */
122   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123   0,                             /* \X                                     */
124   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
125   1,                             /* Char                                   */
126   1,                             /* Chari                                  */
127   1,                             /* not                                    */
128   1,                             /* noti                                   */
129   /* Positive single-char repeats                                          */
130   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132   1+IMM2_SIZE,                   /* exact                                  */
133   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136   1+IMM2_SIZE,                   /* exact I                                */
137   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138   /* Negative single-char repeats - only for chars < 256                   */
139   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141   1+IMM2_SIZE,                   /* NOT exact                              */
142   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145   1+IMM2_SIZE,                   /* NOT exact I                            */
146   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147   /* Positive type repeats                                                 */
148   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150   1+IMM2_SIZE,                   /* Type exact                             */
151   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152   /* Character class & ref repeats                                         */
153   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154   0, 0,                          /* CRRANGE, CRMINRANGE                    */
155   0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
156   0,                             /* CLASS                                  */
157   0,                             /* NCLASS                                 */
158   0,                             /* XCLASS - variable length               */
159   0,                             /* REF                                    */
160   0,                             /* REFI                                   */
161   0,                             /* DNREF                                  */
162   0,                             /* DNREFI                                 */
163   0,                             /* RECURSE                                */
164   0,                             /* CALLOUT                                */
165   0,                             /* CALLOUT_STR                            */
166   0,                             /* Alt                                    */
167   0,                             /* Ket                                    */
168   0,                             /* KetRmax                                */
169   0,                             /* KetRmin                                */
170   0,                             /* KetRpos                                */
171   0,                             /* Reverse                                */
172   0,                             /* Assert                                 */
173   0,                             /* Assert not                             */
174   0,                             /* Assert behind                          */
175   0,                             /* Assert behind not                      */
176   0,                             /* NA assert                              */
177   0,                             /* NA assert behind                       */
178   0,                             /* ONCE                                   */
179   0,                             /* SCRIPT_RUN                             */
180   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
181   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
182   0, 0,                          /* CREF, DNCREF                           */
183   0, 0,                          /* RREF, DNRREF                           */
184   0, 0,                          /* FALSE, TRUE                            */
185   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
186   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
187   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
188   0, 0,                          /* COMMIT, COMMIT_ARG                     */
189   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
190   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
191 };
192 
193 /* This table identifies those opcodes that inspect a character. It is used to
194 remember the fact that a character could have been inspected when the end of
195 the subject is reached. ***NOTE*** If the start of this table is modified, the
196 two tables that follow must also be modified. */
197 
198 static const uint8_t poptable[] = {
199   0,                             /* End                                    */
200   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
201   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
202   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
203   1, 1,                          /* \P, \p                                 */
204   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
205   1,                             /* \X                                     */
206   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
207   1,                             /* Char                                   */
208   1,                             /* Chari                                  */
209   1,                             /* not                                    */
210   1,                             /* noti                                   */
211   /* Positive single-char repeats                                          */
212   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
213   1, 1, 1,                       /* upto, minupto, exact                   */
214   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
215   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
216   1, 1, 1,                       /* upto I, minupto I, exact I             */
217   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
218   /* Negative single-char repeats - only for chars < 256                   */
219   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
220   1, 1, 1,                       /* NOT upto, minupto, exact               */
221   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
222   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
223   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
224   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
225   /* Positive type repeats                                                 */
226   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
227   1, 1, 1,                       /* Type upto, minupto, exact              */
228   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
229   /* Character class & ref repeats                                         */
230   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
231   1, 1,                          /* CRRANGE, CRMINRANGE                    */
232   1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
233   1,                             /* CLASS                                  */
234   1,                             /* NCLASS                                 */
235   1,                             /* XCLASS - variable length               */
236   0,                             /* REF                                    */
237   0,                             /* REFI                                   */
238   0,                             /* DNREF                                  */
239   0,                             /* DNREFI                                 */
240   0,                             /* RECURSE                                */
241   0,                             /* CALLOUT                                */
242   0,                             /* CALLOUT_STR                            */
243   0,                             /* Alt                                    */
244   0,                             /* Ket                                    */
245   0,                             /* KetRmax                                */
246   0,                             /* KetRmin                                */
247   0,                             /* KetRpos                                */
248   0,                             /* Reverse                                */
249   0,                             /* Assert                                 */
250   0,                             /* Assert not                             */
251   0,                             /* Assert behind                          */
252   0,                             /* Assert behind not                      */
253   0,                             /* NA assert                              */
254   0,                             /* NA assert behind                       */
255   0,                             /* ONCE                                   */
256   0,                             /* SCRIPT_RUN                             */
257   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
258   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
259   0, 0,                          /* CREF, DNCREF                           */
260   0, 0,                          /* RREF, DNRREF                           */
261   0, 0,                          /* FALSE, TRUE                            */
262   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
263   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
264   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
265   0, 0,                          /* COMMIT, COMMIT_ARG                     */
266   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
267   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
268 };
269 
270 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
271 and \w */
272 
273 static const uint8_t toptable1[] = {
274   0, 0, 0, 0, 0, 0,
275   ctype_digit, ctype_digit,
276   ctype_space, ctype_space,
277   ctype_word,  ctype_word,
278   0, 0                            /* OP_ANY, OP_ALLANY */
279 };
280 
281 static const uint8_t toptable2[] = {
282   0, 0, 0, 0, 0, 0,
283   ctype_digit, 0,
284   ctype_space, 0,
285   ctype_word,  0,
286   1, 1                            /* OP_ANY, OP_ALLANY */
287 };
288 
289 
290 /* Structure for holding data about a particular state, which is in effect the
291 current data for an active path through the match tree. It must consist
292 entirely of ints because the working vector we are passed, and which we put
293 these structures in, is a vector of ints. */
294 
295 typedef struct stateblock {
296   int offset;                     /* Offset to opcode (-ve has meaning) */
297   int count;                      /* Count for repeats */
298   int data;                       /* Some use extra data */
299 } stateblock;
300 
301 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
302 
303 
304 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
305 local working space and output vectors that were created on the stack. This has
306 caused issues for some patterns, especially in small-stack environments such as
307 Windows. A new scheme is now in use which sets up a vector on the stack, but if
308 this is too small, heap memory is used, up to the heap_limit. The main
309 parameters are all numbers of ints because the workspace is a vector of ints.
310 
311 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
312 defined in pcre2_internal.h so as to be available to pcre2test when it is
313 finding the minimum heap requirement for a match. */
314 
315 #define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
316 
317 #define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
318 #define RWS_RSIZE       1000                    /* Work size for recursion */
319 #define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
320 #define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
321 
322 /* This structure is at the start of each workspace block. */
323 
324 typedef struct RWS_anchor {
325   struct RWS_anchor *next;
326   uint32_t size;  /* Number of ints */
327   uint32_t free;  /* Number of ints */
328 } RWS_anchor;
329 
330 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
331 
332 
333 
334 /*************************************************
335 *               Process a callout                *
336 *************************************************/
337 
338 /* This function is called to perform a callout.
339 
340 Arguments:
341   code              current code pointer
342   offsets           points to current capture offsets
343   current_subject   start of current subject match
344   ptr               current position in subject
345   mb                the match block
346   extracode         extra code offset when called from condition
347   lengthptr         where to return the callout length
348 
349 Returns:            the return from the callout
350 */
351 
352 static int
do_callout_dfa(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)353 do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
354   PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
355   PCRE2_SIZE *lengthptr)
356 {
357 pcre2_callout_block *cb = mb->cb;
358 
359 *lengthptr = (code[extracode] == OP_CALLOUT)?
360   (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
361   (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
362 
363 if (mb->callout == NULL) return 0;    /* No callout provided */
364 
365 /* Fixed fields in the callout block are set once and for all at the start of
366 matching. */
367 
368 cb->offset_vector    = offsets;
369 cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
370 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
371 cb->pattern_position = GET(code, 1 + extracode);
372 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
373 
374 if (code[extracode] == OP_CALLOUT)
375   {
376   cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
377   cb->callout_string_offset = 0;
378   cb->callout_string = NULL;
379   cb->callout_string_length = 0;
380   }
381 else
382   {
383   cb->callout_number = 0;
384   cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
385   cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
386   cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
387   }
388 
389 return (mb->callout)(cb, mb->callout_data);
390 }
391 
392 
393 
394 /*************************************************
395 *         Expand local workspace memory          *
396 *************************************************/
397 
398 /* This function is called when internal_dfa_match() is about to be called
399 recursively and there is insufficient working space left in the current
400 workspace block. If there's an existing next block, use it; otherwise get a new
401 block unless the heap limit is reached.
402 
403 Arguments:
404   rwsptr     pointer to block pointer (updated)
405   ovecsize   space needed for an ovector
406   mb         the match block
407 
408 Returns:     0 rwsptr has been updated
409             !0 an error code
410 */
411 
412 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)413 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
414 {
415 RWS_anchor *rws = *rwsptr;
416 RWS_anchor *new;
417 
418 if (rws->next != NULL)
419   {
420   new = rws->next;
421   }
422 
423 /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
424 mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
425 overflow. */
426 
427 else
428   {
429   uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
430   uint32_t newsizeK = newsize/(1024/sizeof(int));
431 
432   if (newsizeK + mb->heap_used > mb->heap_limit)
433     newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
434   newsize = newsizeK*(1024/sizeof(int));
435 
436   if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
437     return PCRE2_ERROR_HEAPLIMIT;
438   new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
439   if (new == NULL) return PCRE2_ERROR_NOMEMORY;
440   mb->heap_used += newsizeK;
441   new->next = NULL;
442   new->size = newsize;
443   rws->next = new;
444   }
445 
446 new->free = new->size - RWS_ANCHOR_SIZE;
447 *rwsptr = new;
448 return 0;
449 }
450 
451 
452 
453 /*************************************************
454 *     Match a Regular Expression - DFA engine    *
455 *************************************************/
456 
457 /* This internal function applies a compiled pattern to a subject string,
458 starting at a given point, using a DFA engine. This function is called from the
459 external one, possibly multiple times if the pattern is not anchored. The
460 function calls itself recursively for some kinds of subpattern.
461 
462 Arguments:
463   mb                the match_data block with fixed information
464   this_start_code   the opening bracket of this subexpression's code
465   current_subject   where we currently are in the subject string
466   start_offset      start offset in the subject string
467   offsets           vector to contain the matching string offsets
468   offsetcount       size of same
469   workspace         vector of workspace
470   wscount           size of same
471   rlevel            function call recursion level
472 
473 Returns:            > 0 => number of match offset pairs placed in offsets
474                     = 0 => offsets overflowed; longest matches are present
475                      -1 => failed to match
476                    < -1 => some kind of unexpected problem
477 
478 The following macros are used for adding states to the two state vectors (one
479 for the current character, one for the following character). */
480 
481 #define ADD_ACTIVE(x,y) \
482   if (active_count++ < wscount) \
483     { \
484     next_active_state->offset = (x); \
485     next_active_state->count  = (y); \
486     next_active_state++; \
487     } \
488   else return PCRE2_ERROR_DFA_WSSIZE
489 
490 #define ADD_ACTIVE_DATA(x,y,z) \
491   if (active_count++ < wscount) \
492     { \
493     next_active_state->offset = (x); \
494     next_active_state->count  = (y); \
495     next_active_state->data   = (z); \
496     next_active_state++; \
497     } \
498   else return PCRE2_ERROR_DFA_WSSIZE
499 
500 #define ADD_NEW(x,y) \
501   if (new_count++ < wscount) \
502     { \
503     next_new_state->offset = (x); \
504     next_new_state->count  = (y); \
505     next_new_state++; \
506     } \
507   else return PCRE2_ERROR_DFA_WSSIZE
508 
509 #define ADD_NEW_DATA(x,y,z) \
510   if (new_count++ < wscount) \
511     { \
512     next_new_state->offset = (x); \
513     next_new_state->count  = (y); \
514     next_new_state->data   = (z); \
515     next_new_state++; \
516     } \
517   else return PCRE2_ERROR_DFA_WSSIZE
518 
519 /* And now, here is the code */
520 
521 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)522 internal_dfa_match(
523   dfa_match_block *mb,
524   PCRE2_SPTR this_start_code,
525   PCRE2_SPTR current_subject,
526   PCRE2_SIZE start_offset,
527   PCRE2_SIZE *offsets,
528   uint32_t offsetcount,
529   int *workspace,
530   int wscount,
531   uint32_t rlevel,
532   int *RWS)
533 {
534 stateblock *active_states, *new_states, *temp_states;
535 stateblock *next_active_state, *next_new_state;
536 const uint8_t *ctypes, *lcc, *fcc;
537 PCRE2_SPTR ptr;
538 PCRE2_SPTR end_code;
539 dfa_recursion_info new_recursive;
540 int active_count, new_count, match_count;
541 
542 /* Some fields in the mb block are frequently referenced, so we load them into
543 independent variables in the hope that this will perform better. */
544 
545 PCRE2_SPTR start_subject = mb->start_subject;
546 PCRE2_SPTR end_subject = mb->end_subject;
547 PCRE2_SPTR start_code = mb->start_code;
548 
549 #ifdef SUPPORT_UNICODE
550 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
551 BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
552 #else
553 BOOL utf = FALSE;
554 #endif
555 
556 BOOL reset_could_continue = FALSE;
557 
558 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
559 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
560 offsetcount &= (uint32_t)(-2);  /* Round down */
561 
562 wscount -= 2;
563 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
564           (2 * INTS_PER_STATEBLOCK);
565 
566 ctypes = mb->tables + ctypes_offset;
567 lcc = mb->tables + lcc_offset;
568 fcc = mb->tables + fcc_offset;
569 
570 match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
571 
572 active_states = (stateblock *)(workspace + 2);
573 next_new_state = new_states = active_states + wscount;
574 new_count = 0;
575 
576 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
577 the alternative states onto the list, and find out where the end is. This
578 makes is possible to use this function recursively, when we want to stop at a
579 matching internal ket rather than at the end.
580 
581 If we are dealing with a backward assertion we have to find out the maximum
582 amount to move back, and set up each alternative appropriately. */
583 
584 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
585   {
586   size_t max_back = 0;
587   size_t gone_back;
588 
589   end_code = this_start_code;
590   do
591     {
592     size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
593     if (back > max_back) max_back = back;
594     end_code += GET(end_code, 1);
595     }
596   while (*end_code == OP_ALT);
597 
598   /* If we can't go back the amount required for the longest lookbehind
599   pattern, go back as far as we can; some alternatives may still be viable. */
600 
601 #ifdef SUPPORT_UNICODE
602   /* In character mode we have to step back character by character */
603 
604   if (utf)
605     {
606     for (gone_back = 0; gone_back < max_back; gone_back++)
607       {
608       if (current_subject <= start_subject) break;
609       current_subject--;
610       ACROSSCHAR(current_subject > start_subject, current_subject,
611         current_subject--);
612       }
613     }
614   else
615 #endif
616 
617   /* In byte-mode we can do this quickly. */
618 
619     {
620     size_t current_offset = (size_t)(current_subject - start_subject);
621     gone_back = (current_offset < max_back)? current_offset : max_back;
622     current_subject -= gone_back;
623     }
624 
625   /* Save the earliest consulted character */
626 
627   if (current_subject < mb->start_used_ptr)
628     mb->start_used_ptr = current_subject;
629 
630   /* Now we can process the individual branches. There will be an OP_REVERSE at
631   the start of each branch, except when the length of the branch is zero. */
632 
633   end_code = this_start_code;
634   do
635     {
636     uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
637     size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
638     if (back <= gone_back)
639       {
640       int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
641       ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
642       }
643     end_code += GET(end_code, 1);
644     }
645   while (*end_code == OP_ALT);
646  }
647 
648 /* This is the code for a "normal" subpattern (not a backward assertion). The
649 start of a whole pattern is always one of these. If we are at the top level,
650 we may be asked to restart matching from the same point that we reached for a
651 previous partial match. We still have to scan through the top-level branches to
652 find the end state. */
653 
654 else
655   {
656   end_code = this_start_code;
657 
658   /* Restarting */
659 
660   if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
661     {
662     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
663     new_count = workspace[1];
664     if (!workspace[0])
665       memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
666     }
667 
668   /* Not restarting */
669 
670   else
671     {
672     int length = 1 + LINK_SIZE +
673       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
674         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
675         ? IMM2_SIZE:0);
676     do
677       {
678       ADD_NEW((int)(end_code - start_code + length), 0);
679       end_code += GET(end_code, 1);
680       length = 1 + LINK_SIZE;
681       }
682     while (*end_code == OP_ALT);
683     }
684   }
685 
686 workspace[0] = 0;    /* Bit indicating which vector is current */
687 
688 /* Loop for scanning the subject */
689 
690 ptr = current_subject;
691 for (;;)
692   {
693   int i, j;
694   int clen, dlen;
695   uint32_t c, d;
696   int forced_fail = 0;
697   BOOL partial_newline = FALSE;
698   BOOL could_continue = reset_could_continue;
699   reset_could_continue = FALSE;
700 
701   if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
702 
703   /* Make the new state list into the active state list and empty the
704   new state list. */
705 
706   temp_states = active_states;
707   active_states = new_states;
708   new_states = temp_states;
709   active_count = new_count;
710   new_count = 0;
711 
712   workspace[0] ^= 1;              /* Remember for the restarting feature */
713   workspace[1] = active_count;
714 
715   /* Set the pointers for adding new states */
716 
717   next_active_state = active_states + active_count;
718   next_new_state = new_states;
719 
720   /* Load the current character from the subject outside the loop, as many
721   different states may want to look at it, and we assume that at least one
722   will. */
723 
724   if (ptr < end_subject)
725     {
726     clen = 1;        /* Number of data items in the character */
727 #ifdef SUPPORT_UNICODE
728     GETCHARLENTEST(c, ptr, clen);
729 #else
730     c = *ptr;
731 #endif  /* SUPPORT_UNICODE */
732     }
733   else
734     {
735     clen = 0;        /* This indicates the end of the subject */
736     c = NOTACHAR;    /* This value should never actually be used */
737     }
738 
739   /* Scan up the active states and act on each one. The result of an action
740   may be to add more states to the currently active list (e.g. on hitting a
741   parenthesis) or it may be to put states on the new list, for considering
742   when we move the character pointer on. */
743 
744   for (i = 0; i < active_count; i++)
745     {
746     stateblock *current_state = active_states + i;
747     BOOL caseless = FALSE;
748     PCRE2_SPTR code;
749     uint32_t codevalue;
750     int state_offset = current_state->offset;
751     int rrc;
752     int count;
753 
754     /* A negative offset is a special case meaning "hold off going to this
755     (negated) state until the number of characters in the data field have
756     been skipped". If the could_continue flag was passed over from a previous
757     state, arrange for it to passed on. */
758 
759     if (state_offset < 0)
760       {
761       if (current_state->data > 0)
762         {
763         ADD_NEW_DATA(state_offset, current_state->count,
764           current_state->data - 1);
765         if (could_continue) reset_could_continue = TRUE;
766         continue;
767         }
768       else
769         {
770         current_state->offset = state_offset = -state_offset;
771         }
772       }
773 
774     /* Check for a duplicate state with the same count, and skip if found.
775     See the note at the head of this module about the possibility of improving
776     performance here. */
777 
778     for (j = 0; j < i; j++)
779       {
780       if (active_states[j].offset == state_offset &&
781           active_states[j].count == current_state->count)
782         goto NEXT_ACTIVE_STATE;
783       }
784 
785     /* The state offset is the offset to the opcode */
786 
787     code = start_code + state_offset;
788     codevalue = *code;
789 
790     /* If this opcode inspects a character, but we are at the end of the
791     subject, remember the fact for use when testing for a partial match. */
792 
793     if (clen == 0 && poptable[codevalue] != 0)
794       could_continue = TRUE;
795 
796     /* If this opcode is followed by an inline character, load it. It is
797     tempting to test for the presence of a subject character here, but that
798     is wrong, because sometimes zero repetitions of the subject are
799     permitted.
800 
801     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
802     argument that is not a data character - but is always one byte long because
803     the values are small. We have to take special action to deal with  \P, \p,
804     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
805     these ones to new opcodes. */
806 
807     if (coptable[codevalue] > 0)
808       {
809       dlen = 1;
810 #ifdef SUPPORT_UNICODE
811       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
812 #endif  /* SUPPORT_UNICODE */
813       d = code[coptable[codevalue]];
814       if (codevalue >= OP_TYPESTAR)
815         {
816         switch(d)
817           {
818           case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
819           case OP_NOTPROP:
820           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
821           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
822           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
823           case OP_NOT_HSPACE:
824           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
825           case OP_NOT_VSPACE:
826           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
827           default: break;
828           }
829         }
830       }
831     else
832       {
833       dlen = 0;         /* Not strictly necessary, but compilers moan */
834       d = NOTACHAR;     /* if these variables are not set. */
835       }
836 
837 
838     /* Now process the individual opcodes */
839 
840     switch (codevalue)
841       {
842 /* ========================================================================== */
843       /* These cases are never obeyed. This is a fudge that causes a compile-
844       time error if the vectors coptable or poptable, which are indexed by
845       opcode, are not the correct length. It seems to be the only way to do
846       such a check at compile time, as the sizeof() operator does not work
847       in the C preprocessor. */
848 
849       case OP_TABLE_LENGTH:
850       case OP_TABLE_LENGTH +
851         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
852          (sizeof(poptable) == OP_TABLE_LENGTH)):
853       return 0;
854 
855 /* ========================================================================== */
856       /* Reached a closing bracket. If not at the end of the pattern, carry
857       on with the next opcode. For repeating opcodes, also add the repeat
858       state. Note that KETRPOS will always be encountered at the end of the
859       subpattern, because the possessive subpattern repeats are always handled
860       using recursive calls. Thus, it never adds any new states.
861 
862       At the end of the (sub)pattern, unless we have an empty string and
863       PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
864       start of the subject, save the match data, shifting up all previous
865       matches so we always have the longest first. */
866 
867       case OP_KET:
868       case OP_KETRMIN:
869       case OP_KETRMAX:
870       case OP_KETRPOS:
871       if (code != end_code)
872         {
873         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
874         if (codevalue != OP_KET)
875           {
876           ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
877           }
878         }
879       else
880         {
881         if (ptr > current_subject ||
882             ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
883               ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
884                 current_subject > start_subject + mb->start_offset)))
885           {
886           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
887             else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
888               match_count = 0;
889           count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
890           if (count > 0) (void)memmove(offsets + 2, offsets,
891             (size_t)count * sizeof(PCRE2_SIZE));
892           if (offsetcount >= 2)
893             {
894             offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
895             offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
896             }
897           if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
898           }
899         }
900       break;
901 
902 /* ========================================================================== */
903       /* These opcodes add to the current list of states without looking
904       at the current character. */
905 
906       /*-----------------------------------------------------------------*/
907       case OP_ALT:
908       do { code += GET(code, 1); } while (*code == OP_ALT);
909       ADD_ACTIVE((int)(code - start_code), 0);
910       break;
911 
912       /*-----------------------------------------------------------------*/
913       case OP_BRA:
914       case OP_SBRA:
915       do
916         {
917         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
918         code += GET(code, 1);
919         }
920       while (*code == OP_ALT);
921       break;
922 
923       /*-----------------------------------------------------------------*/
924       case OP_CBRA:
925       case OP_SCBRA:
926       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
927       code += GET(code, 1);
928       while (*code == OP_ALT)
929         {
930         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
931         code += GET(code, 1);
932         }
933       break;
934 
935       /*-----------------------------------------------------------------*/
936       case OP_BRAZERO:
937       case OP_BRAMINZERO:
938       ADD_ACTIVE(state_offset + 1, 0);
939       code += 1 + GET(code, 2);
940       while (*code == OP_ALT) code += GET(code, 1);
941       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
942       break;
943 
944       /*-----------------------------------------------------------------*/
945       case OP_SKIPZERO:
946       code += 1 + GET(code, 2);
947       while (*code == OP_ALT) code += GET(code, 1);
948       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
949       break;
950 
951       /*-----------------------------------------------------------------*/
952       case OP_CIRC:
953       if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
954         { ADD_ACTIVE(state_offset + 1, 0); }
955       break;
956 
957       /*-----------------------------------------------------------------*/
958       case OP_CIRCM:
959       if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
960           ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
961             && WAS_NEWLINE(ptr)))
962         { ADD_ACTIVE(state_offset + 1, 0); }
963       break;
964 
965       /*-----------------------------------------------------------------*/
966       case OP_EOD:
967       if (ptr >= end_subject)
968         {
969         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
970           return PCRE2_ERROR_PARTIAL;
971         else { ADD_ACTIVE(state_offset + 1, 0); }
972         }
973       break;
974 
975       /*-----------------------------------------------------------------*/
976       case OP_SOD:
977       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
978       break;
979 
980       /*-----------------------------------------------------------------*/
981       case OP_SOM:
982       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
983       break;
984 
985 
986 /* ========================================================================== */
987       /* These opcodes inspect the next subject character, and sometimes
988       the previous one as well, but do not have an argument. The variable
989       clen contains the length of the current character and is zero if we are
990       at the end of the subject. */
991 
992       /*-----------------------------------------------------------------*/
993       case OP_ANY:
994       if (clen > 0 && !IS_NEWLINE(ptr))
995         {
996         if (ptr + 1 >= mb->end_subject &&
997             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
998             NLBLOCK->nltype == NLTYPE_FIXED &&
999             NLBLOCK->nllen == 2 &&
1000             c == NLBLOCK->nl[0])
1001           {
1002           could_continue = partial_newline = TRUE;
1003           }
1004         else
1005           {
1006           ADD_NEW(state_offset + 1, 0);
1007           }
1008         }
1009       break;
1010 
1011       /*-----------------------------------------------------------------*/
1012       case OP_ALLANY:
1013       if (clen > 0)
1014         { ADD_NEW(state_offset + 1, 0); }
1015       break;
1016 
1017       /*-----------------------------------------------------------------*/
1018       case OP_EODN:
1019       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1020         {
1021         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1022           return PCRE2_ERROR_PARTIAL;
1023         ADD_ACTIVE(state_offset + 1, 0);
1024         }
1025       break;
1026 
1027       /*-----------------------------------------------------------------*/
1028       case OP_DOLL:
1029       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1030         {
1031         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1032           could_continue = TRUE;
1033         else if (clen == 0 ||
1034             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1035                (ptr == end_subject - mb->nllen)
1036             ))
1037           { ADD_ACTIVE(state_offset + 1, 0); }
1038         else if (ptr + 1 >= mb->end_subject &&
1039                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1040                  NLBLOCK->nltype == NLTYPE_FIXED &&
1041                  NLBLOCK->nllen == 2 &&
1042                  c == NLBLOCK->nl[0])
1043           {
1044           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1045             {
1046             reset_could_continue = TRUE;
1047             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1048             }
1049           else could_continue = partial_newline = TRUE;
1050           }
1051         }
1052       break;
1053 
1054       /*-----------------------------------------------------------------*/
1055       case OP_DOLLM:
1056       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1057         {
1058         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1059           could_continue = TRUE;
1060         else if (clen == 0 ||
1061             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1062           { ADD_ACTIVE(state_offset + 1, 0); }
1063         else if (ptr + 1 >= mb->end_subject &&
1064                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1065                  NLBLOCK->nltype == NLTYPE_FIXED &&
1066                  NLBLOCK->nllen == 2 &&
1067                  c == NLBLOCK->nl[0])
1068           {
1069           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1070             {
1071             reset_could_continue = TRUE;
1072             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1073             }
1074           else could_continue = partial_newline = TRUE;
1075           }
1076         }
1077       else if (IS_NEWLINE(ptr))
1078         { ADD_ACTIVE(state_offset + 1, 0); }
1079       break;
1080 
1081       /*-----------------------------------------------------------------*/
1082 
1083       case OP_DIGIT:
1084       case OP_WHITESPACE:
1085       case OP_WORDCHAR:
1086       if (clen > 0 && c < 256 &&
1087             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1088         { ADD_NEW(state_offset + 1, 0); }
1089       break;
1090 
1091       /*-----------------------------------------------------------------*/
1092       case OP_NOT_DIGIT:
1093       case OP_NOT_WHITESPACE:
1094       case OP_NOT_WORDCHAR:
1095       if (clen > 0 && (c >= 256 ||
1096             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1097         { ADD_NEW(state_offset + 1, 0); }
1098       break;
1099 
1100       /*-----------------------------------------------------------------*/
1101       case OP_WORD_BOUNDARY:
1102       case OP_NOT_WORD_BOUNDARY:
1103         {
1104         int left_word, right_word;
1105 
1106         if (ptr > start_subject)
1107           {
1108           PCRE2_SPTR temp = ptr - 1;
1109           if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1110 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1111           if (utf) { BACKCHAR(temp); }
1112 #endif
1113           GETCHARTEST(d, temp);
1114 #ifdef SUPPORT_UNICODE
1115           if ((mb->poptions & PCRE2_UCP) != 0)
1116             {
1117             if (d == '_') left_word = TRUE; else
1118               {
1119               uint32_t cat = UCD_CATEGORY(d);
1120               left_word = (cat == ucp_L || cat == ucp_N);
1121               }
1122             }
1123           else
1124 #endif
1125           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1126           }
1127         else left_word = FALSE;
1128 
1129         if (clen > 0)
1130           {
1131           if (ptr >= mb->last_used_ptr)
1132             {
1133             PCRE2_SPTR temp = ptr + 1;
1134 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1135             if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1136 #endif
1137             mb->last_used_ptr = temp;
1138             }
1139 #ifdef SUPPORT_UNICODE
1140           if ((mb->poptions & PCRE2_UCP) != 0)
1141             {
1142             if (c == '_') right_word = TRUE; else
1143               {
1144               uint32_t cat = UCD_CATEGORY(c);
1145               right_word = (cat == ucp_L || cat == ucp_N);
1146               }
1147             }
1148           else
1149 #endif
1150           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1151           }
1152         else right_word = FALSE;
1153 
1154         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1155           { ADD_ACTIVE(state_offset + 1, 0); }
1156         }
1157       break;
1158 
1159 
1160       /*-----------------------------------------------------------------*/
1161       /* Check the next character by Unicode property. We will get here only
1162       if the support is in the binary; otherwise a compile-time error occurs.
1163       */
1164 
1165 #ifdef SUPPORT_UNICODE
1166       case OP_PROP:
1167       case OP_NOTPROP:
1168       if (clen > 0)
1169         {
1170         BOOL OK;
1171         const uint32_t *cp;
1172         const ucd_record * prop = GET_UCD(c);
1173         switch(code[1])
1174           {
1175           case PT_ANY:
1176           OK = TRUE;
1177           break;
1178 
1179           case PT_LAMP:
1180           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1181                prop->chartype == ucp_Lt;
1182           break;
1183 
1184           case PT_GC:
1185           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1186           break;
1187 
1188           case PT_PC:
1189           OK = prop->chartype == code[2];
1190           break;
1191 
1192           case PT_SC:
1193           OK = prop->script == code[2];
1194           break;
1195 
1196           case PT_SCX:
1197           OK = (prop->script == code[2] ||
1198                 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1199           break;
1200 
1201           /* These are specials for combination cases. */
1202 
1203           case PT_ALNUM:
1204           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1205                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1206           break;
1207 
1208           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1209           which means that Perl space and POSIX space are now identical. PCRE
1210           was changed at release 8.34. */
1211 
1212           case PT_SPACE:    /* Perl space */
1213           case PT_PXSPACE:  /* POSIX space */
1214           switch(c)
1215             {
1216             HSPACE_CASES:
1217             VSPACE_CASES:
1218             OK = TRUE;
1219             break;
1220 
1221             default:
1222             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1223             break;
1224             }
1225           break;
1226 
1227           case PT_WORD:
1228           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1229                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1230                c == CHAR_UNDERSCORE;
1231           break;
1232 
1233           case PT_CLIST:
1234           cp = PRIV(ucd_caseless_sets) + code[2];
1235           for (;;)
1236             {
1237             if (c < *cp) { OK = FALSE; break; }
1238             if (c == *cp++) { OK = TRUE; break; }
1239             }
1240           break;
1241 
1242           case PT_UCNC:
1243           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1244                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1245                c >= 0xe000;
1246           break;
1247 
1248           case PT_BIDICL:
1249           OK = UCD_BIDICLASS(c) == code[2];
1250           break;
1251 
1252           case PT_BOOL:
1253           OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1254             UCD_BPROPS_PROP(prop), code[2]) != 0;
1255           break;
1256 
1257           /* Should never occur, but keep compilers from grumbling. */
1258 
1259           default:
1260           OK = codevalue != OP_PROP;
1261           break;
1262           }
1263 
1264         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1265         }
1266       break;
1267 #endif
1268 
1269 
1270 
1271 /* ========================================================================== */
1272       /* These opcodes likewise inspect the subject character, but have an
1273       argument that is not a data character. It is one of these opcodes:
1274       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1275       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1276 
1277       case OP_TYPEPLUS:
1278       case OP_TYPEMINPLUS:
1279       case OP_TYPEPOSPLUS:
1280       count = current_state->count;  /* Already matched */
1281       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1282       if (clen > 0)
1283         {
1284         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1285             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1286             NLBLOCK->nltype == NLTYPE_FIXED &&
1287             NLBLOCK->nllen == 2 &&
1288             c == NLBLOCK->nl[0])
1289           {
1290           could_continue = partial_newline = TRUE;
1291           }
1292         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1293             (c < 256 &&
1294               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1295               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1296           {
1297           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1298             {
1299             active_count--;            /* Remove non-match possibility */
1300             next_active_state--;
1301             }
1302           count++;
1303           ADD_NEW(state_offset, count);
1304           }
1305         }
1306       break;
1307 
1308       /*-----------------------------------------------------------------*/
1309       case OP_TYPEQUERY:
1310       case OP_TYPEMINQUERY:
1311       case OP_TYPEPOSQUERY:
1312       ADD_ACTIVE(state_offset + 2, 0);
1313       if (clen > 0)
1314         {
1315         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1316             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1317             NLBLOCK->nltype == NLTYPE_FIXED &&
1318             NLBLOCK->nllen == 2 &&
1319             c == NLBLOCK->nl[0])
1320           {
1321           could_continue = partial_newline = TRUE;
1322           }
1323         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1324             (c < 256 &&
1325               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1326               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1327           {
1328           if (codevalue == OP_TYPEPOSQUERY)
1329             {
1330             active_count--;            /* Remove non-match possibility */
1331             next_active_state--;
1332             }
1333           ADD_NEW(state_offset + 2, 0);
1334           }
1335         }
1336       break;
1337 
1338       /*-----------------------------------------------------------------*/
1339       case OP_TYPESTAR:
1340       case OP_TYPEMINSTAR:
1341       case OP_TYPEPOSSTAR:
1342       ADD_ACTIVE(state_offset + 2, 0);
1343       if (clen > 0)
1344         {
1345         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1346             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1347             NLBLOCK->nltype == NLTYPE_FIXED &&
1348             NLBLOCK->nllen == 2 &&
1349             c == NLBLOCK->nl[0])
1350           {
1351           could_continue = partial_newline = TRUE;
1352           }
1353         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1354             (c < 256 &&
1355               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1356               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1357           {
1358           if (codevalue == OP_TYPEPOSSTAR)
1359             {
1360             active_count--;            /* Remove non-match possibility */
1361             next_active_state--;
1362             }
1363           ADD_NEW(state_offset, 0);
1364           }
1365         }
1366       break;
1367 
1368       /*-----------------------------------------------------------------*/
1369       case OP_TYPEEXACT:
1370       count = current_state->count;  /* Number already matched */
1371       if (clen > 0)
1372         {
1373         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1374             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1375             NLBLOCK->nltype == NLTYPE_FIXED &&
1376             NLBLOCK->nllen == 2 &&
1377             c == NLBLOCK->nl[0])
1378           {
1379           could_continue = partial_newline = TRUE;
1380           }
1381         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1382             (c < 256 &&
1383               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1384               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1385           {
1386           if (++count >= (int)GET2(code, 1))
1387             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1388           else
1389             { ADD_NEW(state_offset, count); }
1390           }
1391         }
1392       break;
1393 
1394       /*-----------------------------------------------------------------*/
1395       case OP_TYPEUPTO:
1396       case OP_TYPEMINUPTO:
1397       case OP_TYPEPOSUPTO:
1398       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1399       count = current_state->count;  /* Number already matched */
1400       if (clen > 0)
1401         {
1402         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1403             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1404             NLBLOCK->nltype == NLTYPE_FIXED &&
1405             NLBLOCK->nllen == 2 &&
1406             c == NLBLOCK->nl[0])
1407           {
1408           could_continue = partial_newline = TRUE;
1409           }
1410         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1411             (c < 256 &&
1412               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1413               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1414           {
1415           if (codevalue == OP_TYPEPOSUPTO)
1416             {
1417             active_count--;           /* Remove non-match possibility */
1418             next_active_state--;
1419             }
1420           if (++count >= (int)GET2(code, 1))
1421             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1422           else
1423             { ADD_NEW(state_offset, count); }
1424           }
1425         }
1426       break;
1427 
1428 /* ========================================================================== */
1429       /* These are virtual opcodes that are used when something like
1430       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1431       argument. It keeps the code above fast for the other cases. The argument
1432       is in the d variable. */
1433 
1434 #ifdef SUPPORT_UNICODE
1435       case OP_PROP_EXTRA + OP_TYPEPLUS:
1436       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1437       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1438       count = current_state->count;           /* Already matched */
1439       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1440       if (clen > 0)
1441         {
1442         BOOL OK;
1443         const uint32_t *cp;
1444         const ucd_record * prop = GET_UCD(c);
1445         switch(code[2])
1446           {
1447           case PT_ANY:
1448           OK = TRUE;
1449           break;
1450 
1451           case PT_LAMP:
1452           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1453             prop->chartype == ucp_Lt;
1454           break;
1455 
1456           case PT_GC:
1457           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1458           break;
1459 
1460           case PT_PC:
1461           OK = prop->chartype == code[3];
1462           break;
1463 
1464           case PT_SC:
1465           OK = prop->script == code[3];
1466           break;
1467 
1468           case PT_SCX:
1469           OK = (prop->script == code[3] ||
1470                 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1471           break;
1472 
1473           /* These are specials for combination cases. */
1474 
1475           case PT_ALNUM:
1476           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1477                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1478           break;
1479 
1480           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1481           which means that Perl space and POSIX space are now identical. PCRE
1482           was changed at release 8.34. */
1483 
1484           case PT_SPACE:    /* Perl space */
1485           case PT_PXSPACE:  /* POSIX space */
1486           switch(c)
1487             {
1488             HSPACE_CASES:
1489             VSPACE_CASES:
1490             OK = TRUE;
1491             break;
1492 
1493             default:
1494             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1495             break;
1496             }
1497           break;
1498 
1499           case PT_WORD:
1500           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1501                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1502                c == CHAR_UNDERSCORE;
1503           break;
1504 
1505           case PT_CLIST:
1506           cp = PRIV(ucd_caseless_sets) + code[3];
1507           for (;;)
1508             {
1509             if (c < *cp) { OK = FALSE; break; }
1510             if (c == *cp++) { OK = TRUE; break; }
1511             }
1512           break;
1513 
1514           case PT_UCNC:
1515           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1516                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1517                c >= 0xe000;
1518           break;
1519 
1520           case PT_BIDICL:
1521           OK = UCD_BIDICLASS(c) == code[3];
1522           break;
1523 
1524           case PT_BOOL:
1525           OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1526             UCD_BPROPS_PROP(prop), code[3]) != 0;
1527           break;
1528 
1529           /* Should never occur, but keep compilers from grumbling. */
1530 
1531           default:
1532           OK = codevalue != OP_PROP;
1533           break;
1534           }
1535 
1536         if (OK == (d == OP_PROP))
1537           {
1538           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1539             {
1540             active_count--;           /* Remove non-match possibility */
1541             next_active_state--;
1542             }
1543           count++;
1544           ADD_NEW(state_offset, count);
1545           }
1546         }
1547       break;
1548 
1549       /*-----------------------------------------------------------------*/
1550       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1551       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1552       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1553       count = current_state->count;  /* Already matched */
1554       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1555       if (clen > 0)
1556         {
1557         int ncount = 0;
1558         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1559           {
1560           active_count--;           /* Remove non-match possibility */
1561           next_active_state--;
1562           }
1563         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1564           &ncount);
1565         count++;
1566         ADD_NEW_DATA(-state_offset, count, ncount);
1567         }
1568       break;
1569 #endif
1570 
1571       /*-----------------------------------------------------------------*/
1572       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1573       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1574       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1575       count = current_state->count;  /* Already matched */
1576       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1577       if (clen > 0)
1578         {
1579         int ncount = 0;
1580         switch (c)
1581           {
1582           case CHAR_VT:
1583           case CHAR_FF:
1584           case CHAR_NEL:
1585 #ifndef EBCDIC
1586           case 0x2028:
1587           case 0x2029:
1588 #endif  /* Not EBCDIC */
1589           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1590           goto ANYNL01;
1591 
1592           case CHAR_CR:
1593           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1594           /* Fall through */
1595 
1596           ANYNL01:
1597           case CHAR_LF:
1598           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1599             {
1600             active_count--;           /* Remove non-match possibility */
1601             next_active_state--;
1602             }
1603           count++;
1604           ADD_NEW_DATA(-state_offset, count, ncount);
1605           break;
1606 
1607           default:
1608           break;
1609           }
1610         }
1611       break;
1612 
1613       /*-----------------------------------------------------------------*/
1614       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1615       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1616       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1617       count = current_state->count;  /* Already matched */
1618       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1619       if (clen > 0)
1620         {
1621         BOOL OK;
1622         switch (c)
1623           {
1624           VSPACE_CASES:
1625           OK = TRUE;
1626           break;
1627 
1628           default:
1629           OK = FALSE;
1630           break;
1631           }
1632 
1633         if (OK == (d == OP_VSPACE))
1634           {
1635           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1636             {
1637             active_count--;           /* Remove non-match possibility */
1638             next_active_state--;
1639             }
1640           count++;
1641           ADD_NEW_DATA(-state_offset, count, 0);
1642           }
1643         }
1644       break;
1645 
1646       /*-----------------------------------------------------------------*/
1647       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1648       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1649       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1650       count = current_state->count;  /* Already matched */
1651       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1652       if (clen > 0)
1653         {
1654         BOOL OK;
1655         switch (c)
1656           {
1657           HSPACE_CASES:
1658           OK = TRUE;
1659           break;
1660 
1661           default:
1662           OK = FALSE;
1663           break;
1664           }
1665 
1666         if (OK == (d == OP_HSPACE))
1667           {
1668           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1669             {
1670             active_count--;           /* Remove non-match possibility */
1671             next_active_state--;
1672             }
1673           count++;
1674           ADD_NEW_DATA(-state_offset, count, 0);
1675           }
1676         }
1677       break;
1678 
1679       /*-----------------------------------------------------------------*/
1680 #ifdef SUPPORT_UNICODE
1681       case OP_PROP_EXTRA + OP_TYPEQUERY:
1682       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1683       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1684       count = 4;
1685       goto QS1;
1686 
1687       case OP_PROP_EXTRA + OP_TYPESTAR:
1688       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1689       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1690       count = 0;
1691 
1692       QS1:
1693 
1694       ADD_ACTIVE(state_offset + 4, 0);
1695       if (clen > 0)
1696         {
1697         BOOL OK;
1698         const uint32_t *cp;
1699         const ucd_record * prop = GET_UCD(c);
1700         switch(code[2])
1701           {
1702           case PT_ANY:
1703           OK = TRUE;
1704           break;
1705 
1706           case PT_LAMP:
1707           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1708             prop->chartype == ucp_Lt;
1709           break;
1710 
1711           case PT_GC:
1712           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1713           break;
1714 
1715           case PT_PC:
1716           OK = prop->chartype == code[3];
1717           break;
1718 
1719           case PT_SC:
1720           OK = prop->script == code[3];
1721           break;
1722 
1723           case PT_SCX:
1724           OK = (prop->script == code[3] ||
1725                 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1726           break;
1727 
1728           /* These are specials for combination cases. */
1729 
1730           case PT_ALNUM:
1731           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1732                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1733           break;
1734 
1735           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1736           which means that Perl space and POSIX space are now identical. PCRE
1737           was changed at release 8.34. */
1738 
1739           case PT_SPACE:    /* Perl space */
1740           case PT_PXSPACE:  /* POSIX space */
1741           switch(c)
1742             {
1743             HSPACE_CASES:
1744             VSPACE_CASES:
1745             OK = TRUE;
1746             break;
1747 
1748             default:
1749             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1750             break;
1751             }
1752           break;
1753 
1754           case PT_WORD:
1755           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1756                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1757                c == CHAR_UNDERSCORE;
1758           break;
1759 
1760           case PT_CLIST:
1761           cp = PRIV(ucd_caseless_sets) + code[3];
1762           for (;;)
1763             {
1764             if (c < *cp) { OK = FALSE; break; }
1765             if (c == *cp++) { OK = TRUE; break; }
1766             }
1767           break;
1768 
1769           case PT_UCNC:
1770           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1771                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1772                c >= 0xe000;
1773           break;
1774 
1775           case PT_BIDICL:
1776           OK = UCD_BIDICLASS(c) == code[3];
1777           break;
1778 
1779           case PT_BOOL:
1780           OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1781             UCD_BPROPS_PROP(prop), code[3]) != 0;
1782           break;
1783 
1784           /* Should never occur, but keep compilers from grumbling. */
1785 
1786           default:
1787           OK = codevalue != OP_PROP;
1788           break;
1789           }
1790 
1791         if (OK == (d == OP_PROP))
1792           {
1793           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1794               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1795             {
1796             active_count--;           /* Remove non-match possibility */
1797             next_active_state--;
1798             }
1799           ADD_NEW(state_offset + count, 0);
1800           }
1801         }
1802       break;
1803 
1804       /*-----------------------------------------------------------------*/
1805       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1806       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1807       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1808       count = 2;
1809       goto QS2;
1810 
1811       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1812       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1813       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1814       count = 0;
1815 
1816       QS2:
1817 
1818       ADD_ACTIVE(state_offset + 2, 0);
1819       if (clen > 0)
1820         {
1821         int ncount = 0;
1822         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1823             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1824           {
1825           active_count--;           /* Remove non-match possibility */
1826           next_active_state--;
1827           }
1828         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1829           &ncount);
1830         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1831         }
1832       break;
1833 #endif
1834 
1835       /*-----------------------------------------------------------------*/
1836       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1837       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1838       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1839       count = 2;
1840       goto QS3;
1841 
1842       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1843       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1844       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1845       count = 0;
1846 
1847       QS3:
1848       ADD_ACTIVE(state_offset + 2, 0);
1849       if (clen > 0)
1850         {
1851         int ncount = 0;
1852         switch (c)
1853           {
1854           case CHAR_VT:
1855           case CHAR_FF:
1856           case CHAR_NEL:
1857 #ifndef EBCDIC
1858           case 0x2028:
1859           case 0x2029:
1860 #endif  /* Not EBCDIC */
1861           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1862           goto ANYNL02;
1863 
1864           case CHAR_CR:
1865           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1866           /* Fall through */
1867 
1868           ANYNL02:
1869           case CHAR_LF:
1870           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1871               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1872             {
1873             active_count--;           /* Remove non-match possibility */
1874             next_active_state--;
1875             }
1876           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1877           break;
1878 
1879           default:
1880           break;
1881           }
1882         }
1883       break;
1884 
1885       /*-----------------------------------------------------------------*/
1886       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1887       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1888       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1889       count = 2;
1890       goto QS4;
1891 
1892       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1893       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1894       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1895       count = 0;
1896 
1897       QS4:
1898       ADD_ACTIVE(state_offset + 2, 0);
1899       if (clen > 0)
1900         {
1901         BOOL OK;
1902         switch (c)
1903           {
1904           VSPACE_CASES:
1905           OK = TRUE;
1906           break;
1907 
1908           default:
1909           OK = FALSE;
1910           break;
1911           }
1912         if (OK == (d == OP_VSPACE))
1913           {
1914           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1915               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1916             {
1917             active_count--;           /* Remove non-match possibility */
1918             next_active_state--;
1919             }
1920           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1921           }
1922         }
1923       break;
1924 
1925       /*-----------------------------------------------------------------*/
1926       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1927       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1928       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1929       count = 2;
1930       goto QS5;
1931 
1932       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1933       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1934       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1935       count = 0;
1936 
1937       QS5:
1938       ADD_ACTIVE(state_offset + 2, 0);
1939       if (clen > 0)
1940         {
1941         BOOL OK;
1942         switch (c)
1943           {
1944           HSPACE_CASES:
1945           OK = TRUE;
1946           break;
1947 
1948           default:
1949           OK = FALSE;
1950           break;
1951           }
1952 
1953         if (OK == (d == OP_HSPACE))
1954           {
1955           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1956               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1957             {
1958             active_count--;           /* Remove non-match possibility */
1959             next_active_state--;
1960             }
1961           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1962           }
1963         }
1964       break;
1965 
1966       /*-----------------------------------------------------------------*/
1967 #ifdef SUPPORT_UNICODE
1968       case OP_PROP_EXTRA + OP_TYPEEXACT:
1969       case OP_PROP_EXTRA + OP_TYPEUPTO:
1970       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1971       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1972       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1973         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1974       count = current_state->count;  /* Number already matched */
1975       if (clen > 0)
1976         {
1977         BOOL OK;
1978         const uint32_t *cp;
1979         const ucd_record * prop = GET_UCD(c);
1980         switch(code[1 + IMM2_SIZE + 1])
1981           {
1982           case PT_ANY:
1983           OK = TRUE;
1984           break;
1985 
1986           case PT_LAMP:
1987           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1988             prop->chartype == ucp_Lt;
1989           break;
1990 
1991           case PT_GC:
1992           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1993           break;
1994 
1995           case PT_PC:
1996           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1997           break;
1998 
1999           case PT_SC:
2000           OK = prop->script == code[1 + IMM2_SIZE + 2];
2001           break;
2002 
2003           case PT_SCX:
2004           OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2005                 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2006                   code[1 + IMM2_SIZE + 2]) != 0);
2007           break;
2008 
2009           /* These are specials for combination cases. */
2010 
2011           case PT_ALNUM:
2012           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2013                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
2014           break;
2015 
2016           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2017           which means that Perl space and POSIX space are now identical. PCRE
2018           was changed at release 8.34. */
2019 
2020           case PT_SPACE:    /* Perl space */
2021           case PT_PXSPACE:  /* POSIX space */
2022           switch(c)
2023             {
2024             HSPACE_CASES:
2025             VSPACE_CASES:
2026             OK = TRUE;
2027             break;
2028 
2029             default:
2030             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2031             break;
2032             }
2033           break;
2034 
2035           case PT_WORD:
2036           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2037                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2038                c == CHAR_UNDERSCORE;
2039           break;
2040 
2041           case PT_CLIST:
2042           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2043           for (;;)
2044             {
2045             if (c < *cp) { OK = FALSE; break; }
2046             if (c == *cp++) { OK = TRUE; break; }
2047             }
2048           break;
2049 
2050           case PT_UCNC:
2051           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2052                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2053                c >= 0xe000;
2054           break;
2055 
2056           case PT_BIDICL:
2057           OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2058           break;
2059 
2060           case PT_BOOL:
2061           OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2062             UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2063           break;
2064 
2065           /* Should never occur, but keep compilers from grumbling. */
2066 
2067           default:
2068           OK = codevalue != OP_PROP;
2069           break;
2070           }
2071 
2072         if (OK == (d == OP_PROP))
2073           {
2074           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2075             {
2076             active_count--;           /* Remove non-match possibility */
2077             next_active_state--;
2078             }
2079           if (++count >= (int)GET2(code, 1))
2080             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2081           else
2082             { ADD_NEW(state_offset, count); }
2083           }
2084         }
2085       break;
2086 
2087       /*-----------------------------------------------------------------*/
2088       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2089       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2090       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2091       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2092       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2093         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2094       count = current_state->count;  /* Number already matched */
2095       if (clen > 0)
2096         {
2097         PCRE2_SPTR nptr;
2098         int ncount = 0;
2099         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2100           {
2101           active_count--;           /* Remove non-match possibility */
2102           next_active_state--;
2103           }
2104         nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2105           &ncount);
2106         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2107             reset_could_continue = TRUE;
2108         if (++count >= (int)GET2(code, 1))
2109           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2110         else
2111           { ADD_NEW_DATA(-state_offset, count, ncount); }
2112         }
2113       break;
2114 #endif
2115 
2116       /*-----------------------------------------------------------------*/
2117       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2118       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2119       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2120       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2121       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2122         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2123       count = current_state->count;  /* Number already matched */
2124       if (clen > 0)
2125         {
2126         int ncount = 0;
2127         switch (c)
2128           {
2129           case CHAR_VT:
2130           case CHAR_FF:
2131           case CHAR_NEL:
2132 #ifndef EBCDIC
2133           case 0x2028:
2134           case 0x2029:
2135 #endif  /* Not EBCDIC */
2136           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2137           goto ANYNL03;
2138 
2139           case CHAR_CR:
2140           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2141           /* Fall through */
2142 
2143           ANYNL03:
2144           case CHAR_LF:
2145           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2146             {
2147             active_count--;           /* Remove non-match possibility */
2148             next_active_state--;
2149             }
2150           if (++count >= (int)GET2(code, 1))
2151             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2152           else
2153             { ADD_NEW_DATA(-state_offset, count, ncount); }
2154           break;
2155 
2156           default:
2157           break;
2158           }
2159         }
2160       break;
2161 
2162       /*-----------------------------------------------------------------*/
2163       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2164       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2165       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2166       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2167       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2168         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2169       count = current_state->count;  /* Number already matched */
2170       if (clen > 0)
2171         {
2172         BOOL OK;
2173         switch (c)
2174           {
2175           VSPACE_CASES:
2176           OK = TRUE;
2177           break;
2178 
2179           default:
2180           OK = FALSE;
2181           }
2182 
2183         if (OK == (d == OP_VSPACE))
2184           {
2185           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2186             {
2187             active_count--;           /* Remove non-match possibility */
2188             next_active_state--;
2189             }
2190           if (++count >= (int)GET2(code, 1))
2191             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2192           else
2193             { ADD_NEW_DATA(-state_offset, count, 0); }
2194           }
2195         }
2196       break;
2197 
2198       /*-----------------------------------------------------------------*/
2199       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2200       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2201       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2202       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2203       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2204         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2205       count = current_state->count;  /* Number already matched */
2206       if (clen > 0)
2207         {
2208         BOOL OK;
2209         switch (c)
2210           {
2211           HSPACE_CASES:
2212           OK = TRUE;
2213           break;
2214 
2215           default:
2216           OK = FALSE;
2217           break;
2218           }
2219 
2220         if (OK == (d == OP_HSPACE))
2221           {
2222           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2223             {
2224             active_count--;           /* Remove non-match possibility */
2225             next_active_state--;
2226             }
2227           if (++count >= (int)GET2(code, 1))
2228             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2229           else
2230             { ADD_NEW_DATA(-state_offset, count, 0); }
2231           }
2232         }
2233       break;
2234 
2235 /* ========================================================================== */
2236       /* These opcodes are followed by a character that is usually compared
2237       to the current subject character; it is loaded into d. We still get
2238       here even if there is no subject character, because in some cases zero
2239       repetitions are permitted. */
2240 
2241       /*-----------------------------------------------------------------*/
2242       case OP_CHAR:
2243       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2244       break;
2245 
2246       /*-----------------------------------------------------------------*/
2247       case OP_CHARI:
2248       if (clen == 0) break;
2249 
2250 #ifdef SUPPORT_UNICODE
2251       if (utf_or_ucp)
2252         {
2253         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2254           {
2255           unsigned int othercase;
2256           if (c < 128)
2257             othercase = fcc[c];
2258           else
2259             othercase = UCD_OTHERCASE(c);
2260           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2261           }
2262         }
2263       else
2264 #endif  /* SUPPORT_UNICODE */
2265       /* Not UTF or UCP mode */
2266         {
2267         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2268           { ADD_NEW(state_offset + 2, 0); }
2269         }
2270       break;
2271 
2272 
2273 #ifdef SUPPORT_UNICODE
2274       /*-----------------------------------------------------------------*/
2275       /* This is a tricky one because it can match more than one character.
2276       Find out how many characters to skip, and then set up a negative state
2277       to wait for them to pass before continuing. */
2278 
2279       case OP_EXTUNI:
2280       if (clen > 0)
2281         {
2282         int ncount = 0;
2283         PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2284           end_subject, utf, &ncount);
2285         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2286             reset_could_continue = TRUE;
2287         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2288         }
2289       break;
2290 #endif
2291 
2292       /*-----------------------------------------------------------------*/
2293       /* This is a tricky like EXTUNI because it too can match more than one
2294       character (when CR is followed by LF). In this case, set up a negative
2295       state to wait for one character to pass before continuing. */
2296 
2297       case OP_ANYNL:
2298       if (clen > 0) switch(c)
2299         {
2300         case CHAR_VT:
2301         case CHAR_FF:
2302         case CHAR_NEL:
2303 #ifndef EBCDIC
2304         case 0x2028:
2305         case 0x2029:
2306 #endif  /* Not EBCDIC */
2307         if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2308         /* Fall through */
2309 
2310         case CHAR_LF:
2311         ADD_NEW(state_offset + 1, 0);
2312         break;
2313 
2314         case CHAR_CR:
2315         if (ptr + 1 >= end_subject)
2316           {
2317           ADD_NEW(state_offset + 1, 0);
2318           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2319             reset_could_continue = TRUE;
2320           }
2321         else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2322           {
2323           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2324           }
2325         else
2326           {
2327           ADD_NEW(state_offset + 1, 0);
2328           }
2329         break;
2330         }
2331       break;
2332 
2333       /*-----------------------------------------------------------------*/
2334       case OP_NOT_VSPACE:
2335       if (clen > 0) switch(c)
2336         {
2337         VSPACE_CASES:
2338         break;
2339 
2340         default:
2341         ADD_NEW(state_offset + 1, 0);
2342         break;
2343         }
2344       break;
2345 
2346       /*-----------------------------------------------------------------*/
2347       case OP_VSPACE:
2348       if (clen > 0) switch(c)
2349         {
2350         VSPACE_CASES:
2351         ADD_NEW(state_offset + 1, 0);
2352         break;
2353 
2354         default:
2355         break;
2356         }
2357       break;
2358 
2359       /*-----------------------------------------------------------------*/
2360       case OP_NOT_HSPACE:
2361       if (clen > 0) switch(c)
2362         {
2363         HSPACE_CASES:
2364         break;
2365 
2366         default:
2367         ADD_NEW(state_offset + 1, 0);
2368         break;
2369         }
2370       break;
2371 
2372       /*-----------------------------------------------------------------*/
2373       case OP_HSPACE:
2374       if (clen > 0) switch(c)
2375         {
2376         HSPACE_CASES:
2377         ADD_NEW(state_offset + 1, 0);
2378         break;
2379 
2380         default:
2381         break;
2382         }
2383       break;
2384 
2385       /*-----------------------------------------------------------------*/
2386       /* Match a negated single character casefully. */
2387 
2388       case OP_NOT:
2389       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2390       break;
2391 
2392       /*-----------------------------------------------------------------*/
2393       /* Match a negated single character caselessly. */
2394 
2395       case OP_NOTI:
2396       if (clen > 0)
2397         {
2398         uint32_t otherd;
2399 #ifdef SUPPORT_UNICODE
2400         if (utf_or_ucp && d >= 128)
2401           otherd = UCD_OTHERCASE(d);
2402         else
2403 #endif  /* SUPPORT_UNICODE */
2404         otherd = TABLE_GET(d, fcc, d);
2405         if (c != d && c != otherd)
2406           { ADD_NEW(state_offset + dlen + 1, 0); }
2407         }
2408       break;
2409 
2410       /*-----------------------------------------------------------------*/
2411       case OP_PLUSI:
2412       case OP_MINPLUSI:
2413       case OP_POSPLUSI:
2414       case OP_NOTPLUSI:
2415       case OP_NOTMINPLUSI:
2416       case OP_NOTPOSPLUSI:
2417       caseless = TRUE;
2418       codevalue -= OP_STARI - OP_STAR;
2419 
2420       /* Fall through */
2421       case OP_PLUS:
2422       case OP_MINPLUS:
2423       case OP_POSPLUS:
2424       case OP_NOTPLUS:
2425       case OP_NOTMINPLUS:
2426       case OP_NOTPOSPLUS:
2427       count = current_state->count;  /* Already matched */
2428       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2429       if (clen > 0)
2430         {
2431         uint32_t otherd = NOTACHAR;
2432         if (caseless)
2433           {
2434 #ifdef SUPPORT_UNICODE
2435           if (utf_or_ucp && d >= 128)
2436             otherd = UCD_OTHERCASE(d);
2437           else
2438 #endif  /* SUPPORT_UNICODE */
2439           otherd = TABLE_GET(d, fcc, d);
2440           }
2441         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2442           {
2443           if (count > 0 &&
2444               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2445             {
2446             active_count--;             /* Remove non-match possibility */
2447             next_active_state--;
2448             }
2449           count++;
2450           ADD_NEW(state_offset, count);
2451           }
2452         }
2453       break;
2454 
2455       /*-----------------------------------------------------------------*/
2456       case OP_QUERYI:
2457       case OP_MINQUERYI:
2458       case OP_POSQUERYI:
2459       case OP_NOTQUERYI:
2460       case OP_NOTMINQUERYI:
2461       case OP_NOTPOSQUERYI:
2462       caseless = TRUE;
2463       codevalue -= OP_STARI - OP_STAR;
2464       /* Fall through */
2465       case OP_QUERY:
2466       case OP_MINQUERY:
2467       case OP_POSQUERY:
2468       case OP_NOTQUERY:
2469       case OP_NOTMINQUERY:
2470       case OP_NOTPOSQUERY:
2471       ADD_ACTIVE(state_offset + dlen + 1, 0);
2472       if (clen > 0)
2473         {
2474         uint32_t otherd = NOTACHAR;
2475         if (caseless)
2476           {
2477 #ifdef SUPPORT_UNICODE
2478           if (utf_or_ucp && d >= 128)
2479             otherd = UCD_OTHERCASE(d);
2480           else
2481 #endif  /* SUPPORT_UNICODE */
2482           otherd = TABLE_GET(d, fcc, d);
2483           }
2484         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2485           {
2486           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2487             {
2488             active_count--;            /* Remove non-match possibility */
2489             next_active_state--;
2490             }
2491           ADD_NEW(state_offset + dlen + 1, 0);
2492           }
2493         }
2494       break;
2495 
2496       /*-----------------------------------------------------------------*/
2497       case OP_STARI:
2498       case OP_MINSTARI:
2499       case OP_POSSTARI:
2500       case OP_NOTSTARI:
2501       case OP_NOTMINSTARI:
2502       case OP_NOTPOSSTARI:
2503       caseless = TRUE;
2504       codevalue -= OP_STARI - OP_STAR;
2505       /* Fall through */
2506       case OP_STAR:
2507       case OP_MINSTAR:
2508       case OP_POSSTAR:
2509       case OP_NOTSTAR:
2510       case OP_NOTMINSTAR:
2511       case OP_NOTPOSSTAR:
2512       ADD_ACTIVE(state_offset + dlen + 1, 0);
2513       if (clen > 0)
2514         {
2515         uint32_t otherd = NOTACHAR;
2516         if (caseless)
2517           {
2518 #ifdef SUPPORT_UNICODE
2519           if (utf_or_ucp && d >= 128)
2520             otherd = UCD_OTHERCASE(d);
2521           else
2522 #endif  /* SUPPORT_UNICODE */
2523           otherd = TABLE_GET(d, fcc, d);
2524           }
2525         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2526           {
2527           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2528             {
2529             active_count--;            /* Remove non-match possibility */
2530             next_active_state--;
2531             }
2532           ADD_NEW(state_offset, 0);
2533           }
2534         }
2535       break;
2536 
2537       /*-----------------------------------------------------------------*/
2538       case OP_EXACTI:
2539       case OP_NOTEXACTI:
2540       caseless = TRUE;
2541       codevalue -= OP_STARI - OP_STAR;
2542       /* Fall through */
2543       case OP_EXACT:
2544       case OP_NOTEXACT:
2545       count = current_state->count;  /* Number already matched */
2546       if (clen > 0)
2547         {
2548         uint32_t otherd = NOTACHAR;
2549         if (caseless)
2550           {
2551 #ifdef SUPPORT_UNICODE
2552           if (utf_or_ucp && d >= 128)
2553             otherd = UCD_OTHERCASE(d);
2554           else
2555 #endif  /* SUPPORT_UNICODE */
2556           otherd = TABLE_GET(d, fcc, d);
2557           }
2558         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2559           {
2560           if (++count >= (int)GET2(code, 1))
2561             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2562           else
2563             { ADD_NEW(state_offset, count); }
2564           }
2565         }
2566       break;
2567 
2568       /*-----------------------------------------------------------------*/
2569       case OP_UPTOI:
2570       case OP_MINUPTOI:
2571       case OP_POSUPTOI:
2572       case OP_NOTUPTOI:
2573       case OP_NOTMINUPTOI:
2574       case OP_NOTPOSUPTOI:
2575       caseless = TRUE;
2576       codevalue -= OP_STARI - OP_STAR;
2577       /* Fall through */
2578       case OP_UPTO:
2579       case OP_MINUPTO:
2580       case OP_POSUPTO:
2581       case OP_NOTUPTO:
2582       case OP_NOTMINUPTO:
2583       case OP_NOTPOSUPTO:
2584       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2585       count = current_state->count;  /* Number already matched */
2586       if (clen > 0)
2587         {
2588         uint32_t otherd = NOTACHAR;
2589         if (caseless)
2590           {
2591 #ifdef SUPPORT_UNICODE
2592           if (utf_or_ucp && d >= 128)
2593             otherd = UCD_OTHERCASE(d);
2594           else
2595 #endif  /* SUPPORT_UNICODE */
2596           otherd = TABLE_GET(d, fcc, d);
2597           }
2598         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2599           {
2600           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2601             {
2602             active_count--;             /* Remove non-match possibility */
2603             next_active_state--;
2604             }
2605           if (++count >= (int)GET2(code, 1))
2606             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2607           else
2608             { ADD_NEW(state_offset, count); }
2609           }
2610         }
2611       break;
2612 
2613 
2614 /* ========================================================================== */
2615       /* These are the class-handling opcodes */
2616 
2617       case OP_CLASS:
2618       case OP_NCLASS:
2619       case OP_XCLASS:
2620         {
2621         BOOL isinclass = FALSE;
2622         int next_state_offset;
2623         PCRE2_SPTR ecode;
2624 
2625         /* For a simple class, there is always just a 32-byte table, and we
2626         can set isinclass from it. */
2627 
2628         if (codevalue != OP_XCLASS)
2629           {
2630           ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2631           if (clen > 0)
2632             {
2633             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2634               ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2635             }
2636           }
2637 
2638         /* An extended class may have a table or a list of single characters,
2639         ranges, or both, and it may be positive or negative. There's a
2640         function that sorts all this out. */
2641 
2642         else
2643          {
2644          ecode = code + GET(code, 1);
2645          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2646          }
2647 
2648         /* At this point, isinclass is set for all kinds of class, and ecode
2649         points to the byte after the end of the class. If there is a
2650         quantifier, this is where it will be. */
2651 
2652         next_state_offset = (int)(ecode - start_code);
2653 
2654         switch (*ecode)
2655           {
2656           case OP_CRSTAR:
2657           case OP_CRMINSTAR:
2658           case OP_CRPOSSTAR:
2659           ADD_ACTIVE(next_state_offset + 1, 0);
2660           if (isinclass)
2661             {
2662             if (*ecode == OP_CRPOSSTAR)
2663               {
2664               active_count--;           /* Remove non-match possibility */
2665               next_active_state--;
2666               }
2667             ADD_NEW(state_offset, 0);
2668             }
2669           break;
2670 
2671           case OP_CRPLUS:
2672           case OP_CRMINPLUS:
2673           case OP_CRPOSPLUS:
2674           count = current_state->count;  /* Already matched */
2675           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2676           if (isinclass)
2677             {
2678             if (count > 0 && *ecode == OP_CRPOSPLUS)
2679               {
2680               active_count--;           /* Remove non-match possibility */
2681               next_active_state--;
2682               }
2683             count++;
2684             ADD_NEW(state_offset, count);
2685             }
2686           break;
2687 
2688           case OP_CRQUERY:
2689           case OP_CRMINQUERY:
2690           case OP_CRPOSQUERY:
2691           ADD_ACTIVE(next_state_offset + 1, 0);
2692           if (isinclass)
2693             {
2694             if (*ecode == OP_CRPOSQUERY)
2695               {
2696               active_count--;           /* Remove non-match possibility */
2697               next_active_state--;
2698               }
2699             ADD_NEW(next_state_offset + 1, 0);
2700             }
2701           break;
2702 
2703           case OP_CRRANGE:
2704           case OP_CRMINRANGE:
2705           case OP_CRPOSRANGE:
2706           count = current_state->count;  /* Already matched */
2707           if (count >= (int)GET2(ecode, 1))
2708             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2709           if (isinclass)
2710             {
2711             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2712 
2713             if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2714               {
2715               active_count--;           /* Remove non-match possibility */
2716               next_active_state--;
2717               }
2718 
2719             if (++count >= max && max != 0)   /* Max 0 => no limit */
2720               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2721             else
2722               { ADD_NEW(state_offset, count); }
2723             }
2724           break;
2725 
2726           default:
2727           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2728           break;
2729           }
2730         }
2731       break;
2732 
2733 /* ========================================================================== */
2734       /* These are the opcodes for fancy brackets of various kinds. We have
2735       to use recursion in order to handle them. The "always failing" assertion
2736       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2737       though the other "backtracking verbs" are not supported. */
2738 
2739       case OP_FAIL:
2740       forced_fail++;    /* Count FAILs for multiple states */
2741       break;
2742 
2743       case OP_ASSERT:
2744       case OP_ASSERT_NOT:
2745       case OP_ASSERTBACK:
2746       case OP_ASSERTBACK_NOT:
2747         {
2748         int rc;
2749         int *local_workspace;
2750         PCRE2_SIZE *local_offsets;
2751         PCRE2_SPTR endasscode = code + GET(code, 1);
2752         RWS_anchor *rws = (RWS_anchor *)RWS;
2753 
2754         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2755           {
2756           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2757           if (rc != 0) return rc;
2758           RWS = (int *)rws;
2759           }
2760 
2761         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2762         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2763         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2764 
2765         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2766 
2767         rc = internal_dfa_match(
2768           mb,                                   /* static match data */
2769           code,                                 /* this subexpression's code */
2770           ptr,                                  /* where we currently are */
2771           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2772           local_offsets,                        /* offset vector */
2773           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2774           local_workspace,                      /* workspace vector */
2775           RWS_RSIZE,                            /* size of same */
2776           rlevel,                               /* function recursion level */
2777           RWS);                                 /* recursion workspace */
2778 
2779         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2780 
2781         if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2782         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2783             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2784         }
2785       break;
2786 
2787       /*-----------------------------------------------------------------*/
2788       case OP_COND:
2789       case OP_SCOND:
2790         {
2791         int codelink = (int)GET(code, 1);
2792         PCRE2_UCHAR condcode;
2793 
2794         /* Because of the way auto-callout works during compile, a callout item
2795         is inserted between OP_COND and an assertion condition. This does not
2796         happen for the other conditions. */
2797 
2798         if (code[LINK_SIZE + 1] == OP_CALLOUT
2799             || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2800           {
2801           PCRE2_SIZE callout_length;
2802           rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2803             1 + LINK_SIZE, &callout_length);
2804           if (rrc < 0) return rrc;                 /* Abandon */
2805           if (rrc > 0) break;                      /* Fail this thread */
2806           code += callout_length;                  /* Skip callout data */
2807           }
2808 
2809         condcode = code[LINK_SIZE+1];
2810 
2811         /* Back reference conditions and duplicate named recursion conditions
2812         are not supported */
2813 
2814         if (condcode == OP_CREF || condcode == OP_DNCREF ||
2815             condcode == OP_DNRREF)
2816           return PCRE2_ERROR_DFA_UCOND;
2817 
2818         /* The DEFINE condition is always false, and the assertion (?!) is
2819         converted to OP_FAIL. */
2820 
2821         if (condcode == OP_FALSE || condcode == OP_FAIL)
2822           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2823 
2824         /* There is also an always-true condition */
2825 
2826         else if (condcode == OP_TRUE)
2827           { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2828 
2829         /* The only supported version of OP_RREF is for the value RREF_ANY,
2830         which means "test if in any recursion". We can't test for specifically
2831         recursed groups. */
2832 
2833         else if (condcode == OP_RREF)
2834           {
2835           unsigned int value = GET2(code, LINK_SIZE + 2);
2836           if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2837           if (mb->recursive != NULL)
2838             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2839           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2840           }
2841 
2842         /* Otherwise, the condition is an assertion */
2843 
2844         else
2845           {
2846           int rc;
2847           int *local_workspace;
2848           PCRE2_SIZE *local_offsets;
2849           PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2850           PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2851           RWS_anchor *rws = (RWS_anchor *)RWS;
2852 
2853           if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2854             {
2855             rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2856             if (rc != 0) return rc;
2857             RWS = (int *)rws;
2858             }
2859 
2860           local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2861           local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2862           rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2863 
2864           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2865 
2866           rc = internal_dfa_match(
2867             mb,                                   /* fixed match data */
2868             asscode,                              /* this subexpression's code */
2869             ptr,                                  /* where we currently are */
2870             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2871             local_offsets,                        /* offset vector */
2872             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2873             local_workspace,                      /* workspace vector */
2874             RWS_RSIZE,                            /* size of same */
2875             rlevel,                               /* function recursion level */
2876             RWS);                                 /* recursion workspace */
2877 
2878           rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2879 
2880           if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2881           if ((rc >= 0) ==
2882                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2883             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2884           else
2885             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2886           }
2887         }
2888       break;
2889 
2890       /*-----------------------------------------------------------------*/
2891       case OP_RECURSE:
2892         {
2893         int rc;
2894         int *local_workspace;
2895         PCRE2_SIZE *local_offsets;
2896         RWS_anchor *rws = (RWS_anchor *)RWS;
2897         dfa_recursion_info *ri;
2898         PCRE2_SPTR callpat = start_code + GET(code, 1);
2899         uint32_t recno = (callpat == mb->start_code)? 0 :
2900           GET2(callpat, 1 + LINK_SIZE);
2901 
2902         if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2903           {
2904           rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2905           if (rc != 0) return rc;
2906           RWS = (int *)rws;
2907           }
2908 
2909         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2910         local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2911         rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2912 
2913         /* Check for repeating a recursion without advancing the subject
2914         pointer. This should catch convoluted mutual recursions. (Some simple
2915         cases are caught at compile time.) */
2916 
2917         for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2918           if (recno == ri->group_num && ptr == ri->subject_position)
2919             return PCRE2_ERROR_RECURSELOOP;
2920 
2921         /* Remember this recursion and where we started it so as to
2922         catch infinite loops. */
2923 
2924         new_recursive.group_num = recno;
2925         new_recursive.subject_position = ptr;
2926         new_recursive.prevrec = mb->recursive;
2927         mb->recursive = &new_recursive;
2928 
2929         rc = internal_dfa_match(
2930           mb,                                   /* fixed match data */
2931           callpat,                              /* this subexpression's code */
2932           ptr,                                  /* where we currently are */
2933           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2934           local_offsets,                        /* offset vector */
2935           RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2936           local_workspace,                      /* workspace vector */
2937           RWS_RSIZE,                            /* size of same */
2938           rlevel,                               /* function recursion level */
2939           RWS);                                 /* recursion workspace */
2940 
2941         rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2942         mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2943 
2944         /* Ran out of internal offsets */
2945 
2946         if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2947 
2948         /* For each successful matched substring, set up the next state with a
2949         count of characters to skip before trying it. Note that the count is in
2950         characters, not bytes. */
2951 
2952         if (rc > 0)
2953           {
2954           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2955             {
2956             PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2957 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2958             if (utf)
2959               {
2960               PCRE2_SPTR p = start_subject + local_offsets[rc];
2961               PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2962               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2963               }
2964 #endif
2965             if (charcount > 0)
2966               {
2967               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2968                 (int)(charcount - 1));
2969               }
2970             else
2971               {
2972               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2973               }
2974             }
2975           }
2976         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2977         }
2978       break;
2979 
2980       /*-----------------------------------------------------------------*/
2981       case OP_BRAPOS:
2982       case OP_SBRAPOS:
2983       case OP_CBRAPOS:
2984       case OP_SCBRAPOS:
2985       case OP_BRAPOSZERO:
2986         {
2987         int rc;
2988         int *local_workspace;
2989         PCRE2_SIZE *local_offsets;
2990         PCRE2_SIZE charcount, matched_count;
2991         PCRE2_SPTR local_ptr = ptr;
2992         RWS_anchor *rws = (RWS_anchor *)RWS;
2993         BOOL allow_zero;
2994 
2995         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2996           {
2997           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2998           if (rc != 0) return rc;
2999           RWS = (int *)rws;
3000           }
3001 
3002         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3003         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3004         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3005 
3006         if (codevalue == OP_BRAPOSZERO)
3007           {
3008           allow_zero = TRUE;
3009           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
3010           }
3011         else allow_zero = FALSE;
3012 
3013         /* Loop to match the subpattern as many times as possible as if it were
3014         a complete pattern. */
3015 
3016         for (matched_count = 0;; matched_count++)
3017           {
3018           rc = internal_dfa_match(
3019             mb,                                   /* fixed match data */
3020             code,                                 /* this subexpression's code */
3021             local_ptr,                            /* where we currently are */
3022             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3023             local_offsets,                        /* offset vector */
3024             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3025             local_workspace,                      /* workspace vector */
3026             RWS_RSIZE,                            /* size of same */
3027             rlevel,                               /* function recursion level */
3028             RWS);                                 /* recursion workspace */
3029 
3030           /* Failed to match */
3031 
3032           if (rc < 0)
3033             {
3034             if (rc != PCRE2_ERROR_NOMATCH) return rc;
3035             break;
3036             }
3037 
3038           /* Matched: break the loop if zero characters matched. */
3039 
3040           charcount = local_offsets[1] - local_offsets[0];
3041           if (charcount == 0) break;
3042           local_ptr += charcount;    /* Advance temporary position ptr */
3043           }
3044 
3045         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3046 
3047         /* At this point we have matched the subpattern matched_count
3048         times, and local_ptr is pointing to the character after the end of the
3049         last match. */
3050 
3051         if (matched_count > 0 || allow_zero)
3052           {
3053           PCRE2_SPTR end_subpattern = code;
3054           int next_state_offset;
3055 
3056           do { end_subpattern += GET(end_subpattern, 1); }
3057             while (*end_subpattern == OP_ALT);
3058           next_state_offset =
3059             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3060 
3061           /* Optimization: if there are no more active states, and there
3062           are no new states yet set up, then skip over the subject string
3063           right here, to save looping. Otherwise, set up the new state to swing
3064           into action when the end of the matched substring is reached. */
3065 
3066           if (i + 1 >= active_count && new_count == 0)
3067             {
3068             ptr = local_ptr;
3069             clen = 0;
3070             ADD_NEW(next_state_offset, 0);
3071             }
3072           else
3073             {
3074             PCRE2_SPTR p = ptr;
3075             PCRE2_SPTR pp = local_ptr;
3076             charcount = (PCRE2_SIZE)(pp - p);
3077 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3078             if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3079 #endif
3080             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3081             }
3082           }
3083         }
3084       break;
3085 
3086       /*-----------------------------------------------------------------*/
3087       case OP_ONCE:
3088         {
3089         int rc;
3090         int *local_workspace;
3091         PCRE2_SIZE *local_offsets;
3092         RWS_anchor *rws = (RWS_anchor *)RWS;
3093 
3094         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3095           {
3096           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3097           if (rc != 0) return rc;
3098           RWS = (int *)rws;
3099           }
3100 
3101         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3102         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3103         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3104 
3105         rc = internal_dfa_match(
3106           mb,                                   /* fixed match data */
3107           code,                                 /* this subexpression's code */
3108           ptr,                                  /* where we currently are */
3109           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3110           local_offsets,                        /* offset vector */
3111           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3112           local_workspace,                      /* workspace vector */
3113           RWS_RSIZE,                            /* size of same */
3114           rlevel,                               /* function recursion level */
3115           RWS);                                 /* recursion workspace */
3116 
3117         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3118 
3119         if (rc >= 0)
3120           {
3121           PCRE2_SPTR end_subpattern = code;
3122           PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3123           int next_state_offset, repeat_state_offset;
3124 
3125           do { end_subpattern += GET(end_subpattern, 1); }
3126             while (*end_subpattern == OP_ALT);
3127           next_state_offset =
3128             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3129 
3130           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3131           arrange for the repeat state also to be added to the relevant list.
3132           Calculate the offset, or set -1 for no repeat. */
3133 
3134           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3135                                  *end_subpattern == OP_KETRMIN)?
3136             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3137 
3138           /* If we have matched an empty string, add the next state at the
3139           current character pointer. This is important so that the duplicate
3140           checking kicks in, which is what breaks infinite loops that match an
3141           empty string. */
3142 
3143           if (charcount == 0)
3144             {
3145             ADD_ACTIVE(next_state_offset, 0);
3146             }
3147 
3148           /* Optimization: if there are no more active states, and there
3149           are no new states yet set up, then skip over the subject string
3150           right here, to save looping. Otherwise, set up the new state to swing
3151           into action when the end of the matched substring is reached. */
3152 
3153           else if (i + 1 >= active_count && new_count == 0)
3154             {
3155             ptr += charcount;
3156             clen = 0;
3157             ADD_NEW(next_state_offset, 0);
3158 
3159             /* If we are adding a repeat state at the new character position,
3160             we must fudge things so that it is the only current state.
3161             Otherwise, it might be a duplicate of one we processed before, and
3162             that would cause it to be skipped. */
3163 
3164             if (repeat_state_offset >= 0)
3165               {
3166               next_active_state = active_states;
3167               active_count = 0;
3168               i = -1;
3169               ADD_ACTIVE(repeat_state_offset, 0);
3170               }
3171             }
3172           else
3173             {
3174 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3175             if (utf)
3176               {
3177               PCRE2_SPTR p = start_subject + local_offsets[0];
3178               PCRE2_SPTR pp = start_subject + local_offsets[1];
3179               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3180               }
3181 #endif
3182             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3183             if (repeat_state_offset >= 0)
3184               { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3185             }
3186           }
3187         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3188         }
3189       break;
3190 
3191 
3192 /* ========================================================================== */
3193       /* Handle callouts */
3194 
3195       case OP_CALLOUT:
3196       case OP_CALLOUT_STR:
3197         {
3198         PCRE2_SIZE callout_length;
3199         rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3200           &callout_length);
3201         if (rrc < 0) return rrc;   /* Abandon */
3202         if (rrc == 0)
3203           { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3204         }
3205       break;
3206 
3207 
3208 /* ========================================================================== */
3209       default:        /* Unsupported opcode */
3210       return PCRE2_ERROR_DFA_UITEM;
3211       }
3212 
3213     NEXT_ACTIVE_STATE: continue;
3214 
3215     }      /* End of loop scanning active states */
3216 
3217   /* We have finished the processing at the current subject character. If no
3218   new states have been set for the next character, we have found all the
3219   matches that we are going to find. If partial matching has been requested,
3220   check for appropriate conditions.
3221 
3222   The "forced_ fail" variable counts the number of (*F) encountered for the
3223   character. If it is equal to the original active_count (saved in
3224   workspace[1]) it means that (*F) was found on every active state. In this
3225   case we don't want to give a partial match.
3226 
3227   The "could_continue" variable is true if a state could have continued but
3228   for the fact that the end of the subject was reached. */
3229 
3230   if (new_count <= 0)
3231     {
3232     if (could_continue &&                            /* Some could go on, and */
3233         forced_fail != workspace[1] &&               /* Not all forced fail & */
3234         (                                            /* either... */
3235         (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3236         ||                                           /* or... */
3237         ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3238          match_count < 0)                             /* no matches */
3239         ) &&                                         /* And... */
3240         (
3241         partial_newline ||                   /* Either partial NL */
3242           (                                  /* or ... */
3243           ptr >= end_subject &&              /* End of subject and */
3244             (                                  /* either */
3245             ptr > mb->start_used_ptr ||        /* Inspected non-empty string */
3246             mb->allowemptypartial              /* or pattern has lookbehind */
3247             )                                  /* or could match empty */
3248           )
3249         ))
3250       match_count = PCRE2_ERROR_PARTIAL;
3251     break;  /* Exit from loop along the subject string */
3252     }
3253 
3254   /* One or more states are active for the next character. */
3255 
3256   ptr += clen;    /* Advance to next subject character */
3257   }               /* Loop to move along the subject string */
3258 
3259 /* Control gets here from "break" a few lines above. If we have a match and
3260 PCRE2_ENDANCHORED is set, the match fails. */
3261 
3262 if (match_count >= 0 &&
3263     ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3264     ptr < end_subject)
3265   match_count = PCRE2_ERROR_NOMATCH;
3266 
3267 return match_count;
3268 }
3269 
3270 
3271 
3272 /*************************************************
3273 *     Match a pattern using the DFA algorithm    *
3274 *************************************************/
3275 
3276 /* This function matches a compiled pattern to a subject string, using the
3277 alternate matching algorithm that finds all matches at once.
3278 
3279 Arguments:
3280   code          points to the compiled pattern
3281   subject       subject string
3282   length        length of subject string
3283   startoffset   where to start matching in the subject
3284   options       option bits
3285   match_data    points to a match data structure
3286   gcontext      points to a match context
3287   workspace     pointer to workspace
3288   wscount       size of workspace
3289 
3290 Returns:        > 0 => number of match offset pairs placed in offsets
3291                 = 0 => offsets overflowed; longest matches are present
3292                  -1 => failed to match
3293                < -1 => some kind of unexpected problem
3294 */
3295 
3296 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3297 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3298   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3299   pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3300 {
3301 int rc;
3302 int was_zero_terminated = 0;
3303 
3304 const pcre2_real_code *re = (const pcre2_real_code *)code;
3305 
3306 PCRE2_SPTR start_match;
3307 PCRE2_SPTR end_subject;
3308 PCRE2_SPTR bumpalong_limit;
3309 PCRE2_SPTR req_cu_ptr;
3310 
3311 BOOL utf, anchored, startline, firstline;
3312 BOOL has_first_cu = FALSE;
3313 BOOL has_req_cu = FALSE;
3314 
3315 #if PCRE2_CODE_UNIT_WIDTH == 8
3316 PCRE2_SPTR memchr_found_first_cu = NULL;
3317 PCRE2_SPTR memchr_found_first_cu2 = NULL;
3318 #endif
3319 
3320 PCRE2_UCHAR first_cu = 0;
3321 PCRE2_UCHAR first_cu2 = 0;
3322 PCRE2_UCHAR req_cu = 0;
3323 PCRE2_UCHAR req_cu2 = 0;
3324 
3325 const uint8_t *start_bits = NULL;
3326 
3327 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3328 is used below, and it expects NLBLOCK to be defined as a pointer. */
3329 
3330 pcre2_callout_block cb;
3331 dfa_match_block actual_match_block;
3332 dfa_match_block *mb = &actual_match_block;
3333 
3334 /* Set up a starting block of memory for use during recursive calls to
3335 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3336 in the case when it is not needed. If this is too small, more memory is
3337 obtained from the heap. At the start of each block is an anchor structure.*/
3338 
3339 int base_recursion_workspace[RWS_BASE_SIZE];
3340 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3341 rws->next = NULL;
3342 rws->size = RWS_BASE_SIZE;
3343 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3344 
3345 /* Recognize NULL, length 0 as an empty string. */
3346 
3347 if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3348 
3349 /* Plausibility checks */
3350 
3351 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3352 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3353   return PCRE2_ERROR_NULL;
3354 
3355 if (length == PCRE2_ZERO_TERMINATED)
3356   {
3357   length = PRIV(strlen)(subject);
3358   was_zero_terminated = 1;
3359   }
3360 
3361 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3362 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3363 
3364 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3365 time. */
3366 
3367 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3368    ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3369   return PCRE2_ERROR_BADOPTION;
3370 
3371 /* Invalid UTF support is not available for DFA matching. */
3372 
3373 if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3374   return PCRE2_ERROR_DFA_UINVALID_UTF;
3375 
3376 /* Check that the first field in the block is the magic number. If it is not,
3377 return with PCRE2_ERROR_BADMAGIC. */
3378 
3379 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3380 
3381 /* Check the code unit width. */
3382 
3383 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3384   return PCRE2_ERROR_BADMODE;
3385 
3386 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3387 options variable for this function. Users of PCRE2 who are not calling the
3388 function directly would like to have a way of setting these flags, in the same
3389 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3390 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3391 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3392 transferred to the options for this function. The bits are guaranteed to be
3393 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3394 that the match-time bits are not more significant than the flag bits. If by
3395 accident this is not the case, a compile-time division by zero error will
3396 occur. */
3397 
3398 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3399 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3400 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3401 #undef FF
3402 #undef OO
3403 
3404 /* If restarting after a partial match, do some sanity checks on the contents
3405 of the workspace. */
3406 
3407 if ((options & PCRE2_DFA_RESTART) != 0)
3408   {
3409   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3410     workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3411       return PCRE2_ERROR_DFA_BADRESTART;
3412   }
3413 
3414 /* Set some local values */
3415 
3416 utf = (re->overall_options & PCRE2_UTF) != 0;
3417 start_match = subject + start_offset;
3418 end_subject = subject + length;
3419 req_cu_ptr = start_match - 1;
3420 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3421   (re->overall_options & PCRE2_ANCHORED) != 0;
3422 
3423 /* The "must be at the start of a line" flags are used in a loop when finding
3424 where to start. */
3425 
3426 startline = (re->flags & PCRE2_STARTLINE) != 0;
3427 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3428 bumpalong_limit = end_subject;
3429 
3430 /* Initialize and set up the fixed fields in the callout block, with a pointer
3431 in the match block. */
3432 
3433 mb->cb = &cb;
3434 cb.version = 2;
3435 cb.subject = subject;
3436 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3437 cb.callout_flags = 0;
3438 cb.capture_top      = 1;      /* No capture support */
3439 cb.capture_last     = 0;
3440 cb.mark             = NULL;   /* No (*MARK) support */
3441 
3442 /* Get data from the match context, if present, and fill in the remaining
3443 fields in the match block. It is an error to set an offset limit without
3444 setting the flag at compile time. */
3445 
3446 if (mcontext == NULL)
3447   {
3448   mb->callout = NULL;
3449   mb->memctl = re->memctl;
3450   mb->match_limit = PRIV(default_match_context).match_limit;
3451   mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3452   mb->heap_limit = PRIV(default_match_context).heap_limit;
3453   }
3454 else
3455   {
3456   if (mcontext->offset_limit != PCRE2_UNSET)
3457     {
3458     if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3459       return PCRE2_ERROR_BADOFFSETLIMIT;
3460     bumpalong_limit = subject + mcontext->offset_limit;
3461     }
3462   mb->callout = mcontext->callout;
3463   mb->callout_data = mcontext->callout_data;
3464   mb->memctl = mcontext->memctl;
3465   mb->match_limit = mcontext->match_limit;
3466   mb->match_limit_depth = mcontext->depth_limit;
3467   mb->heap_limit = mcontext->heap_limit;
3468   }
3469 
3470 if (mb->match_limit > re->limit_match)
3471   mb->match_limit = re->limit_match;
3472 
3473 if (mb->match_limit_depth > re->limit_depth)
3474   mb->match_limit_depth = re->limit_depth;
3475 
3476 if (mb->heap_limit > re->limit_heap)
3477   mb->heap_limit = re->limit_heap;
3478 
3479 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3480   re->name_count * re->name_entry_size;
3481 mb->tables = re->tables;
3482 mb->start_subject = subject;
3483 mb->end_subject = end_subject;
3484 mb->start_offset = start_offset;
3485 mb->allowemptypartial = (re->max_lookbehind > 0) ||
3486   (re->flags & PCRE2_MATCH_EMPTY) != 0;
3487 mb->moptions = options;
3488 mb->poptions = re->overall_options;
3489 mb->match_call_count = 0;
3490 mb->heap_used = 0;
3491 
3492 /* Process the \R and newline settings. */
3493 
3494 mb->bsr_convention = re->bsr_convention;
3495 mb->nltype = NLTYPE_FIXED;
3496 switch(re->newline_convention)
3497   {
3498   case PCRE2_NEWLINE_CR:
3499   mb->nllen = 1;
3500   mb->nl[0] = CHAR_CR;
3501   break;
3502 
3503   case PCRE2_NEWLINE_LF:
3504   mb->nllen = 1;
3505   mb->nl[0] = CHAR_NL;
3506   break;
3507 
3508   case PCRE2_NEWLINE_NUL:
3509   mb->nllen = 1;
3510   mb->nl[0] = CHAR_NUL;
3511   break;
3512 
3513   case PCRE2_NEWLINE_CRLF:
3514   mb->nllen = 2;
3515   mb->nl[0] = CHAR_CR;
3516   mb->nl[1] = CHAR_NL;
3517   break;
3518 
3519   case PCRE2_NEWLINE_ANY:
3520   mb->nltype = NLTYPE_ANY;
3521   break;
3522 
3523   case PCRE2_NEWLINE_ANYCRLF:
3524   mb->nltype = NLTYPE_ANYCRLF;
3525   break;
3526 
3527   default: return PCRE2_ERROR_INTERNAL;
3528   }
3529 
3530 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3531 we must also check that a starting offset does not point into the middle of a
3532 multiunit character. We check only the portion of the subject that is going to
3533 be inspected during matching - from the offset minus the maximum back reference
3534 to the given length. This saves time when a small part of a large subject is
3535 being matched by the use of a starting offset. Note that the maximum lookbehind
3536 is a number of characters, not code units. */
3537 
3538 #ifdef SUPPORT_UNICODE
3539 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3540   {
3541   PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3542 
3543   if (start_offset > 0)
3544     {
3545 #if PCRE2_CODE_UNIT_WIDTH != 32
3546     unsigned int i;
3547     if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3548       return PCRE2_ERROR_BADUTFOFFSET;
3549     for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3550       {
3551       check_subject--;
3552       while (check_subject > subject &&
3553 #if PCRE2_CODE_UNIT_WIDTH == 8
3554       (*check_subject & 0xc0) == 0x80)
3555 #else  /* 16-bit */
3556       (*check_subject & 0xfc00) == 0xdc00)
3557 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3558         check_subject--;
3559       }
3560 #else   /* In the 32-bit library, one code unit equals one character. */
3561     check_subject -= re->max_lookbehind;
3562     if (check_subject < subject) check_subject = subject;
3563 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3564     }
3565 
3566   /* Validate the relevant portion of the subject. After an error, adjust the
3567   offset to be an absolute offset in the whole string. */
3568 
3569   match_data->rc = PRIV(valid_utf)(check_subject,
3570     length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3571   if (match_data->rc != 0)
3572     {
3573     match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3574     return match_data->rc;
3575     }
3576   }
3577 #endif  /* SUPPORT_UNICODE */
3578 
3579 /* Set up the first code unit to match, if available. If there's no first code
3580 unit there may be a bitmap of possible first characters. */
3581 
3582 if ((re->flags & PCRE2_FIRSTSET) != 0)
3583   {
3584   has_first_cu = TRUE;
3585   first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3586   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3587     {
3588     first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3589 #ifdef SUPPORT_UNICODE
3590 #if PCRE2_CODE_UNIT_WIDTH == 8
3591     if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3592       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3593 #else
3594     if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3595       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3596 #endif
3597 #endif  /* SUPPORT_UNICODE */
3598     }
3599   }
3600 else
3601   if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3602     start_bits = re->start_bitmap;
3603 
3604 /* There may be a "last known required code unit" set. */
3605 
3606 if ((re->flags & PCRE2_LASTSET) != 0)
3607   {
3608   has_req_cu = TRUE;
3609   req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3610   if ((re->flags & PCRE2_LASTCASELESS) != 0)
3611     {
3612     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3613 #ifdef SUPPORT_UNICODE
3614 #if PCRE2_CODE_UNIT_WIDTH == 8
3615     if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3616       req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3617 #else
3618     if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3619       req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3620 #endif
3621 #endif  /* SUPPORT_UNICODE */
3622     }
3623   }
3624 
3625 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3626 free the memory that was obtained. */
3627 
3628 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3629   {
3630   match_data->memctl.free((void *)match_data->subject,
3631     match_data->memctl.memory_data);
3632   match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3633   }
3634 
3635 /* Fill in fields that are always returned in the match data. */
3636 
3637 match_data->code = re;
3638 match_data->subject = NULL;  /* Default for no match */
3639 match_data->mark = NULL;
3640 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3641 
3642 /* Call the main matching function, looping for a non-anchored regex after a
3643 failed match. If not restarting, perform certain optimizations at the start of
3644 a match. */
3645 
3646 for (;;)
3647   {
3648   /* ----------------- Start of match optimizations ---------------- */
3649 
3650   /* There are some optimizations that avoid running the match if a known
3651   starting point is not found, or if a known later code unit is not present.
3652   However, there is an option (settable at compile time) that disables
3653   these, for testing and for ensuring that all callouts do actually occur.
3654   The optimizations must also be avoided when restarting a DFA match. */
3655 
3656   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3657       (options & PCRE2_DFA_RESTART) == 0)
3658     {
3659     /* If firstline is TRUE, the start of the match is constrained to the first
3660     line of a multiline string. That is, the match must be before or at the
3661     first newline following the start of matching. Temporarily adjust
3662     end_subject so that we stop the optimization scans for a first code unit
3663     immediately after the first character of a newline (the first code unit can
3664     legitimately be a newline). If the match fails at the newline, later code
3665     breaks this loop. */
3666 
3667     if (firstline)
3668       {
3669       PCRE2_SPTR t = start_match;
3670 #ifdef SUPPORT_UNICODE
3671       if (utf)
3672         {
3673         while (t < end_subject && !IS_NEWLINE(t))
3674           {
3675           t++;
3676           ACROSSCHAR(t < end_subject, t, t++);
3677           }
3678         }
3679       else
3680 #endif
3681       while (t < end_subject && !IS_NEWLINE(t)) t++;
3682       end_subject = t;
3683       }
3684 
3685     /* Anchored: check the first code unit if one is recorded. This may seem
3686     pointless but it can help in detecting a no match case without scanning for
3687     the required code unit. */
3688 
3689     if (anchored)
3690       {
3691       if (has_first_cu || start_bits != NULL)
3692         {
3693         BOOL ok = start_match < end_subject;
3694         if (ok)
3695           {
3696           PCRE2_UCHAR c = UCHAR21TEST(start_match);
3697           ok = has_first_cu && (c == first_cu || c == first_cu2);
3698           if (!ok && start_bits != NULL)
3699             {
3700 #if PCRE2_CODE_UNIT_WIDTH != 8
3701             if (c > 255) c = 255;
3702 #endif
3703             ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3704             }
3705           }
3706         if (!ok) break;
3707         }
3708       }
3709 
3710     /* Not anchored. Advance to a unique first code unit if there is one. */
3711 
3712     else
3713       {
3714       if (has_first_cu)
3715         {
3716         if (first_cu != first_cu2)  /* Caseless */
3717           {
3718           /* In 16-bit and 32_bit modes we have to do our own search, so can
3719           look for both cases at once. */
3720 
3721 #if PCRE2_CODE_UNIT_WIDTH != 8
3722           PCRE2_UCHAR smc;
3723           while (start_match < end_subject &&
3724                 (smc = UCHAR21TEST(start_match)) != first_cu &&
3725                  smc != first_cu2)
3726             start_match++;
3727 #else
3728           /* In 8-bit mode, the use of memchr() gives a big speed up, even
3729           though we have to call it twice in order to find the earliest
3730           occurrence of the code unit in either of its cases. Caching is used
3731           to remember the positions of previously found code units. This can
3732           make a huge difference when the strings are very long and only one
3733           case is actually present. */
3734 
3735           PCRE2_SPTR pp1 = NULL;
3736           PCRE2_SPTR pp2 = NULL;
3737           PCRE2_SIZE searchlength = end_subject - start_match;
3738 
3739           /* If we haven't got a previously found position for first_cu, or if
3740           the current starting position is later, we need to do a search. If
3741           the code unit is not found, set it to the end. */
3742 
3743           if (memchr_found_first_cu == NULL ||
3744               start_match > memchr_found_first_cu)
3745             {
3746             pp1 = memchr(start_match, first_cu, searchlength);
3747             memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3748             }
3749 
3750           /* If the start is before a previously found position, use the
3751           previous position, or NULL if a previous search failed. */
3752 
3753           else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3754             memchr_found_first_cu;
3755 
3756           /* Do the same thing for the other case. */
3757 
3758           if (memchr_found_first_cu2 == NULL ||
3759               start_match > memchr_found_first_cu2)
3760             {
3761             pp2 = memchr(start_match, first_cu2, searchlength);
3762             memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3763             }
3764 
3765           else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3766             memchr_found_first_cu2;
3767 
3768           /* Set the start to the end of the subject if neither case was found.
3769           Otherwise, use the earlier found point. */
3770 
3771           if (pp1 == NULL)
3772             start_match = (pp2 == NULL)? end_subject : pp2;
3773           else
3774             start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3775 
3776 #endif  /* 8-bit handling */
3777           }
3778 
3779         /* The caseful case is much simpler. */
3780 
3781         else
3782           {
3783 #if PCRE2_CODE_UNIT_WIDTH != 8
3784           while (start_match < end_subject && UCHAR21TEST(start_match) !=
3785                  first_cu)
3786             start_match++;
3787 #else  /* 8-bit code units */
3788           start_match = memchr(start_match, first_cu, end_subject - start_match);
3789           if (start_match == NULL) start_match = end_subject;
3790 #endif
3791           }
3792 
3793         /* If we can't find the required code unit, having reached the true end
3794         of the subject, break the bumpalong loop, to force a match failure,
3795         except when doing partial matching, when we let the next cycle run at
3796         the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3797         which partially matches "abc", even though the string does not contain
3798         the starting character "d". If we have not reached the true end of the
3799         subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3800         we also let the cycle run, because the matching string is legitimately
3801         allowed to start with the first code unit of a newline. */
3802 
3803         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3804             start_match >= mb->end_subject)
3805           break;
3806         }
3807 
3808       /* If there's no first code unit, advance to just after a linebreak for a
3809       multiline match if required. */
3810 
3811       else if (startline)
3812         {
3813         if (start_match > mb->start_subject + start_offset)
3814           {
3815 #ifdef SUPPORT_UNICODE
3816           if (utf)
3817             {
3818             while (start_match < end_subject && !WAS_NEWLINE(start_match))
3819               {
3820               start_match++;
3821               ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3822               }
3823             }
3824           else
3825 #endif
3826           while (start_match < end_subject && !WAS_NEWLINE(start_match))
3827             start_match++;
3828 
3829           /* If we have just passed a CR and the newline option is ANY or
3830           ANYCRLF, and we are now at a LF, advance the match position by one
3831           more code unit. */
3832 
3833           if (start_match[-1] == CHAR_CR &&
3834                (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3835                start_match < end_subject &&
3836                UCHAR21TEST(start_match) == CHAR_NL)
3837             start_match++;
3838           }
3839         }
3840 
3841       /* If there's no first code unit or a requirement for a multiline line
3842       start, advance to a non-unique first code unit if any have been
3843       identified. The bitmap contains only 256 bits. When code units are 16 or
3844       32 bits wide, all code units greater than 254 set the 255 bit. */
3845 
3846       else if (start_bits != NULL)
3847         {
3848         while (start_match < end_subject)
3849           {
3850           uint32_t c = UCHAR21TEST(start_match);
3851 #if PCRE2_CODE_UNIT_WIDTH != 8
3852           if (c > 255) c = 255;
3853 #endif
3854           if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3855           start_match++;
3856           }
3857 
3858         /* See comment above in first_cu checking about the next line. */
3859 
3860         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3861             start_match >= mb->end_subject)
3862           break;
3863         }
3864       }  /* End of first code unit handling */
3865 
3866     /* Restore fudged end_subject */
3867 
3868     end_subject = mb->end_subject;
3869 
3870     /* The following two optimizations are disabled for partial matching. */
3871 
3872     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3873       {
3874       PCRE2_SPTR p;
3875 
3876       /* The minimum matching length is a lower bound; no actual string of that
3877       length may actually match the pattern. Although the value is, strictly,
3878       in characters, we treat it as code units to avoid spending too much time
3879       in this optimization. */
3880 
3881       if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3882 
3883       /* If req_cu is set, we know that that code unit must appear in the
3884       subject for the match to succeed. If the first code unit is set, req_cu
3885       must be later in the subject; otherwise the test starts at the match
3886       point. This optimization can save a huge amount of backtracking in
3887       patterns with nested unlimited repeats that aren't going to match.
3888       Writing separate code for cased/caseless versions makes it go faster, as
3889       does using an autoincrement and backing off on a match. As in the case of
3890       the first code unit, using memchr() in the 8-bit library gives a big
3891       speed up. Unlike the first_cu check above, we do not need to call
3892       memchr() twice in the caseless case because we only need to check for the
3893       presence of the character in either case, not find the first occurrence.
3894 
3895       The search can be skipped if the code unit was found later than the
3896       current starting point in a previous iteration of the bumpalong loop.
3897 
3898       HOWEVER: when the subject string is very, very long, searching to its end
3899       can take a long time, and give bad performance on quite ordinary
3900       patterns. This showed up when somebody was matching something like
3901       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3902       sufficiently long, but it's worth searching a lot more for unanchored
3903       patterns. */
3904 
3905       p = start_match + (has_first_cu? 1:0);
3906       if (has_req_cu && p > req_cu_ptr)
3907         {
3908         PCRE2_SIZE check_length = end_subject - start_match;
3909 
3910         if (check_length < REQ_CU_MAX ||
3911               (!anchored && check_length < REQ_CU_MAX * 1000))
3912           {
3913           if (req_cu != req_cu2)  /* Caseless */
3914             {
3915 #if PCRE2_CODE_UNIT_WIDTH != 8
3916             while (p < end_subject)
3917               {
3918               uint32_t pp = UCHAR21INCTEST(p);
3919               if (pp == req_cu || pp == req_cu2) { p--; break; }
3920               }
3921 #else  /* 8-bit code units */
3922             PCRE2_SPTR pp = p;
3923             p = memchr(pp, req_cu, end_subject - pp);
3924             if (p == NULL)
3925               {
3926               p = memchr(pp, req_cu2, end_subject - pp);
3927               if (p == NULL) p = end_subject;
3928               }
3929 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3930             }
3931 
3932           /* The caseful case */
3933 
3934           else
3935             {
3936 #if PCRE2_CODE_UNIT_WIDTH != 8
3937             while (p < end_subject)
3938               {
3939               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3940               }
3941 
3942 #else  /* 8-bit code units */
3943             p = memchr(p, req_cu, end_subject - p);
3944             if (p == NULL) p = end_subject;
3945 #endif
3946             }
3947 
3948           /* If we can't find the required code unit, break the matching loop,
3949           forcing a match failure. */
3950 
3951           if (p >= end_subject) break;
3952 
3953           /* If we have found the required code unit, save the point where we
3954           found it, so that we don't search again next time round the loop if
3955           the start hasn't passed this code unit yet. */
3956 
3957           req_cu_ptr = p;
3958           }
3959         }
3960       }
3961     }
3962 
3963   /* ------------ End of start of match optimizations ------------ */
3964 
3965   /* Give no match if we have passed the bumpalong limit. */
3966 
3967   if (start_match > bumpalong_limit) break;
3968 
3969   /* OK, now we can do the business */
3970 
3971   mb->start_used_ptr = start_match;
3972   mb->last_used_ptr = start_match;
3973   mb->recursive = NULL;
3974 
3975   rc = internal_dfa_match(
3976     mb,                           /* fixed match data */
3977     mb->start_code,               /* this subexpression's code */
3978     start_match,                  /* where we currently are */
3979     start_offset,                 /* start offset in subject */
3980     match_data->ovector,          /* offset vector */
3981     (uint32_t)match_data->oveccount * 2,  /* actual size of same */
3982     workspace,                    /* workspace vector */
3983     (int)wscount,                 /* size of same */
3984     0,                            /* function recurse level */
3985     base_recursion_workspace);    /* initial workspace for recursion */
3986 
3987   /* Anything other than "no match" means we are done, always; otherwise, carry
3988   on only if not anchored. */
3989 
3990   if (rc != PCRE2_ERROR_NOMATCH || anchored)
3991     {
3992     if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3993       {
3994       match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3995       match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3996       }
3997     match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3998     match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3999     match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4000     match_data->rc = rc;
4001 
4002     if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4003       {
4004       length = CU2BYTES(length + was_zero_terminated);
4005       match_data->subject = match_data->memctl.malloc(length,
4006         match_data->memctl.memory_data);
4007       if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4008       memcpy((void *)match_data->subject, subject, length);
4009       match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4010       }
4011     else
4012       {
4013       if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4014       }
4015     goto EXIT;
4016     }
4017 
4018   /* Advance to the next subject character unless we are at the end of a line
4019   and firstline is set. */
4020 
4021   if (firstline && IS_NEWLINE(start_match)) break;
4022   start_match++;
4023 #ifdef SUPPORT_UNICODE
4024   if (utf)
4025     {
4026     ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4027     }
4028 #endif
4029   if (start_match > end_subject) break;
4030 
4031   /* If we have just passed a CR and we are now at a LF, and the pattern does
4032   not contain any explicit matches for \r or \n, and the newline option is CRLF
4033   or ANY or ANYCRLF, advance the match position by one more character. */
4034 
4035   if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4036       start_match < end_subject &&
4037       UCHAR21TEST(start_match) == CHAR_NL &&
4038       (re->flags & PCRE2_HASCRORLF) == 0 &&
4039         (mb->nltype == NLTYPE_ANY ||
4040          mb->nltype == NLTYPE_ANYCRLF ||
4041          mb->nllen == 2))
4042     start_match++;
4043 
4044   }   /* "Bumpalong" loop */
4045 
4046 NOMATCH_EXIT:
4047 rc = PCRE2_ERROR_NOMATCH;
4048 
4049 EXIT:
4050 while (rws->next != NULL)
4051   {
4052   RWS_anchor *next = rws->next;
4053   rws->next = next->next;
4054   mb->memctl.free(next, mb->memctl.memory_data);
4055   }
4056 
4057 return rc;
4058 }
4059 
4060 /* These #undefs are here to enable unity builds with CMake. */
4061 
4062 #undef NLBLOCK /* Block containing newline information */
4063 #undef PSSTART /* Field containing processed string start */
4064 #undef PSEND   /* Field containing processed string end */
4065 
4066 /* End of pcre2_dfa_match.c */
4067