1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2024 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains mode-dependent macro and structure definitions. The
43 file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
44 These mode-dependent items are kept in a separate file so that they can also be
45 #included multiple times for different code unit widths by pcre2test in order
46 to have access to the hidden structures at all supported widths.
47 
48 Some of the mode-dependent macros are required at different widths for
49 different parts of the pcre2test code (in particular, the included
50 pcre_printint.c file). We undefine them here so that they can be re-defined for
51 multiple inclusions. Not all of these are used in pcre2test, but it's easier
52 just to undefine them all. */
53 
54 #undef ACROSSCHAR
55 #undef BACKCHAR
56 #undef BYTES2CU
57 #undef CHMAX_255
58 #undef CU2BYTES
59 #undef FORWARDCHAR
60 #undef FORWARDCHARTEST
61 #undef GET
62 #undef GET2
63 #undef GETCHAR
64 #undef GETCHARINC
65 #undef GETCHARINCTEST
66 #undef GETCHARLEN
67 #undef GETCHARLENTEST
68 #undef GETCHARTEST
69 #undef GET_EXTRALEN
70 #undef HAS_EXTRALEN
71 #undef IMM2_SIZE
72 #undef MAX_255
73 #undef MAX_MARK
74 #undef MAX_PATTERN_SIZE
75 #undef MAX_UTF_SINGLE_CU
76 #undef NOT_FIRSTCU
77 #undef PUT
78 #undef PUT2
79 #undef PUT2INC
80 #undef PUTCHAR
81 #undef PUTINC
82 #undef TABLE_GET
83 
84 
85 
86 /* -------------------------- MACROS ----------------------------- */
87 
88 /* PCRE keeps offsets in its compiled code as at least 16-bit quantities
89 (always stored in big-endian order in 8-bit mode) by default. These are used,
90 for example, to link from the start of a subpattern to its alternatives and its
91 end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
92 to around 64K, which is big enough for almost everybody. However, I received a
93 request for an even bigger limit. For this reason, and also to make the code
94 easier to maintain, the storing and loading of offsets from the compiled code
95 unit string is now handled by the macros that are defined here.
96 
97 The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
98 values of 3 or 4 are also supported. */
99 
100 /* ------------------- 8-bit support  ------------------ */
101 
102 #if PCRE2_CODE_UNIT_WIDTH == 8
103 
104 #if LINK_SIZE == 2
105 #define PUT(a,n,d)   \
106   (a[n] = (PCRE2_UCHAR)((d) >> 8)), \
107   (a[(n)+1] = (PCRE2_UCHAR)((d) & 255))
108 #define GET(a,n) \
109   (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
110 #define MAX_PATTERN_SIZE (1 << 16)
111 
112 #elif LINK_SIZE == 3
113 #define PUT(a,n,d)       \
114   (a[n] = (PCRE2_UCHAR)((d) >> 16)),    \
115   (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \
116   (a[(n)+2] = (PCRE2_UCHAR)((d) & 255))
117 #define GET(a,n) \
118   (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
119 #define MAX_PATTERN_SIZE (1 << 24)
120 
121 #elif LINK_SIZE == 4
122 #define PUT(a,n,d)        \
123   (a[n] = (PCRE2_UCHAR)((d) >> 24)),     \
124   (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \
125   (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)),  \
126   (a[(n)+3] = (PCRE2_UCHAR)((d) & 255))
127 #define GET(a,n) \
128   (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
129 #define MAX_PATTERN_SIZE (1 << 30)   /* Keep it positive */
130 
131 #else
132 #error LINK_SIZE must be 2, 3, or 4
133 #endif
134 
135 
136 /* ------------------- 16-bit support  ------------------ */
137 
138 #elif PCRE2_CODE_UNIT_WIDTH == 16
139 
140 #if LINK_SIZE == 2
141 #undef LINK_SIZE
142 #define LINK_SIZE 1
143 #define PUT(a,n,d)   \
144   (a[n] = (PCRE2_UCHAR)(d))
145 #define GET(a,n) \
146   (a[n])
147 #define MAX_PATTERN_SIZE (1 << 16)
148 
149 #elif LINK_SIZE == 3 || LINK_SIZE == 4
150 #undef LINK_SIZE
151 #define LINK_SIZE 2
152 #define PUT(a,n,d)   \
153   (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
154   (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))
155 #define GET(a,n) \
156   (unsigned int)(((a)[n] << 16) | (a)[(n)+1])
157 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
158 
159 #else
160 #error LINK_SIZE must be 2, 3, or 4
161 #endif
162 
163 
164 /* ------------------- 32-bit support  ------------------ */
165 
166 #elif PCRE2_CODE_UNIT_WIDTH == 32
167 #undef LINK_SIZE
168 #define LINK_SIZE 1
169 #define PUT(a,n,d)   \
170   (a[n] = (d))
171 #define GET(a,n) \
172   (a[n])
173 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
174 
175 #else
176 #error Unsupported compiling mode
177 #endif
178 
179 
180 /* --------------- Other mode-specific macros ----------------- */
181 
182 /* PCRE uses some other (at least) 16-bit quantities that do not change when
183 the size of offsets changes. There are used for repeat counts and for other
184 things such as capturing parenthesis numbers in back references.
185 
186 Define the number of code units required to hold a 16-bit count/offset, and
187 macros to load and store such a value. For reasons that I do not understand,
188 the expression in the 8-bit GET2 macro is treated by gcc as a signed
189 expression, even when a is declared as unsigned. It seems that any kind of
190 arithmetic results in a signed value. Hence the cast. */
191 
192 #if PCRE2_CODE_UNIT_WIDTH == 8
193 #define IMM2_SIZE 2
194 #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
195 #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
196 
197 #else  /* Code units are 16 or 32 bits */
198 #define IMM2_SIZE 1
199 #define GET2(a,n) a[n]
200 #define PUT2(a,n,d) a[n] = d
201 #endif
202 
203 /* Other macros that are different for 8-bit mode. The MAX_255 macro checks
204 whether its argument, which is assumed to be one code unit, is less than 256.
205 The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK
206 name must fit in one code unit; currently it is set to 255 or 65535. The
207 TABLE_GET macro is used to access elements of tables containing exactly 256
208 items. Its argument is a code unit. When code points can be greater than 255, a
209 check is needed before accessing these tables. */
210 
211 #if PCRE2_CODE_UNIT_WIDTH == 8
212 #define MAX_255(c) TRUE
213 #define MAX_MARK ((1u << 8) - 1)
214 #define TABLE_GET(c, table, default) ((table)[c])
215 #ifdef SUPPORT_UNICODE
216 #define SUPPORT_WIDE_CHARS
217 #define CHMAX_255(c) ((c) <= 255u)
218 #else
219 #define CHMAX_255(c) TRUE
220 #endif  /* SUPPORT_UNICODE */
221 
222 #else  /* Code units are 16 or 32 bits */
223 #define CHMAX_255(c) ((c) <= 255u)
224 #define MAX_255(c) ((c) <= 255u)
225 #define MAX_MARK ((1u << 16) - 1)
226 #define SUPPORT_WIDE_CHARS
227 #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
228 #endif
229 
230 
231 /* ----------------- Character-handling macros ----------------- */
232 
233 /* There is a proposed future special "UTF-21" mode, in which only the lowest
234 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
235 high-order bits available to the application for other uses. In preparation for
236 the future implementation of this mode, there are macros that load a data item
237 and, if in this special mode, mask it to 21 bits. These macros all have names
238 starting with UCHAR21. In all other modes, including the normal 32-bit
239 library, the macros all have the same simple definitions. When the new mode is
240 implemented, it is expected that these definitions will be varied appropriately
241 using #ifdef when compiling the library that supports the special mode. */
242 
243 #define UCHAR21(eptr)        (*(eptr))
244 #define UCHAR21TEST(eptr)    (*(eptr))
245 #define UCHAR21INC(eptr)     (*(eptr)++)
246 #define UCHAR21INCTEST(eptr) (*(eptr)++)
247 
248 /* When UTF encoding is being used, a character is no longer just a single
249 byte in 8-bit mode or a single short in 16-bit mode. The macros for character
250 handling generate simple sequences when used in the basic mode, and more
251 complicated ones for UTF characters. GETCHARLENTEST and other macros are not
252 used when UTF is not supported. To make sure they can never even appear when
253 UTF support is omitted, we don't even define them. */
254 
255 #ifndef SUPPORT_UNICODE
256 
257 /* #define MAX_UTF_SINGLE_CU */
258 /* #define HAS_EXTRALEN(c) */
259 /* #define GET_EXTRALEN(c) */
260 /* #define NOT_FIRSTCU(c) */
261 #define GETCHAR(c, eptr) c = *eptr;
262 #define GETCHARTEST(c, eptr) c = *eptr;
263 #define GETCHARINC(c, eptr) c = *eptr++;
264 #define GETCHARINCTEST(c, eptr) c = *eptr++;
265 #define GETCHARLEN(c, eptr, len) c = *eptr;
266 #define PUTCHAR(c, p) (*p = c, 1)
267 /* #define GETCHARLENTEST(c, eptr, len) */
268 /* #define BACKCHAR(eptr) */
269 /* #define FORWARDCHAR(eptr) */
270 /* #define FORWARCCHARTEST(eptr,end) */
271 /* #define ACROSSCHAR(condition, eptr, action) */
272 
273 #else   /* SUPPORT_UNICODE */
274 
275 /* ------------------- 8-bit support  ------------------ */
276 
277 #if PCRE2_CODE_UNIT_WIDTH == 8
278 #define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
279 
280 /* The largest UTF code point that can be encoded as a single code unit. */
281 
282 #define MAX_UTF_SINGLE_CU 127
283 
284 /* Tests whether the code point needs extra characters to decode. */
285 
286 #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
287 
288 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
289 Otherwise it has an undefined behaviour. */
290 
291 #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])
292 
293 /* Returns TRUE, if the given value is not the first code unit of a UTF
294 sequence. */
295 
296 #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)
297 
298 /* Get the next UTF-8 character, not advancing the pointer. This is called when
299 we know we are in UTF-8 mode. */
300 
301 #define GETCHAR(c, eptr) \
302   c = *eptr; \
303   if (c >= 0xc0u) GETUTF8(c, eptr);
304 
305 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
306 pointer. */
307 
308 #define GETCHARTEST(c, eptr) \
309   c = *eptr; \
310   if (utf && c >= 0xc0u) GETUTF8(c, eptr);
311 
312 /* Get the next UTF-8 character, advancing the pointer. This is called when we
313 know we are in UTF-8 mode. */
314 
315 #define GETCHARINC(c, eptr) \
316   c = *eptr++; \
317   if (c >= 0xc0u) GETUTF8INC(c, eptr);
318 
319 /* Get the next character, testing for UTF-8 mode, and advancing the pointer.
320 This is called when we don't know if we are in UTF-8 mode. */
321 
322 #define GETCHARINCTEST(c, eptr) \
323   c = *eptr++; \
324   if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);
325 
326 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
327 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
328 
329 #define GETCHARLEN(c, eptr, len) \
330   c = *eptr; \
331   if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);
332 
333 /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
334 pointer, incrementing length if there are extra bytes. This is called when we
335 do not know if we are in UTF-8 mode. */
336 
337 #define GETCHARLENTEST(c, eptr, len) \
338   c = *eptr; \
339   if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);
340 
341 /* If the pointer is not at the start of a character, move it back until
342 it is. This is called only in UTF-8 mode - we don't put a test within the macro
343 because almost all calls are already within a block of UTF-8 only code. */
344 
345 #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--
346 
347 /* Same as above, just in the other direction. */
348 #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++
349 #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++
350 
351 /* Same as above, but it allows a fully customizable form. */
352 #define ACROSSCHAR(condition, eptr, action) \
353   while((condition) && ((*eptr) & 0xc0u) == 0x80u) action
354 
355 /* Deposit a character into memory, returning the number of code units. */
356 
357 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
358   PRIV(ord2utf)(c,p) : (*p = c, 1))
359 
360 
361 /* ------------------- 16-bit support  ------------------ */
362 
363 #elif PCRE2_CODE_UNIT_WIDTH == 16
364 #define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
365 
366 /* The largest UTF code point that can be encoded as a single code unit. */
367 
368 #define MAX_UTF_SINGLE_CU 65535
369 
370 /* Tests whether the code point needs extra characters to decode. */
371 
372 #define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)
373 
374 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
375 Otherwise it has an undefined behaviour. */
376 
377 #define GET_EXTRALEN(c) 1
378 
379 /* Returns TRUE, if the given value is not the first code unit of a UTF
380 sequence. */
381 
382 #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)
383 
384 /* Base macro to pick up the low surrogate of a UTF-16 character, not
385 advancing the pointer. */
386 
387 #define GETUTF16(c, eptr) \
388    { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }
389 
390 /* Get the next UTF-16 character, not advancing the pointer. This is called when
391 we know we are in UTF-16 mode. */
392 
393 #define GETCHAR(c, eptr) \
394   c = *eptr; \
395   if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
396 
397 /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
398 pointer. */
399 
400 #define GETCHARTEST(c, eptr) \
401   c = *eptr; \
402   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
403 
404 /* Base macro to pick up the low surrogate of a UTF-16 character, advancing
405 the pointer. */
406 
407 #define GETUTF16INC(c, eptr) \
408    { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }
409 
410 /* Get the next UTF-16 character, advancing the pointer. This is called when we
411 know we are in UTF-16 mode. */
412 
413 #define GETCHARINC(c, eptr) \
414   c = *eptr++; \
415   if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
416 
417 /* Get the next character, testing for UTF-16 mode, and advancing the pointer.
418 This is called when we don't know if we are in UTF-16 mode. */
419 
420 #define GETCHARINCTEST(c, eptr) \
421   c = *eptr++; \
422   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
423 
424 /* Base macro to pick up the low surrogate of a UTF-16 character, not
425 advancing the pointer, incrementing the length. */
426 
427 #define GETUTF16LEN(c, eptr, len) \
428    { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }
429 
430 /* Get the next UTF-16 character, not advancing the pointer, incrementing
431 length if there is a low surrogate. This is called when we know we are in
432 UTF-16 mode. */
433 
434 #define GETCHARLEN(c, eptr, len) \
435   c = *eptr; \
436   if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
437 
438 /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
439 pointer, incrementing length if there is a low surrogate. This is called when
440 we do not know if we are in UTF-16 mode. */
441 
442 #define GETCHARLENTEST(c, eptr, len) \
443   c = *eptr; \
444   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
445 
446 /* If the pointer is not at the start of a character, move it back until
447 it is. This is called only in UTF-16 mode - we don't put a test within the
448 macro because almost all calls are already within a block of UTF-16 only
449 code. */
450 
451 #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--
452 
453 /* Same as above, just in the other direction. */
454 #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++
455 #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++
456 
457 /* Same as above, but it allows a fully customizable form. */
458 #define ACROSSCHAR(condition, eptr, action) \
459   if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action
460 
461 /* Deposit a character into memory, returning the number of code units. */
462 
463 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
464   PRIV(ord2utf)(c,p) : (*p = c, 1))
465 
466 
467 /* ------------------- 32-bit support  ------------------ */
468 
469 #else
470 
471 /* These are trivial for the 32-bit library, since all UTF-32 characters fit
472 into one PCRE2_UCHAR unit. */
473 
474 #define MAX_UTF_SINGLE_CU (0x10ffffu)
475 #define HAS_EXTRALEN(c) (0)
476 #define GET_EXTRALEN(c) (0)
477 #define NOT_FIRSTCU(c) (0)
478 
479 /* Get the next UTF-32 character, not advancing the pointer. This is called when
480 we know we are in UTF-32 mode. */
481 
482 #define GETCHAR(c, eptr) \
483   c = *(eptr);
484 
485 /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
486 pointer. */
487 
488 #define GETCHARTEST(c, eptr) \
489   c = *(eptr);
490 
491 /* Get the next UTF-32 character, advancing the pointer. This is called when we
492 know we are in UTF-32 mode. */
493 
494 #define GETCHARINC(c, eptr) \
495   c = *((eptr)++);
496 
497 /* Get the next character, testing for UTF-32 mode, and advancing the pointer.
498 This is called when we don't know if we are in UTF-32 mode. */
499 
500 #define GETCHARINCTEST(c, eptr) \
501   c = *((eptr)++);
502 
503 /* Get the next UTF-32 character, not advancing the pointer, not incrementing
504 length (since all UTF-32 is of length 1). This is called when we know we are in
505 UTF-32 mode. */
506 
507 #define GETCHARLEN(c, eptr, len) \
508   GETCHAR(c, eptr)
509 
510 /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
511 pointer, not incrementing the length (since all UTF-32 is of length 1).
512 This is called when we do not know if we are in UTF-32 mode. */
513 
514 #define GETCHARLENTEST(c, eptr, len) \
515   GETCHARTEST(c, eptr)
516 
517 /* If the pointer is not at the start of a character, move it back until
518 it is. This is called only in UTF-32 mode - we don't put a test within the
519 macro because almost all calls are already within a block of UTF-32 only
520 code.
521 
522 These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */
523 
524 #define BACKCHAR(eptr) do { } while (0)
525 
526 /* Same as above, just in the other direction. */
527 
528 #define FORWARDCHAR(eptr) do { } while (0)
529 #define FORWARDCHARTEST(eptr,end) do { } while (0)
530 
531 /* Same as above, but it allows a fully customizable form. */
532 
533 #define ACROSSCHAR(condition, eptr, action) do { } while (0)
534 
535 /* Deposit a character into memory, returning the number of code units. */
536 
537 #define PUTCHAR(c, p) (*p = c, 1)
538 
539 #endif  /* UTF-32 character handling */
540 #endif  /* SUPPORT_UNICODE */
541 
542 
543 /* Mode-dependent macros that have the same definition in all modes. */
544 
545 #define CU2BYTES(x)     ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))
546 #define BYTES2CU(x)     ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))
547 #define PUTINC(a,n,d)   PUT(a,n,d), a += LINK_SIZE
548 #define PUT2INC(a,n,d)  PUT2(a,n,d), a += IMM2_SIZE
549 
550 
551 /* ----------------------- HIDDEN STRUCTURES ----------------------------- */
552 
553 /* NOTE: All these structures *must* start with a pcre2_memctl structure. The
554 code that uses them is simpler because it assumes this. */
555 
556 /* The real general context structure. At present it holds only data for custom
557 memory control. */
558 
559 typedef struct pcre2_real_general_context {
560   pcre2_memctl memctl;
561 } pcre2_real_general_context;
562 
563 /* The real compile context structure */
564 
565 typedef struct pcre2_real_compile_context {
566   pcre2_memctl memctl;
567   int (*stack_guard)(uint32_t, void *);
568   void *stack_guard_data;
569   const uint8_t *tables;
570   PCRE2_SIZE max_pattern_length;
571   PCRE2_SIZE max_pattern_compiled_length;
572   uint16_t bsr_convention;
573   uint16_t newline_convention;
574   uint32_t parens_nest_limit;
575   uint32_t extra_options;
576   uint32_t max_varlookbehind;
577 } pcre2_real_compile_context;
578 
579 /* The real match context structure. */
580 
581 typedef struct pcre2_real_match_context {
582   pcre2_memctl memctl;
583 #ifdef SUPPORT_JIT
584   pcre2_jit_callback jit_callback;
585   void *jit_callback_data;
586 #endif
587   int    (*callout)(pcre2_callout_block *, void *);
588   void    *callout_data;
589   int    (*substitute_callout)(pcre2_substitute_callout_block *, void *);
590   void    *substitute_callout_data;
591   PCRE2_SIZE offset_limit;
592   uint32_t heap_limit;
593   uint32_t match_limit;
594   uint32_t depth_limit;
595 } pcre2_real_match_context;
596 
597 /* The real convert context structure. */
598 
599 typedef struct pcre2_real_convert_context {
600   pcre2_memctl memctl;
601   uint32_t glob_separator;
602   uint32_t glob_escape;
603 } pcre2_real_convert_context;
604 
605 /* The real compiled code structure. The type for the blocksize field is
606 defined specially because it is required in pcre2_serialize_decode() when
607 copying the size from possibly unaligned memory into a variable of the same
608 type. Use a macro rather than a typedef to avoid compiler warnings when this
609 file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
610 largest lookbehind that is supported. (OP_REVERSE and OP_VREVERSE in a pattern
611 have 16-bit arguments in 8-bit and 16-bit modes, so we need no more than a
612 16-bit field here.) */
613 
614 #undef  CODE_BLOCKSIZE_TYPE
615 #define CODE_BLOCKSIZE_TYPE PCRE2_SIZE
616 
617 #undef  LOOKBEHIND_MAX
618 #define LOOKBEHIND_MAX UINT16_MAX
619 
620 typedef struct pcre2_real_code {
621   pcre2_memctl memctl;            /* Memory control fields */
622   const uint8_t *tables;          /* The character tables */
623   void    *executable_jit;        /* Pointer to JIT code */
624   uint8_t  start_bitmap[32];      /* Bitmap for starting code unit < 256 */
625   CODE_BLOCKSIZE_TYPE blocksize;  /* Total (bytes) that was malloc-ed */
626   uint32_t magic_number;          /* Paranoid and endianness check */
627   uint32_t compile_options;       /* Options passed to pcre2_compile() */
628   uint32_t overall_options;       /* Options after processing the pattern */
629   uint32_t extra_options;         /* Taken from compile_context */
630   uint32_t flags;                 /* Various state flags */
631   uint32_t limit_heap;            /* Limit set in the pattern */
632   uint32_t limit_match;           /* Limit set in the pattern */
633   uint32_t limit_depth;           /* Limit set in the pattern */
634   uint32_t first_codeunit;        /* Starting code unit */
635   uint32_t last_codeunit;         /* This codeunit must be seen */
636   uint16_t bsr_convention;        /* What \R matches */
637   uint16_t newline_convention;    /* What is a newline? */
638   uint16_t max_lookbehind;        /* Longest lookbehind (characters) */
639   uint16_t minlength;             /* Minimum length of match */
640   uint16_t top_bracket;           /* Highest numbered group */
641   uint16_t top_backref;           /* Highest numbered back reference */
642   uint16_t name_entry_size;       /* Size (code units) of table entries */
643   uint16_t name_count;            /* Number of name entries in the table */
644 } pcre2_real_code;
645 
646 /* The real match data structure. Define ovector as large as it can ever
647 actually be so that array bound checkers don't grumble. Memory for this
648 structure is obtained by calling pcre2_match_data_create(), which sets the size
649 as the offset of ovector plus a pair of elements for each capturable string, so
650 the size varies from call to call. As the maximum number of capturing
651 subpatterns is 65535 we must allow for 65536 strings to include the overall
652 match. (See also the heapframe structure below.) */
653 
654 struct heapframe;  /* Forward reference */
655 
656 typedef struct pcre2_real_match_data {
657   pcre2_memctl     memctl;           /* Memory control fields */
658   const pcre2_real_code *code;       /* The pattern used for the match */
659   PCRE2_SPTR       subject;          /* The subject that was matched */
660   PCRE2_SPTR       mark;             /* Pointer to last mark */
661   struct heapframe *heapframes;      /* Backtracking frames heap memory */
662   PCRE2_SIZE       heapframes_size;  /* Malloc-ed size */
663   PCRE2_SIZE       subject_length;   /* Subject length */
664   PCRE2_SIZE       leftchar;         /* Offset to leftmost code unit */
665   PCRE2_SIZE       rightchar;        /* Offset to rightmost code unit */
666   PCRE2_SIZE       startchar;        /* Offset to starting code unit */
667   uint8_t          matchedby;        /* Type of match (normal, JIT, DFA) */
668   uint8_t          flags;            /* Various flags */
669   uint16_t         oveccount;        /* Number of pairs */
670   int              rc;               /* The return code from the match */
671   PCRE2_SIZE       ovector[131072];  /* Must be last in the structure */
672 } pcre2_real_match_data;
673 
674 
675 /* ----------------------- PRIVATE STRUCTURES ----------------------------- */
676 
677 /* These structures are not needed for pcre2test. */
678 
679 #ifndef PCRE2_PCRE2TEST
680 
681 /* Structures for checking for mutual function recursion when scanning compiled
682 or parsed code. */
683 
684 typedef struct recurse_check {
685   struct recurse_check *prev;
686   PCRE2_SPTR group;
687 } recurse_check;
688 
689 typedef struct parsed_recurse_check {
690   struct parsed_recurse_check *prev;
691   uint32_t *groupptr;
692 } parsed_recurse_check;
693 
694 /* Structure for building a cache when filling in pattern recursion offsets. */
695 
696 typedef struct recurse_cache {
697   PCRE2_SPTR group;
698   int groupnumber;
699 } recurse_cache;
700 
701 /* Structure for maintaining a chain of pointers to the currently incomplete
702 branches, for testing for left recursion while compiling. */
703 
704 typedef struct branch_chain {
705   struct branch_chain *outer;
706   PCRE2_UCHAR *current_branch;
707 } branch_chain;
708 
709 /* Structure for building a list of named groups during the first pass of
710 compiling. */
711 
712 typedef struct named_group {
713   PCRE2_SPTR   name;          /* Points to the name in the pattern */
714   uint32_t     number;        /* Group number */
715   uint16_t     length;        /* Length of the name */
716   uint16_t     isdup;         /* TRUE if a duplicate */
717 } named_group;
718 
719 /* Structure for passing "static" information around between the functions
720 doing the compiling, so that they are thread-safe. */
721 
722 typedef struct compile_block {
723   pcre2_real_compile_context *cx;  /* Points to the compile context */
724   const uint8_t *lcc;              /* Points to lower casing table */
725   const uint8_t *fcc;              /* Points to case-flipping table */
726   const uint8_t *cbits;            /* Points to character type table */
727   const uint8_t *ctypes;           /* Points to table of type maps */
728   PCRE2_SPTR start_workspace;      /* The start of working space */
729   PCRE2_SPTR start_code;           /* The start of the compiled code */
730   PCRE2_SPTR start_pattern;        /* The start of the pattern */
731   PCRE2_SPTR end_pattern;          /* The end of the pattern */
732   PCRE2_UCHAR *name_table;         /* The name/number table */
733   PCRE2_SIZE workspace_size;       /* Size of workspace */
734   PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */
735   PCRE2_SIZE erroroffset;          /* Offset of error in pattern */
736   uint16_t names_found;            /* Number of entries so far */
737   uint16_t name_entry_size;        /* Size of each entry */
738   uint16_t parens_depth;           /* Depth of nested parentheses */
739   uint16_t assert_depth;           /* Depth of nested assertions */
740   named_group *named_groups;       /* Points to vector in pre-compile */
741   uint32_t named_group_list_size;  /* Number of entries in the list */
742   uint32_t external_options;       /* External (initial) options */
743   uint32_t external_flags;         /* External flag bits to be set */
744   uint32_t bracount;               /* Count of capturing parentheses */
745   uint32_t lastcapture;            /* Last capture encountered */
746   uint32_t *parsed_pattern;        /* Parsed pattern buffer */
747   uint32_t *parsed_pattern_end;    /* Parsed pattern should not get here */
748   uint32_t *groupinfo;             /* Group info vector */
749   uint32_t top_backref;            /* Maximum back reference */
750   uint32_t backref_map;            /* Bitmap of low back refs */
751   uint32_t nltype;                 /* Newline type */
752   uint32_t nllen;                  /* Newline string length */
753   uint32_t class_range_start;      /* Overall class range start */
754   uint32_t class_range_end;        /* Overall class range end */
755   PCRE2_UCHAR nl[4];               /* Newline string when fixed length */
756   uint32_t req_varyopt;            /* "After variable item" flag for reqbyte */
757   uint32_t max_varlookbehind;      /* Limit for variable lookbehinds */
758   int  max_lookbehind;             /* Maximum lookbehind encountered (characters) */
759   BOOL had_accept;                 /* (*ACCEPT) encountered */
760   BOOL had_pruneorskip;            /* (*PRUNE) or (*SKIP) encountered */
761   BOOL had_recurse;                /* Had a pattern recursion or subroutine call */
762   BOOL dupnames;                   /* Duplicate names exist */
763 } compile_block;
764 
765 /* Structure for keeping the properties of the in-memory stack used
766 by the JIT matcher. */
767 
768 typedef struct pcre2_real_jit_stack {
769   pcre2_memctl memctl;
770   void* stack;
771 } pcre2_real_jit_stack;
772 
773 /* Structure for items in a linked list that represents an explicit recursive
774 call within the pattern when running pcre2_dfa_match(). */
775 
776 typedef struct dfa_recursion_info {
777   struct dfa_recursion_info *prevrec;
778   PCRE2_SPTR subject_position;
779   PCRE2_SPTR last_used_ptr;
780   uint32_t group_num;
781 } dfa_recursion_info;
782 
783 /* Structure for "stack" frames that are used for remembering backtracking
784 positions during matching. As these are used in a vector, with the ovector item
785 being extended, the size of the structure must be a multiple of PCRE2_SIZE. The
786 only way to check this at compile time is to force an error by generating an
787 array with a negative size. By putting this in a typedef (which is never used),
788 we don't generate any code when all is well. */
789 
790 typedef struct heapframe {
791 
792   /* The first set of fields are variables that have to be preserved over calls
793   to RRMATCH(), but which do not need to be copied to new frames. */
794 
795   PCRE2_SPTR ecode;          /* The current position in the pattern */
796   PCRE2_SPTR temp_sptr[2];   /* Used for short-term PCRE_SPTR values */
797   PCRE2_SIZE length;         /* Used for character, string, or code lengths */
798   PCRE2_SIZE back_frame;     /* Amount to subtract on RRETURN */
799   PCRE2_SIZE temp_size;      /* Used for short-term PCRE2_SIZE values */
800   uint32_t rdepth;           /* Function "recursion" depth within pcre2_match() */
801   uint32_t group_frame_type; /* Type information for group frames */
802   uint32_t temp_32[4];       /* Used for short-term 32-bit or BOOL values */
803   uint8_t return_id;         /* Where to go on in internal "return" */
804   uint8_t op;                /* Processing opcode */
805 
806   /* At this point, the structure is 16-bit aligned. On most architectures
807   the alignment requirement for a pointer will ensure that the eptr field below
808   is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer
809   that is 16-bit aligned. We must therefore ensure that what comes between here
810   and eptr is an odd multiple of 16 bits so as to get back into 32-bit
811   alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs
812   fudges in the other cases. In the 32-bit case the padding comes first so that
813   the occu field itself is 32-bit aligned. Without the padding, this structure
814   is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */
815 
816 #if PCRE2_CODE_UNIT_WIDTH == 8
817   PCRE2_UCHAR occu[6];       /* Used for other case code units */
818 #elif PCRE2_CODE_UNIT_WIDTH == 16
819   PCRE2_UCHAR occu[2];       /* Used for other case code units */
820   uint8_t unused[2];         /* Ensure 32-bit alignment (see above) */
821 #else
822   uint8_t unused[2];         /* Ensure 32-bit alignment (see above) */
823   PCRE2_UCHAR occu[1];       /* Used for other case code units */
824 #endif
825 
826   /* The rest have to be copied from the previous frame whenever a new frame
827   becomes current. The final field is specified as a large vector so that
828   runtime array bound checks don't catch references to it. However, for any
829   specific call to pcre2_match() the memory allocated for each frame structure
830   allows for exactly the right size ovector for the number of capturing
831   parentheses. (See also the comment for pcre2_real_match_data above.) */
832 
833   PCRE2_SPTR eptr;              /* MUST BE FIRST */
834   PCRE2_SPTR start_match;       /* Can be adjusted by \K */
835   PCRE2_SPTR mark;              /* Most recent mark on the success path */
836   PCRE2_SPTR recurse_last_used; /* Last character used at time of pattern recursion */
837   uint32_t current_recurse;     /* Group number of current (deepest) pattern recursion */
838   uint32_t capture_last;        /* Most recent capture */
839   PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */
840   PCRE2_SIZE offset_top;        /* Offset after highest capture */
841   PCRE2_SIZE ovector[131072];   /* Must be last in the structure */
842 } heapframe;
843 
844 /* This typedef is a check that the size of the heapframe structure is a
845 multiple of PCRE2_SIZE. See various comments above. */
846 
847 typedef char check_heapframe_size[
848   ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)];
849 
850 /* Structure for computing the alignment of heapframe. */
851 
852 typedef struct heapframe_align {
853   char unalign;    /* Completely unalign the current offset */
854   heapframe frame; /* Offset is its alignment */
855 } heapframe_align;
856 
857 /* This define is the minimum alignment required for a heapframe, in bytes. */
858 
859 #define HEAPFRAME_ALIGNMENT offsetof(heapframe_align, frame)
860 
861 /* Structure for passing "static" information around between the functions
862 doing traditional NFA matching (pcre2_match() and friends). */
863 
864 typedef struct match_block {
865   pcre2_memctl memctl;            /* For general use */
866   uint32_t heap_limit;            /* As it says */
867   uint32_t match_limit;           /* As it says */
868   uint32_t match_limit_depth;     /* As it says */
869   uint32_t match_call_count;      /* Number of times a new frame is created */
870   BOOL hitend;                    /* Hit the end of the subject at some point */
871   BOOL hasthen;                   /* Pattern contains (*THEN) */
872   BOOL allowemptypartial;         /* Allow empty hard partial */
873   const uint8_t *lcc;             /* Points to lower casing table */
874   const uint8_t *fcc;             /* Points to case-flipping table */
875   const uint8_t *ctypes;          /* Points to table of type maps */
876   PCRE2_SIZE start_offset;        /* The start offset value */
877   PCRE2_SIZE end_offset_top;      /* Highwater mark at end of match */
878   uint16_t partial;               /* PARTIAL options */
879   uint16_t bsr_convention;        /* \R interpretation */
880   uint16_t name_count;            /* Number of names in name table */
881   uint16_t name_entry_size;       /* Size of entry in names table */
882   PCRE2_SPTR name_table;          /* Table of group names */
883   PCRE2_SPTR start_code;          /* For use in pattern recursion */
884   PCRE2_SPTR start_subject;       /* Start of the subject string */
885   PCRE2_SPTR check_subject;       /* Where UTF-checked from */
886   PCRE2_SPTR end_subject;         /* Usable end of the subject string */
887   PCRE2_SPTR true_end_subject;    /* Actual end of the subject string */
888   PCRE2_SPTR end_match_ptr;       /* Subject position at end match */
889   PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
890   PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
891   PCRE2_SPTR mark;                /* Mark pointer to pass back on success */
892   PCRE2_SPTR nomatch_mark;        /* Mark pointer to pass back on failure */
893   PCRE2_SPTR verb_ecode_ptr;      /* For passing back info */
894   PCRE2_SPTR verb_skip_ptr;       /* For passing back a (*SKIP) name */
895   uint32_t verb_current_recurse;  /* Current recursion group when (*VERB) happens */
896   uint32_t moptions;              /* Match options */
897   uint32_t poptions;              /* Pattern options */
898   uint32_t skip_arg_count;        /* For counting SKIP_ARGs */
899   uint32_t ignore_skip_arg;       /* For re-run when SKIP arg name not found */
900   uint32_t nltype;                /* Newline type */
901   uint32_t nllen;                 /* Newline string length */
902   PCRE2_UCHAR nl[4];              /* Newline string when fixed */
903   pcre2_callout_block *cb;        /* Points to a callout block */
904   void  *callout_data;            /* To pass back to callouts */
905   int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
906 } match_block;
907 
908 /* A similar structure is used for the same purpose by the DFA matching
909 functions. */
910 
911 typedef struct dfa_match_block {
912   pcre2_memctl memctl;            /* For general use */
913   PCRE2_SPTR start_code;          /* Start of the compiled pattern */
914   PCRE2_SPTR start_subject ;      /* Start of the subject string */
915   PCRE2_SPTR end_subject;         /* End of subject string */
916   PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
917   PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
918   const uint8_t *tables;          /* Character tables */
919   PCRE2_SIZE start_offset;        /* The start offset value */
920   uint32_t heap_limit;            /* As it says */
921   PCRE2_SIZE heap_used;           /* As it says */
922   uint32_t match_limit;           /* As it says */
923   uint32_t match_limit_depth;     /* As it says */
924   uint32_t match_call_count;      /* Number of calls of internal function */
925   uint32_t moptions;              /* Match options */
926   uint32_t poptions;              /* Pattern options */
927   uint32_t nltype;                /* Newline type */
928   uint32_t nllen;                 /* Newline string length */
929   BOOL allowemptypartial;         /* Allow empty hard partial */
930   PCRE2_UCHAR nl[4];              /* Newline string when fixed */
931   uint16_t bsr_convention;        /* \R interpretation */
932   pcre2_callout_block *cb;        /* Points to a callout block */
933   void *callout_data;             /* To pass back to callouts */
934   int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
935   dfa_recursion_info *recursive;  /* Linked list of pattern recursion data */
936 } dfa_match_block;
937 
938 #endif  /* PCRE2_PCRE2TEST */
939 
940 /* End of pcre2_intmodedep.h */
941