1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2021 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #include "pcre2_internal.h"
47
48 #define PTR_STACK_SIZE 20
49
50 #define SUBSTITUTE_OPTIONS \
51 (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52 PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54 PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55
56
57
58 /*************************************************
59 * Find end of substitute text *
60 *************************************************/
61
62 /* In extended mode, we recognize ${name:+set text:unset text} and similar
63 constructions. This requires the identification of unescaped : and }
64 characters. This function scans for such. It must deal with nested ${
65 constructions. The pointer to the text is updated, either to the required end
66 character, or to where an error was detected.
67
68 Arguments:
69 code points to the compiled expression (for options)
70 ptrptr points to the pointer to the start of the text (updated)
71 ptrend end of the whole string
72 last TRUE if the last expected string (only } recognized)
73
74 Returns: 0 on success
75 negative error code on failure
76 */
77
78 static int
find_text_end(const pcre2_code * code,PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL last)79 find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80 BOOL last)
81 {
82 int rc = 0;
83 uint32_t nestlevel = 0;
84 BOOL literal = FALSE;
85 PCRE2_SPTR ptr = *ptrptr;
86
87 for (; ptr < ptrend; ptr++)
88 {
89 if (literal)
90 {
91 if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92 {
93 literal = FALSE;
94 ptr += 1;
95 }
96 }
97
98 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99 {
100 if (nestlevel == 0) goto EXIT;
101 nestlevel--;
102 }
103
104 else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105
106 else if (*ptr == CHAR_DOLLAR_SIGN)
107 {
108 if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109 {
110 nestlevel++;
111 ptr += 1;
112 }
113 }
114
115 else if (*ptr == CHAR_BACKSLASH)
116 {
117 int erc;
118 int errorcode;
119 uint32_t ch;
120
121 if (ptr < ptrend - 1) switch (ptr[1])
122 {
123 case CHAR_L:
124 case CHAR_l:
125 case CHAR_U:
126 case CHAR_u:
127 ptr += 1;
128 continue;
129 }
130
131 ptr += 1; /* Must point after \ */
132 erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133 code->overall_options, code->extra_options, FALSE, NULL);
134 ptr -= 1; /* Back to last code unit of escape */
135 if (errorcode != 0)
136 {
137 rc = errorcode;
138 goto EXIT;
139 }
140
141 switch(erc)
142 {
143 case 0: /* Data character */
144 case ESC_E: /* Isolated \E is ignored */
145 break;
146
147 case ESC_Q:
148 literal = TRUE;
149 break;
150
151 default:
152 rc = PCRE2_ERROR_BADREPESCAPE;
153 goto EXIT;
154 }
155 }
156 }
157
158 rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
159
160 EXIT:
161 *ptrptr = ptr;
162 return rc;
163 }
164
165
166
167 /*************************************************
168 * Match and substitute *
169 *************************************************/
170
171 /* This function applies a compiled re to a subject string and creates a new
172 string with substitutions. The first 7 arguments are the same as for
173 pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
174
175 Arguments:
176 code points to the compiled expression
177 subject points to the subject string
178 length length of subject string (may contain binary zeros)
179 start_offset where to start in the subject string
180 options option bits
181 match_data points to a match_data block, or is NULL
182 context points a PCRE2 context
183 replacement points to the replacement string
184 rlength length of replacement string
185 buffer where to put the substituted string
186 blength points to length of buffer; updated to length of string
187
188 Returns: >= 0 number of substitutions made
189 < 0 an error code
190 PCRE2_ERROR_BADREPLACEMENT means invalid use of $
191 */
192
193 /* This macro checks for space in the buffer before copying into it. On
194 overflow, either give an error immediately, or keep on, accumulating the
195 length. */
196
197 #define CHECKMEMCPY(from,length) \
198 { \
199 if (!overflowed && lengthleft < length) \
200 { \
201 if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
202 overflowed = TRUE; \
203 extra_needed = length - lengthleft; \
204 } \
205 else if (overflowed) \
206 { \
207 extra_needed += length; \
208 } \
209 else \
210 { \
211 memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
212 buff_offset += length; \
213 lengthleft -= length; \
214 } \
215 }
216
217 /* Here's the function */
218
219 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,PCRE2_SPTR replacement,PCRE2_SIZE rlength,PCRE2_UCHAR * buffer,PCRE2_SIZE * blength)220 pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
221 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
222 pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
223 PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
224 {
225 int rc;
226 int subs;
227 int forcecase = 0;
228 int forcecasereset = 0;
229 uint32_t ovector_count;
230 uint32_t goptions = 0;
231 uint32_t suboptions;
232 pcre2_match_data *internal_match_data = NULL;
233 BOOL escaped_literal = FALSE;
234 BOOL overflowed = FALSE;
235 BOOL use_existing_match;
236 BOOL replacement_only;
237 #ifdef SUPPORT_UNICODE
238 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
239 BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
240 #endif
241 PCRE2_UCHAR temp[6];
242 PCRE2_SPTR ptr;
243 PCRE2_SPTR repend;
244 PCRE2_SIZE extra_needed = 0;
245 PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
246 PCRE2_SIZE *ovector;
247 PCRE2_SIZE ovecsave[3];
248 pcre2_substitute_callout_block scb;
249
250 /* General initialization */
251
252 buff_offset = 0;
253 lengthleft = buff_length = *blength;
254 *blength = PCRE2_UNSET;
255 ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
256
257 /* Partial matching is not valid. This must come after setting *blength to
258 PCRE2_UNSET, so as not to imply an offset in the replacement. */
259
260 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
261 return PCRE2_ERROR_BADOPTION;
262
263 /* Validate length and find the end of the replacement. A NULL replacement of
264 zero length is interpreted as an empty string. */
265
266 if (replacement == NULL)
267 {
268 if (rlength != 0) return PCRE2_ERROR_NULL;
269 replacement = (PCRE2_SPTR)"";
270 }
271
272 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
273 repend = replacement + rlength;
274
275 /* Check for using a match that has already happened. Note that the subject
276 pointer in the match data may be NULL after a no-match. */
277
278 use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
279 replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
280
281 /* If starting from an existing match, there must be an externally provided
282 match data block. We create an internal match_data block in two cases: (a) an
283 external one is not supplied (and we are not starting from an existing match);
284 (b) an existing match is to be used for the first substitution. In the latter
285 case, we copy the existing match into the internal block. This ensures that no
286 changes are made to the existing match data block. */
287
288 if (match_data == NULL)
289 {
290 pcre2_general_context *gcontext;
291 if (use_existing_match) return PCRE2_ERROR_NULL;
292 gcontext = (mcontext == NULL)?
293 (pcre2_general_context *)code :
294 (pcre2_general_context *)mcontext;
295 match_data = internal_match_data =
296 pcre2_match_data_create_from_pattern(code, gcontext);
297 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
298 }
299
300 else if (use_existing_match)
301 {
302 pcre2_general_context *gcontext = (mcontext == NULL)?
303 (pcre2_general_context *)code :
304 (pcre2_general_context *)mcontext;
305 int pairs = (code->top_bracket + 1 < match_data->oveccount)?
306 code->top_bracket + 1 : match_data->oveccount;
307 internal_match_data = pcre2_match_data_create(match_data->oveccount,
308 gcontext);
309 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
310 memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
311 + 2*pairs*sizeof(PCRE2_SIZE));
312 match_data = internal_match_data;
313 }
314
315 /* Remember ovector details */
316
317 ovector = pcre2_get_ovector_pointer(match_data);
318 ovector_count = pcre2_get_ovector_count(match_data);
319
320 /* Fixed things in the callout block */
321
322 scb.version = 0;
323 scb.input = subject;
324 scb.output = (PCRE2_SPTR)buffer;
325 scb.ovector = ovector;
326
327 /* A NULL subject of zero length is treated as an empty string. */
328
329 if (subject == NULL)
330 {
331 if (length != 0) return PCRE2_ERROR_NULL;
332 subject = (PCRE2_SPTR)"";
333 }
334
335 /* Find length of zero-terminated subject */
336
337 if (length == PCRE2_ZERO_TERMINATED)
338 length = subject? PRIV(strlen)(subject) : 0;
339
340 /* Check UTF replacement string if necessary. */
341
342 #ifdef SUPPORT_UNICODE
343 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
344 {
345 rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
346 if (rc != 0)
347 {
348 match_data->leftchar = 0;
349 goto EXIT;
350 }
351 }
352 #endif /* SUPPORT_UNICODE */
353
354 /* Save the substitute options and remove them from the match options. */
355
356 suboptions = options & SUBSTITUTE_OPTIONS;
357 options &= ~SUBSTITUTE_OPTIONS;
358
359 /* Error if the start match offset is greater than the length of the subject. */
360
361 if (start_offset > length)
362 {
363 match_data->leftchar = 0;
364 rc = PCRE2_ERROR_BADOFFSET;
365 goto EXIT;
366 }
367
368 /* Copy up to the start offset, unless only the replacement is required. */
369
370 if (!replacement_only) CHECKMEMCPY(subject, start_offset);
371
372 /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
373 match is taken from the match_data that was passed in. */
374
375 subs = 0;
376 do
377 {
378 PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
379 uint32_t ptrstackptr = 0;
380
381 if (use_existing_match)
382 {
383 rc = match_data->rc;
384 use_existing_match = FALSE;
385 }
386 else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
387 match_data, mcontext);
388
389 #ifdef SUPPORT_UNICODE
390 if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
391 #endif
392
393 /* Any error other than no match returns the error code. No match when not
394 doing the special after-empty-match global rematch, or when at the end of the
395 subject, breaks the global loop. Otherwise, advance the starting point by one
396 character, copying it to the output, and try again. */
397
398 if (rc < 0)
399 {
400 PCRE2_SIZE save_start;
401
402 if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
403 if (goptions == 0 || start_offset >= length) break;
404
405 /* Advance by one code point. Then, if CRLF is a valid newline sequence and
406 we have advanced into the middle of it, advance one more code point. In
407 other words, do not start in the middle of CRLF, even if CR and LF on their
408 own are valid newlines. */
409
410 save_start = start_offset++;
411 if (subject[start_offset-1] == CHAR_CR &&
412 code->newline_convention != PCRE2_NEWLINE_CR &&
413 code->newline_convention != PCRE2_NEWLINE_LF &&
414 start_offset < length &&
415 subject[start_offset] == CHAR_LF)
416 start_offset++;
417
418 /* Otherwise, in UTF mode, advance past any secondary code points. */
419
420 else if ((code->overall_options & PCRE2_UTF) != 0)
421 {
422 #if PCRE2_CODE_UNIT_WIDTH == 8
423 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
424 start_offset++;
425 #elif PCRE2_CODE_UNIT_WIDTH == 16
426 while (start_offset < length &&
427 (subject[start_offset] & 0xfc00) == 0xdc00)
428 start_offset++;
429 #endif
430 }
431
432 /* Copy what we have advanced past (unless not required), reset the special
433 global options, and continue to the next match. */
434
435 fraglength = start_offset - save_start;
436 if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
437 goptions = 0;
438 continue;
439 }
440
441 /* Handle a successful match. Matches that use \K to end before they start
442 or start before the current point in the subject are not supported. */
443
444 if (ovector[1] < ovector[0] || ovector[0] < start_offset)
445 {
446 rc = PCRE2_ERROR_BADSUBSPATTERN;
447 goto EXIT;
448 }
449
450 /* Check for the same match as previous. This is legitimate after matching an
451 empty string that starts after the initial match offset. We have tried again
452 at the match point in case the pattern is one like /(?<=\G.)/ which can never
453 match at its starting point, so running the match achieves the bumpalong. If
454 we do get the same (null) match at the original match point, it isn't such a
455 pattern, so we now do the empty string magic. In all other cases, a repeat
456 match should never occur. */
457
458 if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
459 {
460 if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
461 {
462 goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
463 ovecsave[2] = start_offset;
464 continue; /* Back to the top of the loop */
465 }
466 rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
467 goto EXIT;
468 }
469
470 /* Count substitutions with a paranoid check for integer overflow; surely no
471 real call to this function would ever hit this! */
472
473 if (subs == INT_MAX)
474 {
475 rc = PCRE2_ERROR_TOOMANYREPLACE;
476 goto EXIT;
477 }
478 subs++;
479
480 /* Copy the text leading up to the match (unless not required), and remember
481 where the insert begins and how many ovector pairs are set. */
482
483 if (rc == 0) rc = ovector_count;
484 fraglength = ovector[0] - start_offset;
485 if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
486 scb.output_offsets[0] = buff_offset;
487 scb.oveccount = rc;
488
489 /* Process the replacement string. If the entire replacement is literal, just
490 copy it with length check. */
491
492 ptr = replacement;
493 if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
494 {
495 CHECKMEMCPY(ptr, rlength);
496 }
497
498 /* Within a non-literal replacement, which must be scanned character by
499 character, local literal mode can be set by \Q, but only in extended mode
500 when backslashes are being interpreted. In extended mode we must handle
501 nested substrings that are to be reprocessed. */
502
503 else for (;;)
504 {
505 uint32_t ch;
506 unsigned int chlen;
507
508 /* If at the end of a nested substring, pop the stack. */
509
510 if (ptr >= repend)
511 {
512 if (ptrstackptr == 0) break; /* End of replacement string */
513 repend = ptrstack[--ptrstackptr];
514 ptr = ptrstack[--ptrstackptr];
515 continue;
516 }
517
518 /* Handle the next character */
519
520 if (escaped_literal)
521 {
522 if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
523 {
524 escaped_literal = FALSE;
525 ptr += 2;
526 continue;
527 }
528 goto LOADLITERAL;
529 }
530
531 /* Not in literal mode. */
532
533 if (*ptr == CHAR_DOLLAR_SIGN)
534 {
535 int group, n;
536 uint32_t special = 0;
537 BOOL inparens;
538 BOOL star;
539 PCRE2_SIZE sublength;
540 PCRE2_SPTR text1_start = NULL;
541 PCRE2_SPTR text1_end = NULL;
542 PCRE2_SPTR text2_start = NULL;
543 PCRE2_SPTR text2_end = NULL;
544 PCRE2_UCHAR next;
545 PCRE2_UCHAR name[33];
546
547 if (++ptr >= repend) goto BAD;
548 if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
549
550 group = -1;
551 n = 0;
552 inparens = FALSE;
553 star = FALSE;
554
555 if (next == CHAR_LEFT_CURLY_BRACKET)
556 {
557 if (++ptr >= repend) goto BAD;
558 next = *ptr;
559 inparens = TRUE;
560 }
561
562 if (next == CHAR_ASTERISK)
563 {
564 if (++ptr >= repend) goto BAD;
565 next = *ptr;
566 star = TRUE;
567 }
568
569 if (!star && next >= CHAR_0 && next <= CHAR_9)
570 {
571 group = next - CHAR_0;
572 while (++ptr < repend)
573 {
574 next = *ptr;
575 if (next < CHAR_0 || next > CHAR_9) break;
576 group = group * 10 + next - CHAR_0;
577
578 /* A check for a number greater than the hightest captured group
579 is sufficient here; no need for a separate overflow check. If unknown
580 groups are to be treated as unset, just skip over any remaining
581 digits and carry on. */
582
583 if (group > code->top_bracket)
584 {
585 if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
586 {
587 while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
588 break;
589 }
590 else
591 {
592 rc = PCRE2_ERROR_NOSUBSTRING;
593 goto PTREXIT;
594 }
595 }
596 }
597 }
598 else
599 {
600 const uint8_t *ctypes = code->tables + ctypes_offset;
601 while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
602 {
603 name[n++] = next;
604 if (n > 32) goto BAD;
605 if (++ptr >= repend) break;
606 next = *ptr;
607 }
608 if (n == 0) goto BAD;
609 name[n] = 0;
610 }
611
612 /* In extended mode we recognize ${name:+set text:unset text} and
613 ${name:-default text}. */
614
615 if (inparens)
616 {
617 if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
618 !star && ptr < repend - 2 && next == CHAR_COLON)
619 {
620 special = *(++ptr);
621 if (special != CHAR_PLUS && special != CHAR_MINUS)
622 {
623 rc = PCRE2_ERROR_BADSUBSTITUTION;
624 goto PTREXIT;
625 }
626
627 text1_start = ++ptr;
628 rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
629 if (rc != 0) goto PTREXIT;
630 text1_end = ptr;
631
632 if (special == CHAR_PLUS && *ptr == CHAR_COLON)
633 {
634 text2_start = ++ptr;
635 rc = find_text_end(code, &ptr, repend, TRUE);
636 if (rc != 0) goto PTREXIT;
637 text2_end = ptr;
638 }
639 }
640
641 else
642 {
643 if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
644 {
645 rc = PCRE2_ERROR_REPMISSINGBRACE;
646 goto PTREXIT;
647 }
648 }
649
650 ptr++;
651 }
652
653 /* Have found a syntactically correct group number or name, or *name.
654 Only *MARK is currently recognized. */
655
656 if (star)
657 {
658 if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
659 {
660 PCRE2_SPTR mark = pcre2_get_mark(match_data);
661 if (mark != NULL)
662 {
663 PCRE2_SPTR mark_start = mark;
664 while (*mark != 0) mark++;
665 fraglength = mark - mark_start;
666 CHECKMEMCPY(mark_start, fraglength);
667 }
668 }
669 else goto BAD;
670 }
671
672 /* Substitute the contents of a group. We don't use substring_copy
673 functions any more, in order to support case forcing. */
674
675 else
676 {
677 PCRE2_SPTR subptr, subptrend;
678
679 /* Find a number for a named group. In case there are duplicate names,
680 search for the first one that is set. If the name is not found when
681 PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
682 non-existent group. */
683
684 if (group < 0)
685 {
686 PCRE2_SPTR first, last, entry;
687 rc = pcre2_substring_nametable_scan(code, name, &first, &last);
688 if (rc == PCRE2_ERROR_NOSUBSTRING &&
689 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
690 {
691 group = code->top_bracket + 1;
692 }
693 else
694 {
695 if (rc < 0) goto PTREXIT;
696 for (entry = first; entry <= last; entry += rc)
697 {
698 uint32_t ng = GET2(entry, 0);
699 if (ng < ovector_count)
700 {
701 if (group < 0) group = ng; /* First in ovector */
702 if (ovector[ng*2] != PCRE2_UNSET)
703 {
704 group = ng; /* First that is set */
705 break;
706 }
707 }
708 }
709
710 /* If group is still negative, it means we did not find a group
711 that is in the ovector. Just set the first group. */
712
713 if (group < 0) group = GET2(first, 0);
714 }
715 }
716
717 /* We now have a group that is identified by number. Find the length of
718 the captured string. If a group in a non-special substitution is unset
719 when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
720
721 rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
722 if (rc < 0)
723 {
724 if (rc == PCRE2_ERROR_NOSUBSTRING &&
725 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
726 {
727 rc = PCRE2_ERROR_UNSET;
728 }
729 if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
730 if (special == 0) /* Plain substitution */
731 {
732 if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
733 goto PTREXIT; /* Else error */
734 }
735 }
736
737 /* If special is '+' we have a 'set' and possibly an 'unset' text,
738 both of which are reprocessed when used. If special is '-' we have a
739 default text for when the group is unset; it must be reprocessed. */
740
741 if (special != 0)
742 {
743 if (special == CHAR_MINUS)
744 {
745 if (rc == 0) goto LITERAL_SUBSTITUTE;
746 text2_start = text1_start;
747 text2_end = text1_end;
748 }
749
750 if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
751 ptrstack[ptrstackptr++] = ptr;
752 ptrstack[ptrstackptr++] = repend;
753
754 if (rc == 0)
755 {
756 ptr = text1_start;
757 repend = text1_end;
758 }
759 else
760 {
761 ptr = text2_start;
762 repend = text2_end;
763 }
764 continue;
765 }
766
767 /* Otherwise we have a literal substitution of a group's contents. */
768
769 LITERAL_SUBSTITUTE:
770 subptr = subject + ovector[group*2];
771 subptrend = subject + ovector[group*2 + 1];
772
773 /* Substitute a literal string, possibly forcing alphabetic case. */
774
775 while (subptr < subptrend)
776 {
777 GETCHARINCTEST(ch, subptr);
778 if (forcecase != 0)
779 {
780 #ifdef SUPPORT_UNICODE
781 if (utf || ucp)
782 {
783 uint32_t type = UCD_CHARTYPE(ch);
784 if (PRIV(ucp_gentype)[type] == ucp_L &&
785 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
786 ch = UCD_OTHERCASE(ch);
787 }
788 else
789 #endif
790 {
791 if (((code->tables + cbits_offset +
792 ((forcecase > 0)? cbit_upper:cbit_lower)
793 )[ch/8] & (1u << (ch%8))) == 0)
794 ch = (code->tables + fcc_offset)[ch];
795 }
796 forcecase = forcecasereset;
797 }
798
799 #ifdef SUPPORT_UNICODE
800 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
801 #endif
802 {
803 temp[0] = ch;
804 chlen = 1;
805 }
806 CHECKMEMCPY(temp, chlen);
807 }
808 }
809 }
810
811 /* Handle an escape sequence in extended mode. We can use check_escape()
812 to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
813 the case-forcing escapes are not supported in pcre2_compile() so must be
814 recognized here. */
815
816 else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
817 *ptr == CHAR_BACKSLASH)
818 {
819 int errorcode;
820
821 if (ptr < repend - 1) switch (ptr[1])
822 {
823 case CHAR_L:
824 forcecase = forcecasereset = -1;
825 ptr += 2;
826 continue;
827
828 case CHAR_l:
829 forcecase = -1;
830 forcecasereset = 0;
831 ptr += 2;
832 continue;
833
834 case CHAR_U:
835 forcecase = forcecasereset = 1;
836 ptr += 2;
837 continue;
838
839 case CHAR_u:
840 forcecase = 1;
841 forcecasereset = 0;
842 ptr += 2;
843 continue;
844
845 default:
846 break;
847 }
848
849 ptr++; /* Point after \ */
850 rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
851 code->overall_options, code->extra_options, FALSE, NULL);
852 if (errorcode != 0) goto BADESCAPE;
853
854 switch(rc)
855 {
856 case ESC_E:
857 forcecase = forcecasereset = 0;
858 continue;
859
860 case ESC_Q:
861 escaped_literal = TRUE;
862 continue;
863
864 case 0: /* Data character */
865 goto LITERAL;
866
867 default:
868 goto BADESCAPE;
869 }
870 }
871
872 /* Handle a literal code unit */
873
874 else
875 {
876 LOADLITERAL:
877 GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
878
879 LITERAL:
880 if (forcecase != 0)
881 {
882 #ifdef SUPPORT_UNICODE
883 if (utf || ucp)
884 {
885 uint32_t type = UCD_CHARTYPE(ch);
886 if (PRIV(ucp_gentype)[type] == ucp_L &&
887 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
888 ch = UCD_OTHERCASE(ch);
889 }
890 else
891 #endif
892 {
893 if (((code->tables + cbits_offset +
894 ((forcecase > 0)? cbit_upper:cbit_lower)
895 )[ch/8] & (1u << (ch%8))) == 0)
896 ch = (code->tables + fcc_offset)[ch];
897 }
898 forcecase = forcecasereset;
899 }
900
901 #ifdef SUPPORT_UNICODE
902 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
903 #endif
904 {
905 temp[0] = ch;
906 chlen = 1;
907 }
908 CHECKMEMCPY(temp, chlen);
909 } /* End handling a literal code unit */
910 } /* End of loop for scanning the replacement. */
911
912 /* The replacement has been copied to the output, or its size has been
913 remembered. Do the callout if there is one and we have done an actual
914 replacement. */
915
916 if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
917 {
918 scb.subscount = subs;
919 scb.output_offsets[1] = buff_offset;
920 rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
921
922 /* A non-zero return means cancel this substitution. Instead, copy the
923 matched string fragment. */
924
925 if (rc != 0)
926 {
927 PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
928 PCRE2_SIZE oldlength = ovector[1] - ovector[0];
929
930 buff_offset -= newlength;
931 lengthleft += newlength;
932 if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
933
934 /* A negative return means do not do any more. */
935
936 if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
937 }
938 }
939
940 /* Save the details of this match. See above for how this data is used. If we
941 matched an empty string, do the magic for global matches. Update the start
942 offset to point to the rest of the subject string. If we re-used an existing
943 match for the first match, switch to the internal match data block. */
944
945 ovecsave[0] = ovector[0];
946 ovecsave[1] = ovector[1];
947 ovecsave[2] = start_offset;
948
949 goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
950 PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
951 start_offset = ovector[1];
952 } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
953
954 /* Copy the rest of the subject unless not required, and terminate the output
955 with a binary zero. */
956
957 if (!replacement_only)
958 {
959 fraglength = length - start_offset;
960 CHECKMEMCPY(subject + start_offset, fraglength);
961 }
962
963 temp[0] = 0;
964 CHECKMEMCPY(temp, 1);
965
966 /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
967 and matching has carried on after a full buffer, in order to compute the length
968 needed. Otherwise, an overflow generates an immediate error return. */
969
970 if (overflowed)
971 {
972 rc = PCRE2_ERROR_NOMEMORY;
973 *blength = buff_length + extra_needed;
974 }
975
976 /* After a successful execution, return the number of substitutions and set the
977 length of buffer used, excluding the trailing zero. */
978
979 else
980 {
981 rc = subs;
982 *blength = buff_offset - 1;
983 }
984
985 EXIT:
986 if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
987 else match_data->rc = rc;
988 return rc;
989
990 NOROOM:
991 rc = PCRE2_ERROR_NOMEMORY;
992 goto EXIT;
993
994 BAD:
995 rc = PCRE2_ERROR_BADREPLACEMENT;
996 goto PTREXIT;
997
998 BADESCAPE:
999 rc = PCRE2_ERROR_BADREPESCAPE;
1000
1001 PTREXIT:
1002 *blength = (PCRE2_SIZE)(ptr - replacement);
1003 goto EXIT;
1004 }
1005
1006 /* End of pcre2_substitute.c */
1007