1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2022 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #include "pcre2_internal.h"
47
48 #define PTR_STACK_SIZE 20
49
50 #define SUBSTITUTE_OPTIONS \
51 (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52 PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54 PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55
56
57
58 /*************************************************
59 * Find end of substitute text *
60 *************************************************/
61
62 /* In extended mode, we recognize ${name:+set text:unset text} and similar
63 constructions. This requires the identification of unescaped : and }
64 characters. This function scans for such. It must deal with nested ${
65 constructions. The pointer to the text is updated, either to the required end
66 character, or to where an error was detected.
67
68 Arguments:
69 code points to the compiled expression (for options)
70 ptrptr points to the pointer to the start of the text (updated)
71 ptrend end of the whole string
72 last TRUE if the last expected string (only } recognized)
73
74 Returns: 0 on success
75 negative error code on failure
76 */
77
78 static int
find_text_end(const pcre2_code * code,PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL last)79 find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80 BOOL last)
81 {
82 int rc = 0;
83 uint32_t nestlevel = 0;
84 BOOL literal = FALSE;
85 PCRE2_SPTR ptr = *ptrptr;
86
87 for (; ptr < ptrend; ptr++)
88 {
89 if (literal)
90 {
91 if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92 {
93 literal = FALSE;
94 ptr += 1;
95 }
96 }
97
98 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99 {
100 if (nestlevel == 0) goto EXIT;
101 nestlevel--;
102 }
103
104 else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105
106 else if (*ptr == CHAR_DOLLAR_SIGN)
107 {
108 if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109 {
110 nestlevel++;
111 ptr += 1;
112 }
113 }
114
115 else if (*ptr == CHAR_BACKSLASH)
116 {
117 int erc;
118 int errorcode;
119 uint32_t ch;
120
121 if (ptr < ptrend - 1) switch (ptr[1])
122 {
123 case CHAR_L:
124 case CHAR_l:
125 case CHAR_U:
126 case CHAR_u:
127 ptr += 1;
128 continue;
129 }
130
131 ptr += 1; /* Must point after \ */
132 erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133 code->overall_options, code->extra_options, FALSE, NULL);
134 ptr -= 1; /* Back to last code unit of escape */
135 if (errorcode != 0)
136 {
137 rc = errorcode;
138 goto EXIT;
139 }
140
141 switch(erc)
142 {
143 case 0: /* Data character */
144 case ESC_E: /* Isolated \E is ignored */
145 break;
146
147 case ESC_Q:
148 literal = TRUE;
149 break;
150
151 default:
152 rc = PCRE2_ERROR_BADREPESCAPE;
153 goto EXIT;
154 }
155 }
156 }
157
158 rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
159
160 EXIT:
161 *ptrptr = ptr;
162 return rc;
163 }
164
165
166
167 /*************************************************
168 * Match and substitute *
169 *************************************************/
170
171 /* This function applies a compiled re to a subject string and creates a new
172 string with substitutions. The first 7 arguments are the same as for
173 pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
174
175 Arguments:
176 code points to the compiled expression
177 subject points to the subject string
178 length length of subject string (may contain binary zeros)
179 start_offset where to start in the subject string
180 options option bits
181 match_data points to a match_data block, or is NULL
182 context points a PCRE2 context
183 replacement points to the replacement string
184 rlength length of replacement string
185 buffer where to put the substituted string
186 blength points to length of buffer; updated to length of string
187
188 Returns: >= 0 number of substitutions made
189 < 0 an error code
190 PCRE2_ERROR_BADREPLACEMENT means invalid use of $
191 */
192
193 /* This macro checks for space in the buffer before copying into it. On
194 overflow, either give an error immediately, or keep on, accumulating the
195 length. */
196
197 #define CHECKMEMCPY(from,length) \
198 { \
199 if (!overflowed && lengthleft < length) \
200 { \
201 if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
202 overflowed = TRUE; \
203 extra_needed = length - lengthleft; \
204 } \
205 else if (overflowed) \
206 { \
207 extra_needed += length; \
208 } \
209 else \
210 { \
211 memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
212 buff_offset += length; \
213 lengthleft -= length; \
214 } \
215 }
216
217 /* Here's the function */
218
219 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,PCRE2_SPTR replacement,PCRE2_SIZE rlength,PCRE2_UCHAR * buffer,PCRE2_SIZE * blength)220 pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
221 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
222 pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
223 PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
224 {
225 int rc;
226 int subs;
227 int forcecase = 0;
228 int forcecasereset = 0;
229 uint32_t ovector_count;
230 uint32_t goptions = 0;
231 uint32_t suboptions;
232 pcre2_match_data *internal_match_data = NULL;
233 BOOL escaped_literal = FALSE;
234 BOOL overflowed = FALSE;
235 BOOL use_existing_match;
236 BOOL replacement_only;
237 #ifdef SUPPORT_UNICODE
238 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
239 BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
240 #endif
241 PCRE2_UCHAR temp[6];
242 PCRE2_SPTR ptr;
243 PCRE2_SPTR repend;
244 PCRE2_SIZE extra_needed = 0;
245 PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
246 PCRE2_SIZE *ovector;
247 PCRE2_SIZE ovecsave[3];
248 pcre2_substitute_callout_block scb;
249
250 /* General initialization */
251
252 buff_offset = 0;
253 lengthleft = buff_length = *blength;
254 *blength = PCRE2_UNSET;
255 ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
256
257 /* Partial matching is not valid. This must come after setting *blength to
258 PCRE2_UNSET, so as not to imply an offset in the replacement. */
259
260 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
261 return PCRE2_ERROR_BADOPTION;
262
263 /* Validate length and find the end of the replacement. A NULL replacement of
264 zero length is interpreted as an empty string. */
265
266 if (replacement == NULL)
267 {
268 if (rlength != 0) return PCRE2_ERROR_NULL;
269 replacement = (PCRE2_SPTR)"";
270 }
271
272 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
273 repend = replacement + rlength;
274
275 /* Check for using a match that has already happened. Note that the subject
276 pointer in the match data may be NULL after a no-match. */
277
278 use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
279 replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
280
281 /* If starting from an existing match, there must be an externally provided
282 match data block. We create an internal match_data block in two cases: (a) an
283 external one is not supplied (and we are not starting from an existing match);
284 (b) an existing match is to be used for the first substitution. In the latter
285 case, we copy the existing match into the internal block, except for any cached
286 heap frame size and pointer. This ensures that no changes are made to the
287 external match data block. */
288
289 if (match_data == NULL)
290 {
291 pcre2_general_context *gcontext;
292 if (use_existing_match) return PCRE2_ERROR_NULL;
293 gcontext = (mcontext == NULL)?
294 (pcre2_general_context *)code :
295 (pcre2_general_context *)mcontext;
296 match_data = internal_match_data =
297 pcre2_match_data_create_from_pattern(code, gcontext);
298 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
299 }
300
301 else if (use_existing_match)
302 {
303 pcre2_general_context *gcontext = (mcontext == NULL)?
304 (pcre2_general_context *)code :
305 (pcre2_general_context *)mcontext;
306 int pairs = (code->top_bracket + 1 < match_data->oveccount)?
307 code->top_bracket + 1 : match_data->oveccount;
308 internal_match_data = pcre2_match_data_create(match_data->oveccount,
309 gcontext);
310 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
311 memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
312 + 2*pairs*sizeof(PCRE2_SIZE));
313 internal_match_data->heapframes = NULL;
314 internal_match_data->heapframes_size = 0;
315 match_data = internal_match_data;
316 }
317
318 /* Remember ovector details */
319
320 ovector = pcre2_get_ovector_pointer(match_data);
321 ovector_count = pcre2_get_ovector_count(match_data);
322
323 /* Fixed things in the callout block */
324
325 scb.version = 0;
326 scb.input = subject;
327 scb.output = (PCRE2_SPTR)buffer;
328 scb.ovector = ovector;
329
330 /* A NULL subject of zero length is treated as an empty string. */
331
332 if (subject == NULL)
333 {
334 if (length != 0) return PCRE2_ERROR_NULL;
335 subject = (PCRE2_SPTR)"";
336 }
337
338 /* Find length of zero-terminated subject */
339
340 if (length == PCRE2_ZERO_TERMINATED)
341 length = subject? PRIV(strlen)(subject) : 0;
342
343 /* Check UTF replacement string if necessary. */
344
345 #ifdef SUPPORT_UNICODE
346 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
347 {
348 rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
349 if (rc != 0)
350 {
351 match_data->leftchar = 0;
352 goto EXIT;
353 }
354 }
355 #endif /* SUPPORT_UNICODE */
356
357 /* Save the substitute options and remove them from the match options. */
358
359 suboptions = options & SUBSTITUTE_OPTIONS;
360 options &= ~SUBSTITUTE_OPTIONS;
361
362 /* Error if the start match offset is greater than the length of the subject. */
363
364 if (start_offset > length)
365 {
366 match_data->leftchar = 0;
367 rc = PCRE2_ERROR_BADOFFSET;
368 goto EXIT;
369 }
370
371 /* Copy up to the start offset, unless only the replacement is required. */
372
373 if (!replacement_only) CHECKMEMCPY(subject, start_offset);
374
375 /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
376 match is taken from the match_data that was passed in. */
377
378 subs = 0;
379 do
380 {
381 PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
382 uint32_t ptrstackptr = 0;
383
384 if (use_existing_match)
385 {
386 rc = match_data->rc;
387 use_existing_match = FALSE;
388 }
389 else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
390 match_data, mcontext);
391
392 #ifdef SUPPORT_UNICODE
393 if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
394 #endif
395
396 /* Any error other than no match returns the error code. No match when not
397 doing the special after-empty-match global rematch, or when at the end of the
398 subject, breaks the global loop. Otherwise, advance the starting point by one
399 character, copying it to the output, and try again. */
400
401 if (rc < 0)
402 {
403 PCRE2_SIZE save_start;
404
405 if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
406 if (goptions == 0 || start_offset >= length) break;
407
408 /* Advance by one code point. Then, if CRLF is a valid newline sequence and
409 we have advanced into the middle of it, advance one more code point. In
410 other words, do not start in the middle of CRLF, even if CR and LF on their
411 own are valid newlines. */
412
413 save_start = start_offset++;
414 if (subject[start_offset-1] == CHAR_CR &&
415 code->newline_convention != PCRE2_NEWLINE_CR &&
416 code->newline_convention != PCRE2_NEWLINE_LF &&
417 start_offset < length &&
418 subject[start_offset] == CHAR_LF)
419 start_offset++;
420
421 /* Otherwise, in UTF mode, advance past any secondary code points. */
422
423 else if ((code->overall_options & PCRE2_UTF) != 0)
424 {
425 #if PCRE2_CODE_UNIT_WIDTH == 8
426 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
427 start_offset++;
428 #elif PCRE2_CODE_UNIT_WIDTH == 16
429 while (start_offset < length &&
430 (subject[start_offset] & 0xfc00) == 0xdc00)
431 start_offset++;
432 #endif
433 }
434
435 /* Copy what we have advanced past (unless not required), reset the special
436 global options, and continue to the next match. */
437
438 fraglength = start_offset - save_start;
439 if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
440 goptions = 0;
441 continue;
442 }
443
444 /* Handle a successful match. Matches that use \K to end before they start
445 or start before the current point in the subject are not supported. */
446
447 if (ovector[1] < ovector[0] || ovector[0] < start_offset)
448 {
449 rc = PCRE2_ERROR_BADSUBSPATTERN;
450 goto EXIT;
451 }
452
453 /* Check for the same match as previous. This is legitimate after matching an
454 empty string that starts after the initial match offset. We have tried again
455 at the match point in case the pattern is one like /(?<=\G.)/ which can never
456 match at its starting point, so running the match achieves the bumpalong. If
457 we do get the same (null) match at the original match point, it isn't such a
458 pattern, so we now do the empty string magic. In all other cases, a repeat
459 match should never occur. */
460
461 if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
462 {
463 if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
464 {
465 goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
466 ovecsave[2] = start_offset;
467 continue; /* Back to the top of the loop */
468 }
469 rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
470 goto EXIT;
471 }
472
473 /* Count substitutions with a paranoid check for integer overflow; surely no
474 real call to this function would ever hit this! */
475
476 if (subs == INT_MAX)
477 {
478 rc = PCRE2_ERROR_TOOMANYREPLACE;
479 goto EXIT;
480 }
481 subs++;
482
483 /* Copy the text leading up to the match (unless not required), and remember
484 where the insert begins and how many ovector pairs are set. */
485
486 if (rc == 0) rc = ovector_count;
487 fraglength = ovector[0] - start_offset;
488 if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
489 scb.output_offsets[0] = buff_offset;
490 scb.oveccount = rc;
491
492 /* Process the replacement string. If the entire replacement is literal, just
493 copy it with length check. */
494
495 ptr = replacement;
496 if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
497 {
498 CHECKMEMCPY(ptr, rlength);
499 }
500
501 /* Within a non-literal replacement, which must be scanned character by
502 character, local literal mode can be set by \Q, but only in extended mode
503 when backslashes are being interpreted. In extended mode we must handle
504 nested substrings that are to be reprocessed. */
505
506 else for (;;)
507 {
508 uint32_t ch;
509 unsigned int chlen;
510
511 /* If at the end of a nested substring, pop the stack. */
512
513 if (ptr >= repend)
514 {
515 if (ptrstackptr == 0) break; /* End of replacement string */
516 repend = ptrstack[--ptrstackptr];
517 ptr = ptrstack[--ptrstackptr];
518 continue;
519 }
520
521 /* Handle the next character */
522
523 if (escaped_literal)
524 {
525 if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
526 {
527 escaped_literal = FALSE;
528 ptr += 2;
529 continue;
530 }
531 goto LOADLITERAL;
532 }
533
534 /* Not in literal mode. */
535
536 if (*ptr == CHAR_DOLLAR_SIGN)
537 {
538 int group, n;
539 uint32_t special = 0;
540 BOOL inparens;
541 BOOL star;
542 PCRE2_SIZE sublength;
543 PCRE2_SPTR text1_start = NULL;
544 PCRE2_SPTR text1_end = NULL;
545 PCRE2_SPTR text2_start = NULL;
546 PCRE2_SPTR text2_end = NULL;
547 PCRE2_UCHAR next;
548 PCRE2_UCHAR name[33];
549
550 if (++ptr >= repend) goto BAD;
551 if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
552
553 group = -1;
554 n = 0;
555 inparens = FALSE;
556 star = FALSE;
557
558 if (next == CHAR_LEFT_CURLY_BRACKET)
559 {
560 if (++ptr >= repend) goto BAD;
561 next = *ptr;
562 inparens = TRUE;
563 }
564
565 if (next == CHAR_ASTERISK)
566 {
567 if (++ptr >= repend) goto BAD;
568 next = *ptr;
569 star = TRUE;
570 }
571
572 if (!star && next >= CHAR_0 && next <= CHAR_9)
573 {
574 group = next - CHAR_0;
575 while (++ptr < repend)
576 {
577 next = *ptr;
578 if (next < CHAR_0 || next > CHAR_9) break;
579 group = group * 10 + next - CHAR_0;
580
581 /* A check for a number greater than the hightest captured group
582 is sufficient here; no need for a separate overflow check. If unknown
583 groups are to be treated as unset, just skip over any remaining
584 digits and carry on. */
585
586 if (group > code->top_bracket)
587 {
588 if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
589 {
590 while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
591 break;
592 }
593 else
594 {
595 rc = PCRE2_ERROR_NOSUBSTRING;
596 goto PTREXIT;
597 }
598 }
599 }
600 }
601 else
602 {
603 const uint8_t *ctypes = code->tables + ctypes_offset;
604 while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
605 {
606 name[n++] = next;
607 if (n > 32) goto BAD;
608 if (++ptr >= repend) break;
609 next = *ptr;
610 }
611 if (n == 0) goto BAD;
612 name[n] = 0;
613 }
614
615 /* In extended mode we recognize ${name:+set text:unset text} and
616 ${name:-default text}. */
617
618 if (inparens)
619 {
620 if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
621 !star && ptr < repend - 2 && next == CHAR_COLON)
622 {
623 special = *(++ptr);
624 if (special != CHAR_PLUS && special != CHAR_MINUS)
625 {
626 rc = PCRE2_ERROR_BADSUBSTITUTION;
627 goto PTREXIT;
628 }
629
630 text1_start = ++ptr;
631 rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
632 if (rc != 0) goto PTREXIT;
633 text1_end = ptr;
634
635 if (special == CHAR_PLUS && *ptr == CHAR_COLON)
636 {
637 text2_start = ++ptr;
638 rc = find_text_end(code, &ptr, repend, TRUE);
639 if (rc != 0) goto PTREXIT;
640 text2_end = ptr;
641 }
642 }
643
644 else
645 {
646 if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
647 {
648 rc = PCRE2_ERROR_REPMISSINGBRACE;
649 goto PTREXIT;
650 }
651 }
652
653 ptr++;
654 }
655
656 /* Have found a syntactically correct group number or name, or *name.
657 Only *MARK is currently recognized. */
658
659 if (star)
660 {
661 if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
662 {
663 PCRE2_SPTR mark = pcre2_get_mark(match_data);
664 if (mark != NULL)
665 {
666 PCRE2_SPTR mark_start = mark;
667 while (*mark != 0) mark++;
668 fraglength = mark - mark_start;
669 CHECKMEMCPY(mark_start, fraglength);
670 }
671 }
672 else goto BAD;
673 }
674
675 /* Substitute the contents of a group. We don't use substring_copy
676 functions any more, in order to support case forcing. */
677
678 else
679 {
680 PCRE2_SPTR subptr, subptrend;
681
682 /* Find a number for a named group. In case there are duplicate names,
683 search for the first one that is set. If the name is not found when
684 PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
685 non-existent group. */
686
687 if (group < 0)
688 {
689 PCRE2_SPTR first, last, entry;
690 rc = pcre2_substring_nametable_scan(code, name, &first, &last);
691 if (rc == PCRE2_ERROR_NOSUBSTRING &&
692 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
693 {
694 group = code->top_bracket + 1;
695 }
696 else
697 {
698 if (rc < 0) goto PTREXIT;
699 for (entry = first; entry <= last; entry += rc)
700 {
701 uint32_t ng = GET2(entry, 0);
702 if (ng < ovector_count)
703 {
704 if (group < 0) group = ng; /* First in ovector */
705 if (ovector[ng*2] != PCRE2_UNSET)
706 {
707 group = ng; /* First that is set */
708 break;
709 }
710 }
711 }
712
713 /* If group is still negative, it means we did not find a group
714 that is in the ovector. Just set the first group. */
715
716 if (group < 0) group = GET2(first, 0);
717 }
718 }
719
720 /* We now have a group that is identified by number. Find the length of
721 the captured string. If a group in a non-special substitution is unset
722 when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
723
724 rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
725 if (rc < 0)
726 {
727 if (rc == PCRE2_ERROR_NOSUBSTRING &&
728 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
729 {
730 rc = PCRE2_ERROR_UNSET;
731 }
732 if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
733 if (special == 0) /* Plain substitution */
734 {
735 if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
736 goto PTREXIT; /* Else error */
737 }
738 }
739
740 /* If special is '+' we have a 'set' and possibly an 'unset' text,
741 both of which are reprocessed when used. If special is '-' we have a
742 default text for when the group is unset; it must be reprocessed. */
743
744 if (special != 0)
745 {
746 if (special == CHAR_MINUS)
747 {
748 if (rc == 0) goto LITERAL_SUBSTITUTE;
749 text2_start = text1_start;
750 text2_end = text1_end;
751 }
752
753 if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
754 ptrstack[ptrstackptr++] = ptr;
755 ptrstack[ptrstackptr++] = repend;
756
757 if (rc == 0)
758 {
759 ptr = text1_start;
760 repend = text1_end;
761 }
762 else
763 {
764 ptr = text2_start;
765 repend = text2_end;
766 }
767 continue;
768 }
769
770 /* Otherwise we have a literal substitution of a group's contents. */
771
772 LITERAL_SUBSTITUTE:
773 subptr = subject + ovector[group*2];
774 subptrend = subject + ovector[group*2 + 1];
775
776 /* Substitute a literal string, possibly forcing alphabetic case. */
777
778 while (subptr < subptrend)
779 {
780 GETCHARINCTEST(ch, subptr);
781 if (forcecase != 0)
782 {
783 #ifdef SUPPORT_UNICODE
784 if (utf || ucp)
785 {
786 uint32_t type = UCD_CHARTYPE(ch);
787 if (PRIV(ucp_gentype)[type] == ucp_L &&
788 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
789 ch = UCD_OTHERCASE(ch);
790 }
791 else
792 #endif
793 {
794 if (((code->tables + cbits_offset +
795 ((forcecase > 0)? cbit_upper:cbit_lower)
796 )[ch/8] & (1u << (ch%8))) == 0)
797 ch = (code->tables + fcc_offset)[ch];
798 }
799 forcecase = forcecasereset;
800 }
801
802 #ifdef SUPPORT_UNICODE
803 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
804 #endif
805 {
806 temp[0] = ch;
807 chlen = 1;
808 }
809 CHECKMEMCPY(temp, chlen);
810 }
811 }
812 }
813
814 /* Handle an escape sequence in extended mode. We can use check_escape()
815 to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
816 the case-forcing escapes are not supported in pcre2_compile() so must be
817 recognized here. */
818
819 else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
820 *ptr == CHAR_BACKSLASH)
821 {
822 int errorcode;
823
824 if (ptr < repend - 1) switch (ptr[1])
825 {
826 case CHAR_L:
827 forcecase = forcecasereset = -1;
828 ptr += 2;
829 continue;
830
831 case CHAR_l:
832 forcecase = -1;
833 forcecasereset = 0;
834 ptr += 2;
835 continue;
836
837 case CHAR_U:
838 forcecase = forcecasereset = 1;
839 ptr += 2;
840 continue;
841
842 case CHAR_u:
843 forcecase = 1;
844 forcecasereset = 0;
845 ptr += 2;
846 continue;
847
848 default:
849 break;
850 }
851
852 ptr++; /* Point after \ */
853 rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
854 code->overall_options, code->extra_options, FALSE, NULL);
855 if (errorcode != 0) goto BADESCAPE;
856
857 switch(rc)
858 {
859 case ESC_E:
860 forcecase = forcecasereset = 0;
861 continue;
862
863 case ESC_Q:
864 escaped_literal = TRUE;
865 continue;
866
867 case 0: /* Data character */
868 goto LITERAL;
869
870 default:
871 goto BADESCAPE;
872 }
873 }
874
875 /* Handle a literal code unit */
876
877 else
878 {
879 LOADLITERAL:
880 GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
881
882 LITERAL:
883 if (forcecase != 0)
884 {
885 #ifdef SUPPORT_UNICODE
886 if (utf || ucp)
887 {
888 uint32_t type = UCD_CHARTYPE(ch);
889 if (PRIV(ucp_gentype)[type] == ucp_L &&
890 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
891 ch = UCD_OTHERCASE(ch);
892 }
893 else
894 #endif
895 {
896 if (((code->tables + cbits_offset +
897 ((forcecase > 0)? cbit_upper:cbit_lower)
898 )[ch/8] & (1u << (ch%8))) == 0)
899 ch = (code->tables + fcc_offset)[ch];
900 }
901 forcecase = forcecasereset;
902 }
903
904 #ifdef SUPPORT_UNICODE
905 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
906 #endif
907 {
908 temp[0] = ch;
909 chlen = 1;
910 }
911 CHECKMEMCPY(temp, chlen);
912 } /* End handling a literal code unit */
913 } /* End of loop for scanning the replacement. */
914
915 /* The replacement has been copied to the output, or its size has been
916 remembered. Do the callout if there is one and we have done an actual
917 replacement. */
918
919 if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
920 {
921 scb.subscount = subs;
922 scb.output_offsets[1] = buff_offset;
923 rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
924
925 /* A non-zero return means cancel this substitution. Instead, copy the
926 matched string fragment. */
927
928 if (rc != 0)
929 {
930 PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
931 PCRE2_SIZE oldlength = ovector[1] - ovector[0];
932
933 buff_offset -= newlength;
934 lengthleft += newlength;
935 if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
936
937 /* A negative return means do not do any more. */
938
939 if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
940 }
941 }
942
943 /* Save the details of this match. See above for how this data is used. If we
944 matched an empty string, do the magic for global matches. Update the start
945 offset to point to the rest of the subject string. If we re-used an existing
946 match for the first match, switch to the internal match data block. */
947
948 ovecsave[0] = ovector[0];
949 ovecsave[1] = ovector[1];
950 ovecsave[2] = start_offset;
951
952 goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
953 PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
954 start_offset = ovector[1];
955 } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
956
957 /* Copy the rest of the subject unless not required, and terminate the output
958 with a binary zero. */
959
960 if (!replacement_only)
961 {
962 fraglength = length - start_offset;
963 CHECKMEMCPY(subject + start_offset, fraglength);
964 }
965
966 temp[0] = 0;
967 CHECKMEMCPY(temp, 1);
968
969 /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
970 and matching has carried on after a full buffer, in order to compute the length
971 needed. Otherwise, an overflow generates an immediate error return. */
972
973 if (overflowed)
974 {
975 rc = PCRE2_ERROR_NOMEMORY;
976 *blength = buff_length + extra_needed;
977 }
978
979 /* After a successful execution, return the number of substitutions and set the
980 length of buffer used, excluding the trailing zero. */
981
982 else
983 {
984 rc = subs;
985 *blength = buff_offset - 1;
986 }
987
988 EXIT:
989 if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
990 else match_data->rc = rc;
991 return rc;
992
993 NOROOM:
994 rc = PCRE2_ERROR_NOMEMORY;
995 goto EXIT;
996
997 BAD:
998 rc = PCRE2_ERROR_BADREPLACEMENT;
999 goto PTREXIT;
1000
1001 BADESCAPE:
1002 rc = PCRE2_ERROR_BADREPESCAPE;
1003
1004 PTREXIT:
1005 *blength = (PCRE2_SIZE)(ptr - replacement);
1006 goto EXIT;
1007 }
1008
1009 /* End of pcre2_substitute.c */
1010