1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2020 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #include "pcre2_internal.h"
47
48 #define PTR_STACK_SIZE 20
49
50 #define SUBSTITUTE_OPTIONS \
51 (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52 PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53 PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54 PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55
56
57
58 /*************************************************
59 * Find end of substitute text *
60 *************************************************/
61
62 /* In extended mode, we recognize ${name:+set text:unset text} and similar
63 constructions. This requires the identification of unescaped : and }
64 characters. This function scans for such. It must deal with nested ${
65 constructions. The pointer to the text is updated, either to the required end
66 character, or to where an error was detected.
67
68 Arguments:
69 code points to the compiled expression (for options)
70 ptrptr points to the pointer to the start of the text (updated)
71 ptrend end of the whole string
72 last TRUE if the last expected string (only } recognized)
73
74 Returns: 0 on success
75 negative error code on failure
76 */
77
78 static int
find_text_end(const pcre2_code * code,PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL last)79 find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80 BOOL last)
81 {
82 int rc = 0;
83 uint32_t nestlevel = 0;
84 BOOL literal = FALSE;
85 PCRE2_SPTR ptr = *ptrptr;
86
87 for (; ptr < ptrend; ptr++)
88 {
89 if (literal)
90 {
91 if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92 {
93 literal = FALSE;
94 ptr += 1;
95 }
96 }
97
98 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99 {
100 if (nestlevel == 0) goto EXIT;
101 nestlevel--;
102 }
103
104 else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105
106 else if (*ptr == CHAR_DOLLAR_SIGN)
107 {
108 if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109 {
110 nestlevel++;
111 ptr += 1;
112 }
113 }
114
115 else if (*ptr == CHAR_BACKSLASH)
116 {
117 int erc;
118 int errorcode;
119 uint32_t ch;
120
121 if (ptr < ptrend - 1) switch (ptr[1])
122 {
123 case CHAR_L:
124 case CHAR_l:
125 case CHAR_U:
126 case CHAR_u:
127 ptr += 1;
128 continue;
129 }
130
131 ptr += 1; /* Must point after \ */
132 erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133 code->overall_options, code->extra_options, FALSE, NULL);
134 ptr -= 1; /* Back to last code unit of escape */
135 if (errorcode != 0)
136 {
137 rc = errorcode;
138 goto EXIT;
139 }
140
141 switch(erc)
142 {
143 case 0: /* Data character */
144 case ESC_E: /* Isolated \E is ignored */
145 break;
146
147 case ESC_Q:
148 literal = TRUE;
149 break;
150
151 default:
152 rc = PCRE2_ERROR_BADREPESCAPE;
153 goto EXIT;
154 }
155 }
156 }
157
158 rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
159
160 EXIT:
161 *ptrptr = ptr;
162 return rc;
163 }
164
165
166
167 /*************************************************
168 * Match and substitute *
169 *************************************************/
170
171 /* This function applies a compiled re to a subject string and creates a new
172 string with substitutions. The first 7 arguments are the same as for
173 pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
174
175 Arguments:
176 code points to the compiled expression
177 subject points to the subject string
178 length length of subject string (may contain binary zeros)
179 start_offset where to start in the subject string
180 options option bits
181 match_data points to a match_data block, or is NULL
182 context points a PCRE2 context
183 replacement points to the replacement string
184 rlength length of replacement string
185 buffer where to put the substituted string
186 blength points to length of buffer; updated to length of string
187
188 Returns: >= 0 number of substitutions made
189 < 0 an error code
190 PCRE2_ERROR_BADREPLACEMENT means invalid use of $
191 */
192
193 /* This macro checks for space in the buffer before copying into it. On
194 overflow, either give an error immediately, or keep on, accumulating the
195 length. */
196
197 #define CHECKMEMCPY(from,length) \
198 { \
199 if (!overflowed && lengthleft < length) \
200 { \
201 if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
202 overflowed = TRUE; \
203 extra_needed = length - lengthleft; \
204 } \
205 else if (overflowed) \
206 { \
207 extra_needed += length; \
208 } \
209 else \
210 { \
211 memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
212 buff_offset += length; \
213 lengthleft -= length; \
214 } \
215 }
216
217 /* Here's the function */
218
219 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,PCRE2_SPTR replacement,PCRE2_SIZE rlength,PCRE2_UCHAR * buffer,PCRE2_SIZE * blength)220 pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
221 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
222 pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
223 PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
224 {
225 int rc;
226 int subs;
227 int forcecase = 0;
228 int forcecasereset = 0;
229 uint32_t ovector_count;
230 uint32_t goptions = 0;
231 uint32_t suboptions;
232 pcre2_match_data *internal_match_data = NULL;
233 BOOL escaped_literal = FALSE;
234 BOOL overflowed = FALSE;
235 BOOL use_existing_match;
236 BOOL replacement_only;
237 #ifdef SUPPORT_UNICODE
238 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
239 BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
240 #endif
241 PCRE2_UCHAR temp[6];
242 PCRE2_SPTR ptr;
243 PCRE2_SPTR repend;
244 PCRE2_SIZE extra_needed = 0;
245 PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
246 PCRE2_SIZE *ovector;
247 PCRE2_SIZE ovecsave[3];
248 pcre2_substitute_callout_block scb;
249
250 /* General initialization */
251
252 buff_offset = 0;
253 lengthleft = buff_length = *blength;
254 *blength = PCRE2_UNSET;
255 ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
256
257 /* Partial matching is not valid. This must come after setting *blength to
258 PCRE2_UNSET, so as not to imply an offset in the replacement. */
259
260 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
261 return PCRE2_ERROR_BADOPTION;
262
263 /* Check for using a match that has already happened. Note that the subject
264 pointer in the match data may be NULL after a no-match. */
265
266 use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
267 replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
268
269 /* If starting from an existing match, there must be an externally provided
270 match data block. We create an internal match_data block in two cases: (a) an
271 external one is not supplied (and we are not starting from an existing match);
272 (b) an existing match is to be used for the first substitution. In the latter
273 case, we copy the existing match into the internal block. This ensures that no
274 changes are made to the existing match data block. */
275
276 if (match_data == NULL)
277 {
278 pcre2_general_context *gcontext;
279 if (use_existing_match) return PCRE2_ERROR_NULL;
280 gcontext = (mcontext == NULL)?
281 (pcre2_general_context *)code :
282 (pcre2_general_context *)mcontext;
283 match_data = internal_match_data =
284 pcre2_match_data_create_from_pattern(code, gcontext);
285 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
286 }
287
288 else if (use_existing_match)
289 {
290 pcre2_general_context *gcontext = (mcontext == NULL)?
291 (pcre2_general_context *)code :
292 (pcre2_general_context *)mcontext;
293 int pairs = (code->top_bracket + 1 < match_data->oveccount)?
294 code->top_bracket + 1 : match_data->oveccount;
295 internal_match_data = pcre2_match_data_create(match_data->oveccount,
296 gcontext);
297 if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
298 memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
299 + 2*pairs*sizeof(PCRE2_SIZE));
300 match_data = internal_match_data;
301 }
302
303 /* Remember ovector details */
304
305 ovector = pcre2_get_ovector_pointer(match_data);
306 ovector_count = pcre2_get_ovector_count(match_data);
307
308 /* Fixed things in the callout block */
309
310 scb.version = 0;
311 scb.input = subject;
312 scb.output = (PCRE2_SPTR)buffer;
313 scb.ovector = ovector;
314
315 /* Find lengths of zero-terminated strings and the end of the replacement. */
316
317 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
318 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
319 repend = replacement + rlength;
320
321 /* Check UTF replacement string if necessary. */
322
323 #ifdef SUPPORT_UNICODE
324 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
325 {
326 rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
327 if (rc != 0)
328 {
329 match_data->leftchar = 0;
330 goto EXIT;
331 }
332 }
333 #endif /* SUPPORT_UNICODE */
334
335 /* Save the substitute options and remove them from the match options. */
336
337 suboptions = options & SUBSTITUTE_OPTIONS;
338 options &= ~SUBSTITUTE_OPTIONS;
339
340 /* Error if the start match offset is greater than the length of the subject. */
341
342 if (start_offset > length)
343 {
344 match_data->leftchar = 0;
345 rc = PCRE2_ERROR_BADOFFSET;
346 goto EXIT;
347 }
348
349 /* Copy up to the start offset, unless only the replacement is required. */
350
351 if (!replacement_only) CHECKMEMCPY(subject, start_offset);
352
353 /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
354 match is taken from the match_data that was passed in. */
355
356 subs = 0;
357 do
358 {
359 PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
360 uint32_t ptrstackptr = 0;
361
362 if (use_existing_match)
363 {
364 rc = match_data->rc;
365 use_existing_match = FALSE;
366 }
367 else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
368 match_data, mcontext);
369
370 #ifdef SUPPORT_UNICODE
371 if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */
372 #endif
373
374 /* Any error other than no match returns the error code. No match when not
375 doing the special after-empty-match global rematch, or when at the end of the
376 subject, breaks the global loop. Otherwise, advance the starting point by one
377 character, copying it to the output, and try again. */
378
379 if (rc < 0)
380 {
381 PCRE2_SIZE save_start;
382
383 if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
384 if (goptions == 0 || start_offset >= length) break;
385
386 /* Advance by one code point. Then, if CRLF is a valid newline sequence and
387 we have advanced into the middle of it, advance one more code point. In
388 other words, do not start in the middle of CRLF, even if CR and LF on their
389 own are valid newlines. */
390
391 save_start = start_offset++;
392 if (subject[start_offset-1] == CHAR_CR &&
393 code->newline_convention != PCRE2_NEWLINE_CR &&
394 code->newline_convention != PCRE2_NEWLINE_LF &&
395 start_offset < length &&
396 subject[start_offset] == CHAR_LF)
397 start_offset++;
398
399 /* Otherwise, in UTF mode, advance past any secondary code points. */
400
401 else if ((code->overall_options & PCRE2_UTF) != 0)
402 {
403 #if PCRE2_CODE_UNIT_WIDTH == 8
404 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
405 start_offset++;
406 #elif PCRE2_CODE_UNIT_WIDTH == 16
407 while (start_offset < length &&
408 (subject[start_offset] & 0xfc00) == 0xdc00)
409 start_offset++;
410 #endif
411 }
412
413 /* Copy what we have advanced past (unless not required), reset the special
414 global options, and continue to the next match. */
415
416 fraglength = start_offset - save_start;
417 if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
418 goptions = 0;
419 continue;
420 }
421
422 /* Handle a successful match. Matches that use \K to end before they start
423 or start before the current point in the subject are not supported. */
424
425 if (ovector[1] < ovector[0] || ovector[0] < start_offset)
426 {
427 rc = PCRE2_ERROR_BADSUBSPATTERN;
428 goto EXIT;
429 }
430
431 /* Check for the same match as previous. This is legitimate after matching an
432 empty string that starts after the initial match offset. We have tried again
433 at the match point in case the pattern is one like /(?<=\G.)/ which can never
434 match at its starting point, so running the match achieves the bumpalong. If
435 we do get the same (null) match at the original match point, it isn't such a
436 pattern, so we now do the empty string magic. In all other cases, a repeat
437 match should never occur. */
438
439 if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
440 {
441 if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
442 {
443 goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
444 ovecsave[2] = start_offset;
445 continue; /* Back to the top of the loop */
446 }
447 rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
448 goto EXIT;
449 }
450
451 /* Count substitutions with a paranoid check for integer overflow; surely no
452 real call to this function would ever hit this! */
453
454 if (subs == INT_MAX)
455 {
456 rc = PCRE2_ERROR_TOOMANYREPLACE;
457 goto EXIT;
458 }
459 subs++;
460
461 /* Copy the text leading up to the match (unless not required), and remember
462 where the insert begins and how many ovector pairs are set. */
463
464 if (rc == 0) rc = ovector_count;
465 fraglength = ovector[0] - start_offset;
466 if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
467 scb.output_offsets[0] = buff_offset;
468 scb.oveccount = rc;
469
470 /* Process the replacement string. If the entire replacement is literal, just
471 copy it with length check. */
472
473 ptr = replacement;
474 if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
475 {
476 CHECKMEMCPY(ptr, rlength);
477 }
478
479 /* Within a non-literal replacement, which must be scanned character by
480 character, local literal mode can be set by \Q, but only in extended mode
481 when backslashes are being interpreted. In extended mode we must handle
482 nested substrings that are to be reprocessed. */
483
484 else for (;;)
485 {
486 uint32_t ch;
487 unsigned int chlen;
488
489 /* If at the end of a nested substring, pop the stack. */
490
491 if (ptr >= repend)
492 {
493 if (ptrstackptr == 0) break; /* End of replacement string */
494 repend = ptrstack[--ptrstackptr];
495 ptr = ptrstack[--ptrstackptr];
496 continue;
497 }
498
499 /* Handle the next character */
500
501 if (escaped_literal)
502 {
503 if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
504 {
505 escaped_literal = FALSE;
506 ptr += 2;
507 continue;
508 }
509 goto LOADLITERAL;
510 }
511
512 /* Not in literal mode. */
513
514 if (*ptr == CHAR_DOLLAR_SIGN)
515 {
516 int group, n;
517 uint32_t special = 0;
518 BOOL inparens;
519 BOOL star;
520 PCRE2_SIZE sublength;
521 PCRE2_SPTR text1_start = NULL;
522 PCRE2_SPTR text1_end = NULL;
523 PCRE2_SPTR text2_start = NULL;
524 PCRE2_SPTR text2_end = NULL;
525 PCRE2_UCHAR next;
526 PCRE2_UCHAR name[33];
527
528 if (++ptr >= repend) goto BAD;
529 if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
530
531 group = -1;
532 n = 0;
533 inparens = FALSE;
534 star = FALSE;
535
536 if (next == CHAR_LEFT_CURLY_BRACKET)
537 {
538 if (++ptr >= repend) goto BAD;
539 next = *ptr;
540 inparens = TRUE;
541 }
542
543 if (next == CHAR_ASTERISK)
544 {
545 if (++ptr >= repend) goto BAD;
546 next = *ptr;
547 star = TRUE;
548 }
549
550 if (!star && next >= CHAR_0 && next <= CHAR_9)
551 {
552 group = next - CHAR_0;
553 while (++ptr < repend)
554 {
555 next = *ptr;
556 if (next < CHAR_0 || next > CHAR_9) break;
557 group = group * 10 + next - CHAR_0;
558
559 /* A check for a number greater than the hightest captured group
560 is sufficient here; no need for a separate overflow check. If unknown
561 groups are to be treated as unset, just skip over any remaining
562 digits and carry on. */
563
564 if (group > code->top_bracket)
565 {
566 if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
567 {
568 while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
569 break;
570 }
571 else
572 {
573 rc = PCRE2_ERROR_NOSUBSTRING;
574 goto PTREXIT;
575 }
576 }
577 }
578 }
579 else
580 {
581 const uint8_t *ctypes = code->tables + ctypes_offset;
582 while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
583 {
584 name[n++] = next;
585 if (n > 32) goto BAD;
586 if (++ptr >= repend) break;
587 next = *ptr;
588 }
589 if (n == 0) goto BAD;
590 name[n] = 0;
591 }
592
593 /* In extended mode we recognize ${name:+set text:unset text} and
594 ${name:-default text}. */
595
596 if (inparens)
597 {
598 if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
599 !star && ptr < repend - 2 && next == CHAR_COLON)
600 {
601 special = *(++ptr);
602 if (special != CHAR_PLUS && special != CHAR_MINUS)
603 {
604 rc = PCRE2_ERROR_BADSUBSTITUTION;
605 goto PTREXIT;
606 }
607
608 text1_start = ++ptr;
609 rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
610 if (rc != 0) goto PTREXIT;
611 text1_end = ptr;
612
613 if (special == CHAR_PLUS && *ptr == CHAR_COLON)
614 {
615 text2_start = ++ptr;
616 rc = find_text_end(code, &ptr, repend, TRUE);
617 if (rc != 0) goto PTREXIT;
618 text2_end = ptr;
619 }
620 }
621
622 else
623 {
624 if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
625 {
626 rc = PCRE2_ERROR_REPMISSINGBRACE;
627 goto PTREXIT;
628 }
629 }
630
631 ptr++;
632 }
633
634 /* Have found a syntactically correct group number or name, or *name.
635 Only *MARK is currently recognized. */
636
637 if (star)
638 {
639 if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
640 {
641 PCRE2_SPTR mark = pcre2_get_mark(match_data);
642 if (mark != NULL)
643 {
644 PCRE2_SPTR mark_start = mark;
645 while (*mark != 0) mark++;
646 fraglength = mark - mark_start;
647 CHECKMEMCPY(mark_start, fraglength);
648 }
649 }
650 else goto BAD;
651 }
652
653 /* Substitute the contents of a group. We don't use substring_copy
654 functions any more, in order to support case forcing. */
655
656 else
657 {
658 PCRE2_SPTR subptr, subptrend;
659
660 /* Find a number for a named group. In case there are duplicate names,
661 search for the first one that is set. If the name is not found when
662 PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
663 non-existent group. */
664
665 if (group < 0)
666 {
667 PCRE2_SPTR first, last, entry;
668 rc = pcre2_substring_nametable_scan(code, name, &first, &last);
669 if (rc == PCRE2_ERROR_NOSUBSTRING &&
670 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
671 {
672 group = code->top_bracket + 1;
673 }
674 else
675 {
676 if (rc < 0) goto PTREXIT;
677 for (entry = first; entry <= last; entry += rc)
678 {
679 uint32_t ng = GET2(entry, 0);
680 if (ng < ovector_count)
681 {
682 if (group < 0) group = ng; /* First in ovector */
683 if (ovector[ng*2] != PCRE2_UNSET)
684 {
685 group = ng; /* First that is set */
686 break;
687 }
688 }
689 }
690
691 /* If group is still negative, it means we did not find a group
692 that is in the ovector. Just set the first group. */
693
694 if (group < 0) group = GET2(first, 0);
695 }
696 }
697
698 /* We now have a group that is identified by number. Find the length of
699 the captured string. If a group in a non-special substitution is unset
700 when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
701
702 rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
703 if (rc < 0)
704 {
705 if (rc == PCRE2_ERROR_NOSUBSTRING &&
706 (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
707 {
708 rc = PCRE2_ERROR_UNSET;
709 }
710 if (rc != PCRE2_ERROR_UNSET) goto PTREXIT; /* Non-unset errors */
711 if (special == 0) /* Plain substitution */
712 {
713 if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
714 goto PTREXIT; /* Else error */
715 }
716 }
717
718 /* If special is '+' we have a 'set' and possibly an 'unset' text,
719 both of which are reprocessed when used. If special is '-' we have a
720 default text for when the group is unset; it must be reprocessed. */
721
722 if (special != 0)
723 {
724 if (special == CHAR_MINUS)
725 {
726 if (rc == 0) goto LITERAL_SUBSTITUTE;
727 text2_start = text1_start;
728 text2_end = text1_end;
729 }
730
731 if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
732 ptrstack[ptrstackptr++] = ptr;
733 ptrstack[ptrstackptr++] = repend;
734
735 if (rc == 0)
736 {
737 ptr = text1_start;
738 repend = text1_end;
739 }
740 else
741 {
742 ptr = text2_start;
743 repend = text2_end;
744 }
745 continue;
746 }
747
748 /* Otherwise we have a literal substitution of a group's contents. */
749
750 LITERAL_SUBSTITUTE:
751 subptr = subject + ovector[group*2];
752 subptrend = subject + ovector[group*2 + 1];
753
754 /* Substitute a literal string, possibly forcing alphabetic case. */
755
756 while (subptr < subptrend)
757 {
758 GETCHARINCTEST(ch, subptr);
759 if (forcecase != 0)
760 {
761 #ifdef SUPPORT_UNICODE
762 if (utf || ucp)
763 {
764 uint32_t type = UCD_CHARTYPE(ch);
765 if (PRIV(ucp_gentype)[type] == ucp_L &&
766 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
767 ch = UCD_OTHERCASE(ch);
768 }
769 else
770 #endif
771 {
772 if (((code->tables + cbits_offset +
773 ((forcecase > 0)? cbit_upper:cbit_lower)
774 )[ch/8] & (1u << (ch%8))) == 0)
775 ch = (code->tables + fcc_offset)[ch];
776 }
777 forcecase = forcecasereset;
778 }
779
780 #ifdef SUPPORT_UNICODE
781 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
782 #endif
783 {
784 temp[0] = ch;
785 chlen = 1;
786 }
787 CHECKMEMCPY(temp, chlen);
788 }
789 }
790 }
791
792 /* Handle an escape sequence in extended mode. We can use check_escape()
793 to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
794 the case-forcing escapes are not supported in pcre2_compile() so must be
795 recognized here. */
796
797 else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
798 *ptr == CHAR_BACKSLASH)
799 {
800 int errorcode;
801
802 if (ptr < repend - 1) switch (ptr[1])
803 {
804 case CHAR_L:
805 forcecase = forcecasereset = -1;
806 ptr += 2;
807 continue;
808
809 case CHAR_l:
810 forcecase = -1;
811 forcecasereset = 0;
812 ptr += 2;
813 continue;
814
815 case CHAR_U:
816 forcecase = forcecasereset = 1;
817 ptr += 2;
818 continue;
819
820 case CHAR_u:
821 forcecase = 1;
822 forcecasereset = 0;
823 ptr += 2;
824 continue;
825
826 default:
827 break;
828 }
829
830 ptr++; /* Point after \ */
831 rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
832 code->overall_options, code->extra_options, FALSE, NULL);
833 if (errorcode != 0) goto BADESCAPE;
834
835 switch(rc)
836 {
837 case ESC_E:
838 forcecase = forcecasereset = 0;
839 continue;
840
841 case ESC_Q:
842 escaped_literal = TRUE;
843 continue;
844
845 case 0: /* Data character */
846 goto LITERAL;
847
848 default:
849 goto BADESCAPE;
850 }
851 }
852
853 /* Handle a literal code unit */
854
855 else
856 {
857 LOADLITERAL:
858 GETCHARINCTEST(ch, ptr); /* Get character value, increment pointer */
859
860 LITERAL:
861 if (forcecase != 0)
862 {
863 #ifdef SUPPORT_UNICODE
864 if (utf || ucp)
865 {
866 uint32_t type = UCD_CHARTYPE(ch);
867 if (PRIV(ucp_gentype)[type] == ucp_L &&
868 type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
869 ch = UCD_OTHERCASE(ch);
870 }
871 else
872 #endif
873 {
874 if (((code->tables + cbits_offset +
875 ((forcecase > 0)? cbit_upper:cbit_lower)
876 )[ch/8] & (1u << (ch%8))) == 0)
877 ch = (code->tables + fcc_offset)[ch];
878 }
879 forcecase = forcecasereset;
880 }
881
882 #ifdef SUPPORT_UNICODE
883 if (utf) chlen = PRIV(ord2utf)(ch, temp); else
884 #endif
885 {
886 temp[0] = ch;
887 chlen = 1;
888 }
889 CHECKMEMCPY(temp, chlen);
890 } /* End handling a literal code unit */
891 } /* End of loop for scanning the replacement. */
892
893 /* The replacement has been copied to the output, or its size has been
894 remembered. Do the callout if there is one and we have done an actual
895 replacement. */
896
897 if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
898 {
899 scb.subscount = subs;
900 scb.output_offsets[1] = buff_offset;
901 rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
902
903 /* A non-zero return means cancel this substitution. Instead, copy the
904 matched string fragment. */
905
906 if (rc != 0)
907 {
908 PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
909 PCRE2_SIZE oldlength = ovector[1] - ovector[0];
910
911 buff_offset -= newlength;
912 lengthleft += newlength;
913 if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
914
915 /* A negative return means do not do any more. */
916
917 if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
918 }
919 }
920
921 /* Save the details of this match. See above for how this data is used. If we
922 matched an empty string, do the magic for global matches. Update the start
923 offset to point to the rest of the subject string. If we re-used an existing
924 match for the first match, switch to the internal match data block. */
925
926 ovecsave[0] = ovector[0];
927 ovecsave[1] = ovector[1];
928 ovecsave[2] = start_offset;
929
930 goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
931 PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
932 start_offset = ovector[1];
933 } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
934
935 /* Copy the rest of the subject unless not required, and terminate the output
936 with a binary zero. */
937
938 if (!replacement_only)
939 {
940 fraglength = length - start_offset;
941 CHECKMEMCPY(subject + start_offset, fraglength);
942 }
943
944 temp[0] = 0;
945 CHECKMEMCPY(temp, 1);
946
947 /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
948 and matching has carried on after a full buffer, in order to compute the length
949 needed. Otherwise, an overflow generates an immediate error return. */
950
951 if (overflowed)
952 {
953 rc = PCRE2_ERROR_NOMEMORY;
954 *blength = buff_length + extra_needed;
955 }
956
957 /* After a successful execution, return the number of substitutions and set the
958 length of buffer used, excluding the trailing zero. */
959
960 else
961 {
962 rc = subs;
963 *blength = buff_offset - 1;
964 }
965
966 EXIT:
967 if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
968 else match_data->rc = rc;
969 return rc;
970
971 NOROOM:
972 rc = PCRE2_ERROR_NOMEMORY;
973 goto EXIT;
974
975 BAD:
976 rc = PCRE2_ERROR_BADREPLACEMENT;
977 goto PTREXIT;
978
979 BADESCAPE:
980 rc = PCRE2_ERROR_BADREPESCAPE;
981
982 PTREXIT:
983 *blength = (PCRE2_SIZE)(ptr - replacement);
984 goto EXIT;
985 }
986
987 /* End of pcre2_substitute.c */
988