1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2022 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #include "pcre2_internal.h"
47 
48 #define PTR_STACK_SIZE 20
49 
50 #define SUBSTITUTE_OPTIONS \
51   (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52    PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53    PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54    PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55 
56 
57 
58 /*************************************************
59 *           Find end of substitute text          *
60 *************************************************/
61 
62 /* In extended mode, we recognize ${name:+set text:unset text} and similar
63 constructions. This requires the identification of unescaped : and }
64 characters. This function scans for such. It must deal with nested ${
65 constructions. The pointer to the text is updated, either to the required end
66 character, or to where an error was detected.
67 
68 Arguments:
69   code      points to the compiled expression (for options)
70   ptrptr    points to the pointer to the start of the text (updated)
71   ptrend    end of the whole string
72   last      TRUE if the last expected string (only } recognized)
73 
74 Returns:    0 on success
75             negative error code on failure
76 */
77 
78 static int
find_text_end(const pcre2_code * code,PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL last)79 find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80   BOOL last)
81 {
82 int rc = 0;
83 uint32_t nestlevel = 0;
84 BOOL literal = FALSE;
85 PCRE2_SPTR ptr = *ptrptr;
86 
87 for (; ptr < ptrend; ptr++)
88   {
89   if (literal)
90     {
91     if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92       {
93       literal = FALSE;
94       ptr += 1;
95       }
96     }
97 
98   else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99     {
100     if (nestlevel == 0) goto EXIT;
101     nestlevel--;
102     }
103 
104   else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105 
106   else if (*ptr == CHAR_DOLLAR_SIGN)
107     {
108     if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109       {
110       nestlevel++;
111       ptr += 1;
112       }
113     }
114 
115   else if (*ptr == CHAR_BACKSLASH)
116     {
117     int erc;
118     int errorcode;
119     uint32_t ch;
120 
121     if (ptr < ptrend - 1) switch (ptr[1])
122       {
123       case CHAR_L:
124       case CHAR_l:
125       case CHAR_U:
126       case CHAR_u:
127       ptr += 1;
128       continue;
129       }
130 
131     ptr += 1;  /* Must point after \ */
132     erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133       code->overall_options, code->extra_options, FALSE, NULL);
134     ptr -= 1;  /* Back to last code unit of escape */
135     if (errorcode != 0)
136       {
137       rc = errorcode;
138       goto EXIT;
139       }
140 
141     switch(erc)
142       {
143       case 0:      /* Data character */
144       case ESC_E:  /* Isolated \E is ignored */
145       break;
146 
147       case ESC_Q:
148       literal = TRUE;
149       break;
150 
151       default:
152       rc = PCRE2_ERROR_BADREPESCAPE;
153       goto EXIT;
154       }
155     }
156   }
157 
158 rc = PCRE2_ERROR_REPMISSINGBRACE;   /* Terminator not found */
159 
160 EXIT:
161 *ptrptr = ptr;
162 return rc;
163 }
164 
165 
166 
167 /*************************************************
168 *              Match and substitute              *
169 *************************************************/
170 
171 /* This function applies a compiled re to a subject string and creates a new
172 string with substitutions. The first 7 arguments are the same as for
173 pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
174 
175 Arguments:
176   code            points to the compiled expression
177   subject         points to the subject string
178   length          length of subject string (may contain binary zeros)
179   start_offset    where to start in the subject string
180   options         option bits
181   match_data      points to a match_data block, or is NULL
182   context         points a PCRE2 context
183   replacement     points to the replacement string
184   rlength         length of replacement string
185   buffer          where to put the substituted string
186   blength         points to length of buffer; updated to length of string
187 
188 Returns:          >= 0 number of substitutions made
189                   < 0 an error code
190                   PCRE2_ERROR_BADREPLACEMENT means invalid use of $
191 */
192 
193 /* This macro checks for space in the buffer before copying into it. On
194 overflow, either give an error immediately, or keep on, accumulating the
195 length. */
196 
197 #define CHECKMEMCPY(from,length) \
198   { \
199   if (!overflowed && lengthleft < length) \
200     { \
201     if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
202     overflowed = TRUE; \
203     extra_needed = length - lengthleft; \
204     } \
205   else if (overflowed) \
206     { \
207     extra_needed += length; \
208     }  \
209   else \
210     {  \
211     memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
212     buff_offset += length; \
213     lengthleft -= length; \
214     } \
215   }
216 
217 /* Here's the function */
218 
219 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,PCRE2_SPTR replacement,PCRE2_SIZE rlength,PCRE2_UCHAR * buffer,PCRE2_SIZE * blength)220 pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
221   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
222   pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
223   PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
224 {
225 int rc;
226 int subs;
227 int forcecase = 0;
228 int forcecasereset = 0;
229 uint32_t ovector_count;
230 uint32_t goptions = 0;
231 uint32_t suboptions;
232 pcre2_match_data *internal_match_data = NULL;
233 BOOL escaped_literal = FALSE;
234 BOOL overflowed = FALSE;
235 BOOL use_existing_match;
236 BOOL replacement_only;
237 #ifdef SUPPORT_UNICODE
238 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
239 BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
240 #endif
241 PCRE2_UCHAR temp[6];
242 PCRE2_SPTR ptr;
243 PCRE2_SPTR repend;
244 PCRE2_SIZE extra_needed = 0;
245 PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
246 PCRE2_SIZE *ovector;
247 PCRE2_SIZE ovecsave[3];
248 pcre2_substitute_callout_block scb;
249 
250 /* General initialization */
251 
252 buff_offset = 0;
253 lengthleft = buff_length = *blength;
254 *blength = PCRE2_UNSET;
255 ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
256 
257 /* Partial matching is not valid. This must come after setting *blength to
258 PCRE2_UNSET, so as not to imply an offset in the replacement. */
259 
260 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
261   return PCRE2_ERROR_BADOPTION;
262 
263 /* Validate length and find the end of the replacement. A NULL replacement of
264 zero length is interpreted as an empty string. */
265 
266 if (replacement == NULL)
267   {
268   if (rlength != 0) return PCRE2_ERROR_NULL;
269   replacement = (PCRE2_SPTR)"";
270   }
271 
272 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
273 repend = replacement + rlength;
274 
275 /* Check for using a match that has already happened. Note that the subject
276 pointer in the match data may be NULL after a no-match. */
277 
278 use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
279 replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
280 
281 /* If starting from an existing match, there must be an externally provided
282 match data block. We create an internal match_data block in two cases: (a) an
283 external one is not supplied (and we are not starting from an existing match);
284 (b) an existing match is to be used for the first substitution. In the latter
285 case, we copy the existing match into the internal block, except for any cached
286 heap frame size and pointer. This ensures that no changes are made to the
287 external match data block. */
288 
289 if (match_data == NULL)
290   {
291   pcre2_general_context *gcontext;
292   if (use_existing_match) return PCRE2_ERROR_NULL;
293   gcontext = (mcontext == NULL)?
294     (pcre2_general_context *)code :
295     (pcre2_general_context *)mcontext;
296   match_data = internal_match_data =
297     pcre2_match_data_create_from_pattern(code, gcontext);
298   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
299   }
300 
301 else if (use_existing_match)
302   {
303   pcre2_general_context *gcontext = (mcontext == NULL)?
304     (pcre2_general_context *)code :
305     (pcre2_general_context *)mcontext;
306   int pairs = (code->top_bracket + 1 < match_data->oveccount)?
307     code->top_bracket + 1 : match_data->oveccount;
308   internal_match_data = pcre2_match_data_create(match_data->oveccount,
309     gcontext);
310   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
311   memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
312     + 2*pairs*sizeof(PCRE2_SIZE));
313   internal_match_data->heapframes = NULL;
314   internal_match_data->heapframes_size = 0;
315   match_data = internal_match_data;
316   }
317 
318 /* Remember ovector details */
319 
320 ovector = pcre2_get_ovector_pointer(match_data);
321 ovector_count = pcre2_get_ovector_count(match_data);
322 
323 /* Fixed things in the callout block */
324 
325 scb.version = 0;
326 scb.input = subject;
327 scb.output = (PCRE2_SPTR)buffer;
328 scb.ovector = ovector;
329 
330 /* A NULL subject of zero length is treated as an empty string. */
331 
332 if (subject == NULL)
333   {
334   if (length != 0) return PCRE2_ERROR_NULL;
335   subject = (PCRE2_SPTR)"";
336   }
337 
338 /* Find length of zero-terminated subject */
339 
340 if (length == PCRE2_ZERO_TERMINATED)
341   length = subject? PRIV(strlen)(subject) : 0;
342 
343 /* Check UTF replacement string if necessary. */
344 
345 #ifdef SUPPORT_UNICODE
346 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
347   {
348   rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
349   if (rc != 0)
350     {
351     match_data->leftchar = 0;
352     goto EXIT;
353     }
354   }
355 #endif  /* SUPPORT_UNICODE */
356 
357 /* Save the substitute options and remove them from the match options. */
358 
359 suboptions = options & SUBSTITUTE_OPTIONS;
360 options &= ~SUBSTITUTE_OPTIONS;
361 
362 /* Error if the start match offset is greater than the length of the subject. */
363 
364 if (start_offset > length)
365   {
366   match_data->leftchar = 0;
367   rc = PCRE2_ERROR_BADOFFSET;
368   goto EXIT;
369   }
370 
371 /* Copy up to the start offset, unless only the replacement is required. */
372 
373 if (!replacement_only) CHECKMEMCPY(subject, start_offset);
374 
375 /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
376 match is taken from the match_data that was passed in. */
377 
378 subs = 0;
379 do
380   {
381   PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
382   uint32_t ptrstackptr = 0;
383 
384   if (use_existing_match)
385     {
386     rc = match_data->rc;
387     use_existing_match = FALSE;
388     }
389   else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
390     match_data, mcontext);
391 
392 #ifdef SUPPORT_UNICODE
393   if (utf) options |= PCRE2_NO_UTF_CHECK;  /* Only need to check once */
394 #endif
395 
396   /* Any error other than no match returns the error code. No match when not
397   doing the special after-empty-match global rematch, or when at the end of the
398   subject, breaks the global loop. Otherwise, advance the starting point by one
399   character, copying it to the output, and try again. */
400 
401   if (rc < 0)
402     {
403     PCRE2_SIZE save_start;
404 
405     if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
406     if (goptions == 0 || start_offset >= length) break;
407 
408     /* Advance by one code point. Then, if CRLF is a valid newline sequence and
409     we have advanced into the middle of it, advance one more code point. In
410     other words, do not start in the middle of CRLF, even if CR and LF on their
411     own are valid newlines. */
412 
413     save_start = start_offset++;
414     if (subject[start_offset-1] == CHAR_CR &&
415         code->newline_convention != PCRE2_NEWLINE_CR &&
416         code->newline_convention != PCRE2_NEWLINE_LF &&
417         start_offset < length &&
418         subject[start_offset] == CHAR_LF)
419       start_offset++;
420 
421     /* Otherwise, in UTF mode, advance past any secondary code points. */
422 
423     else if ((code->overall_options & PCRE2_UTF) != 0)
424       {
425 #if PCRE2_CODE_UNIT_WIDTH == 8
426       while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
427         start_offset++;
428 #elif PCRE2_CODE_UNIT_WIDTH == 16
429       while (start_offset < length &&
430             (subject[start_offset] & 0xfc00) == 0xdc00)
431         start_offset++;
432 #endif
433       }
434 
435     /* Copy what we have advanced past (unless not required), reset the special
436     global options, and continue to the next match. */
437 
438     fraglength = start_offset - save_start;
439     if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
440     goptions = 0;
441     continue;
442     }
443 
444   /* Handle a successful match. Matches that use \K to end before they start
445   or start before the current point in the subject are not supported. */
446 
447   if (ovector[1] < ovector[0] || ovector[0] < start_offset)
448     {
449     rc = PCRE2_ERROR_BADSUBSPATTERN;
450     goto EXIT;
451     }
452 
453   /* Check for the same match as previous. This is legitimate after matching an
454   empty string that starts after the initial match offset. We have tried again
455   at the match point in case the pattern is one like /(?<=\G.)/ which can never
456   match at its starting point, so running the match achieves the bumpalong. If
457   we do get the same (null) match at the original match point, it isn't such a
458   pattern, so we now do the empty string magic. In all other cases, a repeat
459   match should never occur. */
460 
461   if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
462     {
463     if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
464       {
465       goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
466       ovecsave[2] = start_offset;
467       continue;    /* Back to the top of the loop */
468       }
469     rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
470     goto EXIT;
471     }
472 
473   /* Count substitutions with a paranoid check for integer overflow; surely no
474   real call to this function would ever hit this! */
475 
476   if (subs == INT_MAX)
477     {
478     rc = PCRE2_ERROR_TOOMANYREPLACE;
479     goto EXIT;
480     }
481   subs++;
482 
483   /* Copy the text leading up to the match (unless not required), and remember
484   where the insert begins and how many ovector pairs are set. */
485 
486   if (rc == 0) rc = ovector_count;
487   fraglength = ovector[0] - start_offset;
488   if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
489   scb.output_offsets[0] = buff_offset;
490   scb.oveccount = rc;
491 
492   /* Process the replacement string. If the entire replacement is literal, just
493   copy it with length check. */
494 
495   ptr = replacement;
496   if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
497     {
498     CHECKMEMCPY(ptr, rlength);
499     }
500 
501   /* Within a non-literal replacement, which must be scanned character by
502   character, local literal mode can be set by \Q, but only in extended mode
503   when backslashes are being interpreted. In extended mode we must handle
504   nested substrings that are to be reprocessed. */
505 
506   else for (;;)
507     {
508     uint32_t ch;
509     unsigned int chlen;
510 
511     /* If at the end of a nested substring, pop the stack. */
512 
513     if (ptr >= repend)
514       {
515       if (ptrstackptr == 0) break;       /* End of replacement string */
516       repend = ptrstack[--ptrstackptr];
517       ptr = ptrstack[--ptrstackptr];
518       continue;
519       }
520 
521     /* Handle the next character */
522 
523     if (escaped_literal)
524       {
525       if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
526         {
527         escaped_literal = FALSE;
528         ptr += 2;
529         continue;
530         }
531       goto LOADLITERAL;
532       }
533 
534     /* Not in literal mode. */
535 
536     if (*ptr == CHAR_DOLLAR_SIGN)
537       {
538       int group, n;
539       uint32_t special = 0;
540       BOOL inparens;
541       BOOL star;
542       PCRE2_SIZE sublength;
543       PCRE2_SPTR text1_start = NULL;
544       PCRE2_SPTR text1_end = NULL;
545       PCRE2_SPTR text2_start = NULL;
546       PCRE2_SPTR text2_end = NULL;
547       PCRE2_UCHAR next;
548       PCRE2_UCHAR name[33];
549 
550       if (++ptr >= repend) goto BAD;
551       if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
552 
553       group = -1;
554       n = 0;
555       inparens = FALSE;
556       star = FALSE;
557 
558       if (next == CHAR_LEFT_CURLY_BRACKET)
559         {
560         if (++ptr >= repend) goto BAD;
561         next = *ptr;
562         inparens = TRUE;
563         }
564 
565       if (next == CHAR_ASTERISK)
566         {
567         if (++ptr >= repend) goto BAD;
568         next = *ptr;
569         star = TRUE;
570         }
571 
572       if (!star && next >= CHAR_0 && next <= CHAR_9)
573         {
574         group = next - CHAR_0;
575         while (++ptr < repend)
576           {
577           next = *ptr;
578           if (next < CHAR_0 || next > CHAR_9) break;
579           group = group * 10 + next - CHAR_0;
580 
581           /* A check for a number greater than the hightest captured group
582           is sufficient here; no need for a separate overflow check. If unknown
583           groups are to be treated as unset, just skip over any remaining
584           digits and carry on. */
585 
586           if (group > code->top_bracket)
587             {
588             if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
589               {
590               while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
591               break;
592               }
593             else
594               {
595               rc = PCRE2_ERROR_NOSUBSTRING;
596               goto PTREXIT;
597               }
598             }
599           }
600         }
601       else
602         {
603         const uint8_t *ctypes = code->tables + ctypes_offset;
604         while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
605           {
606           name[n++] = next;
607           if (n > 32) goto BAD;
608           if (++ptr >= repend) break;
609           next = *ptr;
610           }
611         if (n == 0) goto BAD;
612         name[n] = 0;
613         }
614 
615       /* In extended mode we recognize ${name:+set text:unset text} and
616       ${name:-default text}. */
617 
618       if (inparens)
619         {
620         if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
621              !star && ptr < repend - 2 && next == CHAR_COLON)
622           {
623           special = *(++ptr);
624           if (special != CHAR_PLUS && special != CHAR_MINUS)
625             {
626             rc = PCRE2_ERROR_BADSUBSTITUTION;
627             goto PTREXIT;
628             }
629 
630           text1_start = ++ptr;
631           rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
632           if (rc != 0) goto PTREXIT;
633           text1_end = ptr;
634 
635           if (special == CHAR_PLUS && *ptr == CHAR_COLON)
636             {
637             text2_start = ++ptr;
638             rc = find_text_end(code, &ptr, repend, TRUE);
639             if (rc != 0) goto PTREXIT;
640             text2_end = ptr;
641             }
642           }
643 
644         else
645           {
646           if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
647             {
648             rc = PCRE2_ERROR_REPMISSINGBRACE;
649             goto PTREXIT;
650             }
651           }
652 
653         ptr++;
654         }
655 
656       /* Have found a syntactically correct group number or name, or *name.
657       Only *MARK is currently recognized. */
658 
659       if (star)
660         {
661         if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
662           {
663           PCRE2_SPTR mark = pcre2_get_mark(match_data);
664           if (mark != NULL)
665             {
666             PCRE2_SPTR mark_start = mark;
667             while (*mark != 0) mark++;
668             fraglength = mark - mark_start;
669             CHECKMEMCPY(mark_start, fraglength);
670             }
671           }
672         else goto BAD;
673         }
674 
675       /* Substitute the contents of a group. We don't use substring_copy
676       functions any more, in order to support case forcing. */
677 
678       else
679         {
680         PCRE2_SPTR subptr, subptrend;
681 
682         /* Find a number for a named group. In case there are duplicate names,
683         search for the first one that is set. If the name is not found when
684         PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
685         non-existent group. */
686 
687         if (group < 0)
688           {
689           PCRE2_SPTR first, last, entry;
690           rc = pcre2_substring_nametable_scan(code, name, &first, &last);
691           if (rc == PCRE2_ERROR_NOSUBSTRING &&
692               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
693             {
694             group = code->top_bracket + 1;
695             }
696           else
697             {
698             if (rc < 0) goto PTREXIT;
699             for (entry = first; entry <= last; entry += rc)
700               {
701               uint32_t ng = GET2(entry, 0);
702               if (ng < ovector_count)
703                 {
704                 if (group < 0) group = ng;          /* First in ovector */
705                 if (ovector[ng*2] != PCRE2_UNSET)
706                   {
707                   group = ng;                       /* First that is set */
708                   break;
709                   }
710                 }
711               }
712 
713             /* If group is still negative, it means we did not find a group
714             that is in the ovector. Just set the first group. */
715 
716             if (group < 0) group = GET2(first, 0);
717             }
718           }
719 
720         /* We now have a group that is identified by number. Find the length of
721         the captured string. If a group in a non-special substitution is unset
722         when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
723 
724         rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
725         if (rc < 0)
726           {
727           if (rc == PCRE2_ERROR_NOSUBSTRING &&
728               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
729             {
730             rc = PCRE2_ERROR_UNSET;
731             }
732           if (rc != PCRE2_ERROR_UNSET) goto PTREXIT;  /* Non-unset errors */
733           if (special == 0)                           /* Plain substitution */
734             {
735             if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
736             goto PTREXIT;                             /* Else error */
737             }
738           }
739 
740         /* If special is '+' we have a 'set' and possibly an 'unset' text,
741         both of which are reprocessed when used. If special is '-' we have a
742         default text for when the group is unset; it must be reprocessed. */
743 
744         if (special != 0)
745           {
746           if (special == CHAR_MINUS)
747             {
748             if (rc == 0) goto LITERAL_SUBSTITUTE;
749             text2_start = text1_start;
750             text2_end = text1_end;
751             }
752 
753           if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
754           ptrstack[ptrstackptr++] = ptr;
755           ptrstack[ptrstackptr++] = repend;
756 
757           if (rc == 0)
758             {
759             ptr = text1_start;
760             repend = text1_end;
761             }
762           else
763             {
764             ptr = text2_start;
765             repend = text2_end;
766             }
767           continue;
768           }
769 
770         /* Otherwise we have a literal substitution of a group's contents. */
771 
772         LITERAL_SUBSTITUTE:
773         subptr = subject + ovector[group*2];
774         subptrend = subject + ovector[group*2 + 1];
775 
776         /* Substitute a literal string, possibly forcing alphabetic case. */
777 
778         while (subptr < subptrend)
779           {
780           GETCHARINCTEST(ch, subptr);
781           if (forcecase != 0)
782             {
783 #ifdef SUPPORT_UNICODE
784             if (utf || ucp)
785               {
786               uint32_t type = UCD_CHARTYPE(ch);
787               if (PRIV(ucp_gentype)[type] == ucp_L &&
788                   type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
789                 ch = UCD_OTHERCASE(ch);
790               }
791             else
792 #endif
793               {
794               if (((code->tables + cbits_offset +
795                   ((forcecase > 0)? cbit_upper:cbit_lower)
796                   )[ch/8] & (1u << (ch%8))) == 0)
797                 ch = (code->tables + fcc_offset)[ch];
798               }
799             forcecase = forcecasereset;
800             }
801 
802 #ifdef SUPPORT_UNICODE
803           if (utf) chlen = PRIV(ord2utf)(ch, temp); else
804 #endif
805             {
806             temp[0] = ch;
807             chlen = 1;
808             }
809           CHECKMEMCPY(temp, chlen);
810           }
811         }
812       }
813 
814     /* Handle an escape sequence in extended mode. We can use check_escape()
815     to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
816     the case-forcing escapes are not supported in pcre2_compile() so must be
817     recognized here. */
818 
819     else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
820               *ptr == CHAR_BACKSLASH)
821       {
822       int errorcode;
823 
824       if (ptr < repend - 1) switch (ptr[1])
825         {
826         case CHAR_L:
827         forcecase = forcecasereset = -1;
828         ptr += 2;
829         continue;
830 
831         case CHAR_l:
832         forcecase = -1;
833         forcecasereset = 0;
834         ptr += 2;
835         continue;
836 
837         case CHAR_U:
838         forcecase = forcecasereset = 1;
839         ptr += 2;
840         continue;
841 
842         case CHAR_u:
843         forcecase = 1;
844         forcecasereset = 0;
845         ptr += 2;
846         continue;
847 
848         default:
849         break;
850         }
851 
852       ptr++;  /* Point after \ */
853       rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
854         code->overall_options, code->extra_options, FALSE, NULL);
855       if (errorcode != 0) goto BADESCAPE;
856 
857       switch(rc)
858         {
859         case ESC_E:
860         forcecase = forcecasereset = 0;
861         continue;
862 
863         case ESC_Q:
864         escaped_literal = TRUE;
865         continue;
866 
867         case 0:      /* Data character */
868         goto LITERAL;
869 
870         default:
871         goto BADESCAPE;
872         }
873       }
874 
875     /* Handle a literal code unit */
876 
877     else
878       {
879       LOADLITERAL:
880       GETCHARINCTEST(ch, ptr);    /* Get character value, increment pointer */
881 
882       LITERAL:
883       if (forcecase != 0)
884         {
885 #ifdef SUPPORT_UNICODE
886         if (utf || ucp)
887           {
888           uint32_t type = UCD_CHARTYPE(ch);
889           if (PRIV(ucp_gentype)[type] == ucp_L &&
890               type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
891             ch = UCD_OTHERCASE(ch);
892           }
893         else
894 #endif
895           {
896           if (((code->tables + cbits_offset +
897               ((forcecase > 0)? cbit_upper:cbit_lower)
898               )[ch/8] & (1u << (ch%8))) == 0)
899             ch = (code->tables + fcc_offset)[ch];
900           }
901         forcecase = forcecasereset;
902         }
903 
904 #ifdef SUPPORT_UNICODE
905       if (utf) chlen = PRIV(ord2utf)(ch, temp); else
906 #endif
907         {
908         temp[0] = ch;
909         chlen = 1;
910         }
911       CHECKMEMCPY(temp, chlen);
912       } /* End handling a literal code unit */
913     }   /* End of loop for scanning the replacement. */
914 
915   /* The replacement has been copied to the output, or its size has been
916   remembered. Do the callout if there is one and we have done an actual
917   replacement. */
918 
919   if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
920     {
921     scb.subscount = subs;
922     scb.output_offsets[1] = buff_offset;
923     rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
924 
925     /* A non-zero return means cancel this substitution. Instead, copy the
926     matched string fragment. */
927 
928     if (rc != 0)
929       {
930       PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
931       PCRE2_SIZE oldlength = ovector[1] - ovector[0];
932 
933       buff_offset -= newlength;
934       lengthleft += newlength;
935       if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
936 
937       /* A negative return means do not do any more. */
938 
939       if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
940       }
941     }
942 
943   /* Save the details of this match. See above for how this data is used. If we
944   matched an empty string, do the magic for global matches. Update the start
945   offset to point to the rest of the subject string. If we re-used an existing
946   match for the first match, switch to the internal match data block. */
947 
948   ovecsave[0] = ovector[0];
949   ovecsave[1] = ovector[1];
950   ovecsave[2] = start_offset;
951 
952   goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
953     PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
954   start_offset = ovector[1];
955   } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0);  /* Repeat "do" loop */
956 
957 /* Copy the rest of the subject unless not required, and terminate the output
958 with a binary zero. */
959 
960 if (!replacement_only)
961   {
962   fraglength = length - start_offset;
963   CHECKMEMCPY(subject + start_offset, fraglength);
964   }
965 
966 temp[0] = 0;
967 CHECKMEMCPY(temp, 1);
968 
969 /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
970 and matching has carried on after a full buffer, in order to compute the length
971 needed. Otherwise, an overflow generates an immediate error return. */
972 
973 if (overflowed)
974   {
975   rc = PCRE2_ERROR_NOMEMORY;
976   *blength = buff_length + extra_needed;
977   }
978 
979 /* After a successful execution, return the number of substitutions and set the
980 length of buffer used, excluding the trailing zero. */
981 
982 else
983   {
984   rc = subs;
985   *blength = buff_offset - 1;
986   }
987 
988 EXIT:
989 if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
990   else match_data->rc = rc;
991 return rc;
992 
993 NOROOM:
994 rc = PCRE2_ERROR_NOMEMORY;
995 goto EXIT;
996 
997 BAD:
998 rc = PCRE2_ERROR_BADREPLACEMENT;
999 goto PTREXIT;
1000 
1001 BADESCAPE:
1002 rc = PCRE2_ERROR_BADREPESCAPE;
1003 
1004 PTREXIT:
1005 *blength = (PCRE2_SIZE)(ptr - replacement);
1006 goto EXIT;
1007 }
1008 
1009 /* End of pcre2_substitute.c */
1010