1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2021 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #include "pcre2_internal.h"
47 
48 #define PTR_STACK_SIZE 20
49 
50 #define SUBSTITUTE_OPTIONS \
51   (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
52    PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
53    PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
54    PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
55 
56 
57 
58 /*************************************************
59 *           Find end of substitute text          *
60 *************************************************/
61 
62 /* In extended mode, we recognize ${name:+set text:unset text} and similar
63 constructions. This requires the identification of unescaped : and }
64 characters. This function scans for such. It must deal with nested ${
65 constructions. The pointer to the text is updated, either to the required end
66 character, or to where an error was detected.
67 
68 Arguments:
69   code      points to the compiled expression (for options)
70   ptrptr    points to the pointer to the start of the text (updated)
71   ptrend    end of the whole string
72   last      TRUE if the last expected string (only } recognized)
73 
74 Returns:    0 on success
75             negative error code on failure
76 */
77 
78 static int
find_text_end(const pcre2_code * code,PCRE2_SPTR * ptrptr,PCRE2_SPTR ptrend,BOOL last)79 find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
80   BOOL last)
81 {
82 int rc = 0;
83 uint32_t nestlevel = 0;
84 BOOL literal = FALSE;
85 PCRE2_SPTR ptr = *ptrptr;
86 
87 for (; ptr < ptrend; ptr++)
88   {
89   if (literal)
90     {
91     if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
92       {
93       literal = FALSE;
94       ptr += 1;
95       }
96     }
97 
98   else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
99     {
100     if (nestlevel == 0) goto EXIT;
101     nestlevel--;
102     }
103 
104   else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
105 
106   else if (*ptr == CHAR_DOLLAR_SIGN)
107     {
108     if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
109       {
110       nestlevel++;
111       ptr += 1;
112       }
113     }
114 
115   else if (*ptr == CHAR_BACKSLASH)
116     {
117     int erc;
118     int errorcode;
119     uint32_t ch;
120 
121     if (ptr < ptrend - 1) switch (ptr[1])
122       {
123       case CHAR_L:
124       case CHAR_l:
125       case CHAR_U:
126       case CHAR_u:
127       ptr += 1;
128       continue;
129       }
130 
131     ptr += 1;  /* Must point after \ */
132     erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
133       code->overall_options, code->extra_options, FALSE, NULL);
134     ptr -= 1;  /* Back to last code unit of escape */
135     if (errorcode != 0)
136       {
137       rc = errorcode;
138       goto EXIT;
139       }
140 
141     switch(erc)
142       {
143       case 0:      /* Data character */
144       case ESC_E:  /* Isolated \E is ignored */
145       break;
146 
147       case ESC_Q:
148       literal = TRUE;
149       break;
150 
151       default:
152       rc = PCRE2_ERROR_BADREPESCAPE;
153       goto EXIT;
154       }
155     }
156   }
157 
158 rc = PCRE2_ERROR_REPMISSINGBRACE;   /* Terminator not found */
159 
160 EXIT:
161 *ptrptr = ptr;
162 return rc;
163 }
164 
165 
166 
167 /*************************************************
168 *              Match and substitute              *
169 *************************************************/
170 
171 /* This function applies a compiled re to a subject string and creates a new
172 string with substitutions. The first 7 arguments are the same as for
173 pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
174 
175 Arguments:
176   code            points to the compiled expression
177   subject         points to the subject string
178   length          length of subject string (may contain binary zeros)
179   start_offset    where to start in the subject string
180   options         option bits
181   match_data      points to a match_data block, or is NULL
182   context         points a PCRE2 context
183   replacement     points to the replacement string
184   rlength         length of replacement string
185   buffer          where to put the substituted string
186   blength         points to length of buffer; updated to length of string
187 
188 Returns:          >= 0 number of substitutions made
189                   < 0 an error code
190                   PCRE2_ERROR_BADREPLACEMENT means invalid use of $
191 */
192 
193 /* This macro checks for space in the buffer before copying into it. On
194 overflow, either give an error immediately, or keep on, accumulating the
195 length. */
196 
197 #define CHECKMEMCPY(from,length) \
198   { \
199   if (!overflowed && lengthleft < length) \
200     { \
201     if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
202     overflowed = TRUE; \
203     extra_needed = length - lengthleft; \
204     } \
205   else if (overflowed) \
206     { \
207     extra_needed += length; \
208     }  \
209   else \
210     {  \
211     memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
212     buff_offset += length; \
213     lengthleft -= length; \
214     } \
215   }
216 
217 /* Here's the function */
218 
219 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substitute(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,PCRE2_SPTR replacement,PCRE2_SIZE rlength,PCRE2_UCHAR * buffer,PCRE2_SIZE * blength)220 pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
221   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
222   pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
223   PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
224 {
225 int rc;
226 int subs;
227 int forcecase = 0;
228 int forcecasereset = 0;
229 uint32_t ovector_count;
230 uint32_t goptions = 0;
231 uint32_t suboptions;
232 pcre2_match_data *internal_match_data = NULL;
233 BOOL escaped_literal = FALSE;
234 BOOL overflowed = FALSE;
235 BOOL use_existing_match;
236 BOOL replacement_only;
237 #ifdef SUPPORT_UNICODE
238 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
239 BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
240 #endif
241 PCRE2_UCHAR temp[6];
242 PCRE2_SPTR ptr;
243 PCRE2_SPTR repend;
244 PCRE2_SIZE extra_needed = 0;
245 PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
246 PCRE2_SIZE *ovector;
247 PCRE2_SIZE ovecsave[3];
248 pcre2_substitute_callout_block scb;
249 
250 /* General initialization */
251 
252 buff_offset = 0;
253 lengthleft = buff_length = *blength;
254 *blength = PCRE2_UNSET;
255 ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
256 
257 /* Partial matching is not valid. This must come after setting *blength to
258 PCRE2_UNSET, so as not to imply an offset in the replacement. */
259 
260 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
261   return PCRE2_ERROR_BADOPTION;
262 
263 /* Validate length and find the end of the replacement. A NULL replacement of
264 zero length is interpreted as an empty string. */
265 
266 if (replacement == NULL)
267   {
268   if (rlength != 0) return PCRE2_ERROR_NULL;
269   replacement = (PCRE2_SPTR)"";
270   }
271 
272 if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
273 repend = replacement + rlength;
274 
275 /* Check for using a match that has already happened. Note that the subject
276 pointer in the match data may be NULL after a no-match. */
277 
278 use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
279 replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
280 
281 /* If starting from an existing match, there must be an externally provided
282 match data block. We create an internal match_data block in two cases: (a) an
283 external one is not supplied (and we are not starting from an existing match);
284 (b) an existing match is to be used for the first substitution. In the latter
285 case, we copy the existing match into the internal block. This ensures that no
286 changes are made to the existing match data block. */
287 
288 if (match_data == NULL)
289   {
290   pcre2_general_context *gcontext;
291   if (use_existing_match) return PCRE2_ERROR_NULL;
292   gcontext = (mcontext == NULL)?
293     (pcre2_general_context *)code :
294     (pcre2_general_context *)mcontext;
295   match_data = internal_match_data =
296     pcre2_match_data_create_from_pattern(code, gcontext);
297   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
298   }
299 
300 else if (use_existing_match)
301   {
302   pcre2_general_context *gcontext = (mcontext == NULL)?
303     (pcre2_general_context *)code :
304     (pcre2_general_context *)mcontext;
305   int pairs = (code->top_bracket + 1 < match_data->oveccount)?
306     code->top_bracket + 1 : match_data->oveccount;
307   internal_match_data = pcre2_match_data_create(match_data->oveccount,
308     gcontext);
309   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
310   memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
311     + 2*pairs*sizeof(PCRE2_SIZE));
312   match_data = internal_match_data;
313   }
314 
315 /* Remember ovector details */
316 
317 ovector = pcre2_get_ovector_pointer(match_data);
318 ovector_count = pcre2_get_ovector_count(match_data);
319 
320 /* Fixed things in the callout block */
321 
322 scb.version = 0;
323 scb.input = subject;
324 scb.output = (PCRE2_SPTR)buffer;
325 scb.ovector = ovector;
326 
327 /* A NULL subject of zero length is treated as an empty string. */
328 
329 if (subject == NULL)
330   {
331   if (length != 0) return PCRE2_ERROR_NULL;
332   subject = (PCRE2_SPTR)"";
333   }
334 
335 /* Find length of zero-terminated subject */
336 
337 if (length == PCRE2_ZERO_TERMINATED)
338   length = subject? PRIV(strlen)(subject) : 0;
339 
340 /* Check UTF replacement string if necessary. */
341 
342 #ifdef SUPPORT_UNICODE
343 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
344   {
345   rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
346   if (rc != 0)
347     {
348     match_data->leftchar = 0;
349     goto EXIT;
350     }
351   }
352 #endif  /* SUPPORT_UNICODE */
353 
354 /* Save the substitute options and remove them from the match options. */
355 
356 suboptions = options & SUBSTITUTE_OPTIONS;
357 options &= ~SUBSTITUTE_OPTIONS;
358 
359 /* Error if the start match offset is greater than the length of the subject. */
360 
361 if (start_offset > length)
362   {
363   match_data->leftchar = 0;
364   rc = PCRE2_ERROR_BADOFFSET;
365   goto EXIT;
366   }
367 
368 /* Copy up to the start offset, unless only the replacement is required. */
369 
370 if (!replacement_only) CHECKMEMCPY(subject, start_offset);
371 
372 /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
373 match is taken from the match_data that was passed in. */
374 
375 subs = 0;
376 do
377   {
378   PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
379   uint32_t ptrstackptr = 0;
380 
381   if (use_existing_match)
382     {
383     rc = match_data->rc;
384     use_existing_match = FALSE;
385     }
386   else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
387     match_data, mcontext);
388 
389 #ifdef SUPPORT_UNICODE
390   if (utf) options |= PCRE2_NO_UTF_CHECK;  /* Only need to check once */
391 #endif
392 
393   /* Any error other than no match returns the error code. No match when not
394   doing the special after-empty-match global rematch, or when at the end of the
395   subject, breaks the global loop. Otherwise, advance the starting point by one
396   character, copying it to the output, and try again. */
397 
398   if (rc < 0)
399     {
400     PCRE2_SIZE save_start;
401 
402     if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
403     if (goptions == 0 || start_offset >= length) break;
404 
405     /* Advance by one code point. Then, if CRLF is a valid newline sequence and
406     we have advanced into the middle of it, advance one more code point. In
407     other words, do not start in the middle of CRLF, even if CR and LF on their
408     own are valid newlines. */
409 
410     save_start = start_offset++;
411     if (subject[start_offset-1] == CHAR_CR &&
412         code->newline_convention != PCRE2_NEWLINE_CR &&
413         code->newline_convention != PCRE2_NEWLINE_LF &&
414         start_offset < length &&
415         subject[start_offset] == CHAR_LF)
416       start_offset++;
417 
418     /* Otherwise, in UTF mode, advance past any secondary code points. */
419 
420     else if ((code->overall_options & PCRE2_UTF) != 0)
421       {
422 #if PCRE2_CODE_UNIT_WIDTH == 8
423       while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
424         start_offset++;
425 #elif PCRE2_CODE_UNIT_WIDTH == 16
426       while (start_offset < length &&
427             (subject[start_offset] & 0xfc00) == 0xdc00)
428         start_offset++;
429 #endif
430       }
431 
432     /* Copy what we have advanced past (unless not required), reset the special
433     global options, and continue to the next match. */
434 
435     fraglength = start_offset - save_start;
436     if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
437     goptions = 0;
438     continue;
439     }
440 
441   /* Handle a successful match. Matches that use \K to end before they start
442   or start before the current point in the subject are not supported. */
443 
444   if (ovector[1] < ovector[0] || ovector[0] < start_offset)
445     {
446     rc = PCRE2_ERROR_BADSUBSPATTERN;
447     goto EXIT;
448     }
449 
450   /* Check for the same match as previous. This is legitimate after matching an
451   empty string that starts after the initial match offset. We have tried again
452   at the match point in case the pattern is one like /(?<=\G.)/ which can never
453   match at its starting point, so running the match achieves the bumpalong. If
454   we do get the same (null) match at the original match point, it isn't such a
455   pattern, so we now do the empty string magic. In all other cases, a repeat
456   match should never occur. */
457 
458   if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
459     {
460     if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
461       {
462       goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
463       ovecsave[2] = start_offset;
464       continue;    /* Back to the top of the loop */
465       }
466     rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
467     goto EXIT;
468     }
469 
470   /* Count substitutions with a paranoid check for integer overflow; surely no
471   real call to this function would ever hit this! */
472 
473   if (subs == INT_MAX)
474     {
475     rc = PCRE2_ERROR_TOOMANYREPLACE;
476     goto EXIT;
477     }
478   subs++;
479 
480   /* Copy the text leading up to the match (unless not required), and remember
481   where the insert begins and how many ovector pairs are set. */
482 
483   if (rc == 0) rc = ovector_count;
484   fraglength = ovector[0] - start_offset;
485   if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
486   scb.output_offsets[0] = buff_offset;
487   scb.oveccount = rc;
488 
489   /* Process the replacement string. If the entire replacement is literal, just
490   copy it with length check. */
491 
492   ptr = replacement;
493   if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
494     {
495     CHECKMEMCPY(ptr, rlength);
496     }
497 
498   /* Within a non-literal replacement, which must be scanned character by
499   character, local literal mode can be set by \Q, but only in extended mode
500   when backslashes are being interpreted. In extended mode we must handle
501   nested substrings that are to be reprocessed. */
502 
503   else for (;;)
504     {
505     uint32_t ch;
506     unsigned int chlen;
507 
508     /* If at the end of a nested substring, pop the stack. */
509 
510     if (ptr >= repend)
511       {
512       if (ptrstackptr == 0) break;       /* End of replacement string */
513       repend = ptrstack[--ptrstackptr];
514       ptr = ptrstack[--ptrstackptr];
515       continue;
516       }
517 
518     /* Handle the next character */
519 
520     if (escaped_literal)
521       {
522       if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
523         {
524         escaped_literal = FALSE;
525         ptr += 2;
526         continue;
527         }
528       goto LOADLITERAL;
529       }
530 
531     /* Not in literal mode. */
532 
533     if (*ptr == CHAR_DOLLAR_SIGN)
534       {
535       int group, n;
536       uint32_t special = 0;
537       BOOL inparens;
538       BOOL star;
539       PCRE2_SIZE sublength;
540       PCRE2_SPTR text1_start = NULL;
541       PCRE2_SPTR text1_end = NULL;
542       PCRE2_SPTR text2_start = NULL;
543       PCRE2_SPTR text2_end = NULL;
544       PCRE2_UCHAR next;
545       PCRE2_UCHAR name[33];
546 
547       if (++ptr >= repend) goto BAD;
548       if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
549 
550       group = -1;
551       n = 0;
552       inparens = FALSE;
553       star = FALSE;
554 
555       if (next == CHAR_LEFT_CURLY_BRACKET)
556         {
557         if (++ptr >= repend) goto BAD;
558         next = *ptr;
559         inparens = TRUE;
560         }
561 
562       if (next == CHAR_ASTERISK)
563         {
564         if (++ptr >= repend) goto BAD;
565         next = *ptr;
566         star = TRUE;
567         }
568 
569       if (!star && next >= CHAR_0 && next <= CHAR_9)
570         {
571         group = next - CHAR_0;
572         while (++ptr < repend)
573           {
574           next = *ptr;
575           if (next < CHAR_0 || next > CHAR_9) break;
576           group = group * 10 + next - CHAR_0;
577 
578           /* A check for a number greater than the hightest captured group
579           is sufficient here; no need for a separate overflow check. If unknown
580           groups are to be treated as unset, just skip over any remaining
581           digits and carry on. */
582 
583           if (group > code->top_bracket)
584             {
585             if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
586               {
587               while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
588               break;
589               }
590             else
591               {
592               rc = PCRE2_ERROR_NOSUBSTRING;
593               goto PTREXIT;
594               }
595             }
596           }
597         }
598       else
599         {
600         const uint8_t *ctypes = code->tables + ctypes_offset;
601         while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
602           {
603           name[n++] = next;
604           if (n > 32) goto BAD;
605           if (++ptr >= repend) break;
606           next = *ptr;
607           }
608         if (n == 0) goto BAD;
609         name[n] = 0;
610         }
611 
612       /* In extended mode we recognize ${name:+set text:unset text} and
613       ${name:-default text}. */
614 
615       if (inparens)
616         {
617         if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
618              !star && ptr < repend - 2 && next == CHAR_COLON)
619           {
620           special = *(++ptr);
621           if (special != CHAR_PLUS && special != CHAR_MINUS)
622             {
623             rc = PCRE2_ERROR_BADSUBSTITUTION;
624             goto PTREXIT;
625             }
626 
627           text1_start = ++ptr;
628           rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
629           if (rc != 0) goto PTREXIT;
630           text1_end = ptr;
631 
632           if (special == CHAR_PLUS && *ptr == CHAR_COLON)
633             {
634             text2_start = ++ptr;
635             rc = find_text_end(code, &ptr, repend, TRUE);
636             if (rc != 0) goto PTREXIT;
637             text2_end = ptr;
638             }
639           }
640 
641         else
642           {
643           if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
644             {
645             rc = PCRE2_ERROR_REPMISSINGBRACE;
646             goto PTREXIT;
647             }
648           }
649 
650         ptr++;
651         }
652 
653       /* Have found a syntactically correct group number or name, or *name.
654       Only *MARK is currently recognized. */
655 
656       if (star)
657         {
658         if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
659           {
660           PCRE2_SPTR mark = pcre2_get_mark(match_data);
661           if (mark != NULL)
662             {
663             PCRE2_SPTR mark_start = mark;
664             while (*mark != 0) mark++;
665             fraglength = mark - mark_start;
666             CHECKMEMCPY(mark_start, fraglength);
667             }
668           }
669         else goto BAD;
670         }
671 
672       /* Substitute the contents of a group. We don't use substring_copy
673       functions any more, in order to support case forcing. */
674 
675       else
676         {
677         PCRE2_SPTR subptr, subptrend;
678 
679         /* Find a number for a named group. In case there are duplicate names,
680         search for the first one that is set. If the name is not found when
681         PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
682         non-existent group. */
683 
684         if (group < 0)
685           {
686           PCRE2_SPTR first, last, entry;
687           rc = pcre2_substring_nametable_scan(code, name, &first, &last);
688           if (rc == PCRE2_ERROR_NOSUBSTRING &&
689               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
690             {
691             group = code->top_bracket + 1;
692             }
693           else
694             {
695             if (rc < 0) goto PTREXIT;
696             for (entry = first; entry <= last; entry += rc)
697               {
698               uint32_t ng = GET2(entry, 0);
699               if (ng < ovector_count)
700                 {
701                 if (group < 0) group = ng;          /* First in ovector */
702                 if (ovector[ng*2] != PCRE2_UNSET)
703                   {
704                   group = ng;                       /* First that is set */
705                   break;
706                   }
707                 }
708               }
709 
710             /* If group is still negative, it means we did not find a group
711             that is in the ovector. Just set the first group. */
712 
713             if (group < 0) group = GET2(first, 0);
714             }
715           }
716 
717         /* We now have a group that is identified by number. Find the length of
718         the captured string. If a group in a non-special substitution is unset
719         when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
720 
721         rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
722         if (rc < 0)
723           {
724           if (rc == PCRE2_ERROR_NOSUBSTRING &&
725               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
726             {
727             rc = PCRE2_ERROR_UNSET;
728             }
729           if (rc != PCRE2_ERROR_UNSET) goto PTREXIT;  /* Non-unset errors */
730           if (special == 0)                           /* Plain substitution */
731             {
732             if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
733             goto PTREXIT;                             /* Else error */
734             }
735           }
736 
737         /* If special is '+' we have a 'set' and possibly an 'unset' text,
738         both of which are reprocessed when used. If special is '-' we have a
739         default text for when the group is unset; it must be reprocessed. */
740 
741         if (special != 0)
742           {
743           if (special == CHAR_MINUS)
744             {
745             if (rc == 0) goto LITERAL_SUBSTITUTE;
746             text2_start = text1_start;
747             text2_end = text1_end;
748             }
749 
750           if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
751           ptrstack[ptrstackptr++] = ptr;
752           ptrstack[ptrstackptr++] = repend;
753 
754           if (rc == 0)
755             {
756             ptr = text1_start;
757             repend = text1_end;
758             }
759           else
760             {
761             ptr = text2_start;
762             repend = text2_end;
763             }
764           continue;
765           }
766 
767         /* Otherwise we have a literal substitution of a group's contents. */
768 
769         LITERAL_SUBSTITUTE:
770         subptr = subject + ovector[group*2];
771         subptrend = subject + ovector[group*2 + 1];
772 
773         /* Substitute a literal string, possibly forcing alphabetic case. */
774 
775         while (subptr < subptrend)
776           {
777           GETCHARINCTEST(ch, subptr);
778           if (forcecase != 0)
779             {
780 #ifdef SUPPORT_UNICODE
781             if (utf || ucp)
782               {
783               uint32_t type = UCD_CHARTYPE(ch);
784               if (PRIV(ucp_gentype)[type] == ucp_L &&
785                   type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
786                 ch = UCD_OTHERCASE(ch);
787               }
788             else
789 #endif
790               {
791               if (((code->tables + cbits_offset +
792                   ((forcecase > 0)? cbit_upper:cbit_lower)
793                   )[ch/8] & (1u << (ch%8))) == 0)
794                 ch = (code->tables + fcc_offset)[ch];
795               }
796             forcecase = forcecasereset;
797             }
798 
799 #ifdef SUPPORT_UNICODE
800           if (utf) chlen = PRIV(ord2utf)(ch, temp); else
801 #endif
802             {
803             temp[0] = ch;
804             chlen = 1;
805             }
806           CHECKMEMCPY(temp, chlen);
807           }
808         }
809       }
810 
811     /* Handle an escape sequence in extended mode. We can use check_escape()
812     to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
813     the case-forcing escapes are not supported in pcre2_compile() so must be
814     recognized here. */
815 
816     else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
817               *ptr == CHAR_BACKSLASH)
818       {
819       int errorcode;
820 
821       if (ptr < repend - 1) switch (ptr[1])
822         {
823         case CHAR_L:
824         forcecase = forcecasereset = -1;
825         ptr += 2;
826         continue;
827 
828         case CHAR_l:
829         forcecase = -1;
830         forcecasereset = 0;
831         ptr += 2;
832         continue;
833 
834         case CHAR_U:
835         forcecase = forcecasereset = 1;
836         ptr += 2;
837         continue;
838 
839         case CHAR_u:
840         forcecase = 1;
841         forcecasereset = 0;
842         ptr += 2;
843         continue;
844 
845         default:
846         break;
847         }
848 
849       ptr++;  /* Point after \ */
850       rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
851         code->overall_options, code->extra_options, FALSE, NULL);
852       if (errorcode != 0) goto BADESCAPE;
853 
854       switch(rc)
855         {
856         case ESC_E:
857         forcecase = forcecasereset = 0;
858         continue;
859 
860         case ESC_Q:
861         escaped_literal = TRUE;
862         continue;
863 
864         case 0:      /* Data character */
865         goto LITERAL;
866 
867         default:
868         goto BADESCAPE;
869         }
870       }
871 
872     /* Handle a literal code unit */
873 
874     else
875       {
876       LOADLITERAL:
877       GETCHARINCTEST(ch, ptr);    /* Get character value, increment pointer */
878 
879       LITERAL:
880       if (forcecase != 0)
881         {
882 #ifdef SUPPORT_UNICODE
883         if (utf || ucp)
884           {
885           uint32_t type = UCD_CHARTYPE(ch);
886           if (PRIV(ucp_gentype)[type] == ucp_L &&
887               type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
888             ch = UCD_OTHERCASE(ch);
889           }
890         else
891 #endif
892           {
893           if (((code->tables + cbits_offset +
894               ((forcecase > 0)? cbit_upper:cbit_lower)
895               )[ch/8] & (1u << (ch%8))) == 0)
896             ch = (code->tables + fcc_offset)[ch];
897           }
898         forcecase = forcecasereset;
899         }
900 
901 #ifdef SUPPORT_UNICODE
902       if (utf) chlen = PRIV(ord2utf)(ch, temp); else
903 #endif
904         {
905         temp[0] = ch;
906         chlen = 1;
907         }
908       CHECKMEMCPY(temp, chlen);
909       } /* End handling a literal code unit */
910     }   /* End of loop for scanning the replacement. */
911 
912   /* The replacement has been copied to the output, or its size has been
913   remembered. Do the callout if there is one and we have done an actual
914   replacement. */
915 
916   if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
917     {
918     scb.subscount = subs;
919     scb.output_offsets[1] = buff_offset;
920     rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
921 
922     /* A non-zero return means cancel this substitution. Instead, copy the
923     matched string fragment. */
924 
925     if (rc != 0)
926       {
927       PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
928       PCRE2_SIZE oldlength = ovector[1] - ovector[0];
929 
930       buff_offset -= newlength;
931       lengthleft += newlength;
932       if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
933 
934       /* A negative return means do not do any more. */
935 
936       if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
937       }
938     }
939 
940   /* Save the details of this match. See above for how this data is used. If we
941   matched an empty string, do the magic for global matches. Update the start
942   offset to point to the rest of the subject string. If we re-used an existing
943   match for the first match, switch to the internal match data block. */
944 
945   ovecsave[0] = ovector[0];
946   ovecsave[1] = ovector[1];
947   ovecsave[2] = start_offset;
948 
949   goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
950     PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
951   start_offset = ovector[1];
952   } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0);  /* Repeat "do" loop */
953 
954 /* Copy the rest of the subject unless not required, and terminate the output
955 with a binary zero. */
956 
957 if (!replacement_only)
958   {
959   fraglength = length - start_offset;
960   CHECKMEMCPY(subject + start_offset, fraglength);
961   }
962 
963 temp[0] = 0;
964 CHECKMEMCPY(temp, 1);
965 
966 /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
967 and matching has carried on after a full buffer, in order to compute the length
968 needed. Otherwise, an overflow generates an immediate error return. */
969 
970 if (overflowed)
971   {
972   rc = PCRE2_ERROR_NOMEMORY;
973   *blength = buff_length + extra_needed;
974   }
975 
976 /* After a successful execution, return the number of substitutions and set the
977 length of buffer used, excluding the trailing zero. */
978 
979 else
980   {
981   rc = subs;
982   *blength = buff_offset - 1;
983   }
984 
985 EXIT:
986 if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
987   else match_data->rc = rc;
988 return rc;
989 
990 NOROOM:
991 rc = PCRE2_ERROR_NOMEMORY;
992 goto EXIT;
993 
994 BAD:
995 rc = PCRE2_ERROR_BADREPLACEMENT;
996 goto PTREXIT;
997 
998 BADESCAPE:
999 rc = PCRE2_ERROR_BADREPESCAPE;
1000 
1001 PTREXIT:
1002 *blength = (PCRE2_SIZE)(ptr - replacement);
1003 goto EXIT;
1004 }
1005 
1006 /* End of pcre2_substitute.c */
1007