xref: /php-src/ext/pcre/pcre2lib/pcre2_substring.c (revision ae5beff6)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2023 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #include "pcre2_internal.h"
47 
48 
49 
50 /*************************************************
51 *   Copy named captured string to given buffer   *
52 *************************************************/
53 
54 /* This function copies a single captured substring into a given buffer,
55 identifying it by name. If the regex permits duplicate names, the first
56 substring that is set is chosen.
57 
58 Arguments:
59   match_data     points to the match data
60   stringname     the name of the required substring
61   buffer         where to put the substring
62   sizeptr        the size of the buffer, updated to the size of the substring
63 
64 Returns:         if successful: zero
65                  if not successful, a negative error code:
66                    (1) an error from nametable_scan()
67                    (2) an error from copy_bynumber()
68                    (3) PCRE2_ERROR_UNAVAILABLE: no group is in ovector
69                    (4) PCRE2_ERROR_UNSET: all named groups in ovector are unset
70 */
71 
72 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_copy_byname(pcre2_match_data * match_data,PCRE2_SPTR stringname,PCRE2_UCHAR * buffer,PCRE2_SIZE * sizeptr)73 pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR stringname,
74   PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr)
75 {
76 PCRE2_SPTR first, last, entry;
77 int failrc, entrysize;
78 if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER)
79   return PCRE2_ERROR_DFA_UFUNC;
80 entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
81   &first, &last);
82 if (entrysize < 0) return entrysize;
83 failrc = PCRE2_ERROR_UNAVAILABLE;
84 for (entry = first; entry <= last; entry += entrysize)
85   {
86   uint32_t n = GET2(entry, 0);
87   if (n < match_data->oveccount)
88     {
89     if (match_data->ovector[n*2] != PCRE2_UNSET)
90       return pcre2_substring_copy_bynumber(match_data, n, buffer, sizeptr);
91     failrc = PCRE2_ERROR_UNSET;
92     }
93   }
94 return failrc;
95 }
96 
97 
98 
99 /*************************************************
100 *  Copy numbered captured string to given buffer *
101 *************************************************/
102 
103 /* This function copies a single captured substring into a given buffer,
104 identifying it by number.
105 
106 Arguments:
107   match_data     points to the match data
108   stringnumber   the number of the required substring
109   buffer         where to put the substring
110   sizeptr        the size of the buffer, updated to the size of the substring
111 
112 Returns:         if successful: 0
113                  if not successful, a negative error code:
114                    PCRE2_ERROR_NOMEMORY: buffer too small
115                    PCRE2_ERROR_NOSUBSTRING: no such substring
116                    PCRE2_ERROR_UNAVAILABLE: ovector too small
117                    PCRE2_ERROR_UNSET: substring is not set
118 */
119 
120 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_copy_bynumber(pcre2_match_data * match_data,uint32_t stringnumber,PCRE2_UCHAR * buffer,PCRE2_SIZE * sizeptr)121 pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
122   uint32_t stringnumber, PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr)
123 {
124 int rc;
125 PCRE2_SIZE size;
126 rc = pcre2_substring_length_bynumber(match_data, stringnumber, &size);
127 if (rc < 0) return rc;
128 if (size + 1 > *sizeptr) return PCRE2_ERROR_NOMEMORY;
129 memcpy(buffer, match_data->subject + match_data->ovector[stringnumber*2],
130   CU2BYTES(size));
131 buffer[size] = 0;
132 *sizeptr = size;
133 return 0;
134 }
135 
136 
137 
138 /*************************************************
139 *          Extract named captured string         *
140 *************************************************/
141 
142 /* This function copies a single captured substring, identified by name, into
143 new memory. If the regex permits duplicate names, the first substring that is
144 set is chosen.
145 
146 Arguments:
147   match_data     pointer to match_data
148   stringname     the name of the required substring
149   stringptr      where to put the pointer to the new memory
150   sizeptr        where to put the length of the substring
151 
152 Returns:         if successful: zero
153                  if not successful, a negative value:
154                    (1) an error from nametable_scan()
155                    (2) an error from get_bynumber()
156                    (3) PCRE2_ERROR_UNAVAILABLE: no group is in ovector
157                    (4) PCRE2_ERROR_UNSET: all named groups in ovector are unset
158 */
159 
160 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_get_byname(pcre2_match_data * match_data,PCRE2_SPTR stringname,PCRE2_UCHAR ** stringptr,PCRE2_SIZE * sizeptr)161 pcre2_substring_get_byname(pcre2_match_data *match_data,
162   PCRE2_SPTR stringname, PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr)
163 {
164 PCRE2_SPTR first, last, entry;
165 int failrc, entrysize;
166 if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER)
167   return PCRE2_ERROR_DFA_UFUNC;
168 entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
169   &first, &last);
170 if (entrysize < 0) return entrysize;
171 failrc = PCRE2_ERROR_UNAVAILABLE;
172 for (entry = first; entry <= last; entry += entrysize)
173   {
174   uint32_t n = GET2(entry, 0);
175   if (n < match_data->oveccount)
176     {
177     if (match_data->ovector[n*2] != PCRE2_UNSET)
178       return pcre2_substring_get_bynumber(match_data, n, stringptr, sizeptr);
179     failrc = PCRE2_ERROR_UNSET;
180     }
181   }
182 return failrc;
183 }
184 
185 
186 
187 /*************************************************
188 *      Extract captured string to new memory     *
189 *************************************************/
190 
191 /* This function copies a single captured substring into a piece of new
192 memory.
193 
194 Arguments:
195   match_data     points to match data
196   stringnumber   the number of the required substring
197   stringptr      where to put a pointer to the new memory
198   sizeptr        where to put the size of the substring
199 
200 Returns:         if successful: 0
201                  if not successful, a negative error code:
202                    PCRE2_ERROR_NOMEMORY: failed to get memory
203                    PCRE2_ERROR_NOSUBSTRING: no such substring
204                    PCRE2_ERROR_UNAVAILABLE: ovector too small
205                    PCRE2_ERROR_UNSET: substring is not set
206 */
207 
208 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_get_bynumber(pcre2_match_data * match_data,uint32_t stringnumber,PCRE2_UCHAR ** stringptr,PCRE2_SIZE * sizeptr)209 pcre2_substring_get_bynumber(pcre2_match_data *match_data,
210   uint32_t stringnumber, PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr)
211 {
212 int rc;
213 PCRE2_SIZE size;
214 PCRE2_UCHAR *yield;
215 rc = pcre2_substring_length_bynumber(match_data, stringnumber, &size);
216 if (rc < 0) return rc;
217 yield = PRIV(memctl_malloc)(sizeof(pcre2_memctl) +
218   (size + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)match_data);
219 if (yield == NULL) return PCRE2_ERROR_NOMEMORY;
220 yield = (PCRE2_UCHAR *)(((char *)yield) + sizeof(pcre2_memctl));
221 memcpy(yield, match_data->subject + match_data->ovector[stringnumber*2],
222   CU2BYTES(size));
223 yield[size] = 0;
224 *stringptr = yield;
225 *sizeptr = size;
226 return 0;
227 }
228 
229 
230 
231 /*************************************************
232 *       Free memory obtained by get_substring    *
233 *************************************************/
234 
235 /*
236 Argument:     the result of a previous pcre2_substring_get_byxxx()
237 Returns:      nothing
238 */
239 
240 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_substring_free(PCRE2_UCHAR * string)241 pcre2_substring_free(PCRE2_UCHAR *string)
242 {
243 if (string != NULL)
244   {
245   pcre2_memctl *memctl = (pcre2_memctl *)((char *)string - sizeof(pcre2_memctl));
246   memctl->free(memctl, memctl->memory_data);
247   }
248 }
249 
250 
251 
252 /*************************************************
253 *         Get length of a named substring        *
254 *************************************************/
255 
256 /* This function returns the length of a named captured substring. If the regex
257 permits duplicate names, the first substring that is set is chosen.
258 
259 Arguments:
260   match_data      pointer to match data
261   stringname      the name of the required substring
262   sizeptr         where to put the length
263 
264 Returns:          0 if successful, else a negative error number
265 */
266 
267 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_length_byname(pcre2_match_data * match_data,PCRE2_SPTR stringname,PCRE2_SIZE * sizeptr)268 pcre2_substring_length_byname(pcre2_match_data *match_data,
269   PCRE2_SPTR stringname, PCRE2_SIZE *sizeptr)
270 {
271 PCRE2_SPTR first, last, entry;
272 int failrc, entrysize;
273 if (match_data->matchedby == PCRE2_MATCHEDBY_DFA_INTERPRETER)
274   return PCRE2_ERROR_DFA_UFUNC;
275 entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
276   &first, &last);
277 if (entrysize < 0) return entrysize;
278 failrc = PCRE2_ERROR_UNAVAILABLE;
279 for (entry = first; entry <= last; entry += entrysize)
280   {
281   uint32_t n = GET2(entry, 0);
282   if (n < match_data->oveccount)
283     {
284     if (match_data->ovector[n*2] != PCRE2_UNSET)
285       return pcre2_substring_length_bynumber(match_data, n, sizeptr);
286     failrc = PCRE2_ERROR_UNSET;
287     }
288   }
289 return failrc;
290 }
291 
292 
293 
294 /*************************************************
295 *        Get length of a numbered substring      *
296 *************************************************/
297 
298 /* This function returns the length of a captured substring. If the start is
299 beyond the end (which can happen when \K is used in an assertion), it sets the
300 length to zero.
301 
302 Arguments:
303   match_data      pointer to match data
304   stringnumber    the number of the required substring
305   sizeptr         where to put the length, if not NULL
306 
307 Returns:         if successful: 0
308                  if not successful, a negative error code:
309                    PCRE2_ERROR_NOSUBSTRING: no such substring
310                    PCRE2_ERROR_UNAVAILABLE: ovector is too small
311                    PCRE2_ERROR_UNSET: substring is not set
312                    PCRE2_ERROR_INVALIDOFFSET: internal error, should not occur
313 */
314 
315 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_length_bynumber(pcre2_match_data * match_data,uint32_t stringnumber,PCRE2_SIZE * sizeptr)316 pcre2_substring_length_bynumber(pcre2_match_data *match_data,
317   uint32_t stringnumber, PCRE2_SIZE *sizeptr)
318 {
319 PCRE2_SIZE left, right;
320 int count = match_data->rc;
321 if (count == PCRE2_ERROR_PARTIAL)
322   {
323   if (stringnumber > 0) return PCRE2_ERROR_PARTIAL;
324   count = 0;
325   }
326 else if (count < 0) return count;            /* Match failed */
327 
328 if (match_data->matchedby != PCRE2_MATCHEDBY_DFA_INTERPRETER)
329   {
330   if (stringnumber > match_data->code->top_bracket)
331     return PCRE2_ERROR_NOSUBSTRING;
332   if (stringnumber >= match_data->oveccount)
333     return PCRE2_ERROR_UNAVAILABLE;
334   if (match_data->ovector[stringnumber*2] == PCRE2_UNSET)
335     return PCRE2_ERROR_UNSET;
336   }
337 else  /* Matched using pcre2_dfa_match() */
338   {
339   if (stringnumber >= match_data->oveccount) return PCRE2_ERROR_UNAVAILABLE;
340   if (count != 0 && stringnumber >= (uint32_t)count) return PCRE2_ERROR_UNSET;
341   }
342 
343 left = match_data->ovector[stringnumber*2];
344 right = match_data->ovector[stringnumber*2+1];
345 if (left > match_data->subject_length || right > match_data->subject_length)
346   return PCRE2_ERROR_INVALIDOFFSET;
347 if (sizeptr != NULL) *sizeptr = (left > right)? 0 : right - left;
348 return 0;
349 }
350 
351 
352 
353 /*************************************************
354 *    Extract all captured strings to new memory  *
355 *************************************************/
356 
357 /* This function gets one chunk of memory and builds a list of pointers and all
358 the captured substrings in it. A NULL pointer is put on the end of the list.
359 The substrings are zero-terminated, but also, if the final argument is
360 non-NULL, a list of lengths is also returned. This allows binary data to be
361 handled.
362 
363 Arguments:
364   match_data     points to the match data
365   listptr        set to point to the list of pointers
366   lengthsptr     set to point to the list of lengths (may be NULL)
367 
368 Returns:         if successful: 0
369                  if not successful, a negative error code:
370                    PCRE2_ERROR_NOMEMORY: failed to get memory,
371                    or a match failure code
372 */
373 
374 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_list_get(pcre2_match_data * match_data,PCRE2_UCHAR *** listptr,PCRE2_SIZE ** lengthsptr)375 pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr,
376   PCRE2_SIZE **lengthsptr)
377 {
378 int i, count, count2;
379 PCRE2_SIZE size;
380 PCRE2_SIZE *lensp;
381 pcre2_memctl *memp;
382 PCRE2_UCHAR **listp;
383 PCRE2_UCHAR *sp;
384 PCRE2_SIZE *ovector;
385 
386 if ((count = match_data->rc) < 0) return count;   /* Match failed */
387 if (count == 0) count = match_data->oveccount;    /* Ovector too small */
388 
389 count2 = 2*count;
390 ovector = match_data->ovector;
391 size = sizeof(pcre2_memctl) + sizeof(PCRE2_UCHAR *);      /* For final NULL */
392 if (lengthsptr != NULL) size += sizeof(PCRE2_SIZE) * count;  /* For lengths */
393 
394 for (i = 0; i < count2; i += 2)
395   {
396   size += sizeof(PCRE2_UCHAR *) + CU2BYTES(1);
397   if (ovector[i+1] > ovector[i]) size += CU2BYTES(ovector[i+1] - ovector[i]);
398   }
399 
400 memp = PRIV(memctl_malloc)(size, (pcre2_memctl *)match_data);
401 if (memp == NULL) return PCRE2_ERROR_NOMEMORY;
402 
403 *listptr = listp = (PCRE2_UCHAR **)((char *)memp + sizeof(pcre2_memctl));
404 lensp = (PCRE2_SIZE *)((char *)listp + sizeof(PCRE2_UCHAR *) * (count + 1));
405 
406 if (lengthsptr == NULL)
407   {
408   sp = (PCRE2_UCHAR *)lensp;
409   lensp = NULL;
410   }
411 else
412   {
413   *lengthsptr = lensp;
414   sp = (PCRE2_UCHAR *)((char *)lensp + sizeof(PCRE2_SIZE) * count);
415   }
416 
417 for (i = 0; i < count2; i += 2)
418   {
419   size = (ovector[i+1] > ovector[i])? (ovector[i+1] - ovector[i]) : 0;
420 
421   /* Size == 0 includes the case when the capture is unset. Avoid adding
422   PCRE2_UNSET to match_data->subject because it overflows, even though with
423   zero size calling memcpy() is harmless. */
424 
425   if (size != 0) memcpy(sp, match_data->subject + ovector[i], CU2BYTES(size));
426   *listp++ = sp;
427   if (lensp != NULL) *lensp++ = size;
428   sp += size;
429   *sp++ = 0;
430   }
431 
432 *listp = NULL;
433 return 0;
434 }
435 
436 
437 
438 /*************************************************
439 *   Free memory obtained by substring_list_get   *
440 *************************************************/
441 
442 /*
443 Argument:     the result of a previous pcre2_substring_list_get()
444 Returns:      nothing
445 */
446 
447 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_substring_list_free(PCRE2_UCHAR ** list)448 pcre2_substring_list_free(PCRE2_UCHAR **list)
449 {
450 if (list != NULL)
451   {
452   pcre2_memctl *memctl = (pcre2_memctl *)((char *)list - sizeof(pcre2_memctl));
453   memctl->free(memctl, memctl->memory_data);
454   }
455 }
456 
457 
458 
459 /*************************************************
460 *     Find (multiple) entries for named string   *
461 *************************************************/
462 
463 /* This function scans the nametable for a given name, using binary chop. It
464 returns either two pointers to the entries in the table, or, if no pointers are
465 given, the number of a unique group with the given name. If duplicate names are
466 permitted, and the name is not unique, an error is generated.
467 
468 Arguments:
469   code        the compiled regex
470   stringname  the name whose entries required
471   firstptr    where to put the pointer to the first entry
472   lastptr     where to put the pointer to the last entry
473 
474 Returns:      PCRE2_ERROR_NOSUBSTRING if the name is not found
475               otherwise, if firstptr and lastptr are NULL:
476                 a group number for a unique substring
477                 else PCRE2_ERROR_NOUNIQUESUBSTRING
478               otherwise:
479                 the length of each entry, having set firstptr and lastptr
480 */
481 
482 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_nametable_scan(const pcre2_code * code,PCRE2_SPTR stringname,PCRE2_SPTR * firstptr,PCRE2_SPTR * lastptr)483 pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR stringname,
484   PCRE2_SPTR *firstptr, PCRE2_SPTR *lastptr)
485 {
486 uint16_t bot = 0;
487 uint16_t top = code->name_count;
488 uint16_t entrysize = code->name_entry_size;
489 PCRE2_SPTR nametable = (PCRE2_SPTR)((char *)code + sizeof(pcre2_real_code));
490 
491 while (top > bot)
492   {
493   uint16_t mid = (top + bot) / 2;
494   PCRE2_SPTR entry = nametable + entrysize*mid;
495   int c = PRIV(strcmp)(stringname, entry + IMM2_SIZE);
496   if (c == 0)
497     {
498     PCRE2_SPTR first;
499     PCRE2_SPTR last;
500     PCRE2_SPTR lastentry;
501     lastentry = nametable + entrysize * (code->name_count - 1);
502     first = last = entry;
503     while (first > nametable)
504       {
505       if (PRIV(strcmp)(stringname, (first - entrysize + IMM2_SIZE)) != 0) break;
506       first -= entrysize;
507       }
508     while (last < lastentry)
509       {
510       if (PRIV(strcmp)(stringname, (last + entrysize + IMM2_SIZE)) != 0) break;
511       last += entrysize;
512       }
513     if (firstptr == NULL) return (first == last)?
514       (int)GET2(entry, 0) : PCRE2_ERROR_NOUNIQUESUBSTRING;
515     *firstptr = first;
516     *lastptr = last;
517     return entrysize;
518     }
519   if (c > 0) bot = mid + 1; else top = mid;
520   }
521 
522 return PCRE2_ERROR_NOSUBSTRING;
523 }
524 
525 
526 /*************************************************
527 *           Find number for named string         *
528 *************************************************/
529 
530 /* This function is a convenience wrapper for pcre2_substring_nametable_scan()
531 when it is known that names are unique. If there are duplicate names, it is not
532 defined which number is returned.
533 
534 Arguments:
535   code        the compiled regex
536   stringname  the name whose number is required
537 
538 Returns:      the number of the named parenthesis, or a negative number
539                 PCRE2_ERROR_NOSUBSTRING if not found
540                 PCRE2_ERROR_NOUNIQUESUBSTRING if not unique
541 */
542 
543 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_number_from_name(const pcre2_code * code,PCRE2_SPTR stringname)544 pcre2_substring_number_from_name(const pcre2_code *code,
545   PCRE2_SPTR stringname)
546 {
547 return pcre2_substring_nametable_scan(code, stringname, NULL, NULL);
548 }
549 
550 /* End of pcre2_substring.c */
551