xref: /php-src/ext/pcre/pcre2lib/pcre2_convert.c (revision ae5beff6)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2022 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #include "pcre2_internal.h"
47 
48 #define TYPE_OPTIONS (PCRE2_CONVERT_GLOB| \
49   PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED)
50 
51 #define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \
52   PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \
53   PCRE2_CONVERT_GLOB_NO_STARSTAR| \
54   TYPE_OPTIONS)
55 
56 #define DUMMY_BUFFER_SIZE 100
57 
58 /* Generated pattern fragments */
59 
60 #define STR_BACKSLASH_A STR_BACKSLASH STR_A
61 #define STR_BACKSLASH_z STR_BACKSLASH STR_z
62 #define STR_COLON_RIGHT_SQUARE_BRACKET STR_COLON STR_RIGHT_SQUARE_BRACKET
63 #define STR_DOT_STAR_LOOKBEHIND STR_DOT STR_ASTERISK STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_LESS_THAN_SIGN STR_EQUALS_SIGN
64 #define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS
65 #define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS
66 #define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS
67 
68 /* States for POSIX processing */
69 
70 enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET,
71        POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };
72 
73 /* Macro to add a character string to the output buffer, checking for overflow. */
74 
75 #define PUTCHARS(string) \
76   { \
77   for (s = (char *)(string); *s != 0; s++) \
78     { \
79     if (p >= endp) return PCRE2_ERROR_NOMEMORY; \
80     *p++ = *s; \
81     } \
82   }
83 
84 /* Literals that must be escaped: \ ? * + | . ^ $ { } [ ] ( ) */
85 
86 static const char *pcre2_escaped_literals =
87   STR_BACKSLASH STR_QUESTION_MARK STR_ASTERISK STR_PLUS
88   STR_VERTICAL_LINE STR_DOT STR_CIRCUMFLEX_ACCENT STR_DOLLAR_SIGN
89   STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET
90   STR_LEFT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
91   STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS;
92 
93 /* Recognized escaped metacharacters in POSIX basic patterns. */
94 
95 static const char *posix_meta_escapes =
96   STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS
97   STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET
98   STR_1 STR_2 STR_3 STR_4 STR_5 STR_6 STR_7 STR_8 STR_9;
99 
100 
101 
102 /*************************************************
103 *           Convert a POSIX pattern              *
104 *************************************************/
105 
106 /* This function handles both basic and extended POSIX patterns.
107 
108 Arguments:
109   pattype        the pattern type
110   pattern        the pattern
111   plength        length in code units
112   utf            TRUE if UTF
113   use_buffer     where to put the output
114   use_length     length of use_buffer
115   bufflenptr     where to put the used length
116   dummyrun       TRUE if a dummy run
117   ccontext       the convert context
118 
119 Returns:         0 => success
120                 !0 => error code
121 */
122 
123 static int
convert_posix(uint32_t pattype,PCRE2_SPTR pattern,PCRE2_SIZE plength,BOOL utf,PCRE2_UCHAR * use_buffer,PCRE2_SIZE use_length,PCRE2_SIZE * bufflenptr,BOOL dummyrun,pcre2_convert_context * ccontext)124 convert_posix(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength,
125   BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,
126   PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)
127 {
128 char *s;
129 PCRE2_SPTR posix = pattern;
130 PCRE2_UCHAR *p = use_buffer;
131 PCRE2_UCHAR *pp = p;
132 PCRE2_UCHAR *endp = p + use_length - 1;  /* Allow for trailing zero */
133 PCRE2_SIZE convlength = 0;
134 
135 uint32_t bracount = 0;
136 uint32_t posix_state = POSIX_START_REGEX;
137 uint32_t lastspecial = 0;
138 BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0;
139 BOOL nextisliteral = FALSE;
140 
141 (void)utf;       /* Not used when Unicode not supported */
142 (void)ccontext;  /* Not currently used */
143 
144 /* Initialize default for error offset as end of input. */
145 
146 *bufflenptr = plength;
147 PUTCHARS(STR_STAR_NUL);
148 
149 /* Now scan the input. */
150 
151 while (plength > 0)
152   {
153   uint32_t c, sc;
154   int clength = 1;
155 
156   /* Add in the length of the last item, then, if in the dummy run, pull the
157   pointer back to the start of the (temporary) buffer and then remember the
158   start of the next item. */
159 
160   convlength += p - pp;
161   if (dummyrun) p = use_buffer;
162   pp = p;
163 
164   /* Pick up the next character */
165 
166 #ifndef SUPPORT_UNICODE
167   c = *posix;
168 #else
169   GETCHARLENTEST(c, posix, clength);
170 #endif
171   posix += clength;
172   plength -= clength;
173 
174   sc = nextisliteral? 0 : c;
175   nextisliteral = FALSE;
176 
177   /* Handle a character within a class. */
178 
179   if (posix_state >= POSIX_CLASS_NOT_STARTED)
180     {
181     if (c == CHAR_RIGHT_SQUARE_BRACKET)
182       {
183       PUTCHARS(STR_RIGHT_SQUARE_BRACKET);
184       posix_state = POSIX_NOT_BRACKET;
185       }
186 
187     /* Not the end of the class */
188 
189     else
190       {
191       switch (posix_state)
192         {
193         case POSIX_CLASS_STARTED:
194         if (c <= 127 && islower(c)) break;  /* Remain in started state */
195         posix_state = POSIX_CLASS_NOT_STARTED;
196         if (c == CHAR_COLON  && plength > 0 &&
197             *posix == CHAR_RIGHT_SQUARE_BRACKET)
198           {
199           PUTCHARS(STR_COLON_RIGHT_SQUARE_BRACKET);
200           plength--;
201           posix++;
202           continue;    /* With next character after :] */
203           }
204         /* Fall through */
205 
206         case POSIX_CLASS_NOT_STARTED:
207         if (c == CHAR_LEFT_SQUARE_BRACKET)
208           posix_state = POSIX_CLASS_STARTING;
209         break;
210 
211         case POSIX_CLASS_STARTING:
212         if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED;
213         break;
214         }
215 
216       if (c == CHAR_BACKSLASH) PUTCHARS(STR_BACKSLASH);
217       if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
218       memcpy(p, posix - clength, CU2BYTES(clength));
219       p += clength;
220       }
221     }
222 
223   /* Handle a character not within a class. */
224 
225   else switch(sc)
226     {
227     case CHAR_LEFT_SQUARE_BRACKET:
228     PUTCHARS(STR_LEFT_SQUARE_BRACKET);
229 
230 #ifdef NEVER
231     /* We could handle special cases [[:<:]] and [[:>:]] (which PCRE does
232     support) but they are not part of POSIX 1003.1. */
233 
234     if (plength >= 6)
235       {
236       if (posix[0] == CHAR_LEFT_SQUARE_BRACKET &&
237           posix[1] == CHAR_COLON &&
238           (posix[2] == CHAR_LESS_THAN_SIGN ||
239            posix[2] == CHAR_GREATER_THAN_SIGN) &&
240           posix[3] == CHAR_COLON &&
241           posix[4] == CHAR_RIGHT_SQUARE_BRACKET &&
242           posix[5] == CHAR_RIGHT_SQUARE_BRACKET)
243         {
244         if (p + 6 > endp) return PCRE2_ERROR_NOMEMORY;
245         memcpy(p, posix, CU2BYTES(6));
246         p += 6;
247         posix += 6;
248         plength -= 6;
249         continue;  /* With next character */
250         }
251       }
252 #endif
253 
254     /* Handle start of "normal" character classes */
255 
256     posix_state = POSIX_CLASS_NOT_STARTED;
257 
258     /* Handle ^ and ] as first characters */
259 
260     if (plength > 0)
261       {
262       if (*posix == CHAR_CIRCUMFLEX_ACCENT)
263         {
264         posix++;
265         plength--;
266         PUTCHARS(STR_CIRCUMFLEX_ACCENT);
267         }
268       if (plength > 0 && *posix == CHAR_RIGHT_SQUARE_BRACKET)
269         {
270         posix++;
271         plength--;
272         PUTCHARS(STR_RIGHT_SQUARE_BRACKET);
273         }
274       }
275     break;
276 
277     case CHAR_BACKSLASH:
278     if (plength == 0) return PCRE2_ERROR_END_BACKSLASH;
279     if (extended) nextisliteral = TRUE; else
280       {
281       if (*posix < 127 && strchr(posix_meta_escapes, *posix) != NULL)
282         {
283         if (isdigit(*posix)) PUTCHARS(STR_BACKSLASH);
284         if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
285         lastspecial = *p++ = *posix++;
286         plength--;
287         }
288       else nextisliteral = TRUE;
289       }
290     break;
291 
292     case CHAR_RIGHT_PARENTHESIS:
293     if (!extended || bracount == 0) goto ESCAPE_LITERAL;
294     bracount--;
295     goto COPY_SPECIAL;
296 
297     case CHAR_LEFT_PARENTHESIS:
298     bracount++;
299     /* Fall through */
300 
301     case CHAR_QUESTION_MARK:
302     case CHAR_PLUS:
303     case CHAR_LEFT_CURLY_BRACKET:
304     case CHAR_RIGHT_CURLY_BRACKET:
305     case CHAR_VERTICAL_LINE:
306     if (!extended) goto ESCAPE_LITERAL;
307     /* Fall through */
308 
309     case CHAR_DOT:
310     case CHAR_DOLLAR_SIGN:
311     posix_state = POSIX_NOT_BRACKET;
312     COPY_SPECIAL:
313     lastspecial = c;
314     if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
315     *p++ = c;
316     break;
317 
318     case CHAR_ASTERISK:
319     if (lastspecial != CHAR_ASTERISK)
320       {
321       if (!extended && (posix_state < POSIX_NOT_BRACKET ||
322           lastspecial == CHAR_LEFT_PARENTHESIS))
323         goto ESCAPE_LITERAL;
324       goto COPY_SPECIAL;
325       }
326     break;   /* Ignore second and subsequent asterisks */
327 
328     case CHAR_CIRCUMFLEX_ACCENT:
329     if (extended) goto COPY_SPECIAL;
330     if (posix_state == POSIX_START_REGEX ||
331         lastspecial == CHAR_LEFT_PARENTHESIS)
332       {
333       posix_state = POSIX_ANCHORED;
334       goto COPY_SPECIAL;
335       }
336     /* Fall through */
337 
338     default:
339     if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL)
340       {
341       ESCAPE_LITERAL:
342       PUTCHARS(STR_BACKSLASH);
343       }
344     lastspecial = 0xff;  /* Indicates nothing special */
345     if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
346     memcpy(p, posix - clength, CU2BYTES(clength));
347     p += clength;
348     posix_state = POSIX_NOT_BRACKET;
349     break;
350     }
351   }
352 
353 if (posix_state >= POSIX_CLASS_NOT_STARTED)
354   return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
355 convlength += p - pp;        /* Final segment */
356 *bufflenptr = convlength;
357 *p++ = 0;
358 return 0;
359 }
360 
361 
362 /*************************************************
363 *           Convert a glob pattern               *
364 *************************************************/
365 
366 /* Context for writing the output into a buffer. */
367 
368 typedef struct pcre2_output_context {
369   PCRE2_UCHAR *output;                  /* current output position */
370   PCRE2_SPTR output_end;                /* output end */
371   PCRE2_SIZE output_size;               /* size of the output */
372   uint8_t out_str[8];                   /* string copied to the output */
373 } pcre2_output_context;
374 
375 
376 /* Write a character into the output.
377 
378 Arguments:
379   out            output context
380   chr            the next character
381 */
382 
383 static void
convert_glob_write(pcre2_output_context * out,PCRE2_UCHAR chr)384 convert_glob_write(pcre2_output_context *out, PCRE2_UCHAR chr)
385 {
386 out->output_size++;
387 
388 if (out->output < out->output_end)
389   *out->output++ = chr;
390 }
391 
392 
393 /* Write a string into the output.
394 
395 Arguments:
396   out            output context
397   length         length of out->out_str
398 */
399 
400 static void
convert_glob_write_str(pcre2_output_context * out,PCRE2_SIZE length)401 convert_glob_write_str(pcre2_output_context *out, PCRE2_SIZE length)
402 {
403 uint8_t *out_str = out->out_str;
404 PCRE2_UCHAR *output = out->output;
405 PCRE2_SPTR output_end = out->output_end;
406 PCRE2_SIZE output_size = out->output_size;
407 
408 do
409   {
410   output_size++;
411 
412   if (output < output_end)
413     *output++ = *out_str++;
414   }
415 while (--length != 0);
416 
417 out->output = output;
418 out->output_size = output_size;
419 }
420 
421 
422 /* Prints the separator into the output.
423 
424 Arguments:
425   out            output context
426   separator      glob separator
427   with_escape    backslash is needed before separator
428 */
429 
430 static void
convert_glob_print_separator(pcre2_output_context * out,PCRE2_UCHAR separator,BOOL with_escape)431 convert_glob_print_separator(pcre2_output_context *out,
432   PCRE2_UCHAR separator, BOOL with_escape)
433 {
434 if (with_escape)
435   convert_glob_write(out, CHAR_BACKSLASH);
436 
437 convert_glob_write(out, separator);
438 }
439 
440 
441 /* Prints a wildcard into the output.
442 
443 Arguments:
444   out            output context
445   separator      glob separator
446   with_escape    backslash is needed before separator
447 */
448 
449 static void
convert_glob_print_wildcard(pcre2_output_context * out,PCRE2_UCHAR separator,BOOL with_escape)450 convert_glob_print_wildcard(pcre2_output_context *out,
451   PCRE2_UCHAR separator, BOOL with_escape)
452 {
453 out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
454 out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
455 convert_glob_write_str(out, 2);
456 
457 convert_glob_print_separator(out, separator, with_escape);
458 
459 convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
460 }
461 
462 
463 /* Parse a posix class.
464 
465 Arguments:
466   from           starting point of scanning the range
467   pattern_end    end of pattern
468   out            output context
469 
470 Returns:  >0 => class index
471           0  => malformed class
472 */
473 
474 static int
convert_glob_parse_class(PCRE2_SPTR * from,PCRE2_SPTR pattern_end,pcre2_output_context * out)475 convert_glob_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
476   pcre2_output_context *out)
477 {
478 static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:"
479   "graph:lower:print:punct:space:upper:word:xdigit:";
480 PCRE2_SPTR start = *from + 1;
481 PCRE2_SPTR pattern = start;
482 const char *class_ptr;
483 PCRE2_UCHAR c;
484 int class_index;
485 
486 while (TRUE)
487   {
488   if (pattern >= pattern_end) return 0;
489 
490   c = *pattern++;
491 
492   if (c < CHAR_a || c > CHAR_z) break;
493   }
494 
495 if (c != CHAR_COLON || pattern >= pattern_end ||
496     *pattern != CHAR_RIGHT_SQUARE_BRACKET)
497   return 0;
498 
499 class_ptr = posix_classes;
500 class_index = 1;
501 
502 while (TRUE)
503   {
504   if (*class_ptr == CHAR_NUL) return 0;
505 
506   pattern = start;
507 
508   while (*pattern == (PCRE2_UCHAR) *class_ptr)
509     {
510     if (*pattern == CHAR_COLON)
511       {
512       pattern += 2;
513       start -= 2;
514 
515       do convert_glob_write(out, *start++); while (start < pattern);
516 
517       *from = pattern;
518       return class_index;
519       }
520     pattern++;
521     class_ptr++;
522     }
523 
524   while (*class_ptr != CHAR_COLON) class_ptr++;
525   class_ptr++;
526   class_index++;
527   }
528 }
529 
530 /* Checks whether the character is in the class.
531 
532 Arguments:
533   class_index    class index
534   c              character
535 
536 Returns:   !0 => character is found in the class
537             0 => otherwise
538 */
539 
540 static BOOL
convert_glob_char_in_class(int class_index,PCRE2_UCHAR c)541 convert_glob_char_in_class(int class_index, PCRE2_UCHAR c)
542 {
543 #if PCRE2_CODE_UNIT_WIDTH != 8
544 if (c > 0xff)
545   {
546   /* ctype functions are not sane for c > 0xff */
547   return 0;
548   }
549 #endif
550 
551 switch (class_index)
552   {
553   case 1: return isalnum(c);
554   case 2: return isalpha(c);
555   case 3: return 1;
556   case 4: return c == CHAR_HT || c == CHAR_SPACE;
557   case 5: return iscntrl(c);
558   case 6: return isdigit(c);
559   case 7: return isgraph(c);
560   case 8: return islower(c);
561   case 9: return isprint(c);
562   case 10: return ispunct(c);
563   case 11: return isspace(c);
564   case 12: return isupper(c);
565   case 13: return isalnum(c) || c == CHAR_UNDERSCORE;
566   default: return isxdigit(c);
567   }
568 }
569 
570 /* Parse a range of characters.
571 
572 Arguments:
573   from           starting point of scanning the range
574   pattern_end    end of pattern
575   out            output context
576   separator      glob separator
577   with_escape    backslash is needed before separator
578 
579 Returns:         0 => success
580                 !0 => error code
581 */
582 
583 static int
convert_glob_parse_range(PCRE2_SPTR * from,PCRE2_SPTR pattern_end,pcre2_output_context * out,BOOL utf,PCRE2_UCHAR separator,BOOL with_escape,PCRE2_UCHAR escape,BOOL no_wildsep)584 convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
585   pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator,
586   BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep)
587 {
588 BOOL is_negative = FALSE;
589 BOOL separator_seen = FALSE;
590 BOOL has_prev_c;
591 PCRE2_SPTR pattern = *from;
592 PCRE2_SPTR char_start = NULL;
593 uint32_t c, prev_c;
594 int len, class_index;
595 
596 (void)utf; /* Avoid compiler warning. */
597 
598 if (pattern >= pattern_end)
599   {
600   *from = pattern;
601   return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
602   }
603 
604 if (*pattern == CHAR_EXCLAMATION_MARK
605     || *pattern == CHAR_CIRCUMFLEX_ACCENT)
606   {
607   pattern++;
608 
609   if (pattern >= pattern_end)
610     {
611     *from = pattern;
612     return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
613     }
614 
615   is_negative = TRUE;
616 
617   out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
618   out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
619   len = 2;
620 
621   if (!no_wildsep)
622     {
623     if (with_escape)
624       {
625       out->out_str[len] = CHAR_BACKSLASH;
626       len++;
627       }
628     out->out_str[len] = (uint8_t) separator;
629     }
630 
631   convert_glob_write_str(out, len + 1);
632   }
633 else
634   convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET);
635 
636 has_prev_c = FALSE;
637 prev_c = 0;
638 
639 if (*pattern == CHAR_RIGHT_SQUARE_BRACKET)
640   {
641   out->out_str[0] = CHAR_BACKSLASH;
642   out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET;
643   convert_glob_write_str(out, 2);
644   has_prev_c = TRUE;
645   prev_c = CHAR_RIGHT_SQUARE_BRACKET;
646   pattern++;
647   }
648 
649 while (pattern < pattern_end)
650   {
651   char_start = pattern;
652   GETCHARINCTEST(c, pattern);
653 
654   if (c == CHAR_RIGHT_SQUARE_BRACKET)
655     {
656     convert_glob_write(out, c);
657 
658     if (!is_negative && !no_wildsep && separator_seen)
659       {
660       out->out_str[0] = CHAR_LEFT_PARENTHESIS;
661       out->out_str[1] = CHAR_QUESTION_MARK;
662       out->out_str[2] = CHAR_LESS_THAN_SIGN;
663       out->out_str[3] = CHAR_EXCLAMATION_MARK;
664       convert_glob_write_str(out, 4);
665 
666       convert_glob_print_separator(out, separator, with_escape);
667       convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
668       }
669 
670     *from = pattern;
671     return 0;
672     }
673 
674   if (pattern >= pattern_end) break;
675 
676   if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
677     {
678     *from = pattern;
679     class_index = convert_glob_parse_class(from, pattern_end, out);
680 
681     if (class_index != 0)
682       {
683       pattern = *from;
684 
685       has_prev_c = FALSE;
686       prev_c = 0;
687 
688       if (!is_negative &&
689           convert_glob_char_in_class (class_index, separator))
690         separator_seen = TRUE;
691       continue;
692       }
693     }
694   else if (c == CHAR_MINUS && has_prev_c &&
695            *pattern != CHAR_RIGHT_SQUARE_BRACKET)
696     {
697     convert_glob_write(out, CHAR_MINUS);
698 
699     char_start = pattern;
700     GETCHARINCTEST(c, pattern);
701 
702     if (pattern >= pattern_end) break;
703 
704     if (escape != 0 && c == escape)
705       {
706       char_start = pattern;
707       GETCHARINCTEST(c, pattern);
708       }
709     else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
710       {
711       *from = pattern;
712       return PCRE2_ERROR_CONVERT_SYNTAX;
713       }
714 
715     if (prev_c > c)
716       {
717       *from = pattern;
718       return PCRE2_ERROR_CONVERT_SYNTAX;
719       }
720 
721     if (prev_c < separator && separator < c) separator_seen = TRUE;
722 
723     has_prev_c = FALSE;
724     prev_c = 0;
725     }
726   else
727     {
728     if (escape != 0 && c == escape)
729       {
730       char_start = pattern;
731       GETCHARINCTEST(c, pattern);
732 
733       if (pattern >= pattern_end) break;
734       }
735 
736     has_prev_c = TRUE;
737     prev_c = c;
738     }
739 
740   if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
741       c == CHAR_BACKSLASH || c == CHAR_MINUS)
742     convert_glob_write(out, CHAR_BACKSLASH);
743 
744   if (c == separator) separator_seen = TRUE;
745 
746   do convert_glob_write(out, *char_start++); while (char_start < pattern);
747   }
748 
749 *from = pattern;
750 return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
751 }
752 
753 
754 /* Prints a (*COMMIT) into the output.
755 
756 Arguments:
757   out            output context
758 */
759 
760 static void
convert_glob_print_commit(pcre2_output_context * out)761 convert_glob_print_commit(pcre2_output_context *out)
762 {
763 out->out_str[0] = CHAR_LEFT_PARENTHESIS;
764 out->out_str[1] = CHAR_ASTERISK;
765 out->out_str[2] = CHAR_C;
766 out->out_str[3] = CHAR_O;
767 out->out_str[4] = CHAR_M;
768 out->out_str[5] = CHAR_M;
769 out->out_str[6] = CHAR_I;
770 out->out_str[7] = CHAR_T;
771 convert_glob_write_str(out, 8);
772 convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
773 }
774 
775 
776 /* Bash glob converter.
777 
778 Arguments:
779   pattype        the pattern type
780   pattern        the pattern
781   plength        length in code units
782   utf            TRUE if UTF
783   use_buffer     where to put the output
784   use_length     length of use_buffer
785   bufflenptr     where to put the used length
786   dummyrun       TRUE if a dummy run
787   ccontext       the convert context
788 
789 Returns:         0 => success
790                 !0 => error code
791 */
792 
793 static int
convert_glob(uint32_t options,PCRE2_SPTR pattern,PCRE2_SIZE plength,BOOL utf,PCRE2_UCHAR * use_buffer,PCRE2_SIZE use_length,PCRE2_SIZE * bufflenptr,BOOL dummyrun,pcre2_convert_context * ccontext)794 convert_glob(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength,
795   BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,
796   PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)
797 {
798 pcre2_output_context out;
799 PCRE2_SPTR pattern_start = pattern;
800 PCRE2_SPTR pattern_end = pattern + plength;
801 PCRE2_UCHAR separator = ccontext->glob_separator;
802 PCRE2_UCHAR escape = ccontext->glob_escape;
803 PCRE2_UCHAR c;
804 BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0;
805 BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0;
806 BOOL in_atomic = FALSE;
807 BOOL after_starstar = FALSE;
808 BOOL no_slash_z = FALSE;
809 BOOL with_escape, is_start, after_separator;
810 int result = 0;
811 
812 (void)utf; /* Avoid compiler warning. */
813 
814 #ifdef SUPPORT_UNICODE
815 if (utf && (separator >= 128 || escape >= 128))
816   {
817   /* Currently only ASCII characters are supported. */
818   *bufflenptr = 0;
819   return PCRE2_ERROR_CONVERT_SYNTAX;
820   }
821 #endif
822 
823 with_escape = strchr(pcre2_escaped_literals, separator) != NULL;
824 
825 /* Initialize default for error offset as end of input. */
826 out.output = use_buffer;
827 out.output_end = use_buffer + use_length;
828 out.output_size = 0;
829 
830 out.out_str[0] = CHAR_LEFT_PARENTHESIS;
831 out.out_str[1] = CHAR_QUESTION_MARK;
832 out.out_str[2] = CHAR_s;
833 out.out_str[3] = CHAR_RIGHT_PARENTHESIS;
834 convert_glob_write_str(&out, 4);
835 
836 is_start = TRUE;
837 
838 if (pattern < pattern_end && pattern[0] == CHAR_ASTERISK)
839   {
840   if (no_wildsep)
841     is_start = FALSE;
842   else if (!no_starstar && pattern + 1 < pattern_end &&
843            pattern[1] == CHAR_ASTERISK)
844     is_start = FALSE;
845   }
846 
847 if (is_start)
848   {
849   out.out_str[0] = CHAR_BACKSLASH;
850   out.out_str[1] = CHAR_A;
851   convert_glob_write_str(&out, 2);
852   }
853 
854 while (pattern < pattern_end)
855   {
856   c = *pattern++;
857 
858   if (c == CHAR_ASTERISK)
859     {
860     is_start = pattern == pattern_start + 1;
861 
862     if (in_atomic)
863       {
864       convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
865       in_atomic = FALSE;
866       }
867 
868     if (!no_starstar && pattern < pattern_end && *pattern == CHAR_ASTERISK)
869       {
870       after_separator = is_start || (pattern[-2] == separator);
871 
872       do pattern++; while (pattern < pattern_end &&
873                            *pattern == CHAR_ASTERISK);
874 
875       if (pattern >= pattern_end)
876         {
877         no_slash_z = TRUE;
878         break;
879         }
880 
881       after_starstar = TRUE;
882 
883       if (after_separator && escape != 0 && *pattern == escape &&
884           pattern + 1 < pattern_end && pattern[1] == separator)
885         pattern++;
886 
887       if (is_start)
888         {
889         if (*pattern != separator) continue;
890 
891         out.out_str[0] = CHAR_LEFT_PARENTHESIS;
892         out.out_str[1] = CHAR_QUESTION_MARK;
893         out.out_str[2] = CHAR_COLON;
894         out.out_str[3] = CHAR_BACKSLASH;
895         out.out_str[4] = CHAR_A;
896         out.out_str[5] = CHAR_VERTICAL_LINE;
897         convert_glob_write_str(&out, 6);
898 
899         convert_glob_print_separator(&out, separator, with_escape);
900         convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
901 
902         pattern++;
903         continue;
904         }
905 
906       convert_glob_print_commit(&out);
907 
908       if (!after_separator || *pattern != separator)
909         {
910         out.out_str[0] = CHAR_DOT;
911         out.out_str[1] = CHAR_ASTERISK;
912         out.out_str[2] = CHAR_QUESTION_MARK;
913         convert_glob_write_str(&out, 3);
914         continue;
915         }
916 
917       out.out_str[0] = CHAR_LEFT_PARENTHESIS;
918       out.out_str[1] = CHAR_QUESTION_MARK;
919       out.out_str[2] = CHAR_COLON;
920       out.out_str[3] = CHAR_DOT;
921       out.out_str[4] = CHAR_ASTERISK;
922       out.out_str[5] = CHAR_QUESTION_MARK;
923 
924       convert_glob_write_str(&out, 6);
925 
926       convert_glob_print_separator(&out, separator, with_escape);
927 
928       out.out_str[0] = CHAR_RIGHT_PARENTHESIS;
929       out.out_str[1] = CHAR_QUESTION_MARK;
930       out.out_str[2] = CHAR_QUESTION_MARK;
931       convert_glob_write_str(&out, 3);
932 
933       pattern++;
934       continue;
935       }
936 
937     if (pattern < pattern_end && *pattern == CHAR_ASTERISK)
938       {
939       do pattern++; while (pattern < pattern_end &&
940                            *pattern == CHAR_ASTERISK);
941       }
942 
943     if (no_wildsep)
944       {
945       if (pattern >= pattern_end)
946         {
947         no_slash_z = TRUE;
948         break;
949         }
950 
951       /* Start check must be after the end check. */
952       if (is_start) continue;
953       }
954 
955     if (!is_start)
956       {
957       if (after_starstar)
958         {
959         out.out_str[0] = CHAR_LEFT_PARENTHESIS;
960         out.out_str[1] = CHAR_QUESTION_MARK;
961         out.out_str[2] = CHAR_GREATER_THAN_SIGN;
962         convert_glob_write_str(&out, 3);
963         in_atomic = TRUE;
964         }
965       else
966         convert_glob_print_commit(&out);
967       }
968 
969     if (no_wildsep)
970       convert_glob_write(&out, CHAR_DOT);
971     else
972       convert_glob_print_wildcard(&out, separator, with_escape);
973 
974     out.out_str[0] = CHAR_ASTERISK;
975     out.out_str[1] = CHAR_QUESTION_MARK;
976     if (pattern >= pattern_end)
977       out.out_str[1] = CHAR_PLUS;
978     convert_glob_write_str(&out, 2);
979     continue;
980     }
981 
982   if (c == CHAR_QUESTION_MARK)
983     {
984     if (no_wildsep)
985       convert_glob_write(&out, CHAR_DOT);
986     else
987       convert_glob_print_wildcard(&out, separator, with_escape);
988     continue;
989     }
990 
991   if (c == CHAR_LEFT_SQUARE_BRACKET)
992     {
993     result = convert_glob_parse_range(&pattern, pattern_end,
994       &out, utf, separator, with_escape, escape, no_wildsep);
995     if (result != 0) break;
996     continue;
997     }
998 
999   if (escape != 0 && c == escape)
1000     {
1001     if (pattern >= pattern_end)
1002       {
1003       result = PCRE2_ERROR_CONVERT_SYNTAX;
1004       break;
1005       }
1006     c = *pattern++;
1007     }
1008 
1009   if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL)
1010     convert_glob_write(&out, CHAR_BACKSLASH);
1011 
1012   convert_glob_write(&out, c);
1013   }
1014 
1015 if (result == 0)
1016   {
1017   if (!no_slash_z)
1018     {
1019     out.out_str[0] = CHAR_BACKSLASH;
1020     out.out_str[1] = CHAR_z;
1021     convert_glob_write_str(&out, 2);
1022     }
1023 
1024   if (in_atomic)
1025     convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
1026 
1027   convert_glob_write(&out, CHAR_NUL);
1028 
1029   if (!dummyrun && out.output_size != (PCRE2_SIZE) (out.output - use_buffer))
1030     result = PCRE2_ERROR_NOMEMORY;
1031   }
1032 
1033 if (result != 0)
1034   {
1035   *bufflenptr = pattern - pattern_start;
1036   return result;
1037   }
1038 
1039 *bufflenptr = out.output_size - 1;
1040 return 0;
1041 }
1042 
1043 
1044 /*************************************************
1045 *                Convert pattern                 *
1046 *************************************************/
1047 
1048 /* This is the external-facing function for converting other forms of pattern
1049 into PCRE2 regular expression patterns. On error, the bufflenptr argument is
1050 used to return an offset in the original pattern.
1051 
1052 Arguments:
1053   pattern     the input pattern
1054   plength     length of input, or PCRE2_ZERO_TERMINATED
1055   options     options bits
1056   buffptr     pointer to pointer to output buffer
1057   bufflenptr  pointer to length of output buffer
1058   ccontext    convert context or NULL
1059 
1060 Returns:      0 for success, else an error code (+ve or -ve)
1061 */
1062 
1063 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_pattern_convert(PCRE2_SPTR pattern,PCRE2_SIZE plength,uint32_t options,PCRE2_UCHAR ** buffptr,PCRE2_SIZE * bufflenptr,pcre2_convert_context * ccontext)1064 pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE plength, uint32_t options,
1065   PCRE2_UCHAR **buffptr, PCRE2_SIZE *bufflenptr,
1066   pcre2_convert_context *ccontext)
1067 {
1068 int i, rc;
1069 PCRE2_UCHAR dummy_buffer[DUMMY_BUFFER_SIZE];
1070 PCRE2_UCHAR *use_buffer = dummy_buffer;
1071 PCRE2_SIZE use_length = DUMMY_BUFFER_SIZE;
1072 BOOL utf = (options & PCRE2_CONVERT_UTF) != 0;
1073 uint32_t pattype = options & TYPE_OPTIONS;
1074 
1075 if (pattern == NULL || bufflenptr == NULL) return PCRE2_ERROR_NULL;
1076 
1077 if ((options & ~ALL_OPTIONS) != 0 ||        /* Undefined bit set */
1078     (pattype & (~pattype+1)) != pattype ||  /* More than one type set */
1079     pattype == 0)                           /* No type set */
1080   {
1081   *bufflenptr = 0;                          /* Error offset */
1082   return PCRE2_ERROR_BADOPTION;
1083   }
1084 
1085 if (plength == PCRE2_ZERO_TERMINATED) plength = PRIV(strlen)(pattern);
1086 if (ccontext == NULL) ccontext =
1087   (pcre2_convert_context *)(&PRIV(default_convert_context));
1088 
1089 /* Check UTF if required. */
1090 
1091 #ifndef SUPPORT_UNICODE
1092 if (utf)
1093   {
1094   *bufflenptr = 0;  /* Error offset */
1095   return PCRE2_ERROR_UNICODE_NOT_SUPPORTED;
1096   }
1097 #else
1098 if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0)
1099   {
1100   PCRE2_SIZE erroroffset;
1101   rc = PRIV(valid_utf)(pattern, plength, &erroroffset);
1102   if (rc != 0)
1103     {
1104     *bufflenptr = erroroffset;
1105     return rc;
1106     }
1107   }
1108 #endif
1109 
1110 /* If buffptr is not NULL, and what it points to is not NULL, we are being
1111 provided with a buffer and a length, so set them as the buffer to use. */
1112 
1113 if (buffptr != NULL && *buffptr != NULL)
1114   {
1115   use_buffer = *buffptr;
1116   use_length = *bufflenptr;
1117   }
1118 
1119 /* Call an individual converter, either just once (if a buffer was provided or
1120 just the length is needed), or twice (if a memory allocation is required). */
1121 
1122 for (i = 0; i < 2; i++)
1123   {
1124   PCRE2_UCHAR *allocated;
1125   BOOL dummyrun = buffptr == NULL || *buffptr == NULL;
1126 
1127   switch(pattype)
1128     {
1129     case PCRE2_CONVERT_GLOB:
1130     rc = convert_glob(options & ~PCRE2_CONVERT_GLOB, pattern, plength, utf,
1131       use_buffer, use_length, bufflenptr, dummyrun, ccontext);
1132     break;
1133 
1134     case PCRE2_CONVERT_POSIX_BASIC:
1135     case PCRE2_CONVERT_POSIX_EXTENDED:
1136     rc = convert_posix(pattype, pattern, plength, utf, use_buffer, use_length,
1137       bufflenptr, dummyrun, ccontext);
1138     break;
1139 
1140     default:
1141     *bufflenptr = 0;  /* Error offset */
1142     return PCRE2_ERROR_INTERNAL;
1143     }
1144 
1145   if (rc != 0 ||           /* Error */
1146       buffptr == NULL ||   /* Just the length is required */
1147       *buffptr != NULL)    /* Buffer was provided or allocated */
1148     return rc;
1149 
1150   /* Allocate memory for the buffer, with hidden space for an allocator at
1151   the start. The next time round the loop runs the conversion for real. */
1152 
1153   allocated = PRIV(memctl_malloc)(sizeof(pcre2_memctl) +
1154     (*bufflenptr + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)ccontext);
1155   if (allocated == NULL) return PCRE2_ERROR_NOMEMORY;
1156   *buffptr = (PCRE2_UCHAR *)(((char *)allocated) + sizeof(pcre2_memctl));
1157 
1158   use_buffer = *buffptr;
1159   use_length = *bufflenptr + 1;
1160   }
1161 
1162 /* Control should never get here. */
1163 
1164 return PCRE2_ERROR_INTERNAL;
1165 }
1166 
1167 
1168 /*************************************************
1169 *            Free converted pattern              *
1170 *************************************************/
1171 
1172 /* This frees a converted pattern that was put in newly-allocated memory.
1173 
1174 Argument:   the converted pattern
1175 Returns:    nothing
1176 */
1177 
1178 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_converted_pattern_free(PCRE2_UCHAR * converted)1179 pcre2_converted_pattern_free(PCRE2_UCHAR *converted)
1180 {
1181 if (converted != NULL)
1182   {
1183   pcre2_memctl *memctl =
1184     (pcre2_memctl *)((char *)converted - sizeof(pcre2_memctl));
1185   memctl->free(memctl, memctl->memory_data);
1186   }
1187 }
1188 
1189 /* End of pcre2_convert.c */
1190