xref: /PHP-8.2/ext/pcre/pcre2lib/pcre2_convert.c (revision c4e8f652)
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2022 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45 
46 #include "pcre2_internal.h"
47 
48 #define TYPE_OPTIONS (PCRE2_CONVERT_GLOB| \
49   PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED)
50 
51 #define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \
52   PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \
53   PCRE2_CONVERT_GLOB_NO_STARSTAR| \
54   TYPE_OPTIONS)
55 
56 #define DUMMY_BUFFER_SIZE 100
57 
58 /* Generated pattern fragments */
59 
60 #define STR_BACKSLASH_A STR_BACKSLASH STR_A
61 #define STR_BACKSLASH_z STR_BACKSLASH STR_z
62 #define STR_COLON_RIGHT_SQUARE_BRACKET STR_COLON STR_RIGHT_SQUARE_BRACKET
63 #define STR_DOT_STAR_LOOKBEHIND STR_DOT STR_ASTERISK STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_LESS_THAN_SIGN STR_EQUALS_SIGN
64 #define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS
65 #define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS
66 #define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS
67 
68 /* States for POSIX processing */
69 
70 enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET,
71        POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };
72 
73 /* Macro to add a character string to the output buffer, checking for overflow. */
74 
75 #define PUTCHARS(string) \
76   { \
77   for (s = (char *)(string); *s != 0; s++) \
78     { \
79     if (p >= endp) return PCRE2_ERROR_NOMEMORY; \
80     *p++ = *s; \
81     } \
82   }
83 
84 /* Literals that must be escaped: \ ? * + | . ^ $ { } [ ] ( ) */
85 
86 static const char *pcre2_escaped_literals =
87   STR_BACKSLASH STR_QUESTION_MARK STR_ASTERISK STR_PLUS
88   STR_VERTICAL_LINE STR_DOT STR_CIRCUMFLEX_ACCENT STR_DOLLAR_SIGN
89   STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET
90   STR_LEFT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
91   STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS;
92 
93 /* Recognized escaped metacharacters in POSIX basic patterns. */
94 
95 static const char *posix_meta_escapes =
96   STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS
97   STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET
98   STR_1 STR_2 STR_3 STR_4 STR_5 STR_6 STR_7 STR_8 STR_9;
99 
100 
101 
102 /*************************************************
103 *           Convert a POSIX pattern              *
104 *************************************************/
105 
106 /* This function handles both basic and extended POSIX patterns.
107 
108 Arguments:
109   pattype        the pattern type
110   pattern        the pattern
111   plength        length in code units
112   utf            TRUE if UTF
113   use_buffer     where to put the output
114   use_length     length of use_buffer
115   bufflenptr     where to put the used length
116   dummyrun       TRUE if a dummy run
117   ccontext       the convert context
118 
119 Returns:         0 => success
120                 !0 => error code
121 */
122 
123 static int
convert_posix(uint32_t pattype,PCRE2_SPTR pattern,PCRE2_SIZE plength,BOOL utf,PCRE2_UCHAR * use_buffer,PCRE2_SIZE use_length,PCRE2_SIZE * bufflenptr,BOOL dummyrun,pcre2_convert_context * ccontext)124 convert_posix(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength,
125   BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,
126   PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)
127 {
128 char *s;
129 PCRE2_SPTR posix = pattern;
130 PCRE2_UCHAR *p = use_buffer;
131 PCRE2_UCHAR *pp = p;
132 PCRE2_UCHAR *endp = p + use_length - 1;  /* Allow for trailing zero */
133 PCRE2_SIZE convlength = 0;
134 
135 uint32_t bracount = 0;
136 uint32_t posix_state = POSIX_START_REGEX;
137 uint32_t lastspecial = 0;
138 BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0;
139 BOOL nextisliteral = FALSE;
140 
141 (void)utf;       /* Not used when Unicode not supported */
142 (void)ccontext;  /* Not currently used */
143 
144 /* Initialize default for error offset as end of input. */
145 
146 *bufflenptr = plength;
147 PUTCHARS(STR_STAR_NUL);
148 
149 /* Now scan the input. */
150 
151 while (plength > 0)
152   {
153   uint32_t c, sc;
154   int clength = 1;
155 
156   /* Add in the length of the last item, then, if in the dummy run, pull the
157   pointer back to the start of the (temporary) buffer and then remember the
158   start of the next item. */
159 
160   convlength += p - pp;
161   if (dummyrun) p = use_buffer;
162   pp = p;
163 
164   /* Pick up the next character */
165 
166 #ifndef SUPPORT_UNICODE
167   c = *posix;
168 #else
169   GETCHARLENTEST(c, posix, clength);
170 #endif
171   posix += clength;
172   plength -= clength;
173 
174   sc = nextisliteral? 0 : c;
175   nextisliteral = FALSE;
176 
177   /* Handle a character within a class. */
178 
179   if (posix_state >= POSIX_CLASS_NOT_STARTED)
180     {
181     if (c == CHAR_RIGHT_SQUARE_BRACKET)
182       {
183       PUTCHARS(STR_RIGHT_SQUARE_BRACKET);
184       posix_state = POSIX_NOT_BRACKET;
185       }
186 
187     /* Not the end of the class */
188 
189     else
190       {
191       switch (posix_state)
192         {
193         case POSIX_CLASS_STARTED:
194         if (c <= 127 && islower(c)) break;  /* Remain in started state */
195         posix_state = POSIX_CLASS_NOT_STARTED;
196         if (c == CHAR_COLON  && plength > 0 &&
197             *posix == CHAR_RIGHT_SQUARE_BRACKET)
198           {
199           PUTCHARS(STR_COLON_RIGHT_SQUARE_BRACKET);
200           plength--;
201           posix++;
202           continue;    /* With next character after :] */
203           }
204         /* Fall through */
205 
206         case POSIX_CLASS_NOT_STARTED:
207         if (c == CHAR_LEFT_SQUARE_BRACKET)
208           posix_state = POSIX_CLASS_STARTING;
209         break;
210 
211         case POSIX_CLASS_STARTING:
212         if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED;
213         break;
214         }
215 
216       if (c == CHAR_BACKSLASH) PUTCHARS(STR_BACKSLASH);
217       if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
218       memcpy(p, posix - clength, CU2BYTES(clength));
219       p += clength;
220       }
221     }
222 
223   /* Handle a character not within a class. */
224 
225   else switch(sc)
226     {
227     case CHAR_LEFT_SQUARE_BRACKET:
228     PUTCHARS(STR_LEFT_SQUARE_BRACKET);
229 
230 #ifdef NEVER
231     /* We could handle special cases [[:<:]] and [[:>:]] (which PCRE does
232     support) but they are not part of POSIX 1003.1. */
233 
234     if (plength >= 6)
235       {
236       if (posix[0] == CHAR_LEFT_SQUARE_BRACKET &&
237           posix[1] == CHAR_COLON &&
238           (posix[2] == CHAR_LESS_THAN_SIGN ||
239            posix[2] == CHAR_GREATER_THAN_SIGN) &&
240           posix[3] == CHAR_COLON &&
241           posix[4] == CHAR_RIGHT_SQUARE_BRACKET &&
242           posix[5] == CHAR_RIGHT_SQUARE_BRACKET)
243         {
244         if (p + 6 > endp) return PCRE2_ERROR_NOMEMORY;
245         memcpy(p, posix, CU2BYTES(6));
246         p += 6;
247         posix += 6;
248         plength -= 6;
249         continue;  /* With next character */
250         }
251       }
252 #endif
253 
254     /* Handle start of "normal" character classes */
255 
256     posix_state = POSIX_CLASS_NOT_STARTED;
257 
258     /* Handle ^ and ] as first characters */
259 
260     if (plength > 0)
261       {
262       if (*posix == CHAR_CIRCUMFLEX_ACCENT)
263         {
264         posix++;
265         plength--;
266         PUTCHARS(STR_CIRCUMFLEX_ACCENT);
267         }
268       if (plength > 0 && *posix == CHAR_RIGHT_SQUARE_BRACKET)
269         {
270         posix++;
271         plength--;
272         PUTCHARS(STR_RIGHT_SQUARE_BRACKET);
273         }
274       }
275     break;
276 
277     case CHAR_BACKSLASH:
278     if (plength == 0) return PCRE2_ERROR_END_BACKSLASH;
279     if (extended) nextisliteral = TRUE; else
280       {
281       if (*posix < 127 && strchr(posix_meta_escapes, *posix) != NULL)
282         {
283         if (isdigit(*posix)) PUTCHARS(STR_BACKSLASH);
284         if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
285         lastspecial = *p++ = *posix++;
286         plength--;
287         }
288       else nextisliteral = TRUE;
289       }
290     break;
291 
292     case CHAR_RIGHT_PARENTHESIS:
293     if (!extended || bracount == 0) goto ESCAPE_LITERAL;
294     bracount--;
295     goto COPY_SPECIAL;
296 
297     case CHAR_LEFT_PARENTHESIS:
298     bracount++;
299     /* Fall through */
300 
301     case CHAR_QUESTION_MARK:
302     case CHAR_PLUS:
303     case CHAR_LEFT_CURLY_BRACKET:
304     case CHAR_RIGHT_CURLY_BRACKET:
305     case CHAR_VERTICAL_LINE:
306     if (!extended) goto ESCAPE_LITERAL;
307     /* Fall through */
308 
309     case CHAR_DOT:
310     case CHAR_DOLLAR_SIGN:
311     posix_state = POSIX_NOT_BRACKET;
312     COPY_SPECIAL:
313     lastspecial = c;
314     if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
315     *p++ = c;
316     break;
317 
318     case CHAR_ASTERISK:
319     if (lastspecial != CHAR_ASTERISK)
320       {
321       if (!extended && (posix_state < POSIX_NOT_BRACKET ||
322           lastspecial == CHAR_LEFT_PARENTHESIS))
323         goto ESCAPE_LITERAL;
324       goto COPY_SPECIAL;
325       }
326     break;   /* Ignore second and subsequent asterisks */
327 
328     case CHAR_CIRCUMFLEX_ACCENT:
329     if (extended) goto COPY_SPECIAL;
330     if (posix_state == POSIX_START_REGEX ||
331         lastspecial == CHAR_LEFT_PARENTHESIS)
332       {
333       posix_state = POSIX_ANCHORED;
334       goto COPY_SPECIAL;
335       }
336     /* Fall through */
337 
338     default:
339     if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL)
340       {
341       ESCAPE_LITERAL:
342       PUTCHARS(STR_BACKSLASH);
343       }
344     lastspecial = 0xff;  /* Indicates nothing special */
345     if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
346     memcpy(p, posix - clength, CU2BYTES(clength));
347     p += clength;
348     posix_state = POSIX_NOT_BRACKET;
349     break;
350     }
351   }
352 
353 if (posix_state >= POSIX_CLASS_NOT_STARTED)
354   return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
355 convlength += p - pp;        /* Final segment */
356 *bufflenptr = convlength;
357 *p++ = 0;
358 return 0;
359 }
360 
361 
362 /*************************************************
363 *           Convert a glob pattern               *
364 *************************************************/
365 
366 /* Context for writing the output into a buffer. */
367 
368 typedef struct pcre2_output_context {
369   PCRE2_UCHAR *output;                  /* current output position */
370   PCRE2_SPTR output_end;                /* output end */
371   PCRE2_SIZE output_size;               /* size of the output */
372   uint8_t out_str[8];                   /* string copied to the output */
373 } pcre2_output_context;
374 
375 
376 /* Write a character into the output.
377 
378 Arguments:
379   out            output context
380   chr            the next character
381 */
382 
383 static void
convert_glob_write(pcre2_output_context * out,PCRE2_UCHAR chr)384 convert_glob_write(pcre2_output_context *out, PCRE2_UCHAR chr)
385 {
386 out->output_size++;
387 
388 if (out->output < out->output_end)
389   *out->output++ = chr;
390 }
391 
392 
393 /* Write a string into the output.
394 
395 Arguments:
396   out            output context
397   length         length of out->out_str
398 */
399 
400 static void
convert_glob_write_str(pcre2_output_context * out,PCRE2_SIZE length)401 convert_glob_write_str(pcre2_output_context *out, PCRE2_SIZE length)
402 {
403 uint8_t *out_str = out->out_str;
404 PCRE2_UCHAR *output = out->output;
405 PCRE2_SPTR output_end = out->output_end;
406 PCRE2_SIZE output_size = out->output_size;
407 
408 do
409   {
410   output_size++;
411 
412   if (output < output_end)
413     *output++ = *out_str++;
414   }
415 while (--length != 0);
416 
417 out->output = output;
418 out->output_size = output_size;
419 }
420 
421 
422 /* Prints the separator into the output.
423 
424 Arguments:
425   out            output context
426   separator      glob separator
427   with_escape    backslash is needed before separator
428 */
429 
430 static void
convert_glob_print_separator(pcre2_output_context * out,PCRE2_UCHAR separator,BOOL with_escape)431 convert_glob_print_separator(pcre2_output_context *out,
432   PCRE2_UCHAR separator, BOOL with_escape)
433 {
434 if (with_escape)
435   convert_glob_write(out, CHAR_BACKSLASH);
436 
437 convert_glob_write(out, separator);
438 }
439 
440 
441 /* Prints a wildcard into the output.
442 
443 Arguments:
444   out            output context
445   separator      glob separator
446   with_escape    backslash is needed before separator
447 */
448 
449 static void
convert_glob_print_wildcard(pcre2_output_context * out,PCRE2_UCHAR separator,BOOL with_escape)450 convert_glob_print_wildcard(pcre2_output_context *out,
451   PCRE2_UCHAR separator, BOOL with_escape)
452 {
453 out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
454 out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
455 convert_glob_write_str(out, 2);
456 
457 convert_glob_print_separator(out, separator, with_escape);
458 
459 convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
460 }
461 
462 
463 /* Parse a posix class.
464 
465 Arguments:
466   from           starting point of scanning the range
467   pattern_end    end of pattern
468   out            output context
469 
470 Returns:  >0 => class index
471           0  => malformed class
472 */
473 
474 static int
convert_glob_parse_class(PCRE2_SPTR * from,PCRE2_SPTR pattern_end,pcre2_output_context * out)475 convert_glob_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
476   pcre2_output_context *out)
477 {
478 static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:"
479   "graph:lower:print:punct:space:upper:word:xdigit:";
480 PCRE2_SPTR start = *from + 1;
481 PCRE2_SPTR pattern = start;
482 const char *class_ptr;
483 PCRE2_UCHAR c;
484 int class_index;
485 
486 while (TRUE)
487   {
488   if (pattern >= pattern_end) return 0;
489 
490   c = *pattern++;
491 
492   if (c < CHAR_a || c > CHAR_z) break;
493   }
494 
495 if (c != CHAR_COLON || pattern >= pattern_end ||
496     *pattern != CHAR_RIGHT_SQUARE_BRACKET)
497   return 0;
498 
499 class_ptr = posix_classes;
500 class_index = 1;
501 
502 while (TRUE)
503   {
504   if (*class_ptr == CHAR_NUL) return 0;
505 
506   pattern = start;
507 
508   while (*pattern == (PCRE2_UCHAR) *class_ptr)
509     {
510     if (*pattern == CHAR_COLON)
511       {
512       pattern += 2;
513       start -= 2;
514 
515       do convert_glob_write(out, *start++); while (start < pattern);
516 
517       *from = pattern;
518       return class_index;
519       }
520     pattern++;
521     class_ptr++;
522     }
523 
524   while (*class_ptr != CHAR_COLON) class_ptr++;
525   class_ptr++;
526   class_index++;
527   }
528 }
529 
530 /* Checks whether the character is in the class.
531 
532 Arguments:
533   class_index    class index
534   c              character
535 
536 Returns:   !0 => character is found in the class
537             0 => otherwise
538 */
539 
540 static BOOL
convert_glob_char_in_class(int class_index,PCRE2_UCHAR c)541 convert_glob_char_in_class(int class_index, PCRE2_UCHAR c)
542 {
543 switch (class_index)
544   {
545   case 1: return isalnum(c);
546   case 2: return isalpha(c);
547   case 3: return 1;
548   case 4: return c == CHAR_HT || c == CHAR_SPACE;
549   case 5: return iscntrl(c);
550   case 6: return isdigit(c);
551   case 7: return isgraph(c);
552   case 8: return islower(c);
553   case 9: return isprint(c);
554   case 10: return ispunct(c);
555   case 11: return isspace(c);
556   case 12: return isupper(c);
557   case 13: return isalnum(c) || c == CHAR_UNDERSCORE;
558   default: return isxdigit(c);
559   }
560 }
561 
562 /* Parse a range of characters.
563 
564 Arguments:
565   from           starting point of scanning the range
566   pattern_end    end of pattern
567   out            output context
568   separator      glob separator
569   with_escape    backslash is needed before separator
570 
571 Returns:         0 => success
572                 !0 => error code
573 */
574 
575 static int
convert_glob_parse_range(PCRE2_SPTR * from,PCRE2_SPTR pattern_end,pcre2_output_context * out,BOOL utf,PCRE2_UCHAR separator,BOOL with_escape,PCRE2_UCHAR escape,BOOL no_wildsep)576 convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
577   pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator,
578   BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep)
579 {
580 BOOL is_negative = FALSE;
581 BOOL separator_seen = FALSE;
582 BOOL has_prev_c;
583 PCRE2_SPTR pattern = *from;
584 PCRE2_SPTR char_start = NULL;
585 uint32_t c, prev_c;
586 int len, class_index;
587 
588 (void)utf; /* Avoid compiler warning. */
589 
590 if (pattern >= pattern_end)
591   {
592   *from = pattern;
593   return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
594   }
595 
596 if (*pattern == CHAR_EXCLAMATION_MARK
597     || *pattern == CHAR_CIRCUMFLEX_ACCENT)
598   {
599   pattern++;
600 
601   if (pattern >= pattern_end)
602     {
603     *from = pattern;
604     return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
605     }
606 
607   is_negative = TRUE;
608 
609   out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
610   out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
611   len = 2;
612 
613   if (!no_wildsep)
614     {
615     if (with_escape)
616       {
617       out->out_str[len] = CHAR_BACKSLASH;
618       len++;
619       }
620     out->out_str[len] = (uint8_t) separator;
621     }
622 
623   convert_glob_write_str(out, len + 1);
624   }
625 else
626   convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET);
627 
628 has_prev_c = FALSE;
629 prev_c = 0;
630 
631 if (*pattern == CHAR_RIGHT_SQUARE_BRACKET)
632   {
633   out->out_str[0] = CHAR_BACKSLASH;
634   out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET;
635   convert_glob_write_str(out, 2);
636   has_prev_c = TRUE;
637   prev_c = CHAR_RIGHT_SQUARE_BRACKET;
638   pattern++;
639   }
640 
641 while (pattern < pattern_end)
642   {
643   char_start = pattern;
644   GETCHARINCTEST(c, pattern);
645 
646   if (c == CHAR_RIGHT_SQUARE_BRACKET)
647     {
648     convert_glob_write(out, c);
649 
650     if (!is_negative && !no_wildsep && separator_seen)
651       {
652       out->out_str[0] = CHAR_LEFT_PARENTHESIS;
653       out->out_str[1] = CHAR_QUESTION_MARK;
654       out->out_str[2] = CHAR_LESS_THAN_SIGN;
655       out->out_str[3] = CHAR_EXCLAMATION_MARK;
656       convert_glob_write_str(out, 4);
657 
658       convert_glob_print_separator(out, separator, with_escape);
659       convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
660       }
661 
662     *from = pattern;
663     return 0;
664     }
665 
666   if (pattern >= pattern_end) break;
667 
668   if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
669     {
670     *from = pattern;
671     class_index = convert_glob_parse_class(from, pattern_end, out);
672 
673     if (class_index != 0)
674       {
675       pattern = *from;
676 
677       has_prev_c = FALSE;
678       prev_c = 0;
679 
680       if (!is_negative &&
681           convert_glob_char_in_class (class_index, separator))
682         separator_seen = TRUE;
683       continue;
684       }
685     }
686   else if (c == CHAR_MINUS && has_prev_c &&
687            *pattern != CHAR_RIGHT_SQUARE_BRACKET)
688     {
689     convert_glob_write(out, CHAR_MINUS);
690 
691     char_start = pattern;
692     GETCHARINCTEST(c, pattern);
693 
694     if (pattern >= pattern_end) break;
695 
696     if (escape != 0 && c == escape)
697       {
698       char_start = pattern;
699       GETCHARINCTEST(c, pattern);
700       }
701     else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
702       {
703       *from = pattern;
704       return PCRE2_ERROR_CONVERT_SYNTAX;
705       }
706 
707     if (prev_c > c)
708       {
709       *from = pattern;
710       return PCRE2_ERROR_CONVERT_SYNTAX;
711       }
712 
713     if (prev_c < separator && separator < c) separator_seen = TRUE;
714 
715     has_prev_c = FALSE;
716     prev_c = 0;
717     }
718   else
719     {
720     if (escape != 0 && c == escape)
721       {
722       char_start = pattern;
723       GETCHARINCTEST(c, pattern);
724 
725       if (pattern >= pattern_end) break;
726       }
727 
728     has_prev_c = TRUE;
729     prev_c = c;
730     }
731 
732   if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
733       c == CHAR_BACKSLASH || c == CHAR_MINUS)
734     convert_glob_write(out, CHAR_BACKSLASH);
735 
736   if (c == separator) separator_seen = TRUE;
737 
738   do convert_glob_write(out, *char_start++); while (char_start < pattern);
739   }
740 
741 *from = pattern;
742 return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
743 }
744 
745 
746 /* Prints a (*COMMIT) into the output.
747 
748 Arguments:
749   out            output context
750 */
751 
752 static void
convert_glob_print_commit(pcre2_output_context * out)753 convert_glob_print_commit(pcre2_output_context *out)
754 {
755 out->out_str[0] = CHAR_LEFT_PARENTHESIS;
756 out->out_str[1] = CHAR_ASTERISK;
757 out->out_str[2] = CHAR_C;
758 out->out_str[3] = CHAR_O;
759 out->out_str[4] = CHAR_M;
760 out->out_str[5] = CHAR_M;
761 out->out_str[6] = CHAR_I;
762 out->out_str[7] = CHAR_T;
763 convert_glob_write_str(out, 8);
764 convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
765 }
766 
767 
768 /* Bash glob converter.
769 
770 Arguments:
771   pattype        the pattern type
772   pattern        the pattern
773   plength        length in code units
774   utf            TRUE if UTF
775   use_buffer     where to put the output
776   use_length     length of use_buffer
777   bufflenptr     where to put the used length
778   dummyrun       TRUE if a dummy run
779   ccontext       the convert context
780 
781 Returns:         0 => success
782                 !0 => error code
783 */
784 
785 static int
convert_glob(uint32_t options,PCRE2_SPTR pattern,PCRE2_SIZE plength,BOOL utf,PCRE2_UCHAR * use_buffer,PCRE2_SIZE use_length,PCRE2_SIZE * bufflenptr,BOOL dummyrun,pcre2_convert_context * ccontext)786 convert_glob(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength,
787   BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,
788   PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)
789 {
790 pcre2_output_context out;
791 PCRE2_SPTR pattern_start = pattern;
792 PCRE2_SPTR pattern_end = pattern + plength;
793 PCRE2_UCHAR separator = ccontext->glob_separator;
794 PCRE2_UCHAR escape = ccontext->glob_escape;
795 PCRE2_UCHAR c;
796 BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0;
797 BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0;
798 BOOL in_atomic = FALSE;
799 BOOL after_starstar = FALSE;
800 BOOL no_slash_z = FALSE;
801 BOOL with_escape, is_start, after_separator;
802 int result = 0;
803 
804 (void)utf; /* Avoid compiler warning. */
805 
806 #ifdef SUPPORT_UNICODE
807 if (utf && (separator >= 128 || escape >= 128))
808   {
809   /* Currently only ASCII characters are supported. */
810   *bufflenptr = 0;
811   return PCRE2_ERROR_CONVERT_SYNTAX;
812   }
813 #endif
814 
815 with_escape = strchr(pcre2_escaped_literals, separator) != NULL;
816 
817 /* Initialize default for error offset as end of input. */
818 out.output = use_buffer;
819 out.output_end = use_buffer + use_length;
820 out.output_size = 0;
821 
822 out.out_str[0] = CHAR_LEFT_PARENTHESIS;
823 out.out_str[1] = CHAR_QUESTION_MARK;
824 out.out_str[2] = CHAR_s;
825 out.out_str[3] = CHAR_RIGHT_PARENTHESIS;
826 convert_glob_write_str(&out, 4);
827 
828 is_start = TRUE;
829 
830 if (pattern < pattern_end && pattern[0] == CHAR_ASTERISK)
831   {
832   if (no_wildsep)
833     is_start = FALSE;
834   else if (!no_starstar && pattern + 1 < pattern_end &&
835            pattern[1] == CHAR_ASTERISK)
836     is_start = FALSE;
837   }
838 
839 if (is_start)
840   {
841   out.out_str[0] = CHAR_BACKSLASH;
842   out.out_str[1] = CHAR_A;
843   convert_glob_write_str(&out, 2);
844   }
845 
846 while (pattern < pattern_end)
847   {
848   c = *pattern++;
849 
850   if (c == CHAR_ASTERISK)
851     {
852     is_start = pattern == pattern_start + 1;
853 
854     if (in_atomic)
855       {
856       convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
857       in_atomic = FALSE;
858       }
859 
860     if (!no_starstar && pattern < pattern_end && *pattern == CHAR_ASTERISK)
861       {
862       after_separator = is_start || (pattern[-2] == separator);
863 
864       do pattern++; while (pattern < pattern_end &&
865                            *pattern == CHAR_ASTERISK);
866 
867       if (pattern >= pattern_end)
868         {
869         no_slash_z = TRUE;
870         break;
871         }
872 
873       after_starstar = TRUE;
874 
875       if (after_separator && escape != 0 && *pattern == escape &&
876           pattern + 1 < pattern_end && pattern[1] == separator)
877         pattern++;
878 
879       if (is_start)
880         {
881         if (*pattern != separator) continue;
882 
883         out.out_str[0] = CHAR_LEFT_PARENTHESIS;
884         out.out_str[1] = CHAR_QUESTION_MARK;
885         out.out_str[2] = CHAR_COLON;
886         out.out_str[3] = CHAR_BACKSLASH;
887         out.out_str[4] = CHAR_A;
888         out.out_str[5] = CHAR_VERTICAL_LINE;
889         convert_glob_write_str(&out, 6);
890 
891         convert_glob_print_separator(&out, separator, with_escape);
892         convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
893 
894         pattern++;
895         continue;
896         }
897 
898       convert_glob_print_commit(&out);
899 
900       if (!after_separator || *pattern != separator)
901         {
902         out.out_str[0] = CHAR_DOT;
903         out.out_str[1] = CHAR_ASTERISK;
904         out.out_str[2] = CHAR_QUESTION_MARK;
905         convert_glob_write_str(&out, 3);
906         continue;
907         }
908 
909       out.out_str[0] = CHAR_LEFT_PARENTHESIS;
910       out.out_str[1] = CHAR_QUESTION_MARK;
911       out.out_str[2] = CHAR_COLON;
912       out.out_str[3] = CHAR_DOT;
913       out.out_str[4] = CHAR_ASTERISK;
914       out.out_str[5] = CHAR_QUESTION_MARK;
915 
916       convert_glob_write_str(&out, 6);
917 
918       convert_glob_print_separator(&out, separator, with_escape);
919 
920       out.out_str[0] = CHAR_RIGHT_PARENTHESIS;
921       out.out_str[1] = CHAR_QUESTION_MARK;
922       out.out_str[2] = CHAR_QUESTION_MARK;
923       convert_glob_write_str(&out, 3);
924 
925       pattern++;
926       continue;
927       }
928 
929     if (pattern < pattern_end && *pattern == CHAR_ASTERISK)
930       {
931       do pattern++; while (pattern < pattern_end &&
932                            *pattern == CHAR_ASTERISK);
933       }
934 
935     if (no_wildsep)
936       {
937       if (pattern >= pattern_end)
938         {
939         no_slash_z = TRUE;
940         break;
941         }
942 
943       /* Start check must be after the end check. */
944       if (is_start) continue;
945       }
946 
947     if (!is_start)
948       {
949       if (after_starstar)
950         {
951         out.out_str[0] = CHAR_LEFT_PARENTHESIS;
952         out.out_str[1] = CHAR_QUESTION_MARK;
953         out.out_str[2] = CHAR_GREATER_THAN_SIGN;
954         convert_glob_write_str(&out, 3);
955         in_atomic = TRUE;
956         }
957       else
958         convert_glob_print_commit(&out);
959       }
960 
961     if (no_wildsep)
962       convert_glob_write(&out, CHAR_DOT);
963     else
964       convert_glob_print_wildcard(&out, separator, with_escape);
965 
966     out.out_str[0] = CHAR_ASTERISK;
967     out.out_str[1] = CHAR_QUESTION_MARK;
968     if (pattern >= pattern_end)
969       out.out_str[1] = CHAR_PLUS;
970     convert_glob_write_str(&out, 2);
971     continue;
972     }
973 
974   if (c == CHAR_QUESTION_MARK)
975     {
976     if (no_wildsep)
977       convert_glob_write(&out, CHAR_DOT);
978     else
979       convert_glob_print_wildcard(&out, separator, with_escape);
980     continue;
981     }
982 
983   if (c == CHAR_LEFT_SQUARE_BRACKET)
984     {
985     result = convert_glob_parse_range(&pattern, pattern_end,
986       &out, utf, separator, with_escape, escape, no_wildsep);
987     if (result != 0) break;
988     continue;
989     }
990 
991   if (escape != 0 && c == escape)
992     {
993     if (pattern >= pattern_end)
994       {
995       result = PCRE2_ERROR_CONVERT_SYNTAX;
996       break;
997       }
998     c = *pattern++;
999     }
1000 
1001   if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL)
1002     convert_glob_write(&out, CHAR_BACKSLASH);
1003 
1004   convert_glob_write(&out, c);
1005   }
1006 
1007 if (result == 0)
1008   {
1009   if (!no_slash_z)
1010     {
1011     out.out_str[0] = CHAR_BACKSLASH;
1012     out.out_str[1] = CHAR_z;
1013     convert_glob_write_str(&out, 2);
1014     }
1015 
1016   if (in_atomic)
1017     convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
1018 
1019   convert_glob_write(&out, CHAR_NUL);
1020 
1021   if (!dummyrun && out.output_size != (PCRE2_SIZE) (out.output - use_buffer))
1022     result = PCRE2_ERROR_NOMEMORY;
1023   }
1024 
1025 if (result != 0)
1026   {
1027   *bufflenptr = pattern - pattern_start;
1028   return result;
1029   }
1030 
1031 *bufflenptr = out.output_size - 1;
1032 return 0;
1033 }
1034 
1035 
1036 /*************************************************
1037 *                Convert pattern                 *
1038 *************************************************/
1039 
1040 /* This is the external-facing function for converting other forms of pattern
1041 into PCRE2 regular expression patterns. On error, the bufflenptr argument is
1042 used to return an offset in the original pattern.
1043 
1044 Arguments:
1045   pattern     the input pattern
1046   plength     length of input, or PCRE2_ZERO_TERMINATED
1047   options     options bits
1048   buffptr     pointer to pointer to output buffer
1049   bufflenptr  pointer to length of output buffer
1050   ccontext    convert context or NULL
1051 
1052 Returns:      0 for success, else an error code (+ve or -ve)
1053 */
1054 
1055 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_pattern_convert(PCRE2_SPTR pattern,PCRE2_SIZE plength,uint32_t options,PCRE2_UCHAR ** buffptr,PCRE2_SIZE * bufflenptr,pcre2_convert_context * ccontext)1056 pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE plength, uint32_t options,
1057   PCRE2_UCHAR **buffptr, PCRE2_SIZE *bufflenptr,
1058   pcre2_convert_context *ccontext)
1059 {
1060 int i, rc;
1061 PCRE2_UCHAR dummy_buffer[DUMMY_BUFFER_SIZE];
1062 PCRE2_UCHAR *use_buffer = dummy_buffer;
1063 PCRE2_SIZE use_length = DUMMY_BUFFER_SIZE;
1064 BOOL utf = (options & PCRE2_CONVERT_UTF) != 0;
1065 uint32_t pattype = options & TYPE_OPTIONS;
1066 
1067 if (pattern == NULL || bufflenptr == NULL) return PCRE2_ERROR_NULL;
1068 
1069 if ((options & ~ALL_OPTIONS) != 0 ||        /* Undefined bit set */
1070     (pattype & (~pattype+1)) != pattype ||  /* More than one type set */
1071     pattype == 0)                           /* No type set */
1072   {
1073   *bufflenptr = 0;                          /* Error offset */
1074   return PCRE2_ERROR_BADOPTION;
1075   }
1076 
1077 if (plength == PCRE2_ZERO_TERMINATED) plength = PRIV(strlen)(pattern);
1078 if (ccontext == NULL) ccontext =
1079   (pcre2_convert_context *)(&PRIV(default_convert_context));
1080 
1081 /* Check UTF if required. */
1082 
1083 #ifndef SUPPORT_UNICODE
1084 if (utf)
1085   {
1086   *bufflenptr = 0;  /* Error offset */
1087   return PCRE2_ERROR_UNICODE_NOT_SUPPORTED;
1088   }
1089 #else
1090 if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0)
1091   {
1092   PCRE2_SIZE erroroffset;
1093   rc = PRIV(valid_utf)(pattern, plength, &erroroffset);
1094   if (rc != 0)
1095     {
1096     *bufflenptr = erroroffset;
1097     return rc;
1098     }
1099   }
1100 #endif
1101 
1102 /* If buffptr is not NULL, and what it points to is not NULL, we are being
1103 provided with a buffer and a length, so set them as the buffer to use. */
1104 
1105 if (buffptr != NULL && *buffptr != NULL)
1106   {
1107   use_buffer = *buffptr;
1108   use_length = *bufflenptr;
1109   }
1110 
1111 /* Call an individual converter, either just once (if a buffer was provided or
1112 just the length is needed), or twice (if a memory allocation is required). */
1113 
1114 for (i = 0; i < 2; i++)
1115   {
1116   PCRE2_UCHAR *allocated;
1117   BOOL dummyrun = buffptr == NULL || *buffptr == NULL;
1118 
1119   switch(pattype)
1120     {
1121     case PCRE2_CONVERT_GLOB:
1122     rc = convert_glob(options & ~PCRE2_CONVERT_GLOB, pattern, plength, utf,
1123       use_buffer, use_length, bufflenptr, dummyrun, ccontext);
1124     break;
1125 
1126     case PCRE2_CONVERT_POSIX_BASIC:
1127     case PCRE2_CONVERT_POSIX_EXTENDED:
1128     rc = convert_posix(pattype, pattern, plength, utf, use_buffer, use_length,
1129       bufflenptr, dummyrun, ccontext);
1130     break;
1131 
1132     default:
1133     *bufflenptr = 0;  /* Error offset */
1134     return PCRE2_ERROR_INTERNAL;
1135     }
1136 
1137   if (rc != 0 ||           /* Error */
1138       buffptr == NULL ||   /* Just the length is required */
1139       *buffptr != NULL)    /* Buffer was provided or allocated */
1140     return rc;
1141 
1142   /* Allocate memory for the buffer, with hidden space for an allocator at
1143   the start. The next time round the loop runs the conversion for real. */
1144 
1145   allocated = PRIV(memctl_malloc)(sizeof(pcre2_memctl) +
1146     (*bufflenptr + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)ccontext);
1147   if (allocated == NULL) return PCRE2_ERROR_NOMEMORY;
1148   *buffptr = (PCRE2_UCHAR *)(((char *)allocated) + sizeof(pcre2_memctl));
1149 
1150   use_buffer = *buffptr;
1151   use_length = *bufflenptr + 1;
1152   }
1153 
1154 /* Control should never get here. */
1155 
1156 return PCRE2_ERROR_INTERNAL;
1157 }
1158 
1159 
1160 /*************************************************
1161 *            Free converted pattern              *
1162 *************************************************/
1163 
1164 /* This frees a converted pattern that was put in newly-allocated memory.
1165 
1166 Argument:   the converted pattern
1167 Returns:    nothing
1168 */
1169 
1170 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_converted_pattern_free(PCRE2_UCHAR * converted)1171 pcre2_converted_pattern_free(PCRE2_UCHAR *converted)
1172 {
1173 if (converted != NULL)
1174   {
1175   pcre2_memctl *memctl =
1176     (pcre2_memctl *)((char *)converted - sizeof(pcre2_memctl));
1177   memctl->free(memctl, memctl->memory_data);
1178   }
1179 }
1180 
1181 /* End of pcre2_convert.c */
1182