1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2022 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 #ifdef HAVE_CONFIG_H
43 #include "config.h"
44 #endif
45
46 #include "pcre2_internal.h"
47
48 #define TYPE_OPTIONS (PCRE2_CONVERT_GLOB| \
49 PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED)
50
51 #define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \
52 PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \
53 PCRE2_CONVERT_GLOB_NO_STARSTAR| \
54 TYPE_OPTIONS)
55
56 #define DUMMY_BUFFER_SIZE 100
57
58 /* Generated pattern fragments */
59
60 #define STR_BACKSLASH_A STR_BACKSLASH STR_A
61 #define STR_BACKSLASH_z STR_BACKSLASH STR_z
62 #define STR_COLON_RIGHT_SQUARE_BRACKET STR_COLON STR_RIGHT_SQUARE_BRACKET
63 #define STR_DOT_STAR_LOOKBEHIND STR_DOT STR_ASTERISK STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_LESS_THAN_SIGN STR_EQUALS_SIGN
64 #define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS
65 #define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS
66 #define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS
67
68 /* States for POSIX processing */
69
70 enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET,
71 POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };
72
73 /* Macro to add a character string to the output buffer, checking for overflow. */
74
75 #define PUTCHARS(string) \
76 { \
77 for (s = (char *)(string); *s != 0; s++) \
78 { \
79 if (p >= endp) return PCRE2_ERROR_NOMEMORY; \
80 *p++ = *s; \
81 } \
82 }
83
84 /* Literals that must be escaped: \ ? * + | . ^ $ { } [ ] ( ) */
85
86 static const char *pcre2_escaped_literals =
87 STR_BACKSLASH STR_QUESTION_MARK STR_ASTERISK STR_PLUS
88 STR_VERTICAL_LINE STR_DOT STR_CIRCUMFLEX_ACCENT STR_DOLLAR_SIGN
89 STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET
90 STR_LEFT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
91 STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS;
92
93 /* Recognized escaped metacharacters in POSIX basic patterns. */
94
95 static const char *posix_meta_escapes =
96 STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS
97 STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET
98 STR_1 STR_2 STR_3 STR_4 STR_5 STR_6 STR_7 STR_8 STR_9;
99
100
101
102 /*************************************************
103 * Convert a POSIX pattern *
104 *************************************************/
105
106 /* This function handles both basic and extended POSIX patterns.
107
108 Arguments:
109 pattype the pattern type
110 pattern the pattern
111 plength length in code units
112 utf TRUE if UTF
113 use_buffer where to put the output
114 use_length length of use_buffer
115 bufflenptr where to put the used length
116 dummyrun TRUE if a dummy run
117 ccontext the convert context
118
119 Returns: 0 => success
120 !0 => error code
121 */
122
123 static int
convert_posix(uint32_t pattype,PCRE2_SPTR pattern,PCRE2_SIZE plength,BOOL utf,PCRE2_UCHAR * use_buffer,PCRE2_SIZE use_length,PCRE2_SIZE * bufflenptr,BOOL dummyrun,pcre2_convert_context * ccontext)124 convert_posix(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength,
125 BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,
126 PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)
127 {
128 char *s;
129 PCRE2_SPTR posix = pattern;
130 PCRE2_UCHAR *p = use_buffer;
131 PCRE2_UCHAR *pp = p;
132 PCRE2_UCHAR *endp = p + use_length - 1; /* Allow for trailing zero */
133 PCRE2_SIZE convlength = 0;
134
135 uint32_t bracount = 0;
136 uint32_t posix_state = POSIX_START_REGEX;
137 uint32_t lastspecial = 0;
138 BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0;
139 BOOL nextisliteral = FALSE;
140
141 (void)utf; /* Not used when Unicode not supported */
142 (void)ccontext; /* Not currently used */
143
144 /* Initialize default for error offset as end of input. */
145
146 *bufflenptr = plength;
147 PUTCHARS(STR_STAR_NUL);
148
149 /* Now scan the input. */
150
151 while (plength > 0)
152 {
153 uint32_t c, sc;
154 int clength = 1;
155
156 /* Add in the length of the last item, then, if in the dummy run, pull the
157 pointer back to the start of the (temporary) buffer and then remember the
158 start of the next item. */
159
160 convlength += p - pp;
161 if (dummyrun) p = use_buffer;
162 pp = p;
163
164 /* Pick up the next character */
165
166 #ifndef SUPPORT_UNICODE
167 c = *posix;
168 #else
169 GETCHARLENTEST(c, posix, clength);
170 #endif
171 posix += clength;
172 plength -= clength;
173
174 sc = nextisliteral? 0 : c;
175 nextisliteral = FALSE;
176
177 /* Handle a character within a class. */
178
179 if (posix_state >= POSIX_CLASS_NOT_STARTED)
180 {
181 if (c == CHAR_RIGHT_SQUARE_BRACKET)
182 {
183 PUTCHARS(STR_RIGHT_SQUARE_BRACKET);
184 posix_state = POSIX_NOT_BRACKET;
185 }
186
187 /* Not the end of the class */
188
189 else
190 {
191 switch (posix_state)
192 {
193 case POSIX_CLASS_STARTED:
194 if (c <= 127 && islower(c)) break; /* Remain in started state */
195 posix_state = POSIX_CLASS_NOT_STARTED;
196 if (c == CHAR_COLON && plength > 0 &&
197 *posix == CHAR_RIGHT_SQUARE_BRACKET)
198 {
199 PUTCHARS(STR_COLON_RIGHT_SQUARE_BRACKET);
200 plength--;
201 posix++;
202 continue; /* With next character after :] */
203 }
204 /* Fall through */
205
206 case POSIX_CLASS_NOT_STARTED:
207 if (c == CHAR_LEFT_SQUARE_BRACKET)
208 posix_state = POSIX_CLASS_STARTING;
209 break;
210
211 case POSIX_CLASS_STARTING:
212 if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED;
213 break;
214 }
215
216 if (c == CHAR_BACKSLASH) PUTCHARS(STR_BACKSLASH);
217 if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
218 memcpy(p, posix - clength, CU2BYTES(clength));
219 p += clength;
220 }
221 }
222
223 /* Handle a character not within a class. */
224
225 else switch(sc)
226 {
227 case CHAR_LEFT_SQUARE_BRACKET:
228 PUTCHARS(STR_LEFT_SQUARE_BRACKET);
229
230 #ifdef NEVER
231 /* We could handle special cases [[:<:]] and [[:>:]] (which PCRE does
232 support) but they are not part of POSIX 1003.1. */
233
234 if (plength >= 6)
235 {
236 if (posix[0] == CHAR_LEFT_SQUARE_BRACKET &&
237 posix[1] == CHAR_COLON &&
238 (posix[2] == CHAR_LESS_THAN_SIGN ||
239 posix[2] == CHAR_GREATER_THAN_SIGN) &&
240 posix[3] == CHAR_COLON &&
241 posix[4] == CHAR_RIGHT_SQUARE_BRACKET &&
242 posix[5] == CHAR_RIGHT_SQUARE_BRACKET)
243 {
244 if (p + 6 > endp) return PCRE2_ERROR_NOMEMORY;
245 memcpy(p, posix, CU2BYTES(6));
246 p += 6;
247 posix += 6;
248 plength -= 6;
249 continue; /* With next character */
250 }
251 }
252 #endif
253
254 /* Handle start of "normal" character classes */
255
256 posix_state = POSIX_CLASS_NOT_STARTED;
257
258 /* Handle ^ and ] as first characters */
259
260 if (plength > 0)
261 {
262 if (*posix == CHAR_CIRCUMFLEX_ACCENT)
263 {
264 posix++;
265 plength--;
266 PUTCHARS(STR_CIRCUMFLEX_ACCENT);
267 }
268 if (plength > 0 && *posix == CHAR_RIGHT_SQUARE_BRACKET)
269 {
270 posix++;
271 plength--;
272 PUTCHARS(STR_RIGHT_SQUARE_BRACKET);
273 }
274 }
275 break;
276
277 case CHAR_BACKSLASH:
278 if (plength == 0) return PCRE2_ERROR_END_BACKSLASH;
279 if (extended) nextisliteral = TRUE; else
280 {
281 if (*posix < 127 && strchr(posix_meta_escapes, *posix) != NULL)
282 {
283 if (isdigit(*posix)) PUTCHARS(STR_BACKSLASH);
284 if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
285 lastspecial = *p++ = *posix++;
286 plength--;
287 }
288 else nextisliteral = TRUE;
289 }
290 break;
291
292 case CHAR_RIGHT_PARENTHESIS:
293 if (!extended || bracount == 0) goto ESCAPE_LITERAL;
294 bracount--;
295 goto COPY_SPECIAL;
296
297 case CHAR_LEFT_PARENTHESIS:
298 bracount++;
299 /* Fall through */
300
301 case CHAR_QUESTION_MARK:
302 case CHAR_PLUS:
303 case CHAR_LEFT_CURLY_BRACKET:
304 case CHAR_RIGHT_CURLY_BRACKET:
305 case CHAR_VERTICAL_LINE:
306 if (!extended) goto ESCAPE_LITERAL;
307 /* Fall through */
308
309 case CHAR_DOT:
310 case CHAR_DOLLAR_SIGN:
311 posix_state = POSIX_NOT_BRACKET;
312 COPY_SPECIAL:
313 lastspecial = c;
314 if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
315 *p++ = c;
316 break;
317
318 case CHAR_ASTERISK:
319 if (lastspecial != CHAR_ASTERISK)
320 {
321 if (!extended && (posix_state < POSIX_NOT_BRACKET ||
322 lastspecial == CHAR_LEFT_PARENTHESIS))
323 goto ESCAPE_LITERAL;
324 goto COPY_SPECIAL;
325 }
326 break; /* Ignore second and subsequent asterisks */
327
328 case CHAR_CIRCUMFLEX_ACCENT:
329 if (extended) goto COPY_SPECIAL;
330 if (posix_state == POSIX_START_REGEX ||
331 lastspecial == CHAR_LEFT_PARENTHESIS)
332 {
333 posix_state = POSIX_ANCHORED;
334 goto COPY_SPECIAL;
335 }
336 /* Fall through */
337
338 default:
339 if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL)
340 {
341 ESCAPE_LITERAL:
342 PUTCHARS(STR_BACKSLASH);
343 }
344 lastspecial = 0xff; /* Indicates nothing special */
345 if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
346 memcpy(p, posix - clength, CU2BYTES(clength));
347 p += clength;
348 posix_state = POSIX_NOT_BRACKET;
349 break;
350 }
351 }
352
353 if (posix_state >= POSIX_CLASS_NOT_STARTED)
354 return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
355 convlength += p - pp; /* Final segment */
356 *bufflenptr = convlength;
357 *p++ = 0;
358 return 0;
359 }
360
361
362 /*************************************************
363 * Convert a glob pattern *
364 *************************************************/
365
366 /* Context for writing the output into a buffer. */
367
368 typedef struct pcre2_output_context {
369 PCRE2_UCHAR *output; /* current output position */
370 PCRE2_SPTR output_end; /* output end */
371 PCRE2_SIZE output_size; /* size of the output */
372 uint8_t out_str[8]; /* string copied to the output */
373 } pcre2_output_context;
374
375
376 /* Write a character into the output.
377
378 Arguments:
379 out output context
380 chr the next character
381 */
382
383 static void
convert_glob_write(pcre2_output_context * out,PCRE2_UCHAR chr)384 convert_glob_write(pcre2_output_context *out, PCRE2_UCHAR chr)
385 {
386 out->output_size++;
387
388 if (out->output < out->output_end)
389 *out->output++ = chr;
390 }
391
392
393 /* Write a string into the output.
394
395 Arguments:
396 out output context
397 length length of out->out_str
398 */
399
400 static void
convert_glob_write_str(pcre2_output_context * out,PCRE2_SIZE length)401 convert_glob_write_str(pcre2_output_context *out, PCRE2_SIZE length)
402 {
403 uint8_t *out_str = out->out_str;
404 PCRE2_UCHAR *output = out->output;
405 PCRE2_SPTR output_end = out->output_end;
406 PCRE2_SIZE output_size = out->output_size;
407
408 do
409 {
410 output_size++;
411
412 if (output < output_end)
413 *output++ = *out_str++;
414 }
415 while (--length != 0);
416
417 out->output = output;
418 out->output_size = output_size;
419 }
420
421
422 /* Prints the separator into the output.
423
424 Arguments:
425 out output context
426 separator glob separator
427 with_escape backslash is needed before separator
428 */
429
430 static void
convert_glob_print_separator(pcre2_output_context * out,PCRE2_UCHAR separator,BOOL with_escape)431 convert_glob_print_separator(pcre2_output_context *out,
432 PCRE2_UCHAR separator, BOOL with_escape)
433 {
434 if (with_escape)
435 convert_glob_write(out, CHAR_BACKSLASH);
436
437 convert_glob_write(out, separator);
438 }
439
440
441 /* Prints a wildcard into the output.
442
443 Arguments:
444 out output context
445 separator glob separator
446 with_escape backslash is needed before separator
447 */
448
449 static void
convert_glob_print_wildcard(pcre2_output_context * out,PCRE2_UCHAR separator,BOOL with_escape)450 convert_glob_print_wildcard(pcre2_output_context *out,
451 PCRE2_UCHAR separator, BOOL with_escape)
452 {
453 out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
454 out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
455 convert_glob_write_str(out, 2);
456
457 convert_glob_print_separator(out, separator, with_escape);
458
459 convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
460 }
461
462
463 /* Parse a posix class.
464
465 Arguments:
466 from starting point of scanning the range
467 pattern_end end of pattern
468 out output context
469
470 Returns: >0 => class index
471 0 => malformed class
472 */
473
474 static int
convert_glob_parse_class(PCRE2_SPTR * from,PCRE2_SPTR pattern_end,pcre2_output_context * out)475 convert_glob_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
476 pcre2_output_context *out)
477 {
478 static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:"
479 "graph:lower:print:punct:space:upper:word:xdigit:";
480 PCRE2_SPTR start = *from + 1;
481 PCRE2_SPTR pattern = start;
482 const char *class_ptr;
483 PCRE2_UCHAR c;
484 int class_index;
485
486 while (TRUE)
487 {
488 if (pattern >= pattern_end) return 0;
489
490 c = *pattern++;
491
492 if (c < CHAR_a || c > CHAR_z) break;
493 }
494
495 if (c != CHAR_COLON || pattern >= pattern_end ||
496 *pattern != CHAR_RIGHT_SQUARE_BRACKET)
497 return 0;
498
499 class_ptr = posix_classes;
500 class_index = 1;
501
502 while (TRUE)
503 {
504 if (*class_ptr == CHAR_NUL) return 0;
505
506 pattern = start;
507
508 while (*pattern == (PCRE2_UCHAR) *class_ptr)
509 {
510 if (*pattern == CHAR_COLON)
511 {
512 pattern += 2;
513 start -= 2;
514
515 do convert_glob_write(out, *start++); while (start < pattern);
516
517 *from = pattern;
518 return class_index;
519 }
520 pattern++;
521 class_ptr++;
522 }
523
524 while (*class_ptr != CHAR_COLON) class_ptr++;
525 class_ptr++;
526 class_index++;
527 }
528 }
529
530 /* Checks whether the character is in the class.
531
532 Arguments:
533 class_index class index
534 c character
535
536 Returns: !0 => character is found in the class
537 0 => otherwise
538 */
539
540 static BOOL
convert_glob_char_in_class(int class_index,PCRE2_UCHAR c)541 convert_glob_char_in_class(int class_index, PCRE2_UCHAR c)
542 {
543 #if PCRE2_CODE_UNIT_WIDTH != 8
544 if (c > 0xff)
545 {
546 /* ctype functions are not sane for c > 0xff */
547 return 0;
548 }
549 #endif
550
551 switch (class_index)
552 {
553 case 1: return isalnum(c);
554 case 2: return isalpha(c);
555 case 3: return 1;
556 case 4: return c == CHAR_HT || c == CHAR_SPACE;
557 case 5: return iscntrl(c);
558 case 6: return isdigit(c);
559 case 7: return isgraph(c);
560 case 8: return islower(c);
561 case 9: return isprint(c);
562 case 10: return ispunct(c);
563 case 11: return isspace(c);
564 case 12: return isupper(c);
565 case 13: return isalnum(c) || c == CHAR_UNDERSCORE;
566 default: return isxdigit(c);
567 }
568 }
569
570 /* Parse a range of characters.
571
572 Arguments:
573 from starting point of scanning the range
574 pattern_end end of pattern
575 out output context
576 separator glob separator
577 with_escape backslash is needed before separator
578
579 Returns: 0 => success
580 !0 => error code
581 */
582
583 static int
convert_glob_parse_range(PCRE2_SPTR * from,PCRE2_SPTR pattern_end,pcre2_output_context * out,BOOL utf,PCRE2_UCHAR separator,BOOL with_escape,PCRE2_UCHAR escape,BOOL no_wildsep)584 convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
585 pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator,
586 BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep)
587 {
588 BOOL is_negative = FALSE;
589 BOOL separator_seen = FALSE;
590 BOOL has_prev_c;
591 PCRE2_SPTR pattern = *from;
592 PCRE2_SPTR char_start = NULL;
593 uint32_t c, prev_c;
594 int len, class_index;
595
596 (void)utf; /* Avoid compiler warning. */
597
598 if (pattern >= pattern_end)
599 {
600 *from = pattern;
601 return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
602 }
603
604 if (*pattern == CHAR_EXCLAMATION_MARK
605 || *pattern == CHAR_CIRCUMFLEX_ACCENT)
606 {
607 pattern++;
608
609 if (pattern >= pattern_end)
610 {
611 *from = pattern;
612 return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
613 }
614
615 is_negative = TRUE;
616
617 out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
618 out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
619 len = 2;
620
621 if (!no_wildsep)
622 {
623 if (with_escape)
624 {
625 out->out_str[len] = CHAR_BACKSLASH;
626 len++;
627 }
628 out->out_str[len] = (uint8_t) separator;
629 }
630
631 convert_glob_write_str(out, len + 1);
632 }
633 else
634 convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET);
635
636 has_prev_c = FALSE;
637 prev_c = 0;
638
639 if (*pattern == CHAR_RIGHT_SQUARE_BRACKET)
640 {
641 out->out_str[0] = CHAR_BACKSLASH;
642 out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET;
643 convert_glob_write_str(out, 2);
644 has_prev_c = TRUE;
645 prev_c = CHAR_RIGHT_SQUARE_BRACKET;
646 pattern++;
647 }
648
649 while (pattern < pattern_end)
650 {
651 char_start = pattern;
652 GETCHARINCTEST(c, pattern);
653
654 if (c == CHAR_RIGHT_SQUARE_BRACKET)
655 {
656 convert_glob_write(out, c);
657
658 if (!is_negative && !no_wildsep && separator_seen)
659 {
660 out->out_str[0] = CHAR_LEFT_PARENTHESIS;
661 out->out_str[1] = CHAR_QUESTION_MARK;
662 out->out_str[2] = CHAR_LESS_THAN_SIGN;
663 out->out_str[3] = CHAR_EXCLAMATION_MARK;
664 convert_glob_write_str(out, 4);
665
666 convert_glob_print_separator(out, separator, with_escape);
667 convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
668 }
669
670 *from = pattern;
671 return 0;
672 }
673
674 if (pattern >= pattern_end) break;
675
676 if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
677 {
678 *from = pattern;
679 class_index = convert_glob_parse_class(from, pattern_end, out);
680
681 if (class_index != 0)
682 {
683 pattern = *from;
684
685 has_prev_c = FALSE;
686 prev_c = 0;
687
688 if (!is_negative &&
689 convert_glob_char_in_class (class_index, separator))
690 separator_seen = TRUE;
691 continue;
692 }
693 }
694 else if (c == CHAR_MINUS && has_prev_c &&
695 *pattern != CHAR_RIGHT_SQUARE_BRACKET)
696 {
697 convert_glob_write(out, CHAR_MINUS);
698
699 char_start = pattern;
700 GETCHARINCTEST(c, pattern);
701
702 if (pattern >= pattern_end) break;
703
704 if (escape != 0 && c == escape)
705 {
706 char_start = pattern;
707 GETCHARINCTEST(c, pattern);
708 }
709 else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
710 {
711 *from = pattern;
712 return PCRE2_ERROR_CONVERT_SYNTAX;
713 }
714
715 if (prev_c > c)
716 {
717 *from = pattern;
718 return PCRE2_ERROR_CONVERT_SYNTAX;
719 }
720
721 if (prev_c < separator && separator < c) separator_seen = TRUE;
722
723 has_prev_c = FALSE;
724 prev_c = 0;
725 }
726 else
727 {
728 if (escape != 0 && c == escape)
729 {
730 char_start = pattern;
731 GETCHARINCTEST(c, pattern);
732
733 if (pattern >= pattern_end) break;
734 }
735
736 has_prev_c = TRUE;
737 prev_c = c;
738 }
739
740 if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
741 c == CHAR_BACKSLASH || c == CHAR_MINUS)
742 convert_glob_write(out, CHAR_BACKSLASH);
743
744 if (c == separator) separator_seen = TRUE;
745
746 do convert_glob_write(out, *char_start++); while (char_start < pattern);
747 }
748
749 *from = pattern;
750 return PCRE2_ERROR_MISSING_SQUARE_BRACKET;
751 }
752
753
754 /* Prints a (*COMMIT) into the output.
755
756 Arguments:
757 out output context
758 */
759
760 static void
convert_glob_print_commit(pcre2_output_context * out)761 convert_glob_print_commit(pcre2_output_context *out)
762 {
763 out->out_str[0] = CHAR_LEFT_PARENTHESIS;
764 out->out_str[1] = CHAR_ASTERISK;
765 out->out_str[2] = CHAR_C;
766 out->out_str[3] = CHAR_O;
767 out->out_str[4] = CHAR_M;
768 out->out_str[5] = CHAR_M;
769 out->out_str[6] = CHAR_I;
770 out->out_str[7] = CHAR_T;
771 convert_glob_write_str(out, 8);
772 convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
773 }
774
775
776 /* Bash glob converter.
777
778 Arguments:
779 pattype the pattern type
780 pattern the pattern
781 plength length in code units
782 utf TRUE if UTF
783 use_buffer where to put the output
784 use_length length of use_buffer
785 bufflenptr where to put the used length
786 dummyrun TRUE if a dummy run
787 ccontext the convert context
788
789 Returns: 0 => success
790 !0 => error code
791 */
792
793 static int
convert_glob(uint32_t options,PCRE2_SPTR pattern,PCRE2_SIZE plength,BOOL utf,PCRE2_UCHAR * use_buffer,PCRE2_SIZE use_length,PCRE2_SIZE * bufflenptr,BOOL dummyrun,pcre2_convert_context * ccontext)794 convert_glob(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength,
795 BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,
796 PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)
797 {
798 pcre2_output_context out;
799 PCRE2_SPTR pattern_start = pattern;
800 PCRE2_SPTR pattern_end = pattern + plength;
801 PCRE2_UCHAR separator = ccontext->glob_separator;
802 PCRE2_UCHAR escape = ccontext->glob_escape;
803 PCRE2_UCHAR c;
804 BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0;
805 BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0;
806 BOOL in_atomic = FALSE;
807 BOOL after_starstar = FALSE;
808 BOOL no_slash_z = FALSE;
809 BOOL with_escape, is_start, after_separator;
810 int result = 0;
811
812 (void)utf; /* Avoid compiler warning. */
813
814 #ifdef SUPPORT_UNICODE
815 if (utf && (separator >= 128 || escape >= 128))
816 {
817 /* Currently only ASCII characters are supported. */
818 *bufflenptr = 0;
819 return PCRE2_ERROR_CONVERT_SYNTAX;
820 }
821 #endif
822
823 with_escape = strchr(pcre2_escaped_literals, separator) != NULL;
824
825 /* Initialize default for error offset as end of input. */
826 out.output = use_buffer;
827 out.output_end = use_buffer + use_length;
828 out.output_size = 0;
829
830 out.out_str[0] = CHAR_LEFT_PARENTHESIS;
831 out.out_str[1] = CHAR_QUESTION_MARK;
832 out.out_str[2] = CHAR_s;
833 out.out_str[3] = CHAR_RIGHT_PARENTHESIS;
834 convert_glob_write_str(&out, 4);
835
836 is_start = TRUE;
837
838 if (pattern < pattern_end && pattern[0] == CHAR_ASTERISK)
839 {
840 if (no_wildsep)
841 is_start = FALSE;
842 else if (!no_starstar && pattern + 1 < pattern_end &&
843 pattern[1] == CHAR_ASTERISK)
844 is_start = FALSE;
845 }
846
847 if (is_start)
848 {
849 out.out_str[0] = CHAR_BACKSLASH;
850 out.out_str[1] = CHAR_A;
851 convert_glob_write_str(&out, 2);
852 }
853
854 while (pattern < pattern_end)
855 {
856 c = *pattern++;
857
858 if (c == CHAR_ASTERISK)
859 {
860 is_start = pattern == pattern_start + 1;
861
862 if (in_atomic)
863 {
864 convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
865 in_atomic = FALSE;
866 }
867
868 if (!no_starstar && pattern < pattern_end && *pattern == CHAR_ASTERISK)
869 {
870 after_separator = is_start || (pattern[-2] == separator);
871
872 do pattern++; while (pattern < pattern_end &&
873 *pattern == CHAR_ASTERISK);
874
875 if (pattern >= pattern_end)
876 {
877 no_slash_z = TRUE;
878 break;
879 }
880
881 after_starstar = TRUE;
882
883 if (after_separator && escape != 0 && *pattern == escape &&
884 pattern + 1 < pattern_end && pattern[1] == separator)
885 pattern++;
886
887 if (is_start)
888 {
889 if (*pattern != separator) continue;
890
891 out.out_str[0] = CHAR_LEFT_PARENTHESIS;
892 out.out_str[1] = CHAR_QUESTION_MARK;
893 out.out_str[2] = CHAR_COLON;
894 out.out_str[3] = CHAR_BACKSLASH;
895 out.out_str[4] = CHAR_A;
896 out.out_str[5] = CHAR_VERTICAL_LINE;
897 convert_glob_write_str(&out, 6);
898
899 convert_glob_print_separator(&out, separator, with_escape);
900 convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
901
902 pattern++;
903 continue;
904 }
905
906 convert_glob_print_commit(&out);
907
908 if (!after_separator || *pattern != separator)
909 {
910 out.out_str[0] = CHAR_DOT;
911 out.out_str[1] = CHAR_ASTERISK;
912 out.out_str[2] = CHAR_QUESTION_MARK;
913 convert_glob_write_str(&out, 3);
914 continue;
915 }
916
917 out.out_str[0] = CHAR_LEFT_PARENTHESIS;
918 out.out_str[1] = CHAR_QUESTION_MARK;
919 out.out_str[2] = CHAR_COLON;
920 out.out_str[3] = CHAR_DOT;
921 out.out_str[4] = CHAR_ASTERISK;
922 out.out_str[5] = CHAR_QUESTION_MARK;
923
924 convert_glob_write_str(&out, 6);
925
926 convert_glob_print_separator(&out, separator, with_escape);
927
928 out.out_str[0] = CHAR_RIGHT_PARENTHESIS;
929 out.out_str[1] = CHAR_QUESTION_MARK;
930 out.out_str[2] = CHAR_QUESTION_MARK;
931 convert_glob_write_str(&out, 3);
932
933 pattern++;
934 continue;
935 }
936
937 if (pattern < pattern_end && *pattern == CHAR_ASTERISK)
938 {
939 do pattern++; while (pattern < pattern_end &&
940 *pattern == CHAR_ASTERISK);
941 }
942
943 if (no_wildsep)
944 {
945 if (pattern >= pattern_end)
946 {
947 no_slash_z = TRUE;
948 break;
949 }
950
951 /* Start check must be after the end check. */
952 if (is_start) continue;
953 }
954
955 if (!is_start)
956 {
957 if (after_starstar)
958 {
959 out.out_str[0] = CHAR_LEFT_PARENTHESIS;
960 out.out_str[1] = CHAR_QUESTION_MARK;
961 out.out_str[2] = CHAR_GREATER_THAN_SIGN;
962 convert_glob_write_str(&out, 3);
963 in_atomic = TRUE;
964 }
965 else
966 convert_glob_print_commit(&out);
967 }
968
969 if (no_wildsep)
970 convert_glob_write(&out, CHAR_DOT);
971 else
972 convert_glob_print_wildcard(&out, separator, with_escape);
973
974 out.out_str[0] = CHAR_ASTERISK;
975 out.out_str[1] = CHAR_QUESTION_MARK;
976 if (pattern >= pattern_end)
977 out.out_str[1] = CHAR_PLUS;
978 convert_glob_write_str(&out, 2);
979 continue;
980 }
981
982 if (c == CHAR_QUESTION_MARK)
983 {
984 if (no_wildsep)
985 convert_glob_write(&out, CHAR_DOT);
986 else
987 convert_glob_print_wildcard(&out, separator, with_escape);
988 continue;
989 }
990
991 if (c == CHAR_LEFT_SQUARE_BRACKET)
992 {
993 result = convert_glob_parse_range(&pattern, pattern_end,
994 &out, utf, separator, with_escape, escape, no_wildsep);
995 if (result != 0) break;
996 continue;
997 }
998
999 if (escape != 0 && c == escape)
1000 {
1001 if (pattern >= pattern_end)
1002 {
1003 result = PCRE2_ERROR_CONVERT_SYNTAX;
1004 break;
1005 }
1006 c = *pattern++;
1007 }
1008
1009 if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL)
1010 convert_glob_write(&out, CHAR_BACKSLASH);
1011
1012 convert_glob_write(&out, c);
1013 }
1014
1015 if (result == 0)
1016 {
1017 if (!no_slash_z)
1018 {
1019 out.out_str[0] = CHAR_BACKSLASH;
1020 out.out_str[1] = CHAR_z;
1021 convert_glob_write_str(&out, 2);
1022 }
1023
1024 if (in_atomic)
1025 convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS);
1026
1027 convert_glob_write(&out, CHAR_NUL);
1028
1029 if (!dummyrun && out.output_size != (PCRE2_SIZE) (out.output - use_buffer))
1030 result = PCRE2_ERROR_NOMEMORY;
1031 }
1032
1033 if (result != 0)
1034 {
1035 *bufflenptr = pattern - pattern_start;
1036 return result;
1037 }
1038
1039 *bufflenptr = out.output_size - 1;
1040 return 0;
1041 }
1042
1043
1044 /*************************************************
1045 * Convert pattern *
1046 *************************************************/
1047
1048 /* This is the external-facing function for converting other forms of pattern
1049 into PCRE2 regular expression patterns. On error, the bufflenptr argument is
1050 used to return an offset in the original pattern.
1051
1052 Arguments:
1053 pattern the input pattern
1054 plength length of input, or PCRE2_ZERO_TERMINATED
1055 options options bits
1056 buffptr pointer to pointer to output buffer
1057 bufflenptr pointer to length of output buffer
1058 ccontext convert context or NULL
1059
1060 Returns: 0 for success, else an error code (+ve or -ve)
1061 */
1062
1063 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_pattern_convert(PCRE2_SPTR pattern,PCRE2_SIZE plength,uint32_t options,PCRE2_UCHAR ** buffptr,PCRE2_SIZE * bufflenptr,pcre2_convert_context * ccontext)1064 pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE plength, uint32_t options,
1065 PCRE2_UCHAR **buffptr, PCRE2_SIZE *bufflenptr,
1066 pcre2_convert_context *ccontext)
1067 {
1068 int i, rc;
1069 PCRE2_UCHAR dummy_buffer[DUMMY_BUFFER_SIZE];
1070 PCRE2_UCHAR *use_buffer = dummy_buffer;
1071 PCRE2_SIZE use_length = DUMMY_BUFFER_SIZE;
1072 BOOL utf = (options & PCRE2_CONVERT_UTF) != 0;
1073 uint32_t pattype = options & TYPE_OPTIONS;
1074
1075 if (pattern == NULL || bufflenptr == NULL) return PCRE2_ERROR_NULL;
1076
1077 if ((options & ~ALL_OPTIONS) != 0 || /* Undefined bit set */
1078 (pattype & (~pattype+1)) != pattype || /* More than one type set */
1079 pattype == 0) /* No type set */
1080 {
1081 *bufflenptr = 0; /* Error offset */
1082 return PCRE2_ERROR_BADOPTION;
1083 }
1084
1085 if (plength == PCRE2_ZERO_TERMINATED) plength = PRIV(strlen)(pattern);
1086 if (ccontext == NULL) ccontext =
1087 (pcre2_convert_context *)(&PRIV(default_convert_context));
1088
1089 /* Check UTF if required. */
1090
1091 #ifndef SUPPORT_UNICODE
1092 if (utf)
1093 {
1094 *bufflenptr = 0; /* Error offset */
1095 return PCRE2_ERROR_UNICODE_NOT_SUPPORTED;
1096 }
1097 #else
1098 if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0)
1099 {
1100 PCRE2_SIZE erroroffset;
1101 rc = PRIV(valid_utf)(pattern, plength, &erroroffset);
1102 if (rc != 0)
1103 {
1104 *bufflenptr = erroroffset;
1105 return rc;
1106 }
1107 }
1108 #endif
1109
1110 /* If buffptr is not NULL, and what it points to is not NULL, we are being
1111 provided with a buffer and a length, so set them as the buffer to use. */
1112
1113 if (buffptr != NULL && *buffptr != NULL)
1114 {
1115 use_buffer = *buffptr;
1116 use_length = *bufflenptr;
1117 }
1118
1119 /* Call an individual converter, either just once (if a buffer was provided or
1120 just the length is needed), or twice (if a memory allocation is required). */
1121
1122 for (i = 0; i < 2; i++)
1123 {
1124 PCRE2_UCHAR *allocated;
1125 BOOL dummyrun = buffptr == NULL || *buffptr == NULL;
1126
1127 switch(pattype)
1128 {
1129 case PCRE2_CONVERT_GLOB:
1130 rc = convert_glob(options & ~PCRE2_CONVERT_GLOB, pattern, plength, utf,
1131 use_buffer, use_length, bufflenptr, dummyrun, ccontext);
1132 break;
1133
1134 case PCRE2_CONVERT_POSIX_BASIC:
1135 case PCRE2_CONVERT_POSIX_EXTENDED:
1136 rc = convert_posix(pattype, pattern, plength, utf, use_buffer, use_length,
1137 bufflenptr, dummyrun, ccontext);
1138 break;
1139
1140 default:
1141 *bufflenptr = 0; /* Error offset */
1142 return PCRE2_ERROR_INTERNAL;
1143 }
1144
1145 if (rc != 0 || /* Error */
1146 buffptr == NULL || /* Just the length is required */
1147 *buffptr != NULL) /* Buffer was provided or allocated */
1148 return rc;
1149
1150 /* Allocate memory for the buffer, with hidden space for an allocator at
1151 the start. The next time round the loop runs the conversion for real. */
1152
1153 allocated = PRIV(memctl_malloc)(sizeof(pcre2_memctl) +
1154 (*bufflenptr + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)ccontext);
1155 if (allocated == NULL) return PCRE2_ERROR_NOMEMORY;
1156 *buffptr = (PCRE2_UCHAR *)(((char *)allocated) + sizeof(pcre2_memctl));
1157
1158 use_buffer = *buffptr;
1159 use_length = *bufflenptr + 1;
1160 }
1161
1162 /* Control should never get here. */
1163
1164 return PCRE2_ERROR_INTERNAL;
1165 }
1166
1167
1168 /*************************************************
1169 * Free converted pattern *
1170 *************************************************/
1171
1172 /* This frees a converted pattern that was put in newly-allocated memory.
1173
1174 Argument: the converted pattern
1175 Returns: nothing
1176 */
1177
1178 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_converted_pattern_free(PCRE2_UCHAR * converted)1179 pcre2_converted_pattern_free(PCRE2_UCHAR *converted)
1180 {
1181 if (converted != NULL)
1182 {
1183 pcre2_memctl *memctl =
1184 (pcre2_memctl *)((char *)converted - sizeof(pcre2_memctl));
1185 memctl->free(memctl, memctl->memory_data);
1186 }
1187 }
1188
1189 /* End of pcre2_convert.c */
1190