xref: /PHP-7.2/ext/mbstring/oniguruma/src/regparse.c (revision 0d993573)
1 /**********************************************************************
2   regparse.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2017  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 #include "regparse.h"
30 #include "st.h"
31 
32 #ifdef DEBUG_NODE_FREE
33 #include <stdio.h>
34 #endif
35 
36 #define WARN_BUFSIZE    256
37 
38 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
39 
40 
41 OnigSyntaxType OnigSyntaxRuby = {
42   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
43      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
44      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_O_BRACE_OCTAL |
45      ONIG_SYN_OP_ESC_CONTROL_CHARS |
46      ONIG_SYN_OP_ESC_C_CONTROL )
47    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
48   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
49       ONIG_SYN_OP2_OPTION_RUBY |
50       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
51       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
52       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
53       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
54       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
55       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
56       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
57       ONIG_SYN_OP2_ESC_H_XDIGIT )
58   , ( SYN_GNU_REGEX_BV |
59       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
60       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
61       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
62       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
63       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
64       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
65       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
66   , ONIG_OPTION_NONE
67   ,
68   {
69       (OnigCodePoint )'\\'                       /* esc */
70     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
71     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
72     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
73     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
74     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
75   }
76 };
77 
78 OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
79 
onig_null_warn(const char * s ARG_UNUSED)80 extern void onig_null_warn(const char* s ARG_UNUSED) { }
81 
82 #ifdef DEFAULT_WARN_FUNCTION
83 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
84 #else
85 static OnigWarnFunc onig_warn = onig_null_warn;
86 #endif
87 
88 #ifdef DEFAULT_VERB_WARN_FUNCTION
89 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
90 #else
91 static OnigWarnFunc onig_verb_warn = onig_null_warn;
92 #endif
93 
onig_set_warn_func(OnigWarnFunc f)94 extern void onig_set_warn_func(OnigWarnFunc f)
95 {
96   onig_warn = f;
97 }
98 
onig_set_verb_warn_func(OnigWarnFunc f)99 extern void onig_set_verb_warn_func(OnigWarnFunc f)
100 {
101   onig_verb_warn = f;
102 }
103 
104 extern void
onig_warning(const char * s)105 onig_warning(const char* s)
106 {
107   if (onig_warn == onig_null_warn) return ;
108 
109   (*onig_warn)(s);
110 }
111 
112 #define DEFAULT_MAX_CAPTURE_NUM   32767
113 
114 static int MaxCaptureNum = DEFAULT_MAX_CAPTURE_NUM;
115 
116 extern int
onig_set_capture_num_limit(int num)117 onig_set_capture_num_limit(int num)
118 {
119   if (num < 0) return -1;
120 
121   MaxCaptureNum = num;
122   return 0;
123 }
124 
125 static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
126 
127 extern unsigned int
onig_get_parse_depth_limit(void)128 onig_get_parse_depth_limit(void)
129 {
130   return ParseDepthLimit;
131 }
132 
133 extern int
onig_set_parse_depth_limit(unsigned int depth)134 onig_set_parse_depth_limit(unsigned int depth)
135 {
136   if (depth == 0)
137     ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
138   else
139     ParseDepthLimit = depth;
140   return 0;
141 }
142 
143 
144 static void
bbuf_free(BBuf * bbuf)145 bbuf_free(BBuf* bbuf)
146 {
147   if (IS_NOT_NULL(bbuf)) {
148     if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
149     xfree(bbuf);
150   }
151 }
152 
153 static int
bbuf_clone(BBuf ** rto,BBuf * from)154 bbuf_clone(BBuf** rto, BBuf* from)
155 {
156   int r;
157   BBuf *to;
158 
159   *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
160   CHECK_NULL_RETURN_MEMERR(to);
161   r = BBUF_INIT(to, from->alloc);
162   if (r != 0) return r;
163   to->used = from->used;
164   xmemcpy(to->p, from->p, from->used);
165   return 0;
166 }
167 
168 #define BACKREF_REL_TO_ABS(rel_no, env) \
169   ((env)->num_mem + 1 + (rel_no))
170 
171 #define ONOFF(v,f,negative)    (negative) ? ((v) &= ~(f)) : ((v) |= (f))
172 
173 #define MBCODE_START_POS(enc) \
174   (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
175 
176 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
177   add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
178 
179 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
180   if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
181     r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
182     if (r) return r;\
183   }\
184 } while (0)
185 
186 
187 #define BITSET_IS_EMPTY(bs,empty) do {\
188   int i;\
189   empty = 1;\
190   for (i = 0; i < (int )BITSET_SIZE; i++) {\
191     if ((bs)[i] != 0) {\
192       empty = 0; break;\
193     }\
194   }\
195 } while (0)
196 
197 static void
bitset_set_range(BitSetRef bs,int from,int to)198 bitset_set_range(BitSetRef bs, int from, int to)
199 {
200   int i;
201   for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
202     BITSET_SET_BIT(bs, i);
203   }
204 }
205 
206 #if 0
207 static void
208 bitset_set_all(BitSetRef bs)
209 {
210   int i;
211   for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
212 }
213 #endif
214 
215 static void
bitset_invert(BitSetRef bs)216 bitset_invert(BitSetRef bs)
217 {
218   int i;
219   for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
220 }
221 
222 static void
bitset_invert_to(BitSetRef from,BitSetRef to)223 bitset_invert_to(BitSetRef from, BitSetRef to)
224 {
225   int i;
226   for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
227 }
228 
229 static void
bitset_and(BitSetRef dest,BitSetRef bs)230 bitset_and(BitSetRef dest, BitSetRef bs)
231 {
232   int i;
233   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
234 }
235 
236 static void
bitset_or(BitSetRef dest,BitSetRef bs)237 bitset_or(BitSetRef dest, BitSetRef bs)
238 {
239   int i;
240   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
241 }
242 
243 static void
bitset_copy(BitSetRef dest,BitSetRef bs)244 bitset_copy(BitSetRef dest, BitSetRef bs)
245 {
246   int i;
247   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
248 }
249 
250 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)251 onig_strncmp(const UChar* s1, const UChar* s2, int n)
252 {
253   int x;
254 
255   while (n-- > 0) {
256     x = *s2++ - *s1++;
257     if (x) return x;
258   }
259   return 0;
260 }
261 
262 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)263 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
264 {
265   int len = end - src;
266   if (len > 0) {
267     xmemcpy(dest, src, len);
268     dest[len] = (UChar )0;
269   }
270 }
271 
272 #ifdef USE_NAMED_GROUP
273 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)274 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
275 {
276   int slen, term_len, i;
277   UChar *r;
278 
279   slen = end - s;
280   term_len = ONIGENC_MBC_MINLEN(enc);
281 
282   r = (UChar* )xmalloc(slen + term_len);
283   CHECK_NULL_RETURN(r);
284   xmemcpy(r, s, slen);
285 
286   for (i = 0; i < term_len; i++)
287     r[slen + i] = (UChar )0;
288 
289   return r;
290 }
291 #endif
292 
293 /* scan pattern methods */
294 #define PEND_VALUE   0
295 
296 #define PFETCH_READY  UChar* pfetch_prev
297 #define PEND         (p < end ?  0 : 1)
298 #define PUNFETCH     p = pfetch_prev
299 #define PINC       do { \
300   pfetch_prev = p; \
301   p += ONIGENC_MBC_ENC_LEN(enc, p); \
302 } while (0)
303 #define PFETCH(c)  do { \
304   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
305   pfetch_prev = p; \
306   p += ONIGENC_MBC_ENC_LEN(enc, p); \
307   if(UNEXPECTED(p > end)) p = end; \
308 } while (0)
309 
310 #define PINC_S     do { \
311   p += ONIGENC_MBC_ENC_LEN(enc, p); \
312   if(UNEXPECTED(p > end)) p = end; \
313 } while (0)
314 #define PFETCH_S(c) do { \
315   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
316   p += ONIGENC_MBC_ENC_LEN(enc, p); \
317   if(UNEXPECTED(p > end)) p = end; \
318 } while (0)
319 
320 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
321 #define PPEEK_IS(c)  (PPEEK == (OnigCodePoint )c)
322 
323 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)324 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
325 	      int capa)
326 {
327   UChar* r;
328 
329   if (dest)
330     r = (UChar* )xrealloc(dest, capa + 1);
331   else
332     r = (UChar* )xmalloc(capa + 1);
333 
334   CHECK_NULL_RETURN(r);
335   onig_strcpy(r + (dest_end - dest), src, src_end);
336   return r;
337 }
338 
339 /* dest on static area */
340 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)341 strcat_capa_from_static(UChar* dest, UChar* dest_end,
342 			const UChar* src, const UChar* src_end, int capa)
343 {
344   UChar* r;
345 
346   r = (UChar* )xmalloc(capa + 1);
347   CHECK_NULL_RETURN(r);
348   onig_strcpy(r, dest, dest_end);
349   onig_strcpy(r + (dest_end - dest), src, src_end);
350   return r;
351 }
352 
353 
354 #ifdef USE_ST_LIBRARY
355 
356 typedef struct {
357   UChar* s;
358   UChar* end;
359 } st_str_end_key;
360 
361 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)362 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
363 {
364   UChar *p, *q;
365   int c;
366 
367   if ((x->end - x->s) != (y->end - y->s))
368     return 1;
369 
370   p = x->s;
371   q = y->s;
372   while (p < x->end) {
373     c = (int )*p - (int )*q;
374     if (c != 0) return c;
375 
376     p++; q++;
377   }
378 
379   return 0;
380 }
381 
382 static int
str_end_hash(st_str_end_key * x)383 str_end_hash(st_str_end_key* x)
384 {
385   UChar *p;
386   int val = 0;
387 
388   p = x->s;
389   while (p < x->end) {
390     val = val * 997 + (int )*p++;
391   }
392 
393   return val + (val >> 5);
394 }
395 
396 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)397 onig_st_init_strend_table_with_size(int size)
398 {
399   static struct st_hash_type hashType = {
400     str_end_cmp,
401     str_end_hash,
402   };
403 
404   return (hash_table_type* )
405            onig_st_init_table_with_size(&hashType, size);
406 }
407 
408 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)409 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
410 		      const UChar* end_key, hash_data_type *value)
411 {
412   st_str_end_key key;
413 
414   key.s   = (UChar* )str_key;
415   key.end = (UChar* )end_key;
416 
417   return onig_st_lookup(table, (st_data_t )(&key), value);
418 }
419 
420 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)421 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
422 		      const UChar* end_key, hash_data_type value)
423 {
424   st_str_end_key* key;
425   int result;
426 
427   key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
428   key->s   = (UChar* )str_key;
429   key->end = (UChar* )end_key;
430   result = onig_st_insert(table, (st_data_t )key, value);
431   if (result) {
432     xfree(key);
433   }
434   return result;
435 }
436 
437 #endif /* USE_ST_LIBRARY */
438 
439 
440 #ifdef USE_NAMED_GROUP
441 
442 #define INIT_NAME_BACKREFS_ALLOC_NUM   8
443 
444 typedef struct {
445   UChar* name;
446   int    name_len;   /* byte length */
447   int    back_num;   /* number of backrefs */
448   int    back_alloc;
449   int    back_ref1;
450   int*   back_refs;
451 } NameEntry;
452 
453 #ifdef USE_ST_LIBRARY
454 
455 typedef st_table  NameTable;
456 typedef st_data_t HashDataType;   /* 1.6 st.h doesn't define st_data_t type */
457 
458 #define NAMEBUF_SIZE    24
459 #define NAMEBUF_SIZE_1  25
460 
461 #ifdef ONIG_DEBUG
462 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)463 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
464 {
465   int i;
466   FILE* fp = (FILE* )arg;
467 
468   fprintf(fp, "%s: ", e->name);
469   if (e->back_num == 0)
470     fputs("-", fp);
471   else if (e->back_num == 1)
472     fprintf(fp, "%d", e->back_ref1);
473   else {
474     for (i = 0; i < e->back_num; i++) {
475       if (i > 0) fprintf(fp, ", ");
476       fprintf(fp, "%d", e->back_refs[i]);
477     }
478   }
479   fputs("\n", fp);
480   return ST_CONTINUE;
481 }
482 
483 extern int
onig_print_names(FILE * fp,regex_t * reg)484 onig_print_names(FILE* fp, regex_t* reg)
485 {
486   NameTable* t = (NameTable* )reg->name_table;
487 
488   if (IS_NOT_NULL(t)) {
489     fprintf(fp, "name table\n");
490     onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
491     fputs("\n", fp);
492   }
493   return 0;
494 }
495 #endif /* ONIG_DEBUG */
496 
497 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)498 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
499 {
500   xfree(e->name);
501   if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
502   xfree(key);
503   xfree(e);
504   return ST_DELETE;
505 }
506 
507 static int
names_clear(regex_t * reg)508 names_clear(regex_t* reg)
509 {
510   NameTable* t = (NameTable* )reg->name_table;
511 
512   if (IS_NOT_NULL(t)) {
513     onig_st_foreach(t, i_free_name_entry, 0);
514   }
515   return 0;
516 }
517 
518 extern int
onig_names_free(regex_t * reg)519 onig_names_free(regex_t* reg)
520 {
521   int r;
522   NameTable* t;
523 
524   r = names_clear(reg);
525   if (r) return r;
526 
527   t = (NameTable* )reg->name_table;
528   if (IS_NOT_NULL(t)) onig_st_free_table(t);
529   reg->name_table = (void* )NULL;
530   return 0;
531 }
532 
533 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)534 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
535 {
536   NameEntry* e;
537   NameTable* t = (NameTable* )reg->name_table;
538 
539   e = (NameEntry* )NULL;
540   if (IS_NOT_NULL(t)) {
541     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
542   }
543   return e;
544 }
545 
546 typedef struct {
547   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
548   regex_t* reg;
549   void* arg;
550   int ret;
551   OnigEncoding enc;
552 } INamesArg;
553 
554 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)555 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
556 {
557   int r = (*(arg->func))(e->name,
558                          e->name + e->name_len,
559                          e->back_num,
560                          (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
561                          arg->reg, arg->arg);
562   if (r != 0) {
563     arg->ret = r;
564     return ST_STOP;
565   }
566   return ST_CONTINUE;
567 }
568 
569 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)570 onig_foreach_name(regex_t* reg,
571   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
572 {
573   INamesArg narg;
574   NameTable* t = (NameTable* )reg->name_table;
575 
576   narg.ret = 0;
577   if (IS_NOT_NULL(t)) {
578     narg.func = func;
579     narg.reg  = reg;
580     narg.arg  = arg;
581     narg.enc  = reg->enc; /* should be pattern encoding. */
582     onig_st_foreach(t, i_names, (HashDataType )&narg);
583   }
584   return narg.ret;
585 }
586 
587 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)588 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
589 {
590   int i;
591 
592   if (e->back_num > 1) {
593     for (i = 0; i < e->back_num; i++) {
594       e->back_refs[i] = map[e->back_refs[i]].new_val;
595     }
596   }
597   else if (e->back_num == 1) {
598     e->back_ref1 = map[e->back_ref1].new_val;
599   }
600 
601   return ST_CONTINUE;
602 }
603 
604 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)605 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
606 {
607   NameTable* t = (NameTable* )reg->name_table;
608 
609   if (IS_NOT_NULL(t)) {
610     onig_st_foreach(t, i_renumber_name, (HashDataType )map);
611   }
612   return 0;
613 }
614 
615 
616 extern int
onig_number_of_names(regex_t * reg)617 onig_number_of_names(regex_t* reg)
618 {
619   NameTable* t = (NameTable* )reg->name_table;
620 
621   if (IS_NOT_NULL(t))
622     return t->num_entries;
623   else
624     return 0;
625 }
626 
627 #else  /* USE_ST_LIBRARY */
628 
629 #define INIT_NAMES_ALLOC_NUM    8
630 
631 typedef struct {
632   NameEntry* e;
633   int        num;
634   int        alloc;
635 } NameTable;
636 
637 #ifdef ONIG_DEBUG
638 extern int
onig_print_names(FILE * fp,regex_t * reg)639 onig_print_names(FILE* fp, regex_t* reg)
640 {
641   int i, j;
642   NameEntry* e;
643   NameTable* t = (NameTable* )reg->name_table;
644 
645   if (IS_NOT_NULL(t) && t->num > 0) {
646     fprintf(fp, "name table\n");
647     for (i = 0; i < t->num; i++) {
648       e = &(t->e[i]);
649       fprintf(fp, "%s: ", e->name);
650       if (e->back_num == 0) {
651         fputs("-", fp);
652       }
653       else if (e->back_num == 1) {
654         fprintf(fp, "%d", e->back_ref1);
655       }
656       else {
657         for (j = 0; j < e->back_num; j++) {
658           if (j > 0) fprintf(fp, ", ");
659           fprintf(fp, "%d", e->back_refs[j]);
660         }
661       }
662       fputs("\n", fp);
663     }
664     fputs("\n", fp);
665   }
666   return 0;
667 }
668 #endif
669 
670 static int
names_clear(regex_t * reg)671 names_clear(regex_t* reg)
672 {
673   int i;
674   NameEntry* e;
675   NameTable* t = (NameTable* )reg->name_table;
676 
677   if (IS_NOT_NULL(t)) {
678     for (i = 0; i < t->num; i++) {
679       e = &(t->e[i]);
680       if (IS_NOT_NULL(e->name)) {
681         xfree(e->name);
682         e->name       = NULL;
683         e->name_len   = 0;
684         e->back_num   = 0;
685         e->back_alloc = 0;
686         if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
687         e->back_refs = (int* )NULL;
688       }
689     }
690     if (IS_NOT_NULL(t->e)) {
691       xfree(t->e);
692       t->e = NULL;
693     }
694     t->num = 0;
695   }
696   return 0;
697 }
698 
699 extern int
onig_names_free(regex_t * reg)700 onig_names_free(regex_t* reg)
701 {
702   int r;
703   NameTable* t;
704 
705   r = names_clear(reg);
706   if (r) return r;
707 
708   t = (NameTable* )reg->name_table;
709   if (IS_NOT_NULL(t)) xfree(t);
710   reg->name_table = NULL;
711   return 0;
712 }
713 
714 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)715 name_find(regex_t* reg, UChar* name, UChar* name_end)
716 {
717   int i, len;
718   NameEntry* e;
719   NameTable* t = (NameTable* )reg->name_table;
720 
721   if (IS_NOT_NULL(t)) {
722     len = name_end - name;
723     for (i = 0; i < t->num; i++) {
724       e = &(t->e[i]);
725       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
726         return e;
727     }
728   }
729   return (NameEntry* )NULL;
730 }
731 
732 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)733 onig_foreach_name(regex_t* reg,
734   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
735 {
736   int i, r;
737   NameEntry* e;
738   NameTable* t = (NameTable* )reg->name_table;
739 
740   if (IS_NOT_NULL(t)) {
741     for (i = 0; i < t->num; i++) {
742       e = &(t->e[i]);
743       r = (*func)(e->name, e->name + e->name_len, e->back_num,
744                   (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
745                   reg, arg);
746       if (r != 0) return r;
747     }
748   }
749   return 0;
750 }
751 
752 extern int
onig_number_of_names(regex_t * reg)753 onig_number_of_names(regex_t* reg)
754 {
755   NameTable* t = (NameTable* )reg->name_table;
756 
757   if (IS_NOT_NULL(t))
758     return t->num;
759   else
760     return 0;
761 }
762 
763 #endif /* else USE_ST_LIBRARY */
764 
765 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)766 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
767 {
768   int alloc;
769   NameEntry* e;
770   NameTable* t = (NameTable* )reg->name_table;
771 
772   if (name_end - name <= 0)
773     return ONIGERR_EMPTY_GROUP_NAME;
774 
775   e = name_find(reg, name, name_end);
776   if (IS_NULL(e)) {
777 #ifdef USE_ST_LIBRARY
778     if (IS_NULL(t)) {
779       t = onig_st_init_strend_table_with_size(5);
780       reg->name_table = (void* )t;
781     }
782     e = (NameEntry* )xmalloc(sizeof(NameEntry));
783     CHECK_NULL_RETURN_MEMERR(e);
784 
785     e->name = strdup_with_null(reg->enc, name, name_end);
786     if (IS_NULL(e->name)) {
787       xfree(e);  return ONIGERR_MEMORY;
788     }
789     onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
790                           (HashDataType )e);
791 
792     e->name_len   = name_end - name;
793     e->back_num   = 0;
794     e->back_alloc = 0;
795     e->back_refs  = (int* )NULL;
796 
797 #else
798 
799     if (IS_NULL(t)) {
800       alloc = INIT_NAMES_ALLOC_NUM;
801       t = (NameTable* )xmalloc(sizeof(NameTable));
802       CHECK_NULL_RETURN_MEMERR(t);
803       t->e     = NULL;
804       t->alloc = 0;
805       t->num   = 0;
806 
807       t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
808       if (IS_NULL(t->e)) {
809         xfree(t);
810         return ONIGERR_MEMORY;
811       }
812       t->alloc = alloc;
813       reg->name_table = t;
814       goto clear;
815     }
816     else if (t->num == t->alloc) {
817       int i;
818 
819       alloc = t->alloc * 2;
820       t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
821       CHECK_NULL_RETURN_MEMERR(t->e);
822       t->alloc = alloc;
823 
824     clear:
825       for (i = t->num; i < t->alloc; i++) {
826         t->e[i].name       = NULL;
827         t->e[i].name_len   = 0;
828         t->e[i].back_num   = 0;
829         t->e[i].back_alloc = 0;
830         t->e[i].back_refs  = (int* )NULL;
831       }
832     }
833     e = &(t->e[t->num]);
834     t->num++;
835     e->name = strdup_with_null(reg->enc, name, name_end);
836     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
837     e->name_len = name_end - name;
838 #endif
839   }
840 
841   if (e->back_num >= 1 &&
842       ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
843     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
844                                    name, name_end);
845     return ONIGERR_MULTIPLEX_DEFINED_NAME;
846   }
847 
848   e->back_num++;
849   if (e->back_num == 1) {
850     e->back_ref1 = backref;
851   }
852   else {
853     if (e->back_num == 2) {
854       alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
855       e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
856       CHECK_NULL_RETURN_MEMERR(e->back_refs);
857       e->back_alloc = alloc;
858       e->back_refs[0] = e->back_ref1;
859       e->back_refs[1] = backref;
860     }
861     else {
862       if (e->back_num > e->back_alloc) {
863         alloc = e->back_alloc * 2;
864         e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
865         CHECK_NULL_RETURN_MEMERR(e->back_refs);
866         e->back_alloc = alloc;
867       }
868       e->back_refs[e->back_num - 1] = backref;
869     }
870   }
871 
872   return 0;
873 }
874 
875 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)876 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
877 			   const UChar* name_end, int** nums)
878 {
879   NameEntry* e = name_find(reg, name, name_end);
880 
881   if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
882 
883   switch (e->back_num) {
884   case 0:
885     break;
886   case 1:
887     *nums = &(e->back_ref1);
888     break;
889   default:
890     *nums = e->back_refs;
891     break;
892   }
893   return e->back_num;
894 }
895 
896 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)897 onig_name_to_backref_number(regex_t* reg, const UChar* name,
898 			    const UChar* name_end, OnigRegion *region)
899 {
900   int i, n, *nums;
901 
902   n = onig_name_to_group_numbers(reg, name, name_end, &nums);
903   if (n < 0)
904     return n;
905   else if (n == 0)
906     return ONIGERR_PARSER_BUG;
907   else if (n == 1)
908     return nums[0];
909   else {
910     if (IS_NOT_NULL(region)) {
911       for (i = n - 1; i >= 0; i--) {
912         if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
913           return nums[i];
914       }
915     }
916     return nums[n - 1];
917   }
918 }
919 
920 #else /* USE_NAMED_GROUP */
921 
922 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)923 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
924 			   const UChar* name_end, int** nums)
925 {
926   return ONIG_NO_SUPPORT_CONFIG;
927 }
928 
929 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)930 onig_name_to_backref_number(regex_t* reg, const UChar* name,
931 			    const UChar* name_end, OnigRegion* region)
932 {
933   return ONIG_NO_SUPPORT_CONFIG;
934 }
935 
936 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)937 onig_foreach_name(regex_t* reg,
938   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
939 {
940   return ONIG_NO_SUPPORT_CONFIG;
941 }
942 
943 extern int
onig_number_of_names(regex_t * reg)944 onig_number_of_names(regex_t* reg)
945 {
946   return 0;
947 }
948 #endif /* else USE_NAMED_GROUP */
949 
950 extern int
onig_noname_group_capture_is_active(regex_t * reg)951 onig_noname_group_capture_is_active(regex_t* reg)
952 {
953   if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
954     return 0;
955 
956 #ifdef USE_NAMED_GROUP
957   if (onig_number_of_names(reg) > 0 &&
958       IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
959       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
960     return 0;
961   }
962 #endif
963 
964   return 1;
965 }
966 
967 
968 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE   16
969 
970 static void
scan_env_clear(ScanEnv * env)971 scan_env_clear(ScanEnv* env)
972 {
973   int i;
974 
975   BIT_STATUS_CLEAR(env->capture_history);
976   BIT_STATUS_CLEAR(env->bt_mem_start);
977   BIT_STATUS_CLEAR(env->bt_mem_end);
978   BIT_STATUS_CLEAR(env->backrefed_mem);
979   env->error      = (UChar* )NULL;
980   env->error_end  = (UChar* )NULL;
981   env->num_call   = 0;
982   env->num_mem    = 0;
983 #ifdef USE_NAMED_GROUP
984   env->num_named  = 0;
985 #endif
986   env->mem_alloc         = 0;
987   env->mem_nodes_dynamic = (Node** )NULL;
988 
989   for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
990     env->mem_nodes_static[i] = NULL_NODE;
991 
992 #ifdef USE_COMBINATION_EXPLOSION_CHECK
993   env->num_comb_exp_check  = 0;
994   env->comb_exp_max_regnum = 0;
995   env->curr_max_regnum     = 0;
996   env->has_recursion       = 0;
997 #endif
998   env->parse_depth         = 0;
999 }
1000 
1001 static int
scan_env_add_mem_entry(ScanEnv * env)1002 scan_env_add_mem_entry(ScanEnv* env)
1003 {
1004   int i, need, alloc;
1005   Node** p;
1006 
1007   need = env->num_mem + 1;
1008   if (need > MaxCaptureNum && MaxCaptureNum != 0)
1009     return ONIGERR_TOO_MANY_CAPTURES;
1010 
1011   if (need >= SCANENV_MEMNODES_SIZE) {
1012     if (env->mem_alloc <= need) {
1013       if (IS_NULL(env->mem_nodes_dynamic)) {
1014         alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
1015         p = (Node** )xmalloc(sizeof(Node*) * alloc);
1016         xmemcpy(p, env->mem_nodes_static,
1017                 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
1018       }
1019       else {
1020         alloc = env->mem_alloc * 2;
1021         p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
1022       }
1023       CHECK_NULL_RETURN_MEMERR(p);
1024 
1025       for (i = env->num_mem + 1; i < alloc; i++)
1026         p[i] = NULL_NODE;
1027 
1028       env->mem_nodes_dynamic = p;
1029       env->mem_alloc = alloc;
1030     }
1031   }
1032 
1033   env->num_mem++;
1034   return env->num_mem;
1035 }
1036 
1037 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)1038 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
1039 {
1040   if (env->num_mem >= num)
1041     SCANENV_MEM_NODES(env)[num] = node;
1042   else
1043     return ONIGERR_PARSER_BUG;
1044   return 0;
1045 }
1046 
1047 extern void
onig_node_free(Node * node)1048 onig_node_free(Node* node)
1049 {
1050  start:
1051   if (IS_NULL(node)) return ;
1052 
1053 #ifdef DEBUG_NODE_FREE
1054   fprintf(stderr, "onig_node_free: %p\n", node);
1055 #endif
1056 
1057   switch (NTYPE(node)) {
1058   case NT_STR:
1059     if (NSTR(node)->capa != 0 &&
1060         IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1061       xfree(NSTR(node)->s);
1062     }
1063     break;
1064 
1065   case NT_LIST:
1066   case NT_ALT:
1067     onig_node_free(NCAR(node));
1068     {
1069       Node* next_node = NCDR(node);
1070 
1071       xfree(node);
1072       node = next_node;
1073       goto start;
1074     }
1075     break;
1076 
1077   case NT_CCLASS:
1078     {
1079       CClassNode* cc = NCCLASS(node);
1080 
1081       if (IS_NCCLASS_SHARE(cc)) return ;
1082       if (cc->mbuf)
1083         bbuf_free(cc->mbuf);
1084     }
1085     break;
1086 
1087   case NT_QTFR:
1088     if (NQTFR(node)->target)
1089       onig_node_free(NQTFR(node)->target);
1090     break;
1091 
1092   case NT_ENCLOSE:
1093     if (NENCLOSE(node)->target)
1094       onig_node_free(NENCLOSE(node)->target);
1095     break;
1096 
1097   case NT_BREF:
1098     if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1099       xfree(NBREF(node)->back_dynamic);
1100     break;
1101 
1102   case NT_ANCHOR:
1103     if (NANCHOR(node)->target)
1104       onig_node_free(NANCHOR(node)->target);
1105     break;
1106   }
1107 
1108   xfree(node);
1109 }
1110 
1111 static Node*
node_new(void)1112 node_new(void)
1113 {
1114   Node* node;
1115 
1116   node = (Node* )xmalloc(sizeof(Node));
1117   /* xmemset(node, 0, sizeof(Node)); */
1118 #ifdef DEBUG_NODE_FREE
1119   fprintf(stderr, "node_new: %p\n", node);
1120 #endif
1121   return node;
1122 }
1123 
1124 
1125 static void
initialize_cclass(CClassNode * cc)1126 initialize_cclass(CClassNode* cc)
1127 {
1128   BITSET_CLEAR(cc->bs);
1129   /* cc->base.flags = 0; */
1130   cc->flags = 0;
1131   cc->mbuf  = NULL;
1132 }
1133 
1134 static Node*
node_new_cclass(void)1135 node_new_cclass(void)
1136 {
1137   Node* node = node_new();
1138   CHECK_NULL_RETURN(node);
1139 
1140   SET_NTYPE(node, NT_CCLASS);
1141   initialize_cclass(NCCLASS(node));
1142   return node;
1143 }
1144 
1145 static Node*
node_new_ctype(int type,int not)1146 node_new_ctype(int type, int not)
1147 {
1148   Node* node = node_new();
1149   CHECK_NULL_RETURN(node);
1150 
1151   SET_NTYPE(node, NT_CTYPE);
1152   NCTYPE(node)->ctype = type;
1153   NCTYPE(node)->not   = not;
1154   return node;
1155 }
1156 
1157 static Node*
node_new_anychar(void)1158 node_new_anychar(void)
1159 {
1160   Node* node = node_new();
1161   CHECK_NULL_RETURN(node);
1162 
1163   SET_NTYPE(node, NT_CANY);
1164   return node;
1165 }
1166 
1167 static Node*
node_new_list(Node * left,Node * right)1168 node_new_list(Node* left, Node* right)
1169 {
1170   Node* node = node_new();
1171   CHECK_NULL_RETURN(node);
1172 
1173   SET_NTYPE(node, NT_LIST);
1174   NCAR(node)  = left;
1175   NCDR(node) = right;
1176   return node;
1177 }
1178 
1179 extern Node*
onig_node_new_list(Node * left,Node * right)1180 onig_node_new_list(Node* left, Node* right)
1181 {
1182   return node_new_list(left, right);
1183 }
1184 
1185 extern Node*
onig_node_list_add(Node * list,Node * x)1186 onig_node_list_add(Node* list, Node* x)
1187 {
1188   Node *n;
1189 
1190   n = onig_node_new_list(x, NULL);
1191   if (IS_NULL(n)) return NULL_NODE;
1192 
1193   if (IS_NOT_NULL(list)) {
1194     while (IS_NOT_NULL(NCDR(list)))
1195       list = NCDR(list);
1196 
1197     NCDR(list) = n;
1198   }
1199 
1200   return n;
1201 }
1202 
1203 extern Node*
onig_node_new_alt(Node * left,Node * right)1204 onig_node_new_alt(Node* left, Node* right)
1205 {
1206   Node* node = node_new();
1207   CHECK_NULL_RETURN(node);
1208 
1209   SET_NTYPE(node, NT_ALT);
1210   NCAR(node)  = left;
1211   NCDR(node) = right;
1212   return node;
1213 }
1214 
1215 extern Node*
onig_node_new_anchor(int type)1216 onig_node_new_anchor(int type)
1217 {
1218   Node* node = node_new();
1219   CHECK_NULL_RETURN(node);
1220 
1221   SET_NTYPE(node, NT_ANCHOR);
1222   NANCHOR(node)->type     = type;
1223   NANCHOR(node)->target   = NULL;
1224   NANCHOR(node)->char_len = -1;
1225   return node;
1226 }
1227 
1228 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1229 node_new_backref(int back_num, int* backrefs, int by_name,
1230 #ifdef USE_BACKREF_WITH_LEVEL
1231 		 int exist_level, int nest_level,
1232 #endif
1233 		 ScanEnv* env)
1234 {
1235   int i;
1236   Node* node = node_new();
1237 
1238   CHECK_NULL_RETURN(node);
1239 
1240   SET_NTYPE(node, NT_BREF);
1241   NBREF(node)->state    = 0;
1242   NBREF(node)->back_num = back_num;
1243   NBREF(node)->back_dynamic = (int* )NULL;
1244   if (by_name != 0)
1245     NBREF(node)->state |= NST_NAME_REF;
1246 
1247 #ifdef USE_BACKREF_WITH_LEVEL
1248   if (exist_level != 0) {
1249     NBREF(node)->state |= NST_NEST_LEVEL;
1250     NBREF(node)->nest_level  = nest_level;
1251   }
1252 #endif
1253 
1254   for (i = 0; i < back_num; i++) {
1255     if (backrefs[i] <= env->num_mem &&
1256         IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1257       NBREF(node)->state |= NST_RECURSION;   /* /...(\1).../ */
1258       break;
1259     }
1260   }
1261 
1262   if (back_num <= NODE_BACKREFS_SIZE) {
1263     for (i = 0; i < back_num; i++)
1264       NBREF(node)->back_static[i] = backrefs[i];
1265   }
1266   else {
1267     int* p = (int* )xmalloc(sizeof(int) * back_num);
1268     if (IS_NULL(p)) {
1269       onig_node_free(node);
1270       return NULL;
1271     }
1272     NBREF(node)->back_dynamic = p;
1273     for (i = 0; i < back_num; i++)
1274       p[i] = backrefs[i];
1275   }
1276   return node;
1277 }
1278 
1279 #ifdef USE_SUBEXP_CALL
1280 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1281 node_new_call(UChar* name, UChar* name_end, int gnum)
1282 {
1283   Node* node = node_new();
1284   CHECK_NULL_RETURN(node);
1285 
1286   SET_NTYPE(node, NT_CALL);
1287   NCALL(node)->state     = 0;
1288   NCALL(node)->target    = NULL_NODE;
1289   NCALL(node)->name      = name;
1290   NCALL(node)->name_end  = name_end;
1291   NCALL(node)->group_num = gnum;  /* call by number if gnum != 0 */
1292   return node;
1293 }
1294 #endif
1295 
1296 static Node*
node_new_quantifier(int lower,int upper,int by_number)1297 node_new_quantifier(int lower, int upper, int by_number)
1298 {
1299   Node* node = node_new();
1300   CHECK_NULL_RETURN(node);
1301 
1302   SET_NTYPE(node, NT_QTFR);
1303   NQTFR(node)->state  = 0;
1304   NQTFR(node)->target = NULL;
1305   NQTFR(node)->lower  = lower;
1306   NQTFR(node)->upper  = upper;
1307   NQTFR(node)->greedy = 1;
1308   NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1309   NQTFR(node)->head_exact        = NULL_NODE;
1310   NQTFR(node)->next_head_exact   = NULL_NODE;
1311   NQTFR(node)->is_refered        = 0;
1312   if (by_number != 0)
1313     NQTFR(node)->state |= NST_BY_NUMBER;
1314 
1315 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1316   NQTFR(node)->comb_exp_check_num = 0;
1317 #endif
1318 
1319   return node;
1320 }
1321 
1322 static Node*
node_new_enclose(int type)1323 node_new_enclose(int type)
1324 {
1325   Node* node = node_new();
1326   CHECK_NULL_RETURN(node);
1327 
1328   SET_NTYPE(node, NT_ENCLOSE);
1329   NENCLOSE(node)->type      = type;
1330   NENCLOSE(node)->state     =  0;
1331   NENCLOSE(node)->regnum    =  0;
1332   NENCLOSE(node)->option    =  0;
1333   NENCLOSE(node)->target    = NULL;
1334   NENCLOSE(node)->call_addr = -1;
1335   NENCLOSE(node)->opt_count =  0;
1336   return node;
1337 }
1338 
1339 extern Node*
onig_node_new_enclose(int type)1340 onig_node_new_enclose(int type)
1341 {
1342   return node_new_enclose(type);
1343 }
1344 
1345 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1346 node_new_enclose_memory(OnigOptionType option, int is_named)
1347 {
1348   Node* node = node_new_enclose(ENCLOSE_MEMORY);
1349   CHECK_NULL_RETURN(node);
1350   if (is_named != 0)
1351     SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1352 
1353 #ifdef USE_SUBEXP_CALL
1354   NENCLOSE(node)->option = option;
1355 #endif
1356   return node;
1357 }
1358 
1359 static Node*
node_new_option(OnigOptionType option)1360 node_new_option(OnigOptionType option)
1361 {
1362   Node* node = node_new_enclose(ENCLOSE_OPTION);
1363   CHECK_NULL_RETURN(node);
1364   NENCLOSE(node)->option = option;
1365   return node;
1366 }
1367 
1368 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1369 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1370 {
1371   int addlen = end - s;
1372 
1373   if (addlen > 0) {
1374     int len  = NSTR(node)->end - NSTR(node)->s;
1375 
1376     if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1377       UChar* p;
1378       int capa = len + addlen + NODE_STR_MARGIN;
1379 
1380       if (capa <= NSTR(node)->capa) {
1381         onig_strcpy(NSTR(node)->s + len, s, end);
1382       }
1383       else {
1384         if (NSTR(node)->s == NSTR(node)->buf)
1385           p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1386                                       s, end, capa);
1387         else
1388           p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1389 
1390         CHECK_NULL_RETURN_MEMERR(p);
1391         NSTR(node)->s    = p;
1392         NSTR(node)->capa = capa;
1393       }
1394     }
1395     else {
1396       onig_strcpy(NSTR(node)->s + len, s, end);
1397     }
1398     NSTR(node)->end = NSTR(node)->s + len + addlen;
1399   }
1400 
1401   return 0;
1402 }
1403 
1404 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1405 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1406 {
1407   onig_node_str_clear(node);
1408   return onig_node_str_cat(node, s, end);
1409 }
1410 
1411 static int
node_str_cat_char(Node * node,UChar c)1412 node_str_cat_char(Node* node, UChar c)
1413 {
1414   UChar s[1];
1415 
1416   s[0] = c;
1417   return onig_node_str_cat(node, s, s + 1);
1418 }
1419 
1420 extern void
onig_node_conv_to_str_node(Node * node,int flag)1421 onig_node_conv_to_str_node(Node* node, int flag)
1422 {
1423   SET_NTYPE(node, NT_STR);
1424   NSTR(node)->flag = flag;
1425   NSTR(node)->capa = 0;
1426   NSTR(node)->s    = NSTR(node)->buf;
1427   NSTR(node)->end  = NSTR(node)->buf;
1428 }
1429 
1430 extern void
onig_node_str_clear(Node * node)1431 onig_node_str_clear(Node* node)
1432 {
1433   if (NSTR(node)->capa != 0 &&
1434       IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1435     xfree(NSTR(node)->s);
1436   }
1437 
1438   NSTR(node)->capa = 0;
1439   NSTR(node)->flag = 0;
1440   NSTR(node)->s    = NSTR(node)->buf;
1441   NSTR(node)->end  = NSTR(node)->buf;
1442 }
1443 
1444 static Node*
node_new_str(const UChar * s,const UChar * end)1445 node_new_str(const UChar* s, const UChar* end)
1446 {
1447   Node* node = node_new();
1448   CHECK_NULL_RETURN(node);
1449 
1450   SET_NTYPE(node, NT_STR);
1451   NSTR(node)->capa = 0;
1452   NSTR(node)->flag = 0;
1453   NSTR(node)->s    = NSTR(node)->buf;
1454   NSTR(node)->end  = NSTR(node)->buf;
1455   if (onig_node_str_cat(node, s, end)) {
1456     onig_node_free(node);
1457     return NULL;
1458   }
1459   return node;
1460 }
1461 
1462 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1463 onig_node_new_str(const UChar* s, const UChar* end)
1464 {
1465   return node_new_str(s, end);
1466 }
1467 
1468 static Node*
node_new_str_raw(UChar * s,UChar * end)1469 node_new_str_raw(UChar* s, UChar* end)
1470 {
1471   Node* node = node_new_str(s, end);
1472   NSTRING_SET_RAW(node);
1473   return node;
1474 }
1475 
1476 static Node*
node_new_empty(void)1477 node_new_empty(void)
1478 {
1479   return node_new_str(NULL, NULL);
1480 }
1481 
1482 static Node*
node_new_str_raw_char(UChar c)1483 node_new_str_raw_char(UChar c)
1484 {
1485   UChar p[1];
1486 
1487   p[0] = c;
1488   return node_new_str_raw(p, p + 1);
1489 }
1490 
1491 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1492 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1493 {
1494   const UChar *p;
1495   Node* n = NULL_NODE;
1496 
1497   if (sn->end > sn->s) {
1498     p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1499     if (p && p > sn->s) { /* can be split. */
1500       n = node_new_str(p, sn->end);
1501       if ((sn->flag & NSTR_RAW) != 0)
1502         NSTRING_SET_RAW(n);
1503 
1504       sn->end = (UChar* )p;
1505     }
1506   }
1507   return n;
1508 }
1509 
1510 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1511 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1512 {
1513   if (sn->end > sn->s) {
1514     return ((enclen(enc, sn->s) < sn->end - sn->s)  ?  1 : 0);
1515   }
1516   return 0;
1517 }
1518 
1519 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1520 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1521 node_str_head_pad(StrNode* sn, int num, UChar val)
1522 {
1523   UChar buf[NODE_STR_BUF_SIZE];
1524   int i, len;
1525 
1526   len = sn->end - sn->s;
1527   onig_strcpy(buf, sn->s, sn->end);
1528   onig_strcpy(&(sn->s[num]), buf, buf + len);
1529   sn->end += num;
1530 
1531   for (i = 0; i < num; i++) {
1532     sn->s[i] = val;
1533   }
1534 }
1535 #endif
1536 
1537 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1538 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1539 {
1540   unsigned int num, val;
1541   OnigCodePoint c;
1542   UChar* p = *src;
1543   PFETCH_READY;
1544 
1545   num = 0;
1546   while (!PEND) {
1547     PFETCH(c);
1548     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1549       val = (unsigned int )DIGITVAL(c);
1550       if ((INT_MAX_LIMIT - val) / 10UL < num)
1551         return -1;  /* overflow */
1552 
1553       num = num * 10 + val;
1554     }
1555     else {
1556       PUNFETCH;
1557       break;
1558     }
1559   }
1560   *src = p;
1561   return num;
1562 }
1563 
1564 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1565 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1566 				 OnigEncoding enc)
1567 {
1568   OnigCodePoint c;
1569   unsigned int num, val;
1570   UChar* p = *src;
1571   PFETCH_READY;
1572 
1573   num = 0;
1574   while (! PEND && maxlen-- != 0) {
1575     PFETCH(c);
1576     if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1577       val = (unsigned int )XDIGITVAL(enc,c);
1578       if ((INT_MAX_LIMIT - val) / 16UL < num)
1579         return -1;  /* overflow */
1580 
1581       num = (num << 4) + XDIGITVAL(enc,c);
1582     }
1583     else {
1584       PUNFETCH;
1585       break;
1586     }
1587   }
1588   *src = p;
1589   return num;
1590 }
1591 
1592 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1593 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1594 			   OnigEncoding enc)
1595 {
1596   OnigCodePoint c;
1597   unsigned int num, val;
1598   UChar* p = *src;
1599   PFETCH_READY;
1600 
1601   num = 0;
1602   while (!PEND && maxlen-- != 0) {
1603     PFETCH(c);
1604     if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1605       val = ODIGITVAL(c);
1606       if ((INT_MAX_LIMIT - val) / 8UL < num)
1607         return -1;  /* overflow */
1608 
1609       num = (num << 3) + val;
1610     }
1611     else {
1612       PUNFETCH;
1613       break;
1614     }
1615   }
1616   *src = p;
1617   return num;
1618 }
1619 
1620 
1621 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1622     BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1623 
1624 /* data format:
1625      [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1626      (all data size is OnigCodePoint)
1627  */
1628 static int
new_code_range(BBuf ** pbuf)1629 new_code_range(BBuf** pbuf)
1630 {
1631 #define INIT_MULTI_BYTE_RANGE_SIZE  (SIZE_CODE_POINT * 5)
1632   int r;
1633   OnigCodePoint n;
1634   BBuf* bbuf;
1635 
1636   bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1637   CHECK_NULL_RETURN_MEMERR(*pbuf);
1638   r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1639   if (r) return r;
1640 
1641   n = 0;
1642   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1643   return 0;
1644 }
1645 
1646 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1647 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1648 {
1649   int r, inc_n, pos;
1650   int low, high, bound, x;
1651   OnigCodePoint n, *data;
1652   BBuf* bbuf;
1653 
1654   if (from > to) {
1655     n = from; from = to; to = n;
1656   }
1657 
1658   if (IS_NULL(*pbuf)) {
1659     r = new_code_range(pbuf);
1660     if (r) return r;
1661     bbuf = *pbuf;
1662     n = 0;
1663   }
1664   else {
1665     bbuf = *pbuf;
1666     GET_CODE_POINT(n, bbuf->p);
1667   }
1668   data = (OnigCodePoint* )(bbuf->p);
1669   data++;
1670 
1671   for (low = 0, bound = n; low < bound; ) {
1672     x = (low + bound) >> 1;
1673     if (from > data[x*2 + 1])
1674       low = x + 1;
1675     else
1676       bound = x;
1677   }
1678 
1679   high = (to == ~((OnigCodePoint )0)) ? n : low;
1680   for (bound = n; high < bound; ) {
1681     x = (high + bound) >> 1;
1682     if (to + 1 >= data[x*2])
1683       high = x + 1;
1684     else
1685       bound = x;
1686   }
1687 
1688   inc_n = low + 1 - high;
1689   if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1690     return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1691 
1692   if (inc_n != 1) {
1693     if (from > data[low*2])
1694       from = data[low*2];
1695     if (to < data[(high - 1)*2 + 1])
1696       to = data[(high - 1)*2 + 1];
1697   }
1698 
1699   if (inc_n != 0 && (OnigCodePoint )high < n) {
1700     int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1701     int to_pos   = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1702     int size = (n - high) * 2 * SIZE_CODE_POINT;
1703 
1704     if (inc_n > 0) {
1705       BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1706     }
1707     else {
1708       BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1709     }
1710   }
1711 
1712   pos = SIZE_CODE_POINT * (1 + low * 2);
1713   BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1714   BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1715   BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1716   n += inc_n;
1717   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1718 
1719   return 0;
1720 }
1721 
1722 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1723 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1724 {
1725   if (from > to) {
1726     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1727       return 0;
1728     else
1729       return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1730   }
1731 
1732   return add_code_range_to_buf(pbuf, from, to);
1733 }
1734 
1735 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1736 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1737 {
1738   int r, i, n;
1739   OnigCodePoint pre, from, *data, to = 0;
1740 
1741   *pbuf = (BBuf* )NULL;
1742   if (IS_NULL(bbuf)) {
1743   set_all:
1744     return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1745   }
1746 
1747   data = (OnigCodePoint* )(bbuf->p);
1748   GET_CODE_POINT(n, data);
1749   data++;
1750   if (n <= 0) goto set_all;
1751 
1752   r = 0;
1753   pre = MBCODE_START_POS(enc);
1754   for (i = 0; i < n; i++) {
1755     from = data[i*2];
1756     to   = data[i*2+1];
1757     if (pre <= from - 1) {
1758       r = add_code_range_to_buf(pbuf, pre, from - 1);
1759       if (r != 0) return r;
1760     }
1761     if (to == ~((OnigCodePoint )0)) break;
1762     pre = to + 1;
1763   }
1764   if (to < ~((OnigCodePoint )0)) {
1765     r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1766   }
1767   return r;
1768 }
1769 
1770 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1771   BBuf *tbuf; \
1772   int  tnot; \
1773   tnot = not1;  not1  = not2;  not2  = tnot; \
1774   tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1775 } while (0)
1776 
1777 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1778 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1779                   BBuf* bbuf2, int not2, BBuf** pbuf)
1780 {
1781   int r;
1782   OnigCodePoint i, n1, *data1;
1783   OnigCodePoint from, to;
1784 
1785   *pbuf = (BBuf* )NULL;
1786   if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1787     if (not1 != 0 || not2 != 0)
1788       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1789     return 0;
1790   }
1791 
1792   r = 0;
1793   if (IS_NULL(bbuf2))
1794     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1795 
1796   if (IS_NULL(bbuf1)) {
1797     if (not1 != 0) {
1798       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1799     }
1800     else {
1801       if (not2 == 0) {
1802         return bbuf_clone(pbuf, bbuf2);
1803       }
1804       else {
1805         return not_code_range_buf(enc, bbuf2, pbuf);
1806       }
1807     }
1808   }
1809 
1810   if (not1 != 0)
1811     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1812 
1813   data1 = (OnigCodePoint* )(bbuf1->p);
1814   GET_CODE_POINT(n1, data1);
1815   data1++;
1816 
1817   if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1818     r = bbuf_clone(pbuf, bbuf2);
1819   }
1820   else if (not1 == 0) { /* 1 OR (not 2) */
1821     r = not_code_range_buf(enc, bbuf2, pbuf);
1822   }
1823   if (r != 0) return r;
1824 
1825   for (i = 0; i < n1; i++) {
1826     from = data1[i*2];
1827     to   = data1[i*2+1];
1828     r = add_code_range_to_buf(pbuf, from, to);
1829     if (r != 0) return r;
1830   }
1831   return 0;
1832 }
1833 
1834 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1835 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1836 	        OnigCodePoint* data, int n)
1837 {
1838   int i, r;
1839   OnigCodePoint from2, to2;
1840 
1841   for (i = 0; i < n; i++) {
1842     from2 = data[i*2];
1843     to2   = data[i*2+1];
1844     if (from2 < from1) {
1845       if (to2 < from1) continue;
1846       else {
1847         from1 = to2 + 1;
1848       }
1849     }
1850     else if (from2 <= to1) {
1851       if (to2 < to1) {
1852         if (from1 <= from2 - 1) {
1853           r = add_code_range_to_buf(pbuf, from1, from2-1);
1854           if (r != 0) return r;
1855         }
1856         from1 = to2 + 1;
1857       }
1858       else {
1859         to1 = from2 - 1;
1860       }
1861     }
1862     else {
1863       from1 = from2;
1864     }
1865     if (from1 > to1) break;
1866   }
1867   if (from1 <= to1) {
1868     r = add_code_range_to_buf(pbuf, from1, to1);
1869     if (r != 0) return r;
1870   }
1871   return 0;
1872 }
1873 
1874 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1875 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1876 {
1877   int r;
1878   OnigCodePoint i, j, n1, n2, *data1, *data2;
1879   OnigCodePoint from, to, from1, to1, from2, to2;
1880 
1881   *pbuf = (BBuf* )NULL;
1882   if (IS_NULL(bbuf1)) {
1883     if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1884       return bbuf_clone(pbuf, bbuf2);
1885     return 0;
1886   }
1887   else if (IS_NULL(bbuf2)) {
1888     if (not2 != 0)
1889       return bbuf_clone(pbuf, bbuf1);
1890     return 0;
1891   }
1892 
1893   if (not1 != 0)
1894     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1895 
1896   data1 = (OnigCodePoint* )(bbuf1->p);
1897   data2 = (OnigCodePoint* )(bbuf2->p);
1898   GET_CODE_POINT(n1, data1);
1899   GET_CODE_POINT(n2, data2);
1900   data1++;
1901   data2++;
1902 
1903   if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1904     for (i = 0; i < n1; i++) {
1905       from1 = data1[i*2];
1906       to1   = data1[i*2+1];
1907       for (j = 0; j < n2; j++) {
1908         from2 = data2[j*2];
1909         to2   = data2[j*2+1];
1910         if (from2 > to1) break;
1911         if (to2 < from1) continue;
1912         from = MAX(from1, from2);
1913         to   = MIN(to1, to2);
1914         r = add_code_range_to_buf(pbuf, from, to);
1915         if (r != 0) return r;
1916       }
1917     }
1918   }
1919   else if (not1 == 0) { /* 1 AND (not 2) */
1920     for (i = 0; i < n1; i++) {
1921       from1 = data1[i*2];
1922       to1   = data1[i*2+1];
1923       r = and_code_range1(pbuf, from1, to1, data2, n2);
1924       if (r != 0) return r;
1925     }
1926   }
1927 
1928   return 0;
1929 }
1930 
1931 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1932 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1933 {
1934   int r, not1, not2;
1935   BBuf *buf1, *buf2, *pbuf;
1936   BitSetRef bsr1, bsr2;
1937   BitSet bs1, bs2;
1938 
1939   not1 = IS_NCCLASS_NOT(dest);
1940   bsr1 = dest->bs;
1941   buf1 = dest->mbuf;
1942   not2 = IS_NCCLASS_NOT(cc);
1943   bsr2 = cc->bs;
1944   buf2 = cc->mbuf;
1945 
1946   if (not1 != 0) {
1947     bitset_invert_to(bsr1, bs1);
1948     bsr1 = bs1;
1949   }
1950   if (not2 != 0) {
1951     bitset_invert_to(bsr2, bs2);
1952     bsr2 = bs2;
1953   }
1954   bitset_and(bsr1, bsr2);
1955   if (bsr1 != dest->bs) {
1956     bitset_copy(dest->bs, bsr1);
1957     bsr1 = dest->bs;
1958   }
1959   if (not1 != 0) {
1960     bitset_invert(dest->bs);
1961   }
1962 
1963   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
1964     if (not1 != 0 && not2 != 0) {
1965       r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
1966     }
1967     else {
1968       r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
1969       if (r == 0 && not1 != 0) {
1970         BBuf *tbuf;
1971         r = not_code_range_buf(enc, pbuf, &tbuf);
1972         if (r != 0) {
1973           bbuf_free(pbuf);
1974           return r;
1975         }
1976         bbuf_free(pbuf);
1977         pbuf = tbuf;
1978       }
1979     }
1980     if (r != 0) return r;
1981 
1982     dest->mbuf = pbuf;
1983     bbuf_free(buf1);
1984     return r;
1985   }
1986   return 0;
1987 }
1988 
1989 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1990 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1991 {
1992   int r, not1, not2;
1993   BBuf *buf1, *buf2, *pbuf;
1994   BitSetRef bsr1, bsr2;
1995   BitSet bs1, bs2;
1996 
1997   not1 = IS_NCCLASS_NOT(dest);
1998   bsr1 = dest->bs;
1999   buf1 = dest->mbuf;
2000   not2 = IS_NCCLASS_NOT(cc);
2001   bsr2 = cc->bs;
2002   buf2 = cc->mbuf;
2003 
2004   if (not1 != 0) {
2005     bitset_invert_to(bsr1, bs1);
2006     bsr1 = bs1;
2007   }
2008   if (not2 != 0) {
2009     bitset_invert_to(bsr2, bs2);
2010     bsr2 = bs2;
2011   }
2012   bitset_or(bsr1, bsr2);
2013   if (bsr1 != dest->bs) {
2014     bitset_copy(dest->bs, bsr1);
2015     bsr1 = dest->bs;
2016   }
2017   if (not1 != 0) {
2018     bitset_invert(dest->bs);
2019   }
2020 
2021   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2022     if (not1 != 0 && not2 != 0) {
2023       r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2024     }
2025     else {
2026       r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2027       if (r == 0 && not1 != 0) {
2028         BBuf *tbuf;
2029         r = not_code_range_buf(enc, pbuf, &tbuf);
2030         if (r != 0) {
2031           bbuf_free(pbuf);
2032           return r;
2033         }
2034         bbuf_free(pbuf);
2035         pbuf = tbuf;
2036       }
2037     }
2038     if (r != 0) return r;
2039 
2040     dest->mbuf = pbuf;
2041     bbuf_free(buf1);
2042     return r;
2043   }
2044   else
2045     return 0;
2046 }
2047 
2048 static OnigCodePoint
conv_backslash_value(OnigCodePoint c,ScanEnv * env)2049 conv_backslash_value(OnigCodePoint c, ScanEnv* env)
2050 {
2051   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2052     switch (c) {
2053     case 'n': return '\n';
2054     case 't': return '\t';
2055     case 'r': return '\r';
2056     case 'f': return '\f';
2057     case 'a': return '\007';
2058     case 'b': return '\010';
2059     case 'e': return '\033';
2060     case 'v':
2061       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2062 	return '\v';
2063       break;
2064 
2065     default:
2066       break;
2067     }
2068   }
2069   return c;
2070 }
2071 
2072 static int
is_invalid_quantifier_target(Node * node)2073 is_invalid_quantifier_target(Node* node)
2074 {
2075   switch (NTYPE(node)) {
2076   case NT_ANCHOR:
2077     return 1;
2078     break;
2079 
2080   case NT_ENCLOSE:
2081     /* allow enclosed elements */
2082     /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2083     break;
2084 
2085   case NT_LIST:
2086     do {
2087       if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2088     } while (IS_NOT_NULL(node = NCDR(node)));
2089     return 0;
2090     break;
2091 
2092   case NT_ALT:
2093     do {
2094       if (is_invalid_quantifier_target(NCAR(node))) return 1;
2095     } while (IS_NOT_NULL(node = NCDR(node)));
2096     break;
2097 
2098   default:
2099     break;
2100   }
2101   return 0;
2102 }
2103 
2104 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2105 static int
popular_quantifier_num(QtfrNode * q)2106 popular_quantifier_num(QtfrNode* q)
2107 {
2108   if (q->greedy) {
2109     if (q->lower == 0) {
2110       if (q->upper == 1) return 0;
2111       else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2112     }
2113     else if (q->lower == 1) {
2114       if (IS_REPEAT_INFINITE(q->upper)) return 2;
2115     }
2116   }
2117   else {
2118     if (q->lower == 0) {
2119       if (q->upper == 1) return 3;
2120       else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2121     }
2122     else if (q->lower == 1) {
2123       if (IS_REPEAT_INFINITE(q->upper)) return 5;
2124     }
2125   }
2126   return -1;
2127 }
2128 
2129 
2130 enum ReduceType {
2131   RQ_ASIS = 0, /* as is */
2132   RQ_DEL  = 1, /* delete parent */
2133   RQ_A,        /* to '*'    */
2134   RQ_AQ,       /* to '*?'   */
2135   RQ_QQ,       /* to '??'   */
2136   RQ_P_QQ,     /* to '+)??' */
2137   RQ_PQ_Q      /* to '+?)?' */
2138 };
2139 
2140 static enum ReduceType ReduceTypeTable[6][6] = {
2141   {RQ_DEL,  RQ_A,    RQ_A,   RQ_QQ,   RQ_AQ,   RQ_ASIS}, /* '?'  */
2142   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},  /* '*'  */
2143   {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */
2144   {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */
2145   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */
2146   {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */
2147 };
2148 
2149 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2150 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2151 {
2152   int pnum, cnum;
2153   QtfrNode *p, *c;
2154 
2155   p = NQTFR(pnode);
2156   c = NQTFR(cnode);
2157   pnum = popular_quantifier_num(p);
2158   cnum = popular_quantifier_num(c);
2159   if (pnum < 0 || cnum < 0) return ;
2160 
2161   switch(ReduceTypeTable[cnum][pnum]) {
2162   case RQ_DEL:
2163     *pnode = *cnode;
2164     break;
2165   case RQ_A:
2166     p->target = c->target;
2167     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 1;
2168     break;
2169   case RQ_AQ:
2170     p->target = c->target;
2171     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 0;
2172     break;
2173   case RQ_QQ:
2174     p->target = c->target;
2175     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2176     break;
2177   case RQ_P_QQ:
2178     p->target = cnode;
2179     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2180     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 1;
2181     return ;
2182     break;
2183   case RQ_PQ_Q:
2184     p->target = cnode;
2185     p->lower  = 0;  p->upper = 1;  p->greedy = 1;
2186     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 0;
2187     return ;
2188     break;
2189   case RQ_ASIS:
2190     p->target = cnode;
2191     return ;
2192     break;
2193   }
2194 
2195   c->target = NULL_NODE;
2196   onig_node_free(cnode);
2197 }
2198 
2199 
2200 enum TokenSyms {
2201   TK_EOT      = 0,   /* end of token */
2202   TK_RAW_BYTE = 1,
2203   TK_CHAR,
2204   TK_STRING,
2205   TK_CODE_POINT,
2206   TK_ANYCHAR,
2207   TK_CHAR_TYPE,
2208   TK_BACKREF,
2209   TK_CALL,
2210   TK_ANCHOR,
2211   TK_OP_REPEAT,
2212   TK_INTERVAL,
2213   TK_ANYCHAR_ANYTIME,  /* SQL '%' == .* */
2214   TK_ALT,
2215   TK_SUBEXP_OPEN,
2216   TK_SUBEXP_CLOSE,
2217   TK_CC_OPEN,
2218   TK_QUOTE_OPEN,
2219   TK_CHAR_PROPERTY,    /* \p{...}, \P{...} */
2220   /* in cc */
2221   TK_CC_CLOSE,
2222   TK_CC_RANGE,
2223   TK_POSIX_BRACKET_OPEN,
2224   TK_CC_AND,             /* && */
2225   TK_CC_CC_OPEN          /* [ */
2226 };
2227 
2228 typedef struct {
2229   enum TokenSyms type;
2230   int escaped;
2231   int base;   /* is number: 8, 16 (used in [....]) */
2232   UChar* backp;
2233   union {
2234     UChar* s;
2235     int   c;
2236     OnigCodePoint code;
2237     int   anchor;
2238     int   subtype;
2239     struct {
2240       int lower;
2241       int upper;
2242       int greedy;
2243       int possessive;
2244     } repeat;
2245     struct {
2246       int  num;
2247       int  ref1;
2248       int* refs;
2249       int  by_name;
2250 #ifdef USE_BACKREF_WITH_LEVEL
2251       int  exist_level;
2252       int  level;   /* \k<name+n> */
2253 #endif
2254     } backref;
2255     struct {
2256       UChar* name;
2257       UChar* name_end;
2258       int    gnum;
2259     } call;
2260     struct {
2261       int ctype;
2262       int not;
2263     } prop;
2264   } u;
2265 } OnigToken;
2266 
2267 
2268 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2269 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2270 {
2271   int low, up, syn_allow, non_low = 0;
2272   int r = 0;
2273   OnigCodePoint c;
2274   OnigEncoding enc = env->enc;
2275   UChar* p = *src;
2276   PFETCH_READY;
2277 
2278   syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2279 
2280   if (PEND) {
2281     if (syn_allow)
2282       return 1;  /* "....{" : OK! */
2283     else
2284       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;  /* "....{" syntax error */
2285   }
2286 
2287   if (! syn_allow) {
2288     c = PPEEK;
2289     if (c == ')' || c == '(' || c == '|') {
2290       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2291     }
2292   }
2293 
2294   low = onig_scan_unsigned_number(&p, end, env->enc);
2295   if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2296   if (low > ONIG_MAX_REPEAT_NUM)
2297     return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2298 
2299   if (p == *src) { /* can't read low */
2300     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2301       /* allow {,n} as {0,n} */
2302       low = 0;
2303       non_low = 1;
2304     }
2305     else
2306       goto invalid;
2307   }
2308 
2309   if (PEND) goto invalid;
2310   PFETCH(c);
2311   if (c == ',') {
2312     UChar* prev = p;
2313     up = onig_scan_unsigned_number(&p, end, env->enc);
2314     if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2315     if (up > ONIG_MAX_REPEAT_NUM)
2316       return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2317 
2318     if (p == prev) {
2319       if (non_low != 0)
2320         goto invalid;
2321       up = REPEAT_INFINITE;  /* {n,} : {n,infinite} */
2322     }
2323   }
2324   else {
2325     if (non_low != 0)
2326       goto invalid;
2327 
2328     PUNFETCH;
2329     up = low;  /* {n} : exact n times */
2330     r = 2;     /* fixed */
2331   }
2332 
2333   if (PEND) goto invalid;
2334   PFETCH(c);
2335   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2336     if (c != MC_ESC(env->syntax)) goto invalid;
2337     PFETCH(c);
2338   }
2339   if (c != '}') goto invalid;
2340 
2341   if (!IS_REPEAT_INFINITE(up) && low > up) {
2342     return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2343   }
2344 
2345   tok->type = TK_INTERVAL;
2346   tok->u.repeat.lower = low;
2347   tok->u.repeat.upper = up;
2348   *src = p;
2349   return r; /* 0: normal {n,m}, 2: fixed {n} */
2350 
2351  invalid:
2352   if (syn_allow) {
2353     /* *src = p; */ /* !!! Don't do this line !!! */
2354     return 1;  /* OK */
2355   }
2356   else
2357     return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2358 }
2359 
2360 /* \M-, \C-, \c, or \... */
2361 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env,OnigCodePoint * val)2362 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
2363 {
2364   int v;
2365   OnigCodePoint c;
2366   OnigEncoding enc = env->enc;
2367   UChar* p = *src;
2368 
2369   if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2370 
2371   PFETCH_S(c);
2372   switch (c) {
2373   case 'M':
2374     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2375       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2376       PFETCH_S(c);
2377       if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2378       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2379       PFETCH_S(c);
2380       if (c == MC_ESC(env->syntax)) {
2381         v = fetch_escaped_value(&p, end, env, &c);
2382         if (v < 0) return v;
2383       }
2384       c = ((c & 0xff) | 0x80);
2385     }
2386     else
2387       goto backslash;
2388     break;
2389 
2390   case 'C':
2391     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2392       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2393       PFETCH_S(c);
2394       if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2395       goto control;
2396     }
2397     else
2398       goto backslash;
2399 
2400   case 'c':
2401     if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2402     control:
2403       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2404       PFETCH_S(c);
2405       if (c == '?') {
2406         c = 0177;
2407       }
2408       else {
2409         if (c == MC_ESC(env->syntax)) {
2410           v = fetch_escaped_value(&p, end, env, &c);
2411           if (v < 0) return v;
2412         }
2413         c &= 0x9f;
2414       }
2415       break;
2416     }
2417     /* fall through */
2418 
2419   default:
2420     {
2421     backslash:
2422       c = conv_backslash_value(c, env);
2423     }
2424     break;
2425   }
2426 
2427   *src = p;
2428   *val = c;
2429   return 0;
2430 }
2431 
2432 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2433 
2434 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2435 get_name_end_code_point(OnigCodePoint start)
2436 {
2437   switch (start) {
2438   case '<':  return (OnigCodePoint )'>'; break;
2439   case '\'': return (OnigCodePoint )'\''; break;
2440   default:
2441     break;
2442   }
2443 
2444   return (OnigCodePoint )0;
2445 }
2446 
2447 #ifdef USE_NAMED_GROUP
2448 #ifdef USE_BACKREF_WITH_LEVEL
2449 /*
2450    \k<name+n>, \k<name-n>
2451    \k<num+n>,  \k<num-n>
2452    \k<-num+n>, \k<-num-n>
2453 */
2454 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2455 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2456 		      UChar** rname_end, ScanEnv* env,
2457 		      int* rback_num, int* rlevel)
2458 {
2459   int r, sign, is_num, exist_level;
2460   OnigCodePoint end_code;
2461   OnigCodePoint c = 0;
2462   OnigEncoding enc = env->enc;
2463   UChar *name_end;
2464   UChar *pnum_head;
2465   UChar *p = *src;
2466   PFETCH_READY;
2467 
2468   *rback_num = 0;
2469   is_num = exist_level = 0;
2470   sign = 1;
2471   pnum_head = *src;
2472 
2473   end_code = get_name_end_code_point(start_code);
2474 
2475   name_end = end;
2476   r = 0;
2477   if (PEND) {
2478     return ONIGERR_EMPTY_GROUP_NAME;
2479   }
2480   else {
2481     PFETCH(c);
2482     if (c == end_code)
2483       return ONIGERR_EMPTY_GROUP_NAME;
2484 
2485     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2486       is_num = 1;
2487     }
2488     else if (c == '-') {
2489       is_num = 2;
2490       sign = -1;
2491       pnum_head = p;
2492     }
2493     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2494       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2495     }
2496   }
2497 
2498   while (!PEND) {
2499     name_end = p;
2500     PFETCH(c);
2501     if (c == end_code || c == ')' || c == '+' || c == '-') {
2502       if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2503       break;
2504     }
2505 
2506     if (is_num != 0) {
2507       if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2508         is_num = 1;
2509       }
2510       else {
2511         r = ONIGERR_INVALID_GROUP_NAME;
2512         is_num = 0;
2513       }
2514     }
2515     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2516       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2517     }
2518   }
2519 
2520   if (r == 0 && c != end_code) {
2521     if (c == '+' || c == '-') {
2522       int level;
2523       int flag = (c == '-' ? -1 : 1);
2524 
2525       if (PEND) {
2526         r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2527         goto end;
2528       }
2529       PFETCH(c);
2530       if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2531       PUNFETCH;
2532       level = onig_scan_unsigned_number(&p, end, enc);
2533       if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2534       *rlevel = (level * flag);
2535       exist_level = 1;
2536 
2537       if (!PEND) {
2538         PFETCH(c);
2539         if (c == end_code)
2540           goto end;
2541       }
2542     }
2543 
2544   err:
2545     r = ONIGERR_INVALID_GROUP_NAME;
2546     name_end = end;
2547   }
2548 
2549  end:
2550   if (r == 0) {
2551     if (is_num != 0) {
2552       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2553       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2554       else if (*rback_num == 0) goto err;
2555 
2556       *rback_num *= sign;
2557     }
2558 
2559     *rname_end = name_end;
2560     *src = p;
2561     return (exist_level ? 1 : 0);
2562   }
2563   else {
2564     onig_scan_env_set_error_string(env, r, *src, name_end);
2565     return r;
2566   }
2567 }
2568 #endif /* USE_BACKREF_WITH_LEVEL */
2569 
2570 /*
2571   ref: 0 -> define name    (don't allow number name)
2572        1 -> reference name (allow number name)
2573 */
2574 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2575 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2576 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2577 {
2578   int r, is_num, sign;
2579   OnigCodePoint end_code;
2580   OnigCodePoint c = 0;
2581   OnigEncoding enc = env->enc;
2582   UChar *name_end;
2583   UChar *pnum_head;
2584   UChar *p = *src;
2585 
2586   *rback_num = 0;
2587 
2588   end_code = get_name_end_code_point(start_code);
2589 
2590   name_end = end;
2591   pnum_head = *src;
2592   r = 0;
2593   is_num = 0;
2594   sign = 1;
2595   if (PEND) {
2596     return ONIGERR_EMPTY_GROUP_NAME;
2597   }
2598   else {
2599     PFETCH_S(c);
2600     if (c == end_code)
2601       return ONIGERR_EMPTY_GROUP_NAME;
2602 
2603     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2604       if (ref == 1)
2605         is_num = 1;
2606       else {
2607         r = ONIGERR_INVALID_GROUP_NAME;
2608         is_num = 0;
2609       }
2610     }
2611     else if (c == '-') {
2612       if (ref == 1) {
2613         is_num = 2;
2614         sign = -1;
2615         pnum_head = p;
2616       }
2617       else {
2618         r = ONIGERR_INVALID_GROUP_NAME;
2619         is_num = 0;
2620       }
2621     }
2622     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2623       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2624     }
2625   }
2626 
2627   if (r == 0) {
2628     while (!PEND) {
2629       name_end = p;
2630       PFETCH_S(c);
2631       if (c == end_code || c == ')') {
2632         if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2633         break;
2634       }
2635 
2636       if (is_num != 0) {
2637         if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2638           is_num = 1;
2639         }
2640         else {
2641           if (!ONIGENC_IS_CODE_WORD(enc, c))
2642             r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2643           else
2644             r = ONIGERR_INVALID_GROUP_NAME;
2645           is_num = 0;
2646         }
2647       }
2648       else {
2649         if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2650           r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2651         }
2652       }
2653     }
2654 
2655     if (c != end_code) {
2656       r = ONIGERR_INVALID_GROUP_NAME;
2657       name_end = end;
2658     }
2659 
2660     if (is_num != 0) {
2661       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2662       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2663       else if (*rback_num == 0) {
2664         r = ONIGERR_INVALID_GROUP_NAME;
2665         goto err;
2666       }
2667 
2668       *rback_num *= sign;
2669     }
2670 
2671     *rname_end = name_end;
2672     *src = p;
2673     return 0;
2674   }
2675   else {
2676     while (!PEND) {
2677       name_end = p;
2678       PFETCH_S(c);
2679       if (c == end_code || c == ')')
2680         break;
2681     }
2682     if (PEND)
2683       name_end = end;
2684 
2685   err:
2686     onig_scan_env_set_error_string(env, r, *src, name_end);
2687     return r;
2688   }
2689 }
2690 #else
2691 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2692 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2693 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2694 {
2695   int r, is_num, sign;
2696   OnigCodePoint end_code;
2697   OnigCodePoint c = 0;
2698   UChar *name_end;
2699   OnigEncoding enc = env->enc;
2700   UChar *pnum_head;
2701   UChar *p = *src;
2702   PFETCH_READY;
2703 
2704   *rback_num = 0;
2705 
2706   end_code = get_name_end_code_point(start_code);
2707 
2708   *rname_end = name_end = end;
2709   r = 0;
2710   pnum_head = *src;
2711   is_num = 0;
2712   sign = 1;
2713 
2714   if (PEND) {
2715     return ONIGERR_EMPTY_GROUP_NAME;
2716   }
2717   else {
2718     PFETCH(c);
2719     if (c == end_code)
2720       return ONIGERR_EMPTY_GROUP_NAME;
2721 
2722     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2723       is_num = 1;
2724     }
2725     else if (c == '-') {
2726       is_num = 2;
2727       sign = -1;
2728       pnum_head = p;
2729     }
2730     else {
2731       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2732     }
2733   }
2734 
2735   while (!PEND) {
2736     name_end = p;
2737 
2738     PFETCH(c);
2739     if (c == end_code || c == ')') break;
2740     if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2741       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2742   }
2743   if (r == 0 && c != end_code) {
2744     r = ONIGERR_INVALID_GROUP_NAME;
2745     name_end = end;
2746   }
2747 
2748   if (r == 0) {
2749     *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2750     if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2751     else if (*rback_num == 0) {
2752       r = ONIGERR_INVALID_GROUP_NAME;
2753       goto err;
2754     }
2755     *rback_num *= sign;
2756 
2757     *rname_end = name_end;
2758     *src = p;
2759     return 0;
2760   }
2761   else {
2762   err:
2763     onig_scan_env_set_error_string(env, r, *src, name_end);
2764     return r;
2765   }
2766 }
2767 #endif /* USE_NAMED_GROUP */
2768 
2769 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2770 CC_ESC_WARN(ScanEnv* env, UChar *c)
2771 {
2772   if (onig_warn == onig_null_warn) return ;
2773 
2774   if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2775       IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2776     UChar buf[WARN_BUFSIZE];
2777     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2778 		env->pattern, env->pattern_end,
2779                 (UChar* )"character class has '%s' without escape", c);
2780     (*onig_warn)((char* )buf);
2781   }
2782 }
2783 
2784 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2785 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2786 {
2787   if (onig_warn == onig_null_warn) return ;
2788 
2789   if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2790     UChar buf[WARN_BUFSIZE];
2791     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2792 		(env)->pattern, (env)->pattern_end,
2793 		(UChar* )"regular expression has '%s' without escape", c);
2794     (*onig_warn)((char* )buf);
2795   }
2796 }
2797 
2798 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2799 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2800 		  UChar **next, OnigEncoding enc)
2801 {
2802   int i;
2803   OnigCodePoint x;
2804   UChar *q;
2805   UChar *p = from;
2806 
2807   while (p < to) {
2808     x = ONIGENC_MBC_TO_CODE(enc, p, to);
2809     q = p + enclen(enc, p);
2810     if (x == s[0]) {
2811       for (i = 1; i < n && q < to; i++) {
2812         x = ONIGENC_MBC_TO_CODE(enc, q, to);
2813         if (x != s[i]) break;
2814         q += enclen(enc, q);
2815       }
2816       if (i >= n) {
2817         if (IS_NOT_NULL(next))
2818           *next = q;
2819         return p;
2820       }
2821     }
2822     p = q;
2823   }
2824   return NULL_UCHARP;
2825 }
2826 
2827 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2828 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2829 		 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2830 {
2831   int i, in_esc;
2832   OnigCodePoint x;
2833   UChar *q;
2834   UChar *p = from;
2835 
2836   in_esc = 0;
2837   while (p < to) {
2838     if (in_esc) {
2839       in_esc = 0;
2840       p += enclen(enc, p);
2841     }
2842     else {
2843       x = ONIGENC_MBC_TO_CODE(enc, p, to);
2844       q = p + enclen(enc, p);
2845       if (x == s[0]) {
2846         for (i = 1; i < n && q < to; i++) {
2847           x = ONIGENC_MBC_TO_CODE(enc, q, to);
2848           if (x != s[i]) break;
2849           q += enclen(enc, q);
2850         }
2851         if (i >= n) return 1;
2852         p += enclen(enc, p);
2853       }
2854       else {
2855         x = ONIGENC_MBC_TO_CODE(enc, p, to);
2856         if (x == bad) return 0;
2857         else if (x == MC_ESC(syn)) in_esc = 1;
2858         p = q;
2859       }
2860     }
2861   }
2862   return 0;
2863 }
2864 
2865 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2866 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2867 {
2868   int num;
2869   OnigCodePoint c, c2;
2870   OnigSyntaxType* syn = env->syntax;
2871   OnigEncoding enc = env->enc;
2872   UChar* prev;
2873   UChar* p = *src;
2874   PFETCH_READY;
2875 
2876   if (PEND) {
2877     tok->type = TK_EOT;
2878     return tok->type;
2879   }
2880 
2881   PFETCH(c);
2882   tok->type = TK_CHAR;
2883   tok->base = 0;
2884   tok->u.c  = c;
2885   tok->escaped = 0;
2886 
2887   if (c == ']') {
2888     tok->type = TK_CC_CLOSE;
2889   }
2890   else if (c == '-') {
2891     tok->type = TK_CC_RANGE;
2892   }
2893   else if (c == MC_ESC(syn)) {
2894     if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2895       goto end;
2896 
2897     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2898 
2899     PFETCH(c);
2900     tok->escaped = 1;
2901     tok->u.c = c;
2902     switch (c) {
2903     case 'w':
2904       tok->type = TK_CHAR_TYPE;
2905       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2906       tok->u.prop.not   = 0;
2907       break;
2908     case 'W':
2909       tok->type = TK_CHAR_TYPE;
2910       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2911       tok->u.prop.not   = 1;
2912       break;
2913     case 'd':
2914       tok->type = TK_CHAR_TYPE;
2915       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2916       tok->u.prop.not   = 0;
2917       break;
2918     case 'D':
2919       tok->type = TK_CHAR_TYPE;
2920       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2921       tok->u.prop.not   = 1;
2922       break;
2923     case 's':
2924       tok->type = TK_CHAR_TYPE;
2925       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2926       tok->u.prop.not   = 0;
2927       break;
2928     case 'S':
2929       tok->type = TK_CHAR_TYPE;
2930       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2931       tok->u.prop.not   = 1;
2932       break;
2933     case 'h':
2934       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2935       tok->type = TK_CHAR_TYPE;
2936       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2937       tok->u.prop.not   = 0;
2938       break;
2939     case 'H':
2940       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2941       tok->type = TK_CHAR_TYPE;
2942       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2943       tok->u.prop.not   = 1;
2944       break;
2945 
2946     case 'p':
2947     case 'P':
2948       if (PEND) break;
2949 
2950       c2 = PPEEK;
2951       if (c2 == '{' &&
2952           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2953         PINC;
2954         tok->type = TK_CHAR_PROPERTY;
2955         tok->u.prop.not = (c == 'P' ? 1 : 0);
2956 
2957         if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2958           PFETCH(c2);
2959           if (c2 == '^') {
2960             tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
2961           }
2962           else
2963             PUNFETCH;
2964         }
2965       }
2966       break;
2967 
2968     case 'o':
2969       if (PEND) break;
2970 
2971       prev = p;
2972       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
2973         PINC;
2974         num = scan_unsigned_octal_number(&p, end, 11, enc);
2975         if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
2976         if (!PEND) {
2977           c2 = PPEEK;
2978           if (ONIGENC_IS_CODE_DIGIT(enc, c2))
2979             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
2980         }
2981 
2982         if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
2983           PINC;
2984           tok->type   = TK_CODE_POINT;
2985           tok->base   = 8;
2986           tok->u.code = (OnigCodePoint )num;
2987         }
2988         else {
2989           /* can't read nothing or invalid format */
2990           p = prev;
2991         }
2992       }
2993       break;
2994 
2995     case 'x':
2996       if (PEND) break;
2997 
2998       prev = p;
2999       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3000         PINC;
3001         num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3002         if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3003         if (!PEND) {
3004           c2 = PPEEK;
3005           if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3006             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3007         }
3008 
3009         if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3010           PINC;
3011           tok->type   = TK_CODE_POINT;
3012           tok->base   = 16;
3013           tok->u.code = (OnigCodePoint )num;
3014         }
3015         else {
3016           /* can't read nothing or invalid format */
3017           p = prev;
3018         }
3019       }
3020       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3021         num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3022         if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3023         if (p == prev) {  /* can't read nothing. */
3024           num = 0; /* but, it's not error */
3025         }
3026         tok->type = TK_RAW_BYTE;
3027         tok->base = 16;
3028         tok->u.c  = num;
3029       }
3030       break;
3031 
3032     case 'u':
3033       if (PEND) break;
3034 
3035       prev = p;
3036       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3037         num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3038         if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3039         if (p == prev) {  /* can't read nothing. */
3040           num = 0; /* but, it's not error */
3041         }
3042         tok->type   = TK_CODE_POINT;
3043         tok->base   = 16;
3044         tok->u.code = (OnigCodePoint )num;
3045       }
3046       break;
3047 
3048     case '0':
3049     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3050       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3051         PUNFETCH;
3052         prev = p;
3053         num = scan_unsigned_octal_number(&p, end, 3, enc);
3054         if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3055         if (p == prev) {  /* can't read nothing. */
3056           num = 0; /* but, it's not error */
3057         }
3058         tok->type = TK_RAW_BYTE;
3059         tok->base = 8;
3060         tok->u.c  = num;
3061       }
3062       break;
3063 
3064     default:
3065       PUNFETCH;
3066       num = fetch_escaped_value(&p, end, env, &c2);
3067       if (num < 0) return num;
3068       if (tok->u.c != c2) {
3069         tok->u.code = c2;
3070         tok->type   = TK_CODE_POINT;
3071       }
3072       break;
3073     }
3074   }
3075   else if (c == '[') {
3076     if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3077       OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3078       tok->backp = p; /* point at '[' is read */
3079       PINC;
3080       if (str_exist_check_with_esc(send, 2, p, end,
3081                                    (OnigCodePoint )']', enc, syn)) {
3082         tok->type = TK_POSIX_BRACKET_OPEN;
3083       }
3084       else {
3085         PUNFETCH;
3086         goto cc_in_cc;
3087       }
3088     }
3089     else {
3090     cc_in_cc:
3091       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3092         tok->type = TK_CC_CC_OPEN;
3093       }
3094       else {
3095         CC_ESC_WARN(env, (UChar* )"[");
3096       }
3097     }
3098   }
3099   else if (c == '&') {
3100     if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3101         !PEND && (PPEEK_IS('&'))) {
3102       PINC;
3103       tok->type = TK_CC_AND;
3104     }
3105   }
3106 
3107  end:
3108   *src = p;
3109   return tok->type;
3110 }
3111 
3112 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3113 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3114 {
3115   int r, num;
3116   OnigCodePoint c;
3117   OnigEncoding enc = env->enc;
3118   OnigSyntaxType* syn = env->syntax;
3119   UChar* prev;
3120   UChar* p = *src;
3121   PFETCH_READY;
3122 
3123  start:
3124   if (PEND) {
3125     tok->type = TK_EOT;
3126     return tok->type;
3127   }
3128 
3129   tok->type  = TK_STRING;
3130   tok->base  = 0;
3131   tok->backp = p;
3132 
3133   PFETCH(c);
3134   if (IS_MC_ESC_CODE(c, syn)) {
3135     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3136 
3137     tok->backp = p;
3138     PFETCH(c);
3139 
3140     tok->u.c = c;
3141     tok->escaped = 1;
3142     switch (c) {
3143     case '*':
3144       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3145       tok->type = TK_OP_REPEAT;
3146       tok->u.repeat.lower = 0;
3147       tok->u.repeat.upper = REPEAT_INFINITE;
3148       goto greedy_check;
3149       break;
3150 
3151     case '+':
3152       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3153       tok->type = TK_OP_REPEAT;
3154       tok->u.repeat.lower = 1;
3155       tok->u.repeat.upper = REPEAT_INFINITE;
3156       goto greedy_check;
3157       break;
3158 
3159     case '?':
3160       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3161       tok->type = TK_OP_REPEAT;
3162       tok->u.repeat.lower = 0;
3163       tok->u.repeat.upper = 1;
3164     greedy_check:
3165       if (!PEND && PPEEK_IS('?') &&
3166           IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3167         PFETCH(c);
3168         tok->u.repeat.greedy     = 0;
3169         tok->u.repeat.possessive = 0;
3170       }
3171       else {
3172       possessive_check:
3173         if (!PEND && PPEEK_IS('+') &&
3174             ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3175               tok->type != TK_INTERVAL)  ||
3176              (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3177               tok->type == TK_INTERVAL))) {
3178           PFETCH(c);
3179           tok->u.repeat.greedy     = 1;
3180           tok->u.repeat.possessive = 1;
3181         }
3182         else {
3183           tok->u.repeat.greedy     = 1;
3184           tok->u.repeat.possessive = 0;
3185         }
3186       }
3187       break;
3188 
3189     case '{':
3190       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3191       r = fetch_range_quantifier(&p, end, tok, env);
3192       if (r < 0) return r;  /* error */
3193       if (r == 0) goto greedy_check;
3194       else if (r == 2) { /* {n} */
3195         if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3196           goto possessive_check;
3197 
3198         goto greedy_check;
3199       }
3200       /* r == 1 : normal char */
3201       break;
3202 
3203     case '|':
3204       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3205       tok->type = TK_ALT;
3206       break;
3207 
3208     case '(':
3209       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3210       tok->type = TK_SUBEXP_OPEN;
3211       break;
3212 
3213     case ')':
3214       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3215       tok->type = TK_SUBEXP_CLOSE;
3216       break;
3217 
3218     case 'w':
3219       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3220       tok->type = TK_CHAR_TYPE;
3221       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3222       tok->u.prop.not   = 0;
3223       break;
3224 
3225     case 'W':
3226       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3227       tok->type = TK_CHAR_TYPE;
3228       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3229       tok->u.prop.not   = 1;
3230       break;
3231 
3232     case 'b':
3233       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3234       tok->type = TK_ANCHOR;
3235       tok->u.anchor = ANCHOR_WORD_BOUND;
3236       break;
3237 
3238     case 'B':
3239       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3240       tok->type = TK_ANCHOR;
3241       tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3242       break;
3243 
3244 #ifdef USE_WORD_BEGIN_END
3245     case '<':
3246       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3247       tok->type = TK_ANCHOR;
3248       tok->u.anchor = ANCHOR_WORD_BEGIN;
3249       break;
3250 
3251     case '>':
3252       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3253       tok->type = TK_ANCHOR;
3254       tok->u.anchor = ANCHOR_WORD_END;
3255       break;
3256 #endif
3257 
3258     case 's':
3259       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3260       tok->type = TK_CHAR_TYPE;
3261       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3262       tok->u.prop.not   = 0;
3263       break;
3264 
3265     case 'S':
3266       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3267       tok->type = TK_CHAR_TYPE;
3268       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3269       tok->u.prop.not   = 1;
3270       break;
3271 
3272     case 'd':
3273       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3274       tok->type = TK_CHAR_TYPE;
3275       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3276       tok->u.prop.not   = 0;
3277       break;
3278 
3279     case 'D':
3280       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3281       tok->type = TK_CHAR_TYPE;
3282       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3283       tok->u.prop.not   = 1;
3284       break;
3285 
3286     case 'h':
3287       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3288       tok->type = TK_CHAR_TYPE;
3289       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3290       tok->u.prop.not   = 0;
3291       break;
3292 
3293     case 'H':
3294       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3295       tok->type = TK_CHAR_TYPE;
3296       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3297       tok->u.prop.not   = 1;
3298       break;
3299 
3300     case 'A':
3301       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3302     begin_buf:
3303       tok->type = TK_ANCHOR;
3304       tok->u.subtype = ANCHOR_BEGIN_BUF;
3305       break;
3306 
3307     case 'Z':
3308       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3309       tok->type = TK_ANCHOR;
3310       tok->u.subtype = ANCHOR_SEMI_END_BUF;
3311       break;
3312 
3313     case 'z':
3314       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3315     end_buf:
3316       tok->type = TK_ANCHOR;
3317       tok->u.subtype = ANCHOR_END_BUF;
3318       break;
3319 
3320     case 'G':
3321       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3322       tok->type = TK_ANCHOR;
3323       tok->u.subtype = ANCHOR_BEGIN_POSITION;
3324       break;
3325 
3326     case '`':
3327       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3328       goto begin_buf;
3329       break;
3330 
3331     case '\'':
3332       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3333       goto end_buf;
3334       break;
3335 
3336     case 'o':
3337       if (PEND) break;
3338 
3339       prev = p;
3340       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
3341         PINC;
3342         num = scan_unsigned_octal_number(&p, end, 11, enc);
3343         if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3344         if (!PEND) {
3345           if (ONIGENC_IS_CODE_DIGIT(enc, PPEEK))
3346             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3347         }
3348 
3349         if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3350           PINC;
3351           tok->type   = TK_CODE_POINT;
3352           tok->u.code = (OnigCodePoint )num;
3353         }
3354         else {
3355           /* can't read nothing or invalid format */
3356           p = prev;
3357         }
3358       }
3359       break;
3360 
3361     case 'x':
3362       if (PEND) break;
3363 
3364       prev = p;
3365       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3366         PINC;
3367         num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3368         if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3369         if (!PEND) {
3370           if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3371             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3372         }
3373 
3374         if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3375           PINC;
3376           tok->type   = TK_CODE_POINT;
3377           tok->u.code = (OnigCodePoint )num;
3378         }
3379         else {
3380           /* can't read nothing or invalid format */
3381           p = prev;
3382         }
3383       }
3384       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3385         num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3386         if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3387         if (p == prev) {  /* can't read nothing. */
3388           num = 0; /* but, it's not error */
3389         }
3390         tok->type = TK_RAW_BYTE;
3391         tok->base = 16;
3392         tok->u.c  = num;
3393       }
3394       break;
3395 
3396     case 'u':
3397       if (PEND) break;
3398 
3399       prev = p;
3400       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3401         num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3402         if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3403         if (p == prev) {  /* can't read nothing. */
3404           num = 0; /* but, it's not error */
3405         }
3406         tok->type   = TK_CODE_POINT;
3407         tok->base   = 16;
3408         tok->u.code = (OnigCodePoint )num;
3409       }
3410       break;
3411 
3412     case '1': case '2': case '3': case '4':
3413     case '5': case '6': case '7': case '8': case '9':
3414       PUNFETCH;
3415       prev = p;
3416       num = onig_scan_unsigned_number(&p, end, enc);
3417       if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3418         goto skip_backref;
3419       }
3420 
3421       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3422           (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3423         if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3424           if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3425             return ONIGERR_INVALID_BACKREF;
3426         }
3427 
3428         tok->type = TK_BACKREF;
3429         tok->u.backref.num     = 1;
3430         tok->u.backref.ref1    = num;
3431         tok->u.backref.by_name = 0;
3432 #ifdef USE_BACKREF_WITH_LEVEL
3433         tok->u.backref.exist_level = 0;
3434 #endif
3435         break;
3436       }
3437 
3438     skip_backref:
3439       if (c == '8' || c == '9') {
3440         /* normal char */
3441         p = prev; PINC;
3442         break;
3443       }
3444 
3445       p = prev;
3446       /* fall through */
3447     case '0':
3448       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3449         prev = p;
3450         num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3451         if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3452         if (p == prev) {  /* can't read nothing. */
3453           num = 0; /* but, it's not error */
3454         }
3455         tok->type = TK_RAW_BYTE;
3456         tok->base = 8;
3457         tok->u.c  = num;
3458       }
3459       else if (c != '0') {
3460         PINC;
3461       }
3462       break;
3463 
3464 #ifdef USE_NAMED_GROUP
3465     case 'k':
3466       if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3467         PFETCH(c);
3468         if (c == '<' || c == '\'') {
3469           UChar* name_end;
3470           int* backs;
3471           int back_num;
3472 
3473           prev = p;
3474 
3475 #ifdef USE_BACKREF_WITH_LEVEL
3476           name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3477           r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3478                                     env, &back_num, &tok->u.backref.level);
3479           if (r == 1) tok->u.backref.exist_level = 1;
3480           else        tok->u.backref.exist_level = 0;
3481 #else
3482           r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3483 #endif
3484           if (r < 0) return r;
3485 
3486           if (back_num != 0) {
3487             if (back_num < 0) {
3488               back_num = BACKREF_REL_TO_ABS(back_num, env);
3489               if (back_num <= 0)
3490                 return ONIGERR_INVALID_BACKREF;
3491             }
3492 
3493             if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3494               if (back_num > env->num_mem ||
3495                   IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3496                 return ONIGERR_INVALID_BACKREF;
3497             }
3498             tok->type = TK_BACKREF;
3499             tok->u.backref.by_name = 0;
3500             tok->u.backref.num  = 1;
3501             tok->u.backref.ref1 = back_num;
3502           }
3503           else {
3504             num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3505             if (num <= 0) {
3506               onig_scan_env_set_error_string(env,
3507                         ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3508               return ONIGERR_UNDEFINED_NAME_REFERENCE;
3509             }
3510             if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3511               int i;
3512               for (i = 0; i < num; i++) {
3513                 if (backs[i] > env->num_mem ||
3514                     IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3515                   return ONIGERR_INVALID_BACKREF;
3516               }
3517             }
3518 
3519             tok->type = TK_BACKREF;
3520             tok->u.backref.by_name = 1;
3521             if (num == 1) {
3522               tok->u.backref.num  = 1;
3523               tok->u.backref.ref1 = backs[0];
3524             }
3525             else {
3526               tok->u.backref.num  = num;
3527               tok->u.backref.refs = backs;
3528             }
3529           }
3530         }
3531         else
3532           PUNFETCH;
3533       }
3534       break;
3535 #endif
3536 
3537 #ifdef USE_SUBEXP_CALL
3538     case 'g':
3539       if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3540         PFETCH(c);
3541         if (c == '<' || c == '\'') {
3542           int gnum;
3543           UChar* name_end;
3544 
3545           prev = p;
3546           r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3547           if (r < 0) return r;
3548 
3549           tok->type = TK_CALL;
3550           tok->u.call.name     = prev;
3551           tok->u.call.name_end = name_end;
3552           tok->u.call.gnum     = gnum;
3553         }
3554         else
3555           PUNFETCH;
3556       }
3557       break;
3558 #endif
3559 
3560     case 'Q':
3561       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3562         tok->type = TK_QUOTE_OPEN;
3563       }
3564       break;
3565 
3566     case 'p':
3567     case 'P':
3568       if (!PEND && PPEEK_IS('{') &&
3569           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3570         PINC;
3571         tok->type = TK_CHAR_PROPERTY;
3572         tok->u.prop.not = (c == 'P' ? 1 : 0);
3573 
3574         if (!PEND &&
3575             IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3576           PFETCH(c);
3577           if (c == '^') {
3578             tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3579           }
3580           else
3581             PUNFETCH;
3582         }
3583       }
3584       break;
3585 
3586     default:
3587       {
3588         OnigCodePoint c2;
3589 
3590         PUNFETCH;
3591         num = fetch_escaped_value(&p, end, env, &c2);
3592         if (num < 0) return num;
3593         /* set_raw: */
3594         if (tok->u.c != c2) {
3595           tok->type = TK_CODE_POINT;
3596           tok->u.code = c2;
3597         }
3598         else { /* string */
3599           int len;
3600           SAFE_ENC_LEN(enc, tok->backp, end, len);
3601           p = tok->backp + len;
3602         }
3603       }
3604       break;
3605     }
3606   }
3607   else {
3608     tok->u.c = c;
3609     tok->escaped = 0;
3610 
3611 #ifdef USE_VARIABLE_META_CHARS
3612     if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3613         IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3614       if (c == MC_ANYCHAR(syn))
3615         goto any_char;
3616       else if (c == MC_ANYTIME(syn))
3617         goto anytime;
3618       else if (c == MC_ZERO_OR_ONE_TIME(syn))
3619         goto zero_or_one_time;
3620       else if (c == MC_ONE_OR_MORE_TIME(syn))
3621         goto one_or_more_time;
3622       else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3623         tok->type = TK_ANYCHAR_ANYTIME;
3624         goto out;
3625       }
3626     }
3627 #endif
3628 
3629     switch (c) {
3630     case '.':
3631       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3632 #ifdef USE_VARIABLE_META_CHARS
3633     any_char:
3634 #endif
3635       tok->type = TK_ANYCHAR;
3636       break;
3637 
3638     case '*':
3639       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3640 #ifdef USE_VARIABLE_META_CHARS
3641     anytime:
3642 #endif
3643       tok->type = TK_OP_REPEAT;
3644       tok->u.repeat.lower = 0;
3645       tok->u.repeat.upper = REPEAT_INFINITE;
3646       goto greedy_check;
3647       break;
3648 
3649     case '+':
3650       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3651 #ifdef USE_VARIABLE_META_CHARS
3652     one_or_more_time:
3653 #endif
3654       tok->type = TK_OP_REPEAT;
3655       tok->u.repeat.lower = 1;
3656       tok->u.repeat.upper = REPEAT_INFINITE;
3657       goto greedy_check;
3658       break;
3659 
3660     case '?':
3661       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3662 #ifdef USE_VARIABLE_META_CHARS
3663     zero_or_one_time:
3664 #endif
3665       tok->type = TK_OP_REPEAT;
3666       tok->u.repeat.lower = 0;
3667       tok->u.repeat.upper = 1;
3668       goto greedy_check;
3669       break;
3670 
3671     case '{':
3672       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3673       r = fetch_range_quantifier(&p, end, tok, env);
3674       if (r < 0) return r;  /* error */
3675       if (r == 0) goto greedy_check;
3676       else if (r == 2) { /* {n} */
3677         if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3678           goto possessive_check;
3679 
3680         goto greedy_check;
3681       }
3682       /* r == 1 : normal char */
3683       break;
3684 
3685     case '|':
3686       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3687       tok->type = TK_ALT;
3688       break;
3689 
3690     case '(':
3691       if (!PEND && PPEEK_IS('?') &&
3692           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3693         PINC;
3694         if (!PEND && PPEEK_IS('#')) {
3695           PFETCH(c);
3696           while (1) {
3697             if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3698             PFETCH(c);
3699             if (c == MC_ESC(syn)) {
3700               if (!PEND) PFETCH(c);
3701             }
3702             else {
3703               if (c == ')') break;
3704             }
3705           }
3706           goto start;
3707         }
3708         PUNFETCH;
3709       }
3710 
3711       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3712       tok->type = TK_SUBEXP_OPEN;
3713       break;
3714 
3715     case ')':
3716       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3717       tok->type = TK_SUBEXP_CLOSE;
3718       break;
3719 
3720     case '^':
3721       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3722       tok->type = TK_ANCHOR;
3723       tok->u.subtype = (IS_SINGLELINE(env->option)
3724 			? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3725       break;
3726 
3727     case '$':
3728       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3729       tok->type = TK_ANCHOR;
3730       tok->u.subtype = (IS_SINGLELINE(env->option)
3731 			? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3732       break;
3733 
3734     case '[':
3735       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3736       tok->type = TK_CC_OPEN;
3737       break;
3738 
3739     case ']':
3740       if (*src > env->pattern)   /* /].../ is allowed. */
3741         CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3742       break;
3743 
3744     case '#':
3745       if (IS_EXTEND(env->option)) {
3746         while (!PEND) {
3747           PFETCH(c);
3748           if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3749             break;
3750         }
3751         goto start;
3752         break;
3753       }
3754       break;
3755 
3756     case ' ': case '\t': case '\n': case '\r': case '\f':
3757       if (IS_EXTEND(env->option))
3758         goto start;
3759       break;
3760 
3761     default:
3762       /* string */
3763       break;
3764     }
3765   }
3766 
3767 #ifdef USE_VARIABLE_META_CHARS
3768  out:
3769 #endif
3770   *src = p;
3771   return tok->type;
3772 }
3773 
3774 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3775 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3776 			 OnigEncoding enc ARG_UNUSED,
3777                          OnigCodePoint sb_out, const OnigCodePoint mbr[])
3778 {
3779   int i, r;
3780   OnigCodePoint j;
3781 
3782   int n = ONIGENC_CODE_RANGE_NUM(mbr);
3783 
3784   if (not == 0) {
3785     for (i = 0; i < n; i++) {
3786       for (j  = ONIGENC_CODE_RANGE_FROM(mbr, i);
3787            j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3788         if (j >= sb_out) {
3789           if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3790             r = add_code_range_to_buf(&(cc->mbuf), j,
3791                                       ONIGENC_CODE_RANGE_TO(mbr, i));
3792             if (r != 0) return r;
3793             i++;
3794           }
3795 
3796           goto sb_end;
3797         }
3798         BITSET_SET_BIT(cc->bs, j);
3799       }
3800     }
3801 
3802   sb_end:
3803     for ( ; i < n; i++) {
3804       r = add_code_range_to_buf(&(cc->mbuf),
3805                                 ONIGENC_CODE_RANGE_FROM(mbr, i),
3806                                 ONIGENC_CODE_RANGE_TO(mbr, i));
3807       if (r != 0) return r;
3808     }
3809   }
3810   else {
3811     OnigCodePoint prev = 0;
3812 
3813     for (i = 0; i < n; i++) {
3814       for (j = prev; j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3815         if (j >= sb_out) {
3816           goto sb_end2;
3817         }
3818         BITSET_SET_BIT(cc->bs, j);
3819       }
3820       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3821     }
3822     for (j = prev; j < sb_out; j++) {
3823       BITSET_SET_BIT(cc->bs, j);
3824     }
3825 
3826   sb_end2:
3827     prev = sb_out;
3828 
3829     for (i = 0; i < n; i++) {
3830       if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3831         r = add_code_range_to_buf(&(cc->mbuf), prev,
3832                                   ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3833         if (r != 0) return r;
3834       }
3835       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3836     }
3837     if (prev < 0x7fffffff) {
3838       r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3839       if (r != 0) return r;
3840     }
3841   }
3842 
3843   return 0;
3844 }
3845 
3846 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3847 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3848 {
3849   int c, r;
3850   const OnigCodePoint *ranges;
3851   OnigCodePoint sb_out;
3852   OnigEncoding enc = env->enc;
3853 
3854   r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3855   if (r == 0) {
3856     return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3857   }
3858   else if (r != ONIG_NO_SUPPORT_CONFIG) {
3859     return r;
3860   }
3861 
3862   r = 0;
3863   switch (ctype) {
3864   case ONIGENC_CTYPE_ALPHA:
3865   case ONIGENC_CTYPE_BLANK:
3866   case ONIGENC_CTYPE_CNTRL:
3867   case ONIGENC_CTYPE_DIGIT:
3868   case ONIGENC_CTYPE_LOWER:
3869   case ONIGENC_CTYPE_PUNCT:
3870   case ONIGENC_CTYPE_SPACE:
3871   case ONIGENC_CTYPE_UPPER:
3872   case ONIGENC_CTYPE_XDIGIT:
3873   case ONIGENC_CTYPE_ASCII:
3874   case ONIGENC_CTYPE_ALNUM:
3875     if (not != 0) {
3876       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3877         if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3878           BITSET_SET_BIT(cc->bs, c);
3879       }
3880       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3881     }
3882     else {
3883       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3884         if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3885           BITSET_SET_BIT(cc->bs, c);
3886       }
3887     }
3888     break;
3889 
3890   case ONIGENC_CTYPE_GRAPH:
3891   case ONIGENC_CTYPE_PRINT:
3892     if (not != 0) {
3893       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3894         if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3895           BITSET_SET_BIT(cc->bs, c);
3896       }
3897     }
3898     else {
3899       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3900         if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3901           BITSET_SET_BIT(cc->bs, c);
3902       }
3903       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3904     }
3905     break;
3906 
3907   case ONIGENC_CTYPE_WORD:
3908     if (not == 0) {
3909       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3910         if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3911       }
3912       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3913     }
3914     else {
3915       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3916         if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3917             && ! ONIGENC_IS_CODE_WORD(enc, c))
3918           BITSET_SET_BIT(cc->bs, c);
3919       }
3920     }
3921     break;
3922 
3923   default:
3924     return ONIGERR_PARSER_BUG;
3925     break;
3926   }
3927 
3928   return r;
3929 }
3930 
3931 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3932 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3933 {
3934 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH  20
3935 #define POSIX_BRACKET_NAME_MIN_LEN         4
3936 
3937   static PosixBracketEntryType PBS[] = {
3938     { (UChar* )"alnum",  ONIGENC_CTYPE_ALNUM,  5 },
3939     { (UChar* )"alpha",  ONIGENC_CTYPE_ALPHA,  5 },
3940     { (UChar* )"blank",  ONIGENC_CTYPE_BLANK,  5 },
3941     { (UChar* )"cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
3942     { (UChar* )"digit",  ONIGENC_CTYPE_DIGIT,  5 },
3943     { (UChar* )"graph",  ONIGENC_CTYPE_GRAPH,  5 },
3944     { (UChar* )"lower",  ONIGENC_CTYPE_LOWER,  5 },
3945     { (UChar* )"print",  ONIGENC_CTYPE_PRINT,  5 },
3946     { (UChar* )"punct",  ONIGENC_CTYPE_PUNCT,  5 },
3947     { (UChar* )"space",  ONIGENC_CTYPE_SPACE,  5 },
3948     { (UChar* )"upper",  ONIGENC_CTYPE_UPPER,  5 },
3949     { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3950     { (UChar* )"ascii",  ONIGENC_CTYPE_ASCII,  5 },
3951     { (UChar* )"word",   ONIGENC_CTYPE_WORD,   4 },
3952     { (UChar* )NULL,     -1, 0 }
3953   };
3954 
3955   PosixBracketEntryType *pb;
3956   int not, i, r;
3957   OnigCodePoint c;
3958   OnigEncoding enc = env->enc;
3959   UChar *p = *src;
3960 
3961   if (PPEEK_IS('^')) {
3962     PINC_S;
3963     not = 1;
3964   }
3965   else
3966     not = 0;
3967 
3968   if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3969     goto not_posix_bracket;
3970 
3971   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3972     if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3973       p = (UChar* )onigenc_step(enc, p, end, pb->len);
3974       if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3975         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3976 
3977       r = add_ctype_to_cc(cc, pb->ctype, not, env);
3978       if (r != 0) return r;
3979 
3980       PINC_S; PINC_S;
3981       *src = p;
3982       return 0;
3983     }
3984   }
3985 
3986  not_posix_bracket:
3987   c = 0;
3988   i = 0;
3989   while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3990     PINC_S;
3991     if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3992   }
3993   if (c == ':' && ! PEND) {
3994     PINC_S;
3995     if (! PEND) {
3996       PFETCH_S(c);
3997       if (c == ']')
3998         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3999     }
4000   }
4001 
4002   return 1;  /* 1: is not POSIX bracket, but no error. */
4003 }
4004 
4005 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)4006 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
4007 {
4008   int r;
4009   OnigCodePoint c;
4010   OnigEncoding enc = env->enc;
4011   UChar *prev, *start, *p = *src;
4012 
4013   r = 0;
4014   start = prev = p;
4015 
4016   while (!PEND) {
4017     prev = p;
4018     PFETCH_S(c);
4019     if (c == '}') {
4020       r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4021       if (r < 0) break;
4022 
4023       *src = p;
4024       return r;
4025     }
4026     else if (c == '(' || c == ')' || c == '{' || c == '|') {
4027       r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4028       break;
4029     }
4030   }
4031 
4032   onig_scan_env_set_error_string(env, r, *src, prev);
4033   return r;
4034 }
4035 
4036 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4037 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4038 		    ScanEnv* env)
4039 {
4040   int r, ctype;
4041   CClassNode* cc;
4042 
4043   ctype = fetch_char_property_to_ctype(src, end, env);
4044   if (ctype < 0) return ctype;
4045 
4046   *np = node_new_cclass();
4047   CHECK_NULL_RETURN_MEMERR(*np);
4048   cc = NCCLASS(*np);
4049   r = add_ctype_to_cc(cc, ctype, 0, env);
4050   if (r != 0) return r;
4051   if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4052 
4053   return 0;
4054 }
4055 
4056 
4057 enum CCSTATE {
4058   CCS_VALUE,
4059   CCS_RANGE,
4060   CCS_COMPLETE,
4061   CCS_START
4062 };
4063 
4064 enum CCVALTYPE {
4065   CCV_SB,
4066   CCV_CODE_POINT,
4067   CCV_CLASS
4068 };
4069 
4070 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4071 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4072 		 enum CCSTATE* state, ScanEnv* env)
4073 {
4074   int r;
4075 
4076   if (*state == CCS_RANGE)
4077     return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4078 
4079   if (*state == CCS_VALUE && *type != CCV_CLASS) {
4080     if (*type == CCV_SB)
4081       BITSET_SET_BIT(cc->bs, (int )(*vs));
4082     else if (*type == CCV_CODE_POINT) {
4083       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4084       if (r < 0) return r;
4085     }
4086   }
4087 
4088   if (*state != CCS_START)
4089     *state = CCS_VALUE;
4090 
4091   *type  = CCV_CLASS;
4092   return 0;
4093 }
4094 
4095 static int
next_state_val(CClassNode * cc,OnigCodePoint * from,OnigCodePoint to,int * from_israw,int to_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4096 next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,
4097 	       int* from_israw, int to_israw,
4098 	       enum CCVALTYPE intype, enum CCVALTYPE* type,
4099 	       enum CCSTATE* state, ScanEnv* env)
4100 {
4101   int r;
4102 
4103   switch (*state) {
4104   case CCS_VALUE:
4105     if (*type == CCV_SB) {
4106       if (*from > 0xff)
4107           return ONIGERR_INVALID_CODE_POINT_VALUE;
4108 
4109       BITSET_SET_BIT(cc->bs, (int )(*from));
4110     }
4111     else if (*type == CCV_CODE_POINT) {
4112       r = add_code_range(&(cc->mbuf), env, *from, *from);
4113       if (r < 0) return r;
4114     }
4115     break;
4116 
4117   case CCS_RANGE:
4118     if (intype == *type) {
4119       if (intype == CCV_SB) {
4120         if (*from > 0xff || to > 0xff)
4121           return ONIGERR_INVALID_CODE_POINT_VALUE;
4122 
4123         if (*from > to) {
4124           if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4125             goto ccs_range_end;
4126           else
4127             return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4128         }
4129         bitset_set_range(cc->bs, (int )*from, (int )to);
4130       }
4131       else {
4132         r = add_code_range(&(cc->mbuf), env, *from, to);
4133         if (r < 0) return r;
4134       }
4135     }
4136     else {
4137       if (*from > to) {
4138         if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4139           goto ccs_range_end;
4140         else
4141           return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4142       }
4143       bitset_set_range(cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
4144       r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to);
4145       if (r < 0) return r;
4146     }
4147   ccs_range_end:
4148     *state = CCS_COMPLETE;
4149     break;
4150 
4151   case CCS_COMPLETE:
4152   case CCS_START:
4153     *state = CCS_VALUE;
4154     break;
4155 
4156   default:
4157     break;
4158   }
4159 
4160   *from_israw = to_israw;
4161   *from       = to;
4162   *type       = intype;
4163   return 0;
4164 }
4165 
4166 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4167 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4168 		 ScanEnv* env)
4169 {
4170   int in_esc;
4171   OnigCodePoint code;
4172   OnigEncoding enc = env->enc;
4173   UChar* p = from;
4174 
4175   in_esc = 0;
4176   while (! PEND) {
4177     if (ignore_escaped && in_esc) {
4178       in_esc = 0;
4179     }
4180     else {
4181       PFETCH_S(code);
4182       if (code == c) return 1;
4183       if (code == MC_ESC(env->syntax)) in_esc = 1;
4184     }
4185   }
4186   return 0;
4187 }
4188 
4189 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4190 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4191 		 ScanEnv* env)
4192 {
4193   int r, neg, len, fetched, and_start;
4194   OnigCodePoint v, vs;
4195   UChar *p;
4196   Node* node;
4197   CClassNode *cc, *prev_cc;
4198   CClassNode work_cc;
4199 
4200   enum CCSTATE state;
4201   enum CCVALTYPE val_type, in_type;
4202   int val_israw, in_israw;
4203 
4204   *np = NULL_NODE;
4205   env->parse_depth++;
4206   if (env->parse_depth > ParseDepthLimit)
4207     return ONIGERR_PARSE_DEPTH_LIMIT_OVER;
4208   prev_cc = (CClassNode* )NULL;
4209   r = fetch_token_in_cc(tok, src, end, env);
4210   if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4211     neg = 1;
4212     r = fetch_token_in_cc(tok, src, end, env);
4213   }
4214   else {
4215     neg = 0;
4216   }
4217 
4218   if (r < 0) return r;
4219   if (r == TK_CC_CLOSE) {
4220     if (! code_exist_check((OnigCodePoint )']',
4221                            *src, env->pattern_end, 1, env))
4222       return ONIGERR_EMPTY_CHAR_CLASS;
4223 
4224     CC_ESC_WARN(env, (UChar* )"]");
4225     r = tok->type = TK_CHAR;  /* allow []...] */
4226   }
4227 
4228   *np = node = node_new_cclass();
4229   CHECK_NULL_RETURN_MEMERR(node);
4230   cc = NCCLASS(node);
4231 
4232   and_start = 0;
4233   state = CCS_START;
4234   p = *src;
4235   while (r != TK_CC_CLOSE) {
4236     fetched = 0;
4237     switch (r) {
4238     case TK_CHAR:
4239     any_char_in:
4240       len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4241       if (len > 1) {
4242         in_type = CCV_CODE_POINT;
4243       }
4244       else if (len < 0) {
4245         r = len;
4246         goto err;
4247       }
4248       else {
4249         /* sb_char: */
4250         in_type = CCV_SB;
4251       }
4252       v = (OnigCodePoint )tok->u.c;
4253       in_israw = 0;
4254       goto val_entry2;
4255       break;
4256 
4257     case TK_RAW_BYTE:
4258       /* tok->base != 0 : octal or hexadec. */
4259       if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4260         UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4261         UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4262         UChar* psave = p;
4263         int i, base = tok->base;
4264 
4265         buf[0] = tok->u.c;
4266         for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4267           r = fetch_token_in_cc(tok, &p, end, env);
4268           if (r < 0) goto err;
4269           if (r != TK_RAW_BYTE || tok->base != base) {
4270             fetched = 1;
4271             break;
4272           }
4273           buf[i] = tok->u.c;
4274         }
4275 
4276         if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4277           r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4278           goto err;
4279         }
4280 
4281         len = enclen(env->enc, buf);
4282         if (i < len) {
4283           r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4284           goto err;
4285         }
4286         else if (i > len) { /* fetch back */
4287           p = psave;
4288           for (i = 1; i < len; i++) {
4289             r = fetch_token_in_cc(tok, &p, end, env);
4290           }
4291           fetched = 0;
4292         }
4293 
4294         if (i == 1) {
4295           v = (OnigCodePoint )buf[0];
4296           goto raw_single;
4297         }
4298         else {
4299           v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4300           in_type = CCV_CODE_POINT;
4301         }
4302       }
4303       else {
4304         v = (OnigCodePoint )tok->u.c;
4305       raw_single:
4306         in_type = CCV_SB;
4307       }
4308       in_israw = 1;
4309       goto val_entry2;
4310       break;
4311 
4312     case TK_CODE_POINT:
4313       v = tok->u.code;
4314       in_israw = 1;
4315     val_entry:
4316       len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4317       if (len < 0) {
4318         r = len;
4319         goto err;
4320       }
4321       in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4322     val_entry2:
4323       r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4324 			 &state, env);
4325       if (r != 0) goto err;
4326       break;
4327 
4328     case TK_POSIX_BRACKET_OPEN:
4329       r = parse_posix_bracket(cc, &p, end, env);
4330       if (r < 0) goto err;
4331       if (r == 1) {  /* is not POSIX bracket */
4332         CC_ESC_WARN(env, (UChar* )"[");
4333         p = tok->backp;
4334         v = (OnigCodePoint )tok->u.c;
4335         in_israw = 0;
4336         goto val_entry;
4337       }
4338       goto next_class;
4339       break;
4340 
4341     case TK_CHAR_TYPE:
4342       r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4343       if (r != 0) return r;
4344 
4345     next_class:
4346       r = next_state_class(cc, &vs, &val_type, &state, env);
4347       if (r != 0) goto err;
4348       break;
4349 
4350     case TK_CHAR_PROPERTY:
4351       {
4352         int ctype;
4353 
4354         ctype = fetch_char_property_to_ctype(&p, end, env);
4355         if (ctype < 0) return ctype;
4356         r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4357         if (r != 0) return r;
4358         goto next_class;
4359       }
4360       break;
4361 
4362     case TK_CC_RANGE:
4363       if (state == CCS_VALUE) {
4364         r = fetch_token_in_cc(tok, &p, end, env);
4365         if (r < 0) goto err;
4366         fetched = 1;
4367         if (r == TK_CC_CLOSE) { /* allow [x-] */
4368         range_end_val:
4369           v = (OnigCodePoint )'-';
4370           in_israw = 0;
4371           goto val_entry;
4372         }
4373         else if (r == TK_CC_AND) {
4374           CC_ESC_WARN(env, (UChar* )"-");
4375           goto range_end_val;
4376         }
4377         state = CCS_RANGE;
4378       }
4379       else if (state == CCS_START) {
4380         /* [-xa] is allowed */
4381         v = (OnigCodePoint )tok->u.c;
4382         in_israw = 0;
4383 
4384         r = fetch_token_in_cc(tok, &p, end, env);
4385         if (r < 0) goto err;
4386         fetched = 1;
4387         /* [--x] or [a&&-x] is warned. */
4388         if (r == TK_CC_RANGE || and_start != 0)
4389           CC_ESC_WARN(env, (UChar* )"-");
4390 
4391         goto val_entry;
4392       }
4393       else if (state == CCS_RANGE) {
4394         CC_ESC_WARN(env, (UChar* )"-");
4395         goto any_char_in;  /* [!--x] is allowed */
4396       }
4397       else { /* CCS_COMPLETE */
4398         r = fetch_token_in_cc(tok, &p, end, env);
4399         if (r < 0) goto err;
4400         fetched = 1;
4401         if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4402         else if (r == TK_CC_AND) {
4403           CC_ESC_WARN(env, (UChar* )"-");
4404           goto range_end_val;
4405         }
4406 
4407         if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4408           CC_ESC_WARN(env, (UChar* )"-");
4409           goto range_end_val;   /* [0-9-a] is allowed as [0-9\-a] */
4410         }
4411         r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4412         goto err;
4413       }
4414       break;
4415 
4416     case TK_CC_CC_OPEN: /* [ */
4417       {
4418         Node *anode;
4419         CClassNode* acc;
4420 
4421         r = parse_char_class(&anode, tok, &p, end, env);
4422         if (r != 0) {
4423           onig_node_free(anode);
4424           goto cc_open_err;
4425         }
4426         acc = NCCLASS(anode);
4427         r = or_cclass(cc, acc, env->enc);
4428 
4429         onig_node_free(anode);
4430       cc_open_err:
4431         if (r != 0) goto err;
4432       }
4433       break;
4434 
4435     case TK_CC_AND: /* && */
4436       {
4437         if (state == CCS_VALUE) {
4438           r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4439                              &val_type, &state, env);
4440           if (r != 0) goto err;
4441         }
4442         /* initialize local variables */
4443         and_start = 1;
4444         state = CCS_START;
4445 
4446         if (IS_NOT_NULL(prev_cc)) {
4447           r = and_cclass(prev_cc, cc, env->enc);
4448           if (r != 0) goto err;
4449           bbuf_free(cc->mbuf);
4450         }
4451         else {
4452           prev_cc = cc;
4453           cc = &work_cc;
4454         }
4455         initialize_cclass(cc);
4456       }
4457       break;
4458 
4459     case TK_EOT:
4460       r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4461       goto err;
4462       break;
4463     default:
4464       r = ONIGERR_PARSER_BUG;
4465       goto err;
4466       break;
4467     }
4468 
4469     if (fetched)
4470       r = tok->type;
4471     else {
4472       r = fetch_token_in_cc(tok, &p, end, env);
4473       if (r < 0) goto err;
4474     }
4475   }
4476 
4477   if (state == CCS_VALUE) {
4478     r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4479 		       &val_type, &state, env);
4480     if (r != 0) goto err;
4481   }
4482 
4483   if (IS_NOT_NULL(prev_cc)) {
4484     r = and_cclass(prev_cc, cc, env->enc);
4485     if (r != 0) goto err;
4486     bbuf_free(cc->mbuf);
4487     cc = prev_cc;
4488   }
4489 
4490   if (neg != 0)
4491     NCCLASS_SET_NOT(cc);
4492   else
4493     NCCLASS_CLEAR_NOT(cc);
4494   if (IS_NCCLASS_NOT(cc) &&
4495       IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4496     int is_empty;
4497 
4498     is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4499     if (is_empty != 0)
4500       BITSET_IS_EMPTY(cc->bs, is_empty);
4501 
4502     if (is_empty == 0) {
4503 #define NEWLINE_CODE    0x0a
4504 
4505       if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4506         if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4507           BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4508         else
4509           add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4510       }
4511     }
4512   }
4513   *src = p;
4514   env->parse_depth--;
4515   return 0;
4516 
4517  err:
4518   if (cc != NCCLASS(*np))
4519     bbuf_free(cc->mbuf);
4520   return r;
4521 }
4522 
4523 static int parse_subexp(Node** top, OnigToken* tok, int term,
4524 			UChar** src, UChar* end, ScanEnv* env);
4525 
4526 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4527 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4528 	      ScanEnv* env)
4529 {
4530   int r, num;
4531   Node *target;
4532   OnigOptionType option;
4533   OnigCodePoint c;
4534   OnigEncoding enc = env->enc;
4535 
4536 #ifdef USE_NAMED_GROUP
4537   int list_capture;
4538 #endif
4539 
4540   UChar* p = *src;
4541   PFETCH_READY;
4542 
4543   *np = NULL;
4544   if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4545 
4546   option = env->option;
4547   if (PPEEK_IS('?') &&
4548       IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4549     PINC;
4550     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4551 
4552     PFETCH(c);
4553     switch (c) {
4554     case ':':   /* (?:...) grouping only */
4555     group:
4556       r = fetch_token(tok, &p, end, env);
4557       if (r < 0) return r;
4558       r = parse_subexp(np, tok, term, &p, end, env);
4559       if (r < 0) return r;
4560       *src = p;
4561       return 1; /* group */
4562       break;
4563 
4564     case '=':
4565       *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4566       break;
4567     case '!':  /*         preceding read */
4568       *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4569       break;
4570     case '>':            /* (?>...) stop backtrack */
4571       *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4572       break;
4573 
4574 #ifdef USE_NAMED_GROUP
4575     case '\'':
4576       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4577         goto named_group1;
4578       }
4579       else
4580         return ONIGERR_UNDEFINED_GROUP_OPTION;
4581       break;
4582 #endif
4583 
4584     case '<':   /* look behind (?<=...), (?<!...) */
4585       if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4586       PFETCH(c);
4587       if (c == '=')
4588         *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4589       else if (c == '!')
4590         *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4591 #ifdef USE_NAMED_GROUP
4592       else {
4593         if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4594           UChar *name;
4595           UChar *name_end;
4596 
4597           PUNFETCH;
4598           c = '<';
4599 
4600         named_group1:
4601           list_capture = 0;
4602 
4603         named_group2:
4604           name = p;
4605           r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4606           if (r < 0) return r;
4607 
4608           num = scan_env_add_mem_entry(env);
4609           if (num < 0) return num;
4610           if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4611             return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4612 
4613           r = name_add(env->reg, name, name_end, num, env);
4614           if (r != 0) return r;
4615           *np = node_new_enclose_memory(env->option, 1);
4616           CHECK_NULL_RETURN_MEMERR(*np);
4617           NENCLOSE(*np)->regnum = num;
4618           if (list_capture != 0)
4619             BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4620           env->num_named++;
4621         }
4622         else {
4623           return ONIGERR_UNDEFINED_GROUP_OPTION;
4624         }
4625       }
4626 #else
4627       else {
4628         return ONIGERR_UNDEFINED_GROUP_OPTION;
4629       }
4630 #endif
4631       break;
4632 
4633     case '@':
4634       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4635 #ifdef USE_NAMED_GROUP
4636         if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4637           PFETCH(c);
4638           if (c == '<' || c == '\'') {
4639             list_capture = 1;
4640             goto named_group2; /* (?@<name>...) */
4641           }
4642           PUNFETCH;
4643         }
4644 #endif
4645         *np = node_new_enclose_memory(env->option, 0);
4646         CHECK_NULL_RETURN_MEMERR(*np);
4647         num = scan_env_add_mem_entry(env);
4648         if (num < 0) {
4649           return num;
4650         }
4651         else if (num >= (int )BIT_STATUS_BITS_NUM) {
4652           return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4653         }
4654         NENCLOSE(*np)->regnum = num;
4655         BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4656       }
4657       else {
4658         return ONIGERR_UNDEFINED_GROUP_OPTION;
4659       }
4660       break;
4661 
4662 #ifdef USE_POSIXLINE_OPTION
4663     case 'p':
4664 #endif
4665     case '-': case 'i': case 'm': case 's': case 'x':
4666       {
4667         int neg = 0;
4668 
4669         while (1) {
4670           switch (c) {
4671           case ':':
4672           case ')':
4673             break;
4674 
4675           case '-':  neg = 1; break;
4676           case 'x':  ONOFF(option, ONIG_OPTION_EXTEND,     neg); break;
4677           case 'i':  ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4678           case 's':
4679             if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4680               ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4681             }
4682             else
4683               return ONIGERR_UNDEFINED_GROUP_OPTION;
4684             break;
4685 
4686           case 'm':
4687             if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4688               ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4689             }
4690             else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4691               ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4692             }
4693             else
4694               return ONIGERR_UNDEFINED_GROUP_OPTION;
4695             break;
4696 #ifdef USE_POSIXLINE_OPTION
4697           case 'p':
4698             ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4699             break;
4700 #endif
4701           default:
4702             return ONIGERR_UNDEFINED_GROUP_OPTION;
4703           }
4704 
4705           if (c == ')') {
4706             *np = node_new_option(option);
4707             CHECK_NULL_RETURN_MEMERR(*np);
4708             *src = p;
4709             return 2; /* option only */
4710           }
4711           else if (c == ':') {
4712             OnigOptionType prev = env->option;
4713 
4714             env->option     = option;
4715             r = fetch_token(tok, &p, end, env);
4716             if (r < 0) return r;
4717             r = parse_subexp(&target, tok, term, &p, end, env);
4718             env->option = prev;
4719             if (r < 0) {
4720               onig_node_free(target);
4721               return r;
4722             }
4723             *np = node_new_option(option);
4724             CHECK_NULL_RETURN_MEMERR(*np);
4725             NENCLOSE(*np)->target = target;
4726             *src = p;
4727             return 0;
4728           }
4729 
4730           if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4731           PFETCH(c);
4732         }
4733       }
4734       break;
4735 
4736     default:
4737       return ONIGERR_UNDEFINED_GROUP_OPTION;
4738     }
4739   }
4740   else {
4741     if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4742       goto group;
4743 
4744     *np = node_new_enclose_memory(env->option, 0);
4745     CHECK_NULL_RETURN_MEMERR(*np);
4746     num = scan_env_add_mem_entry(env);
4747     if (num < 0) return num;
4748     NENCLOSE(*np)->regnum = num;
4749   }
4750 
4751   CHECK_NULL_RETURN_MEMERR(*np);
4752   r = fetch_token(tok, &p, end, env);
4753   if (r < 0) return r;
4754   r = parse_subexp(&target, tok, term, &p, end, env);
4755   if (r < 0) {
4756     onig_node_free(target);
4757     return r;
4758   }
4759 
4760   if (NTYPE(*np) == NT_ANCHOR)
4761     NANCHOR(*np)->target = target;
4762   else {
4763     NENCLOSE(*np)->target = target;
4764     if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4765       /* Don't move this to previous of parse_subexp() */
4766       r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4767       if (r != 0) return r;
4768     }
4769   }
4770 
4771   *src = p;
4772   return 0;
4773 }
4774 
4775 static const char* PopularQStr[] = {
4776   "?", "*", "+", "??", "*?", "+?"
4777 };
4778 
4779 static const char* ReduceQStr[] = {
4780   "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4781 };
4782 
4783 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4784 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4785 {
4786   QtfrNode* qn;
4787 
4788   qn = NQTFR(qnode);
4789   if (qn->lower == 1 && qn->upper == 1) {
4790     return 1;
4791   }
4792 
4793   switch (NTYPE(target)) {
4794   case NT_STR:
4795     if (! group) {
4796       StrNode* sn = NSTR(target);
4797       if (str_node_can_be_split(sn, env->enc)) {
4798         Node* n = str_node_split_last_char(sn, env->enc);
4799         if (IS_NOT_NULL(n)) {
4800           qn->target = n;
4801           return 2;
4802         }
4803       }
4804     }
4805     break;
4806 
4807   case NT_QTFR:
4808     { /* check redundant double repeat. */
4809       /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4810       QtfrNode* qnt   = NQTFR(target);
4811       int nestq_num   = popular_quantifier_num(qn);
4812       int targetq_num = popular_quantifier_num(qnt);
4813 
4814 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4815       if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4816           IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4817         UChar buf[WARN_BUFSIZE];
4818 
4819         switch(ReduceTypeTable[targetq_num][nestq_num]) {
4820         case RQ_ASIS:
4821           break;
4822 
4823         case RQ_DEL:
4824           if (onig_verb_warn != onig_null_warn) {
4825             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4826                                   env->pattern, env->pattern_end,
4827                                   (UChar* )"redundant nested repeat operator");
4828             (*onig_verb_warn)((char* )buf);
4829           }
4830           goto warn_exit;
4831           break;
4832 
4833         default:
4834           if (onig_verb_warn != onig_null_warn) {
4835             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4836                                        env->pattern, env->pattern_end,
4837             (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4838             PopularQStr[targetq_num], PopularQStr[nestq_num],
4839             ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4840             (*onig_verb_warn)((char* )buf);
4841           }
4842           goto warn_exit;
4843           break;
4844         }
4845       }
4846 
4847     warn_exit:
4848 #endif
4849       if (targetq_num >= 0) {
4850         if (nestq_num >= 0) {
4851           onig_reduce_nested_quantifier(qnode, target);
4852           goto q_exit;
4853         }
4854         else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4855           /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4856           if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4857             qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4858           }
4859         }
4860       }
4861     }
4862     break;
4863 
4864   default:
4865     break;
4866   }
4867 
4868   qn->target = target;
4869  q_exit:
4870   return 0;
4871 }
4872 
4873 
4874 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4875 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4876 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4877 {
4878   BBuf *tbuf;
4879   int r;
4880 
4881   if (IS_NCCLASS_NOT(cc)) {
4882     bitset_invert(cc->bs);
4883 
4884     if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4885       r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4886       if (r != 0) return r;
4887 
4888       bbuf_free(cc->mbuf);
4889       cc->mbuf = tbuf;
4890     }
4891 
4892     NCCLASS_CLEAR_NOT(cc);
4893   }
4894 
4895   return 0;
4896 }
4897 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4898 
4899 typedef struct {
4900   ScanEnv*    env;
4901   CClassNode* cc;
4902   Node*       alt_root;
4903   Node**      ptail;
4904 } IApplyCaseFoldArg;
4905 
4906 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4907 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4908 		  int to_len, void* arg)
4909 {
4910   IApplyCaseFoldArg* iarg;
4911   ScanEnv* env;
4912   CClassNode* cc;
4913   BitSetRef bs;
4914 
4915   iarg = (IApplyCaseFoldArg* )arg;
4916   env = iarg->env;
4917   cc  = iarg->cc;
4918   bs = cc->bs;
4919 
4920   if (to_len == 1) {
4921     int is_in = onig_is_code_in_cc(env->enc, from, cc);
4922 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4923     if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4924         (is_in == 0 &&  IS_NCCLASS_NOT(cc))) {
4925       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4926         add_code_range(&(cc->mbuf), env, *to, *to);
4927       }
4928       else {
4929         BITSET_SET_BIT(bs, *to);
4930       }
4931     }
4932 #else
4933     if (is_in != 0) {
4934       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4935         if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4936         add_code_range(&(cc->mbuf), env, *to, *to);
4937       }
4938       else {
4939         if (IS_NCCLASS_NOT(cc)) {
4940           BITSET_CLEAR_BIT(bs, *to);
4941         }
4942         else
4943           BITSET_SET_BIT(bs, *to);
4944       }
4945     }
4946 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4947   }
4948   else {
4949     int r, i, len;
4950     UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4951     Node *snode = NULL_NODE;
4952 
4953     if (onig_is_code_in_cc(env->enc, from, cc)
4954 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4955 	&& !IS_NCCLASS_NOT(cc)
4956 #endif
4957         ) {
4958       for (i = 0; i < to_len; i++) {
4959         len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
4960         if (i == 0) {
4961           snode = onig_node_new_str(buf, buf + len);
4962           CHECK_NULL_RETURN_MEMERR(snode);
4963 
4964           /* char-class expanded multi-char only
4965              compare with string folded at match time. */
4966           NSTRING_SET_AMBIG(snode);
4967         }
4968         else {
4969           r = onig_node_str_cat(snode, buf, buf + len);
4970           if (r < 0) {
4971             onig_node_free(snode);
4972             return r;
4973           }
4974         }
4975       }
4976 
4977       *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
4978       CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
4979       iarg->ptail = &(NCDR((*(iarg->ptail))));
4980     }
4981   }
4982 
4983   return 0;
4984 }
4985 
4986 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4987 parse_exp(Node** np, OnigToken* tok, int term,
4988 	  UChar** src, UChar* end, ScanEnv* env)
4989 {
4990   int r, len, group = 0;
4991   Node* qn;
4992   Node** targetp;
4993 
4994   *np = NULL;
4995   if (tok->type == (enum TokenSyms )term)
4996     goto end_of_token;
4997 
4998   switch (tok->type) {
4999   case TK_ALT:
5000   case TK_EOT:
5001   end_of_token:
5002   *np = node_new_empty();
5003   return tok->type;
5004   break;
5005 
5006   case TK_SUBEXP_OPEN:
5007     r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5008     if (r < 0) return r;
5009     if (r == 1) group = 1;
5010     else if (r == 2) { /* option only */
5011       Node* target;
5012       OnigOptionType prev = env->option;
5013 
5014       env->option = NENCLOSE(*np)->option;
5015       r = fetch_token(tok, src, end, env);
5016       if (r < 0) return r;
5017       r = parse_subexp(&target, tok, term, src, end, env);
5018       env->option = prev;
5019       if (r < 0) {
5020         onig_node_free(target);
5021         return r;
5022       }
5023       NENCLOSE(*np)->target = target;
5024       return tok->type;
5025     }
5026     break;
5027 
5028   case TK_SUBEXP_CLOSE:
5029     if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5030       return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5031 
5032     if (tok->escaped) goto tk_raw_byte;
5033     else goto tk_byte;
5034     break;
5035 
5036   case TK_STRING:
5037   tk_byte:
5038     {
5039       *np = node_new_str(tok->backp, *src);
5040       CHECK_NULL_RETURN_MEMERR(*np);
5041 
5042       while (1) {
5043         r = fetch_token(tok, src, end, env);
5044         if (r < 0) return r;
5045         if (r != TK_STRING) break;
5046 
5047         r = onig_node_str_cat(*np, tok->backp, *src);
5048         if (r < 0) return r;
5049       }
5050 
5051     string_end:
5052       targetp = np;
5053       goto repeat;
5054     }
5055     break;
5056 
5057   case TK_RAW_BYTE:
5058   tk_raw_byte:
5059     {
5060       *np = node_new_str_raw_char((UChar )tok->u.c);
5061       CHECK_NULL_RETURN_MEMERR(*np);
5062       len = 1;
5063       while (1) {
5064         if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5065           if (len == enclen(env->enc, NSTR(*np)->s)) {//should not enclen_end()
5066             r = fetch_token(tok, src, end, env);
5067             NSTRING_CLEAR_RAW(*np);
5068             goto string_end;
5069           }
5070         }
5071 
5072         r = fetch_token(tok, src, end, env);
5073         if (r < 0) return r;
5074         if (r != TK_RAW_BYTE) {
5075           /* Don't use this, it is wrong for little endian encodings. */
5076 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5077           int rem;
5078           if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5079             rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5080             (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5081             if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5082               NSTRING_CLEAR_RAW(*np);
5083               goto string_end;
5084             }
5085           }
5086 #endif
5087           return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5088         }
5089 
5090         r = node_str_cat_char(*np, (UChar )tok->u.c);
5091         if (r < 0) return r;
5092 
5093         len++;
5094       }
5095     }
5096     break;
5097 
5098   case TK_CODE_POINT:
5099     {
5100       UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5101       int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5102       if (num < 0) return num;
5103 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5104       *np = node_new_str_raw(buf, buf + num);
5105 #else
5106       *np = node_new_str(buf, buf + num);
5107 #endif
5108       CHECK_NULL_RETURN_MEMERR(*np);
5109     }
5110     break;
5111 
5112   case TK_QUOTE_OPEN:
5113     {
5114       OnigCodePoint end_op[2];
5115       UChar *qstart, *qend, *nextp;
5116 
5117       end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5118       end_op[1] = (OnigCodePoint )'E';
5119       qstart = *src;
5120       qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5121       if (IS_NULL(qend)) {
5122         nextp = qend = end;
5123       }
5124       *np = node_new_str(qstart, qend);
5125       CHECK_NULL_RETURN_MEMERR(*np);
5126       *src = nextp;
5127     }
5128     break;
5129 
5130   case TK_CHAR_TYPE:
5131     {
5132       switch (tok->u.prop.ctype) {
5133       case ONIGENC_CTYPE_WORD:
5134         *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5135         CHECK_NULL_RETURN_MEMERR(*np);
5136         break;
5137 
5138       case ONIGENC_CTYPE_SPACE:
5139       case ONIGENC_CTYPE_DIGIT:
5140       case ONIGENC_CTYPE_XDIGIT:
5141         {
5142           CClassNode* cc;
5143 
5144           *np = node_new_cclass();
5145           CHECK_NULL_RETURN_MEMERR(*np);
5146           cc = NCCLASS(*np);
5147           add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5148           if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5149         }
5150         break;
5151 
5152       default:
5153         return ONIGERR_PARSER_BUG;
5154         break;
5155       }
5156     }
5157     break;
5158 
5159   case TK_CHAR_PROPERTY:
5160     r = parse_char_property(np, tok, src, end, env);
5161     if (r != 0) return r;
5162     break;
5163 
5164   case TK_CC_OPEN:
5165     {
5166       CClassNode* cc;
5167 
5168       r = parse_char_class(np, tok, src, end, env);
5169       if (r != 0) return r;
5170 
5171       cc = NCCLASS(*np);
5172       if (IS_IGNORECASE(env->option)) {
5173         IApplyCaseFoldArg iarg;
5174 
5175         iarg.env      = env;
5176         iarg.cc       = cc;
5177         iarg.alt_root = NULL_NODE;
5178         iarg.ptail    = &(iarg.alt_root);
5179 
5180         r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5181                                         i_apply_case_fold, &iarg);
5182         if (r != 0) {
5183           onig_node_free(iarg.alt_root);
5184           return r;
5185         }
5186         if (IS_NOT_NULL(iarg.alt_root)) {
5187           Node* work = onig_node_new_alt(*np, iarg.alt_root);
5188           if (IS_NULL(work)) {
5189             onig_node_free(iarg.alt_root);
5190             return ONIGERR_MEMORY;
5191           }
5192           *np = work;
5193         }
5194       }
5195     }
5196     break;
5197 
5198   case TK_ANYCHAR:
5199     *np = node_new_anychar();
5200     CHECK_NULL_RETURN_MEMERR(*np);
5201     break;
5202 
5203   case TK_ANYCHAR_ANYTIME:
5204     *np = node_new_anychar();
5205     CHECK_NULL_RETURN_MEMERR(*np);
5206     qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5207     CHECK_NULL_RETURN_MEMERR(qn);
5208     NQTFR(qn)->target = *np;
5209     *np = qn;
5210     break;
5211 
5212   case TK_BACKREF:
5213     len = tok->u.backref.num;
5214     *np = node_new_backref(len,
5215                   (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5216                   tok->u.backref.by_name,
5217 #ifdef USE_BACKREF_WITH_LEVEL
5218 			   tok->u.backref.exist_level,
5219 			   tok->u.backref.level,
5220 #endif
5221 			   env);
5222     CHECK_NULL_RETURN_MEMERR(*np);
5223     break;
5224 
5225 #ifdef USE_SUBEXP_CALL
5226   case TK_CALL:
5227     {
5228       int gnum = tok->u.call.gnum;
5229 
5230       if (gnum < 0) {
5231         gnum = BACKREF_REL_TO_ABS(gnum, env);
5232         if (gnum <= 0)
5233           return ONIGERR_INVALID_BACKREF;
5234       }
5235       *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5236       CHECK_NULL_RETURN_MEMERR(*np);
5237       env->num_call++;
5238     }
5239     break;
5240 #endif
5241 
5242   case TK_ANCHOR:
5243     *np = onig_node_new_anchor(tok->u.anchor);
5244     break;
5245 
5246   case TK_OP_REPEAT:
5247   case TK_INTERVAL:
5248     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5249       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5250         return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5251       else
5252         *np = node_new_empty();
5253     }
5254     else {
5255       goto tk_byte;
5256     }
5257     break;
5258 
5259   default:
5260     return ONIGERR_PARSER_BUG;
5261     break;
5262   }
5263 
5264   {
5265     targetp = np;
5266 
5267   re_entry:
5268     r = fetch_token(tok, src, end, env);
5269     if (r < 0) return r;
5270 
5271   repeat:
5272     if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5273       if (is_invalid_quantifier_target(*targetp))
5274         return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5275 
5276       qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5277                                (r == TK_INTERVAL ? 1 : 0));
5278       CHECK_NULL_RETURN_MEMERR(qn);
5279       NQTFR(qn)->greedy = tok->u.repeat.greedy;
5280       r = set_quantifier(qn, *targetp, group, env);
5281       if (r < 0) {
5282         onig_node_free(qn);
5283         return r;
5284       }
5285 
5286       if (tok->u.repeat.possessive != 0) {
5287         Node* en;
5288         en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5289         if (IS_NULL(en)) {
5290           onig_node_free(qn);
5291           return ONIGERR_MEMORY;
5292         }
5293         NENCLOSE(en)->target = qn;
5294         qn = en;
5295       }
5296 
5297       if (r == 0) {
5298         *targetp = qn;
5299       }
5300       else if (r == 1) {
5301         onig_node_free(qn);
5302       }
5303       else if (r == 2) { /* split case: /abc+/ */
5304         Node *tmp;
5305 
5306         *targetp = node_new_list(*targetp, NULL);
5307         if (IS_NULL(*targetp)) {
5308           onig_node_free(qn);
5309           return ONIGERR_MEMORY;
5310         }
5311         tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5312         if (IS_NULL(tmp)) {
5313           onig_node_free(qn);
5314           return ONIGERR_MEMORY;
5315         }
5316         targetp = &(NCAR(tmp));
5317       }
5318       goto re_entry;
5319     }
5320   }
5321 
5322   return r;
5323 }
5324 
5325 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5326 parse_branch(Node** top, OnigToken* tok, int term,
5327 	     UChar** src, UChar* end, ScanEnv* env)
5328 {
5329   int r;
5330   Node *node, **headp;
5331 
5332   *top = NULL;
5333   r = parse_exp(&node, tok, term, src, end, env);
5334   if (r < 0) {
5335     onig_node_free(node);
5336     return r;
5337   }
5338 
5339   if (r == TK_EOT || r == term || r == TK_ALT) {
5340     *top = node;
5341   }
5342   else {
5343     *top  = node_new_list(node, NULL);
5344     headp = &(NCDR(*top));
5345     while (r != TK_EOT && r != term && r != TK_ALT) {
5346       r = parse_exp(&node, tok, term, src, end, env);
5347       if (r < 0) {
5348         onig_node_free(node);
5349         return r;
5350       }
5351 
5352       if (NTYPE(node) == NT_LIST) {
5353         *headp = node;
5354         while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5355         headp = &(NCDR(node));
5356       }
5357       else {
5358         *headp = node_new_list(node, NULL);
5359         headp = &(NCDR(*headp));
5360       }
5361     }
5362   }
5363 
5364   return r;
5365 }
5366 
5367 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5368 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5369 parse_subexp(Node** top, OnigToken* tok, int term,
5370 	     UChar** src, UChar* end, ScanEnv* env)
5371 {
5372   int r;
5373   Node *node, **headp;
5374 
5375   *top = NULL;
5376   env->parse_depth++;
5377   if (env->parse_depth > ParseDepthLimit)
5378     return ONIGERR_PARSE_DEPTH_LIMIT_OVER;
5379   r = parse_branch(&node, tok, term, src, end, env);
5380   if (r < 0) {
5381     onig_node_free(node);
5382     return r;
5383   }
5384 
5385   if (r == term) {
5386     *top = node;
5387   }
5388   else if (r == TK_ALT) {
5389     *top  = onig_node_new_alt(node, NULL);
5390     headp = &(NCDR(*top));
5391     while (r == TK_ALT) {
5392       r = fetch_token(tok, src, end, env);
5393       if (r < 0) return r;
5394       r = parse_branch(&node, tok, term, src, end, env);
5395       if (r < 0) {
5396         onig_node_free(node);
5397         return r;
5398       }
5399       *headp = onig_node_new_alt(node, NULL);
5400       headp = &(NCDR(*headp));
5401     }
5402 
5403     if (tok->type != (enum TokenSyms )term)
5404       goto err;
5405   }
5406   else {
5407     onig_node_free(node);
5408   err:
5409     if (term == TK_SUBEXP_CLOSE)
5410       return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5411     else
5412       return ONIGERR_PARSER_BUG;
5413   }
5414 
5415   env->parse_depth--;
5416   return r;
5417 }
5418 
5419 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5420 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5421 {
5422   int r;
5423   OnigToken tok;
5424 
5425   r = fetch_token(&tok, src, end, env);
5426   if (r < 0) return r;
5427   r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5428   if (r < 0) return r;
5429   return 0;
5430 }
5431 
5432 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5433 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5434 		     regex_t* reg, ScanEnv* env)
5435 {
5436   int r;
5437   UChar* p;
5438 
5439 #ifdef USE_NAMED_GROUP
5440   names_clear(reg);
5441 #endif
5442 
5443   scan_env_clear(env);
5444   env->option         = reg->options;
5445   env->case_fold_flag = reg->case_fold_flag;
5446   env->enc            = reg->enc;
5447   env->syntax         = reg->syntax;
5448   env->pattern        = (UChar* )pattern;
5449   env->pattern_end    = (UChar* )end;
5450   env->reg            = reg;
5451 
5452   *root = NULL;
5453 
5454   if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end))
5455     return ONIGERR_INVALID_WIDE_CHAR_VALUE;
5456 
5457   p = (UChar* )pattern;
5458   r = parse_regexp(root, &p, (UChar* )end, env);
5459   reg->num_mem = env->num_mem;
5460   return r;
5461 }
5462 
5463 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5464 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5465 				UChar* arg, UChar* arg_end)
5466 {
5467   env->error     = arg;
5468   env->error_end = arg_end;
5469 }
5470