xref: /PHP-5.4/ext/mbstring/oniguruma/regparse.c (revision 75adcc8d)
1 /**********************************************************************
2   regparse.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 
32 #define WARN_BUFSIZE    256
33 
34 OnigSyntaxType OnigSyntaxRuby = {
35   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
36      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
37      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
38      ONIG_SYN_OP_ESC_C_CONTROL )
39    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
40   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
41       ONIG_SYN_OP2_OPTION_RUBY |
42       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
43       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
44       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
45       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
46       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
47       ONIG_SYN_OP2_ESC_H_XDIGIT )
48   , ( SYN_GNU_REGEX_BV |
49       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
50       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
51       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
52       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
53       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
54       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
55       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
56   , ONIG_OPTION_NONE
57 };
58 
59 OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
60 
onig_null_warn(const char * s)61 extern void onig_null_warn(const char* s) { }
62 
63 #ifdef RUBY_PLATFORM
64 extern void
onig_rb_warn(const char * s)65 onig_rb_warn(const char* s)
66 {
67   rb_warn("%s", s);
68 }
69 
70 extern void
onig_rb_warning(const char * s)71 onig_rb_warning(const char* s)
72 {
73   rb_warning("%s", s);
74 }
75 #endif
76 
77 #ifdef DEFAULT_WARN_FUNCTION
78 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
79 #else
80 static OnigWarnFunc onig_warn = onig_null_warn;
81 #endif
82 
83 #ifdef DEFAULT_VERB_WARN_FUNCTION
84 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
85 #else
86 static OnigWarnFunc onig_verb_warn = onig_null_warn;
87 #endif
88 
onig_set_warn_func(OnigWarnFunc f)89 extern void onig_set_warn_func(OnigWarnFunc f)
90 {
91   onig_warn = f;
92 }
93 
onig_set_verb_warn_func(OnigWarnFunc f)94 extern void onig_set_verb_warn_func(OnigWarnFunc f)
95 {
96   onig_verb_warn = f;
97 }
98 
99 static void
bbuf_free(BBuf * bbuf)100 bbuf_free(BBuf* bbuf)
101 {
102   if (IS_NOT_NULL(bbuf)) {
103     if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
104     xfree(bbuf);
105   }
106 }
107 
108 static int
bbuf_clone(BBuf ** rto,BBuf * from)109 bbuf_clone(BBuf** rto, BBuf* from)
110 {
111   int r;
112   BBuf *to;
113 
114   *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
115   CHECK_NULL_RETURN_VAL(to, ONIGERR_MEMORY);
116   r = BBUF_INIT(to, from->alloc);
117   if (r != 0) return r;
118   to->used = from->used;
119   xmemcpy(to->p, from->p, from->used);
120   return 0;
121 }
122 
123 #define ONOFF(v,f,negative)    (negative) ? ((v) &= ~(f)) : ((v) |= (f))
124 
125 #define MBCODE_START_POS(enc) \
126   (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
127 
128 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
129   add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
130 
131 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
132   if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
133     r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
134     if (r) return r;\
135   }\
136 } while (0)
137 
138 
139 #define BITSET_IS_EMPTY(bs,empty) do {\
140   int i;\
141   empty = 1;\
142   for (i = 0; i < BITSET_SIZE; i++) {\
143     if ((bs)[i] != 0) {\
144       empty = 0; break;\
145     }\
146   }\
147 } while (0)
148 
149 static void
bitset_set_range(BitSetRef bs,int from,int to)150 bitset_set_range(BitSetRef bs, int from, int to)
151 {
152   int i;
153   for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
154     BITSET_SET_BIT(bs, i);
155   }
156 }
157 
158 #if 0
159 static void
160 bitset_set_all(BitSetRef bs)
161 {
162   int i;
163   for (i = 0; i < BITSET_SIZE; i++) {
164     bs[i] = ~((Bits )0);
165   }
166 }
167 #endif
168 
169 static void
bitset_invert(BitSetRef bs)170 bitset_invert(BitSetRef bs)
171 {
172   int i;
173   for (i = 0; i < BITSET_SIZE; i++) {
174     bs[i] = ~(bs[i]);
175   }
176 }
177 
178 static void
bitset_invert_to(BitSetRef from,BitSetRef to)179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181   int i;
182   for (i = 0; i < BITSET_SIZE; i++) {
183     to[i] = ~(from[i]);
184   }
185 }
186 
187 static void
bitset_and(BitSetRef dest,BitSetRef bs)188 bitset_and(BitSetRef dest, BitSetRef bs)
189 {
190   int i;
191   for (i = 0; i < BITSET_SIZE; i++) {
192     dest[i] &= bs[i];
193   }
194 }
195 
196 static void
bitset_or(BitSetRef dest,BitSetRef bs)197 bitset_or(BitSetRef dest, BitSetRef bs)
198 {
199   int i;
200   for (i = 0; i < BITSET_SIZE; i++) {
201     dest[i] |= bs[i];
202   }
203 }
204 
205 static void
bitset_copy(BitSetRef dest,BitSetRef bs)206 bitset_copy(BitSetRef dest, BitSetRef bs)
207 {
208   int i;
209   for (i = 0; i < BITSET_SIZE; i++) {
210     dest[i] = bs[i];
211   }
212 }
213 
214 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)215 onig_strncmp(const UChar* s1, const UChar* s2, int n)
216 {
217   int x;
218 
219   while (n-- > 0) {
220     x = *s2++ - *s1++;
221     if (x) return x;
222   }
223   return 0;
224 }
225 
226 static void
k_strcpy(UChar * dest,const UChar * src,const UChar * end)227 k_strcpy(UChar* dest, const UChar* src, const UChar* end)
228 {
229   int len = end - src;
230   if (len > 0) {
231     xmemcpy(dest, src, len);
232     dest[len] = (UChar )0;
233   }
234 }
235 
236 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)237 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
238 {
239   int slen, term_len, i;
240   UChar *r;
241 
242   slen = end - s;
243   term_len = ONIGENC_MBC_MINLEN(enc);
244 
245   r = (UChar* )xmalloc(slen + term_len);
246   CHECK_NULL_RETURN(r);
247   xmemcpy(r, s, slen);
248 
249   for (i = 0; i < term_len; i++)
250     r[slen + i] = (UChar )0;
251 
252   return r;
253 }
254 
255 
256 /* scan pattern methods */
257 #define PEND_VALUE   0
258 
259 #define PFETCH_READY  UChar* pfetch_prev
260 #define PEND         (p < end ?  0 : 1)
261 #define PUNFETCH     p = pfetch_prev
262 #define PINC       do { \
263   pfetch_prev = p; \
264   p += ONIGENC_MBC_ENC_LEN(enc, p); \
265 } while (0)
266 #define PFETCH(c)  do { \
267   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
268   pfetch_prev = p; \
269   p += ONIGENC_MBC_ENC_LEN(enc, p); \
270 } while (0)
271 
272 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
273 #define PPEEK_IS(c)  (PPEEK == (OnigCodePoint )c)
274 
275 static UChar*
k_strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)276 k_strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
277 	      int capa)
278 {
279   UChar* r;
280 
281   if (dest)
282     r = (UChar* )xrealloc(dest, capa + 1);
283   else
284     r = (UChar* )xmalloc(capa + 1);
285 
286   CHECK_NULL_RETURN(r);
287   k_strcpy(r + (dest_end - dest), src, src_end);
288   return r;
289 }
290 
291 /* dest on static area */
292 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)293 strcat_capa_from_static(UChar* dest, UChar* dest_end,
294 			const UChar* src, const UChar* src_end, int capa)
295 {
296   UChar* r;
297 
298   r = (UChar* )xmalloc(capa + 1);
299   CHECK_NULL_RETURN(r);
300   k_strcpy(r, dest, dest_end);
301   k_strcpy(r + (dest_end - dest), src, src_end);
302   return r;
303 }
304 
305 #ifdef USE_NAMED_GROUP
306 
307 #define INIT_NAME_BACKREFS_ALLOC_NUM   8
308 
309 typedef struct {
310   UChar* name;
311   int    name_len;   /* byte length */
312   int    back_num;   /* number of backrefs */
313   int    back_alloc;
314   int    back_ref1;
315   int*   back_refs;
316 } NameEntry;
317 
318 #ifdef USE_ST_HASH_TABLE
319 
320 #include "st.h"
321 
322 typedef struct {
323   unsigned char* s;
324   unsigned char* end;
325 } st_strend_key;
326 
327 static int strend_cmp(st_strend_key*, st_strend_key*);
328 static int strend_hash(st_strend_key*);
329 
330 static struct st_hash_type type_strend_hash = {
331   strend_cmp,
332   strend_hash,
333 };
334 
335 static st_table*
onig_st_init_strend_table_with_size(int size)336 onig_st_init_strend_table_with_size(int size)
337 {
338     return onig_st_init_table_with_size(&type_strend_hash, size);
339 }
340 
341 static int
onig_st_lookup_strend(st_table * table,const UChar * str_key,const UChar * end_key,st_data_t * value)342 onig_st_lookup_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t *value)
343 {
344     st_strend_key key;
345 
346     key.s   = (unsigned char* )str_key;
347     key.end = (unsigned char* )end_key;
348 
349     return onig_st_lookup(table, (st_data_t )(&key), value);
350 }
351 
352 static int
onig_st_insert_strend(st_table * table,const UChar * str_key,const UChar * end_key,st_data_t value)353 onig_st_insert_strend(st_table *table, const UChar* str_key, const UChar* end_key, st_data_t value)
354 {
355   st_strend_key* key;
356   int result;
357 
358   key = (st_strend_key* )xmalloc(sizeof(st_strend_key));
359   key->s   = (unsigned char* )str_key;
360   key->end = (unsigned char* )end_key;
361   result = onig_st_insert(table, (st_data_t )key, value);
362   if (result) {
363     xfree(key);
364   }
365   return result;
366 }
367 
368 static int
strend_cmp(st_strend_key * x,st_strend_key * y)369 strend_cmp(st_strend_key* x, st_strend_key* y)
370 {
371   unsigned char *p, *q;
372   int c;
373 
374   if ((x->end - x->s) != (y->end - y->s))
375     return 1;
376 
377   p = x->s;
378   q = y->s;
379   while (p < x->end) {
380     c = (int )*p - (int )*q;
381     if (c != 0) return c;
382 
383     p++; q++;
384   }
385 
386   return 0;
387 }
388 
389 static int
strend_hash(st_strend_key * x)390 strend_hash(st_strend_key* x)
391 {
392   int val;
393   unsigned char *p;
394 
395   val = 0;
396   p = x->s;
397   while (p < x->end) {
398     val = val * 997 + (int )*p++;
399   }
400 
401   return val + (val >> 5);
402 }
403 
404 typedef st_table  NameTable;
405 typedef st_data_t HashDataType;   /* 1.6 st.h doesn't define st_data_t type */
406 
407 #define NAMEBUF_SIZE    24
408 #define NAMEBUF_SIZE_1  25
409 
410 #ifdef ONIG_DEBUG
411 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)412 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
413 {
414   int i;
415   FILE* fp = (FILE* )arg;
416 
417   fprintf(fp, "%s: ", e->name);
418   if (e->back_num == 0)
419     fputs("-", fp);
420   else if (e->back_num == 1)
421     fprintf(fp, "%d", e->back_ref1);
422   else {
423     for (i = 0; i < e->back_num; i++) {
424       if (i > 0) fprintf(fp, ", ");
425       fprintf(fp, "%d", e->back_refs[i]);
426     }
427   }
428   fputs("\n", fp);
429   return ST_CONTINUE;
430 }
431 
432 extern int
onig_print_names(FILE * fp,regex_t * reg)433 onig_print_names(FILE* fp, regex_t* reg)
434 {
435   NameTable* t = (NameTable* )reg->name_table;
436 
437   if (IS_NOT_NULL(t)) {
438     fprintf(fp, "name table\n");
439     onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
440     fputs("\n", fp);
441   }
442   return 0;
443 }
444 #endif
445 
446 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg)447 i_free_name_entry(UChar* key, NameEntry* e, void* arg)
448 {
449   xfree(e->name);
450   if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
451   xfree(key);
452   xfree(e);
453   return ST_DELETE;
454 }
455 
456 static int
names_clear(regex_t * reg)457 names_clear(regex_t* reg)
458 {
459   NameTable* t = (NameTable* )reg->name_table;
460 
461   if (IS_NOT_NULL(t)) {
462     onig_st_foreach(t, i_free_name_entry, 0);
463   }
464   return 0;
465 }
466 
467 extern int
onig_names_free(regex_t * reg)468 onig_names_free(regex_t* reg)
469 {
470   int r;
471   NameTable* t;
472 
473   r = names_clear(reg);
474   if (r) return r;
475 
476   t = (NameTable* )reg->name_table;
477   if (IS_NOT_NULL(t)) onig_st_free_table(t);
478   reg->name_table = (void* )NULL;
479   return 0;
480 }
481 
482 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)483 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
484 {
485   NameEntry* e;
486   NameTable* t = (NameTable* )reg->name_table;
487 
488   e = (NameEntry* )NULL;
489   if (IS_NOT_NULL(t)) {
490     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
491   }
492   return e;
493 }
494 
495 typedef struct {
496   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
497   regex_t* reg;
498   void* arg;
499   int ret;
500   OnigEncoding enc;
501 } INamesArg;
502 
503 static int
i_names(UChar * key,NameEntry * e,INamesArg * arg)504 i_names(UChar* key, NameEntry* e, INamesArg* arg)
505 {
506   int r = (*(arg->func))(e->name,
507                    /*e->name + onigenc_str_bytelen_null(arg->enc, e->name), */
508                          e->name + e->name_len,
509                          e->back_num,
510 			 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
511 			 arg->reg, arg->arg);
512   if (r != 0) {
513     arg->ret = r;
514     return ST_STOP;
515   }
516   return ST_CONTINUE;
517 }
518 
519 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)520 onig_foreach_name(regex_t* reg,
521 	   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
522 	   void* arg)
523 {
524   INamesArg narg;
525   NameTable* t = (NameTable* )reg->name_table;
526 
527   narg.ret = 0;
528   if (IS_NOT_NULL(t)) {
529     narg.func = func;
530     narg.reg  = reg;
531     narg.arg  = arg;
532     narg.enc  = reg->enc; /* should be pattern encoding. */
533     onig_st_foreach(t, i_names, (HashDataType )&narg);
534   }
535   return narg.ret;
536 }
537 
538 static int
i_renumber_name(UChar * key,NameEntry * e,GroupNumRemap * map)539 i_renumber_name(UChar* key, NameEntry* e, GroupNumRemap* map)
540 {
541   int i;
542 
543   if (e->back_num > 1) {
544     for (i = 0; i < e->back_num; i++) {
545       e->back_refs[i] = map[e->back_refs[i]].new_val;
546     }
547   }
548   else if (e->back_num == 1) {
549     e->back_ref1 = map[e->back_ref1].new_val;
550   }
551 
552   return ST_CONTINUE;
553 }
554 
555 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)556 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
557 {
558   NameTable* t = (NameTable* )reg->name_table;
559 
560   if (IS_NOT_NULL(t)) {
561     onig_st_foreach(t, i_renumber_name, (HashDataType )map);
562   }
563   return 0;
564 }
565 
566 
567 extern int
onig_number_of_names(regex_t * reg)568 onig_number_of_names(regex_t* reg)
569 {
570   NameTable* t = (NameTable* )reg->name_table;
571 
572   if (IS_NOT_NULL(t))
573     return t->num_entries;
574   else
575     return 0;
576 }
577 
578 #else  /* USE_ST_HASH_TABLE */
579 
580 #define INIT_NAMES_ALLOC_NUM    8
581 
582 typedef struct {
583   NameEntry* e;
584   int        num;
585   int        alloc;
586 } NameTable;
587 
588 
589 #ifdef ONIG_DEBUG
590 extern int
onig_print_names(FILE * fp,regex_t * reg)591 onig_print_names(FILE* fp, regex_t* reg)
592 {
593   int i, j;
594   NameEntry* e;
595   NameTable* t = (NameTable* )reg->name_table;
596 
597   if (IS_NOT_NULL(t) && t->num > 0) {
598     fprintf(fp, "name table\n");
599     for (i = 0; i < t->num; i++) {
600       e = &(t->e[i]);
601       fprintf(fp, "%s: ", e->name);
602       if (e->back_num == 0) {
603 	fputs("-", fp);
604       }
605       else if (e->back_num == 1) {
606 	fprintf(fp, "%d", e->back_ref1);
607       }
608       else {
609 	for (j = 0; j < e->back_num; j++) {
610 	  if (j > 0) fprintf(fp, ", ");
611 	  fprintf(fp, "%d", e->back_refs[j]);
612 	}
613       }
614       fputs("\n", fp);
615     }
616     fputs("\n", fp);
617   }
618   return 0;
619 }
620 #endif
621 
622 static int
names_clear(regex_t * reg)623 names_clear(regex_t* reg)
624 {
625   int i;
626   NameEntry* e;
627   NameTable* t = (NameTable* )reg->name_table;
628 
629   if (IS_NOT_NULL(t)) {
630     for (i = 0; i < t->num; i++) {
631       e = &(t->e[i]);
632       if (IS_NOT_NULL(e->name)) {
633 	xfree(e->name);
634 	e->name       = NULL;
635 	e->name_len   = 0;
636 	e->back_num   = 0;
637 	e->back_alloc = 0;
638 	if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
639 	e->back_refs = (int* )NULL;
640       }
641     }
642     if (IS_NOT_NULL(t->e)) {
643       xfree(t->e);
644       t->e = NULL;
645     }
646     t->num = 0;
647   }
648   return 0;
649 }
650 
651 extern int
onig_names_free(regex_t * reg)652 onig_names_free(regex_t* reg)
653 {
654   int r;
655   NameTable* t;
656 
657   r = names_clear(reg);
658   if (r) return r;
659 
660   t = (NameTable* )reg->name_table;
661   if (IS_NOT_NULL(t)) xfree(t);
662   reg->name_table = NULL;
663   return 0;
664 }
665 
666 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)667 name_find(regex_t* reg, UChar* name, UChar* name_end)
668 {
669   int i, len;
670   NameEntry* e;
671   NameTable* t = (NameTable* )reg->name_table;
672 
673   if (IS_NOT_NULL(t)) {
674     len = name_end - name;
675     for (i = 0; i < t->num; i++) {
676       e = &(t->e[i]);
677       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
678 	return e;
679     }
680   }
681   return (NameEntry* )NULL;
682 }
683 
684 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)685 onig_foreach_name(regex_t* reg,
686 	   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
687 	   void* arg)
688 {
689   int i, r;
690   NameEntry* e;
691   NameTable* t = (NameTable* )reg->name_table;
692 
693   if (IS_NOT_NULL(t)) {
694     for (i = 0; i < t->num; i++) {
695       e = &(t->e[i]);
696       r = (*func)(e->name, e->name + e->name_len, e->back_num,
697 		  (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
698 		  reg, arg);
699       if (r != 0) return r;
700     }
701   }
702   return 0;
703 }
704 
705 extern int
onig_number_of_names(regex_t * reg)706 onig_number_of_names(regex_t* reg)
707 {
708   NameTable* t = (NameTable* )reg->name_table;
709 
710   if (IS_NOT_NULL(t))
711     return t->num;
712   else
713     return 0;
714 }
715 
716 #endif /* else USE_ST_HASH_TABLE */
717 
718 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)719 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
720 {
721   int alloc;
722   NameEntry* e;
723   NameTable* t = (NameTable* )reg->name_table;
724 
725   if (name_end - name <= 0)
726     return ONIGERR_EMPTY_GROUP_NAME;
727 
728   e = name_find(reg, name, name_end);
729   if (IS_NULL(e)) {
730 #ifdef USE_ST_HASH_TABLE
731     if (IS_NULL(t)) {
732       t = onig_st_init_strend_table_with_size(5);
733       reg->name_table = (void* )t;
734     }
735     e = (NameEntry* )xmalloc(sizeof(NameEntry));
736     CHECK_NULL_RETURN_VAL(e, ONIGERR_MEMORY);
737 
738     e->name = strdup_with_null(reg->enc, name, name_end);
739     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
740     onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
741                           (HashDataType )e);
742 
743     e->name_len   = name_end - name;
744     e->back_num   = 0;
745     e->back_alloc = 0;
746     e->back_refs  = (int* )NULL;
747 
748 #else
749 
750     if (IS_NULL(t)) {
751       alloc = INIT_NAMES_ALLOC_NUM;
752       t = (NameTable* )xmalloc(sizeof(NameTable));
753       CHECK_NULL_RETURN_VAL(t, ONIGERR_MEMORY);
754       t->e     = NULL;
755       t->alloc = 0;
756       t->num   = 0;
757 
758       t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
759       if (IS_NULL(t->e)) {
760 	xfree(t);
761 	return ONIGERR_MEMORY;
762       }
763       t->alloc = alloc;
764       reg->name_table = t;
765       goto clear;
766     }
767     else if (t->num == t->alloc) {
768       int i;
769 
770       alloc = t->alloc * 2;
771       t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
772       CHECK_NULL_RETURN_VAL(t->e, ONIGERR_MEMORY);
773       t->alloc = alloc;
774 
775     clear:
776       for (i = t->num; i < t->alloc; i++) {
777 	t->e[i].name       = NULL;
778 	t->e[i].name_len   = 0;
779 	t->e[i].back_num   = 0;
780 	t->e[i].back_alloc = 0;
781 	t->e[i].back_refs  = (int* )NULL;
782       }
783     }
784     e = &(t->e[t->num]);
785     t->num++;
786     e->name = strdup_with_null(reg->enc, name, name_end);
787     e->name_len = name_end - name;
788 #endif
789   }
790 
791   if (e->back_num >= 1 &&
792       ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
793     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
794 				    name, name_end);
795     return ONIGERR_MULTIPLEX_DEFINED_NAME;
796   }
797 
798   e->back_num++;
799   if (e->back_num == 1) {
800     e->back_ref1 = backref;
801   }
802   else {
803     if (e->back_num == 2) {
804       alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
805       e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
806       CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY);
807       e->back_alloc = alloc;
808       e->back_refs[0] = e->back_ref1;
809       e->back_refs[1] = backref;
810     }
811     else {
812       if (e->back_num > e->back_alloc) {
813 	alloc = e->back_alloc * 2;
814 	e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
815 	CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY);
816 	e->back_alloc = alloc;
817       }
818       e->back_refs[e->back_num - 1] = backref;
819     }
820   }
821 
822   return 0;
823 }
824 
825 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)826 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
827 			   const UChar* name_end, int** nums)
828 {
829   NameEntry* e;
830 
831   e = name_find(reg, name, name_end);
832   if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
833 
834   switch (e->back_num) {
835   case 0:
836     break;
837   case 1:
838     *nums = &(e->back_ref1);
839     break;
840   default:
841     *nums = e->back_refs;
842     break;
843   }
844   return e->back_num;
845 }
846 
847 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)848 onig_name_to_backref_number(regex_t* reg, const UChar* name,
849 			    const UChar* name_end, OnigRegion *region)
850 {
851   int i, n, *nums;
852 
853   n = onig_name_to_group_numbers(reg, name, name_end, &nums);
854   if (n < 0)
855     return n;
856   else if (n == 0)
857     return ONIGERR_PARSER_BUG;
858   else if (n == 1)
859     return nums[0];
860   else {
861     if (IS_NOT_NULL(region)) {
862       for (i = n - 1; i >= 0; i--) {
863 	if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
864 	  return nums[i];
865       }
866     }
867     return nums[n - 1];
868   }
869 }
870 
871 #else /* USE_NAMED_GROUP */
872 
873 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)874 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
875 			   const UChar* name_end, int** nums)
876 {
877   return ONIG_NO_SUPPORT_CONFIG;
878 }
879 
880 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)881 onig_name_to_backref_number(regex_t* reg, const UChar* name,
882 			    const UChar* name_end, OnigRegion* region)
883 {
884   return ONIG_NO_SUPPORT_CONFIG;
885 }
886 
887 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)888 onig_foreach_name(regex_t* reg,
889 	   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*),
890 	   void* arg)
891 {
892   return ONIG_NO_SUPPORT_CONFIG;
893 }
894 
895 extern int
onig_number_of_names(regex_t * reg)896 onig_number_of_names(regex_t* reg)
897 {
898   return 0;
899 }
900 #endif /* else USE_NAMED_GROUP */
901 
902 extern int
onig_noname_group_capture_is_active(regex_t * reg)903 onig_noname_group_capture_is_active(regex_t* reg)
904 {
905   if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
906     return 0;
907 
908 #ifdef USE_NAMED_GROUP
909   if (onig_number_of_names(reg) > 0 &&
910       IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
911       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
912     return 0;
913   }
914 #endif
915 
916   return 1;
917 }
918 
919 
920 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE   16
921 
922 static void
scan_env_clear(ScanEnv * env)923 scan_env_clear(ScanEnv* env)
924 {
925   int i;
926 
927   BIT_STATUS_CLEAR(env->capture_history);
928   BIT_STATUS_CLEAR(env->bt_mem_start);
929   BIT_STATUS_CLEAR(env->bt_mem_end);
930   BIT_STATUS_CLEAR(env->backrefed_mem);
931   env->error             = (UChar* )NULL;
932   env->error_end         = (UChar* )NULL;
933   env->num_call          = 0;
934   env->num_mem           = 0;
935 #ifdef USE_NAMED_GROUP
936   env->num_named         = 0;
937 #endif
938   env->mem_alloc         = 0;
939   env->mem_nodes_dynamic = (Node** )NULL;
940 
941   for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
942     env->mem_nodes_static[i] = NULL_NODE;
943 
944 #ifdef USE_COMBINATION_EXPLOSION_CHECK
945   env->num_comb_exp_check  = 0;
946   env->comb_exp_max_regnum = 0;
947   env->curr_max_regnum     = 0;
948   env->has_recursion       = 0;
949 #endif
950 }
951 
952 static int
scan_env_add_mem_entry(ScanEnv * env)953 scan_env_add_mem_entry(ScanEnv* env)
954 {
955   int i, need, alloc;
956   Node** p;
957 
958   need = env->num_mem + 1;
959   if (need >= SCANENV_MEMNODES_SIZE) {
960     if (env->mem_alloc <= need) {
961       if (IS_NULL(env->mem_nodes_dynamic)) {
962 	alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
963 	p = (Node** )xmalloc(sizeof(Node*) * alloc);
964 	xmemcpy(p, env->mem_nodes_static,
965 		sizeof(Node*) * SCANENV_MEMNODES_SIZE);
966       }
967       else {
968 	alloc = env->mem_alloc * 2;
969 	p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
970       }
971       CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
972 
973       for (i = env->num_mem + 1; i < alloc; i++)
974 	p[i] = NULL_NODE;
975 
976       env->mem_nodes_dynamic = p;
977       env->mem_alloc = alloc;
978     }
979   }
980 
981   env->num_mem++;
982   return env->num_mem;
983 }
984 
985 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)986 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
987 {
988   if (env->num_mem >= num)
989     SCANENV_MEM_NODES(env)[num] = node;
990   else
991     return ONIGERR_PARSER_BUG;
992   return 0;
993 }
994 
995 
996 #ifdef USE_RECYCLE_NODE
997 typedef struct _FreeNode {
998   struct _FreeNode* next;
999 } FreeNode;
1000 
1001 static FreeNode* FreeNodeList = (FreeNode* )NULL;
1002 #endif
1003 
1004 extern void
onig_node_free(Node * node)1005 onig_node_free(Node* node)
1006 {
1007  start:
1008   if (IS_NULL(node)) return ;
1009 
1010   switch (NTYPE(node)) {
1011   case N_STRING:
1012     if (IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) {
1013       xfree(NSTRING(node).s);
1014     }
1015     break;
1016 
1017   case N_LIST:
1018   case N_ALT:
1019     onig_node_free(NCONS(node).left);
1020     /* onig_node_free(NCONS(node).right); */
1021     {
1022       Node* next_node = NCONS(node).right;
1023 
1024 #ifdef USE_RECYCLE_NODE
1025       {
1026 	FreeNode* n = (FreeNode* )node;
1027 
1028         THREAD_ATOMIC_START;
1029 	n->next = FreeNodeList;
1030 	FreeNodeList = n;
1031         THREAD_ATOMIC_END;
1032       }
1033 #else
1034       xfree(node);
1035 #endif
1036 
1037       node = next_node;
1038       goto start;
1039     }
1040     break;
1041 
1042   case N_CCLASS:
1043     {
1044       CClassNode* cc = &(NCCLASS(node));
1045 
1046       if (IS_CCLASS_SHARE(cc))
1047         return ;
1048 
1049       if (cc->mbuf)
1050         bbuf_free(cc->mbuf);
1051     }
1052     break;
1053 
1054   case N_QUANTIFIER:
1055     if (NQUANTIFIER(node).target)
1056       onig_node_free(NQUANTIFIER(node).target);
1057     break;
1058 
1059   case N_EFFECT:
1060     if (NEFFECT(node).target)
1061       onig_node_free(NEFFECT(node).target);
1062     break;
1063 
1064   case N_BACKREF:
1065     if (IS_NOT_NULL(NBACKREF(node).back_dynamic))
1066       xfree(NBACKREF(node).back_dynamic);
1067     break;
1068 
1069   case N_ANCHOR:
1070     if (NANCHOR(node).target)
1071       onig_node_free(NANCHOR(node).target);
1072     break;
1073   }
1074 
1075 #ifdef USE_RECYCLE_NODE
1076   {
1077     FreeNode* n = (FreeNode* )node;
1078 
1079     THREAD_ATOMIC_START;
1080     n->next = FreeNodeList;
1081     FreeNodeList = n;
1082     THREAD_ATOMIC_END;
1083   }
1084 #else
1085   xfree(node);
1086 #endif
1087 }
1088 
1089 #ifdef USE_RECYCLE_NODE
1090 extern int
onig_free_node_list(void)1091 onig_free_node_list(void)
1092 {
1093   FreeNode* n;
1094 
1095   /* THREAD_ATOMIC_START; */
1096   while (IS_NOT_NULL(FreeNodeList)) {
1097     n = FreeNodeList;
1098     FreeNodeList = FreeNodeList->next;
1099     xfree(n);
1100   }
1101   /* THREAD_ATOMIC_END; */
1102   return 0;
1103 }
1104 #endif
1105 
1106 static Node*
node_new(void)1107 node_new(void)
1108 {
1109   Node* node;
1110 
1111 #ifdef USE_RECYCLE_NODE
1112   THREAD_ATOMIC_START;
1113   if (IS_NOT_NULL(FreeNodeList)) {
1114     node = (Node* )FreeNodeList;
1115     FreeNodeList = FreeNodeList->next;
1116     THREAD_ATOMIC_END;
1117     return node;
1118   }
1119   THREAD_ATOMIC_END;
1120 #endif
1121 
1122   node = (Node* )xmalloc(sizeof(Node));
1123   return node;
1124 }
1125 
1126 
1127 static void
initialize_cclass(CClassNode * cc)1128 initialize_cclass(CClassNode* cc)
1129 {
1130   BITSET_CLEAR(cc->bs);
1131   cc->flags = 0;
1132   cc->mbuf  = NULL;
1133 }
1134 
1135 static Node*
node_new_cclass(void)1136 node_new_cclass(void)
1137 {
1138   Node* node = node_new();
1139   CHECK_NULL_RETURN(node);
1140   node->type = N_CCLASS;
1141 
1142   initialize_cclass(&(NCCLASS(node)));
1143   return node;
1144 }
1145 
1146 static Node*
node_new_cclass_by_codepoint_range(int not,const OnigCodePoint sbr[],const OnigCodePoint mbr[])1147 node_new_cclass_by_codepoint_range(int not,
1148                    const OnigCodePoint sbr[], const OnigCodePoint mbr[])
1149 {
1150   CClassNode* cc;
1151   int n, i, j;
1152 
1153   Node* node = node_new();
1154   CHECK_NULL_RETURN(node);
1155   node->type = N_CCLASS;
1156 
1157   cc = &(NCCLASS(node));
1158   cc->flags = 0;
1159   if (not != 0) CCLASS_SET_NOT(cc);
1160 
1161   BITSET_CLEAR(cc->bs);
1162   if (IS_NOT_NULL(sbr)) {
1163     n = ONIGENC_CODE_RANGE_NUM(sbr);
1164     for (i = 0; i < n; i++) {
1165       for (j  = ONIGENC_CODE_RANGE_FROM(sbr, i);
1166            j <= (int )ONIGENC_CODE_RANGE_TO(sbr, i); j++) {
1167         BITSET_SET_BIT(cc->bs, j);
1168       }
1169     }
1170   }
1171 
1172   if (IS_NULL(mbr)) {
1173   is_null:
1174     cc->mbuf = NULL;
1175   }
1176   else {
1177     BBuf* bbuf;
1178 
1179     n = ONIGENC_CODE_RANGE_NUM(mbr);
1180     if (n == 0) goto is_null;
1181 
1182     bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1183     CHECK_NULL_RETURN_VAL(bbuf, NULL);
1184     bbuf->alloc = n + 1;
1185     bbuf->used  = n + 1;
1186     bbuf->p     = (UChar* )((void* )mbr);
1187 
1188     cc->mbuf = bbuf;
1189   }
1190 
1191   return node;
1192 }
1193 
1194 static Node*
node_new_ctype(int type)1195 node_new_ctype(int type)
1196 {
1197   Node* node = node_new();
1198   CHECK_NULL_RETURN(node);
1199   node->type = N_CTYPE;
1200   NCTYPE(node).type = type;
1201   return node;
1202 }
1203 
1204 static Node*
node_new_anychar(void)1205 node_new_anychar(void)
1206 {
1207   Node* node = node_new();
1208   CHECK_NULL_RETURN(node);
1209   node->type = N_ANYCHAR;
1210   return node;
1211 }
1212 
1213 static Node*
node_new_list(Node * left,Node * right)1214 node_new_list(Node* left, Node* right)
1215 {
1216   Node* node = node_new();
1217   CHECK_NULL_RETURN(node);
1218   node->type = N_LIST;
1219   NCONS(node).left  = left;
1220   NCONS(node).right = right;
1221   return node;
1222 }
1223 
1224 extern Node*
onig_node_new_list(Node * left,Node * right)1225 onig_node_new_list(Node* left, Node* right)
1226 {
1227   return node_new_list(left, right);
1228 }
1229 
1230 static Node*
node_new_alt(Node * left,Node * right)1231 node_new_alt(Node* left, Node* right)
1232 {
1233   Node* node = node_new();
1234   CHECK_NULL_RETURN(node);
1235   node->type = N_ALT;
1236   NCONS(node).left  = left;
1237   NCONS(node).right = right;
1238   return node;
1239 }
1240 
1241 extern Node*
onig_node_new_anchor(int type)1242 onig_node_new_anchor(int type)
1243 {
1244   Node* node = node_new();
1245   CHECK_NULL_RETURN(node);
1246   node->type = N_ANCHOR;
1247   NANCHOR(node).type     = type;
1248   NANCHOR(node).target   = NULL;
1249   NANCHOR(node).char_len = -1;
1250   return node;
1251 }
1252 
1253 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1254 node_new_backref(int back_num, int* backrefs, int by_name,
1255 #ifdef USE_BACKREF_AT_LEVEL
1256 		 int exist_level, int nest_level,
1257 #endif
1258 		 ScanEnv* env)
1259 {
1260   int i;
1261   Node* node = node_new();
1262 
1263   CHECK_NULL_RETURN(node);
1264   node->type = N_BACKREF;
1265   NBACKREF(node).state    = 0;
1266   NBACKREF(node).back_num = back_num;
1267   NBACKREF(node).back_dynamic = (int* )NULL;
1268   if (by_name != 0)
1269     NBACKREF(node).state |= NST_NAME_REF;
1270 
1271 #ifdef USE_BACKREF_AT_LEVEL
1272   if (exist_level != 0) {
1273     NBACKREF(node).state |= NST_NEST_LEVEL;
1274     NBACKREF(node).nest_level  = nest_level;
1275   }
1276 #endif
1277 
1278   for (i = 0; i < back_num; i++) {
1279     if (backrefs[i] <= env->num_mem &&
1280 	IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1281       NBACKREF(node).state |= NST_RECURSION;   /* /...(\1).../ */
1282       break;
1283     }
1284   }
1285 
1286   if (back_num <= NODE_BACKREFS_SIZE) {
1287     for (i = 0; i < back_num; i++)
1288       NBACKREF(node).back_static[i] = backrefs[i];
1289   }
1290   else {
1291     int* p = (int* )xmalloc(sizeof(int) * back_num);
1292     if (IS_NULL(p)) {
1293       onig_node_free(node);
1294       return NULL;
1295     }
1296     NBACKREF(node).back_dynamic = p;
1297     for (i = 0; i < back_num; i++)
1298       p[i] = backrefs[i];
1299   }
1300   return node;
1301 }
1302 
1303 #ifdef USE_SUBEXP_CALL
1304 static Node*
node_new_call(UChar * name,UChar * name_end)1305 node_new_call(UChar* name, UChar* name_end)
1306 {
1307   Node* node = node_new();
1308   CHECK_NULL_RETURN(node);
1309 
1310   node->type = N_CALL;
1311   NCALL(node).state    = 0;
1312   NCALL(node).ref_num  = CALLNODE_REFNUM_UNDEF;
1313   NCALL(node).target   = NULL_NODE;
1314   NCALL(node).name     = name;
1315   NCALL(node).name_end = name_end;
1316   return node;
1317 }
1318 #endif
1319 
1320 static Node*
node_new_quantifier(int lower,int upper,int by_number)1321 node_new_quantifier(int lower, int upper, int by_number)
1322 {
1323   Node* node = node_new();
1324   CHECK_NULL_RETURN(node);
1325   node->type = N_QUANTIFIER;
1326   NQUANTIFIER(node).state  = 0;
1327   NQUANTIFIER(node).target = NULL;
1328   NQUANTIFIER(node).lower  = lower;
1329   NQUANTIFIER(node).upper  = upper;
1330   NQUANTIFIER(node).greedy = 1;
1331   NQUANTIFIER(node).target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1332   NQUANTIFIER(node).head_exact        = NULL_NODE;
1333   NQUANTIFIER(node).next_head_exact   = NULL_NODE;
1334   NQUANTIFIER(node).is_refered        = 0;
1335   if (by_number != 0)
1336     NQUANTIFIER(node).state |= NST_BY_NUMBER;
1337 
1338 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1339   NQUANTIFIER(node).comb_exp_check_num = 0;
1340 #endif
1341 
1342   return node;
1343 }
1344 
1345 static Node*
node_new_effect(int type)1346 node_new_effect(int type)
1347 {
1348   Node* node = node_new();
1349   CHECK_NULL_RETURN(node);
1350   node->type = N_EFFECT;
1351   NEFFECT(node).type      = type;
1352   NEFFECT(node).state     =  0;
1353   NEFFECT(node).regnum    =  0;
1354   NEFFECT(node).option    =  0;
1355   NEFFECT(node).target    = NULL;
1356   NEFFECT(node).call_addr = -1;
1357   NEFFECT(node).opt_count =  0;
1358   return node;
1359 }
1360 
1361 extern Node*
onig_node_new_effect(int type)1362 onig_node_new_effect(int type)
1363 {
1364   return node_new_effect(type);
1365 }
1366 
1367 static Node*
node_new_effect_memory(OnigOptionType option,int is_named)1368 node_new_effect_memory(OnigOptionType option, int is_named)
1369 {
1370   Node* node = node_new_effect(EFFECT_MEMORY);
1371   CHECK_NULL_RETURN(node);
1372   if (is_named != 0)
1373     SET_EFFECT_STATUS(node, NST_NAMED_GROUP);
1374 
1375 #ifdef USE_SUBEXP_CALL
1376   NEFFECT(node).option = option;
1377 #endif
1378   return node;
1379 }
1380 
1381 static Node*
node_new_option(OnigOptionType option)1382 node_new_option(OnigOptionType option)
1383 {
1384   Node* node = node_new_effect(EFFECT_OPTION);
1385   CHECK_NULL_RETURN(node);
1386   NEFFECT(node).option = option;
1387   return node;
1388 }
1389 
1390 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1391 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1392 {
1393   int addlen = end - s;
1394 
1395   if (addlen > 0) {
1396     int len  = NSTRING(node).end - NSTRING(node).s;
1397 
1398     if (NSTRING(node).capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1399       UChar* p;
1400       int capa = len + addlen + NODE_STR_MARGIN;
1401 
1402       if (capa <= NSTRING(node).capa) {
1403 	k_strcpy(NSTRING(node).s + len, s, end);
1404       }
1405       else {
1406 	if (NSTRING(node).s == NSTRING(node).buf)
1407 	  p = strcat_capa_from_static(NSTRING(node).s, NSTRING(node).end,
1408 				      s, end, capa);
1409 	else
1410 	  p = k_strcat_capa(NSTRING(node).s, NSTRING(node).end, s, end, capa);
1411 
1412 	CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
1413 	NSTRING(node).s    = p;
1414 	NSTRING(node).capa = capa;
1415       }
1416     }
1417     else {
1418       k_strcpy(NSTRING(node).s + len, s, end);
1419     }
1420     NSTRING(node).end = NSTRING(node).s + len + addlen;
1421   }
1422 
1423   return 0;
1424 }
1425 
1426 static int
node_str_cat_char(Node * node,UChar c)1427 node_str_cat_char(Node* node, UChar c)
1428 {
1429   UChar s[1];
1430 
1431   s[0] = c;
1432   return onig_node_str_cat(node, s, s + 1);
1433 }
1434 
1435 extern void
onig_node_conv_to_str_node(Node * node,int flag)1436 onig_node_conv_to_str_node(Node* node, int flag)
1437 {
1438   node->type = N_STRING;
1439 
1440   NSTRING(node).flag = flag;
1441   NSTRING(node).capa = 0;
1442   NSTRING(node).s    = NSTRING(node).buf;
1443   NSTRING(node).end  = NSTRING(node).buf;
1444 }
1445 
1446 extern void
onig_node_str_clear(Node * node)1447 onig_node_str_clear(Node* node)
1448 {
1449   if (NSTRING(node).capa != 0 &&
1450       IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) {
1451     xfree(NSTRING(node).s);
1452   }
1453 
1454   NSTRING(node).capa = 0;
1455   NSTRING(node).flag = 0;
1456   NSTRING(node).s    = NSTRING(node).buf;
1457   NSTRING(node).end  = NSTRING(node).buf;
1458 }
1459 
1460 static Node*
node_new_str(const UChar * s,const UChar * end)1461 node_new_str(const UChar* s, const UChar* end)
1462 {
1463   Node* node = node_new();
1464   CHECK_NULL_RETURN(node);
1465 
1466   node->type = N_STRING;
1467   NSTRING(node).capa = 0;
1468   NSTRING(node).flag = 0;
1469   NSTRING(node).s    = NSTRING(node).buf;
1470   NSTRING(node).end  = NSTRING(node).buf;
1471   if (onig_node_str_cat(node, s, end)) {
1472     onig_node_free(node);
1473     return NULL;
1474   }
1475   return node;
1476 }
1477 
1478 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1479 onig_node_new_str(const UChar* s, const UChar* end)
1480 {
1481   return node_new_str(s, end);
1482 }
1483 
1484 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
1485 static Node*
node_new_str_raw(UChar * s,UChar * end)1486 node_new_str_raw(UChar* s, UChar* end)
1487 {
1488   Node* node = node_new_str(s, end);
1489   NSTRING_SET_RAW(node);
1490   return node;
1491 }
1492 #endif
1493 
1494 static Node*
node_new_empty(void)1495 node_new_empty(void)
1496 {
1497   return node_new_str(NULL, NULL);
1498 }
1499 
1500 static Node*
node_new_str_char(UChar c)1501 node_new_str_char(UChar c)
1502 {
1503   UChar p[1];
1504 
1505   p[0] = c;
1506   return node_new_str(p, p + 1);
1507 }
1508 
1509 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1510 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1511 {
1512   const UChar *p;
1513   Node* n = NULL_NODE;
1514 
1515   if (sn->end > sn->s) {
1516     p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1517     if (p && p > sn->s) { /* can be splitted. */
1518       n = node_new_str(p, sn->end);
1519       if ((sn->flag & NSTR_RAW) != 0)
1520 	NSTRING_SET_RAW(n);
1521       sn->end = (UChar* )p;
1522     }
1523   }
1524   return n;
1525 }
1526 
1527 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1528 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1529 {
1530   if (sn->end > sn->s) {
1531     return ((enc_len(enc, sn->s) < sn->end - sn->s)  ?  1 : 0);
1532   }
1533   return 0;
1534 }
1535 
1536 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1537 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1538 node_str_head_pad(StrNode* sn, int num, UChar val)
1539 {
1540   UChar buf[NODE_STR_BUF_SIZE];
1541   int i, len;
1542 
1543   len = sn->end - sn->s;
1544   onig_strcpy(buf, sn->s, sn->end);
1545   onig_strcpy(&(sn->s[num]), buf, buf + len);
1546   sn->end += num;
1547 
1548   for (i = 0; i < num; i++) {
1549     sn->s[i] = val;
1550   }
1551 }
1552 #endif
1553 
1554 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1555 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1556 {
1557   unsigned int num, val;
1558   OnigCodePoint c;
1559   UChar* p = *src;
1560   PFETCH_READY;
1561 
1562   num = 0;
1563   while (!PEND) {
1564     PFETCH(c);
1565     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1566       val = (unsigned int )DIGITVAL(c);
1567       if ((INT_MAX_LIMIT - val) / 10UL < num)
1568 	return -1;  /* overflow */
1569 
1570       num = num * 10 + val;
1571     }
1572     else {
1573       PUNFETCH;
1574       break;
1575     }
1576   }
1577   *src = p;
1578   return num;
1579 }
1580 
1581 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1582 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1583 				 OnigEncoding enc)
1584 {
1585   OnigCodePoint c;
1586   unsigned int num, val;
1587   UChar* p = *src;
1588   PFETCH_READY;
1589 
1590   num = 0;
1591   while (!PEND && maxlen-- != 0) {
1592     PFETCH(c);
1593     if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1594       val = (unsigned int )XDIGITVAL(enc,c);
1595       if ((INT_MAX_LIMIT - val) / 16UL < num)
1596 	return -1;  /* overflow */
1597 
1598       num = (num << 4) + XDIGITVAL(enc,c);
1599     }
1600     else {
1601       PUNFETCH;
1602       break;
1603     }
1604   }
1605   *src = p;
1606   return num;
1607 }
1608 
1609 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1610 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1611 			   OnigEncoding enc)
1612 {
1613   OnigCodePoint c;
1614   unsigned int num, val;
1615   UChar* p = *src;
1616   PFETCH_READY;
1617 
1618   num = 0;
1619   while (!PEND && maxlen-- != 0) {
1620     PFETCH(c);
1621     if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1622       val = ODIGITVAL(c);
1623       if ((INT_MAX_LIMIT - val) / 8UL < num)
1624 	return -1;  /* overflow */
1625 
1626       num = (num << 3) + val;
1627     }
1628     else {
1629       PUNFETCH;
1630       break;
1631     }
1632   }
1633   *src = p;
1634   return num;
1635 }
1636 
1637 
1638 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1639     BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1640 
1641 /* data format:
1642      [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1643      (all data size is OnigCodePoint)
1644  */
1645 static int
new_code_range(BBuf ** pbuf)1646 new_code_range(BBuf** pbuf)
1647 {
1648 #define INIT_MULTI_BYTE_RANGE_SIZE  (SIZE_CODE_POINT * 5)
1649   int r;
1650   OnigCodePoint n;
1651   BBuf* bbuf;
1652 
1653   bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1654   CHECK_NULL_RETURN_VAL(*pbuf, ONIGERR_MEMORY);
1655   r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1656   if (r) return r;
1657 
1658   n = 0;
1659   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1660   return 0;
1661 }
1662 
1663 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1664 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1665 {
1666   int r, inc_n, pos;
1667   int low, high, bound, x;
1668   OnigCodePoint n, *data;
1669   BBuf* bbuf;
1670 
1671   if (from > to) {
1672     n = from; from = to; to = n;
1673   }
1674 
1675   if (IS_NULL(*pbuf)) {
1676     r = new_code_range(pbuf);
1677     if (r) return r;
1678     bbuf = *pbuf;
1679     n = 0;
1680   }
1681   else {
1682     bbuf = *pbuf;
1683     GET_CODE_POINT(n, bbuf->p);
1684   }
1685   data = (OnigCodePoint* )(bbuf->p);
1686   data++;
1687 
1688   for (low = 0, bound = n; low < bound; ) {
1689     x = (low + bound) >> 1;
1690     if (from > data[x*2 + 1])
1691       low = x + 1;
1692     else
1693       bound = x;
1694   }
1695 
1696   for (high = low, bound = n; high < bound; ) {
1697     x = (high + bound) >> 1;
1698     if (to >= data[x*2] - 1)
1699       high = x + 1;
1700     else
1701       bound = x;
1702   }
1703 
1704   inc_n = low + 1 - high;
1705   if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1706     return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1707 
1708   if (inc_n != 1) {
1709     if (from > data[low*2])
1710       from = data[low*2];
1711     if (to < data[(high - 1)*2 + 1])
1712       to = data[(high - 1)*2 + 1];
1713   }
1714 
1715   if (inc_n != 0 && (OnigCodePoint )high < n) {
1716     int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1717     int to_pos   = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1718     int size = (n - high) * 2 * SIZE_CODE_POINT;
1719 
1720     if (inc_n > 0) {
1721       BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1722     }
1723     else {
1724       BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1725     }
1726   }
1727 
1728   pos = SIZE_CODE_POINT * (1 + low * 2);
1729   BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1730   BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1731   BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1732   n += inc_n;
1733   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1734 
1735   return 0;
1736 }
1737 
1738 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1739 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1740 {
1741   if (from > to) {
1742     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1743       return 0;
1744     else
1745       return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1746   }
1747 
1748   return add_code_range_to_buf(pbuf, from, to);
1749 }
1750 
1751 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1752 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1753 {
1754   int r, i, n;
1755   OnigCodePoint pre, from, *data, to = 0;
1756 
1757   *pbuf = (BBuf* )NULL;
1758   if (IS_NULL(bbuf)) {
1759   set_all:
1760     return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1761   }
1762 
1763   data = (OnigCodePoint* )(bbuf->p);
1764   GET_CODE_POINT(n, data);
1765   data++;
1766   if (n <= 0) goto set_all;
1767 
1768   r = 0;
1769   pre = MBCODE_START_POS(enc);
1770   for (i = 0; i < n; i++) {
1771     from = data[i*2];
1772     to   = data[i*2+1];
1773     if (pre <= from - 1) {
1774       r = add_code_range_to_buf(pbuf, pre, from - 1);
1775       if (r != 0) return r;
1776     }
1777     if (to == ~((OnigCodePoint )0)) break;
1778     pre = to + 1;
1779   }
1780   if (to < ~((OnigCodePoint )0)) {
1781     r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1782   }
1783   return r;
1784 }
1785 
1786 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1787   BBuf *tbuf; \
1788   int  tnot; \
1789   tnot = not1;  not1  = not2;  not2  = tnot; \
1790   tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1791 } while (0)
1792 
1793 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1794 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1795                   BBuf* bbuf2, int not2, BBuf** pbuf)
1796 {
1797   int r;
1798   OnigCodePoint i, n1, *data1;
1799   OnigCodePoint from, to;
1800 
1801   *pbuf = (BBuf* )NULL;
1802   if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1803     if (not1 != 0 || not2 != 0)
1804       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1805     return 0;
1806   }
1807 
1808   r = 0;
1809   if (IS_NULL(bbuf2))
1810     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1811 
1812   if (IS_NULL(bbuf1)) {
1813     if (not1 != 0) {
1814       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1815     }
1816     else {
1817       if (not2 == 0) {
1818 	return bbuf_clone(pbuf, bbuf2);
1819       }
1820       else {
1821 	return not_code_range_buf(enc, bbuf2, pbuf);
1822       }
1823     }
1824   }
1825 
1826   if (not1 != 0)
1827     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1828 
1829   data1 = (OnigCodePoint* )(bbuf1->p);
1830   GET_CODE_POINT(n1, data1);
1831   data1++;
1832 
1833   if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1834     r = bbuf_clone(pbuf, bbuf2);
1835   }
1836   else if (not1 == 0) { /* 1 OR (not 2) */
1837     r = not_code_range_buf(enc, bbuf2, pbuf);
1838   }
1839   if (r != 0) return r;
1840 
1841   for (i = 0; i < n1; i++) {
1842     from = data1[i*2];
1843     to   = data1[i*2+1];
1844     r = add_code_range_to_buf(pbuf, from, to);
1845     if (r != 0) return r;
1846   }
1847   return 0;
1848 }
1849 
1850 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1851 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1852 	        OnigCodePoint* data, int n)
1853 {
1854   int i, r;
1855   OnigCodePoint from2, to2;
1856 
1857   for (i = 0; i < n; i++) {
1858     from2 = data[i*2];
1859     to2   = data[i*2+1];
1860     if (from2 < from1) {
1861       if (to2 < from1) continue;
1862       else {
1863 	from1 = to2 + 1;
1864       }
1865     }
1866     else if (from2 <= to1) {
1867       if (to2 < to1) {
1868 	if (from1 <= from2 - 1) {
1869 	  r = add_code_range_to_buf(pbuf, from1, from2-1);
1870 	  if (r != 0) return r;
1871 	}
1872 	from1 = to2 + 1;
1873       }
1874       else {
1875 	to1 = from2 - 1;
1876       }
1877     }
1878     else {
1879       from1 = from2;
1880     }
1881     if (from1 > to1) break;
1882   }
1883   if (from1 <= to1) {
1884     r = add_code_range_to_buf(pbuf, from1, to1);
1885     if (r != 0) return r;
1886   }
1887   return 0;
1888 }
1889 
1890 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1891 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1892 {
1893   int r;
1894   OnigCodePoint i, j, n1, n2, *data1, *data2;
1895   OnigCodePoint from, to, from1, to1, from2, to2;
1896 
1897   *pbuf = (BBuf* )NULL;
1898   if (IS_NULL(bbuf1)) {
1899     if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1900       return bbuf_clone(pbuf, bbuf2);
1901     return 0;
1902   }
1903   else if (IS_NULL(bbuf2)) {
1904     if (not2 != 0)
1905       return bbuf_clone(pbuf, bbuf1);
1906     return 0;
1907   }
1908 
1909   if (not1 != 0)
1910     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1911 
1912   data1 = (OnigCodePoint* )(bbuf1->p);
1913   data2 = (OnigCodePoint* )(bbuf2->p);
1914   GET_CODE_POINT(n1, data1);
1915   GET_CODE_POINT(n2, data2);
1916   data1++;
1917   data2++;
1918 
1919   if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1920     for (i = 0; i < n1; i++) {
1921       from1 = data1[i*2];
1922       to1   = data1[i*2+1];
1923       for (j = 0; j < n2; j++) {
1924 	from2 = data2[j*2];
1925 	to2   = data2[j*2+1];
1926 	if (from2 > to1) break;
1927 	if (to2 < from1) continue;
1928 	from = MAX(from1, from2);
1929 	to   = MIN(to1, to2);
1930 	r = add_code_range_to_buf(pbuf, from, to);
1931 	if (r != 0) return r;
1932       }
1933     }
1934   }
1935   else if (not1 == 0) { /* 1 AND (not 2) */
1936     for (i = 0; i < n1; i++) {
1937       from1 = data1[i*2];
1938       to1   = data1[i*2+1];
1939       r = and_code_range1(pbuf, from1, to1, data2, n2);
1940       if (r != 0) return r;
1941     }
1942   }
1943 
1944   return 0;
1945 }
1946 
1947 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1948 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1949 {
1950   int r, not1, not2;
1951   BBuf *buf1, *buf2, *pbuf;
1952   BitSetRef bsr1, bsr2;
1953   BitSet bs1, bs2;
1954 
1955   not1 = IS_CCLASS_NOT(dest);
1956   bsr1 = dest->bs;
1957   buf1 = dest->mbuf;
1958   not2 = IS_CCLASS_NOT(cc);
1959   bsr2 = cc->bs;
1960   buf2 = cc->mbuf;
1961 
1962   if (not1 != 0) {
1963     bitset_invert_to(bsr1, bs1);
1964     bsr1 = bs1;
1965   }
1966   if (not2 != 0) {
1967     bitset_invert_to(bsr2, bs2);
1968     bsr2 = bs2;
1969   }
1970   bitset_and(bsr1, bsr2);
1971   if (bsr1 != dest->bs) {
1972     bitset_copy(dest->bs, bsr1);
1973     bsr1 = dest->bs;
1974   }
1975   if (not1 != 0) {
1976     bitset_invert(dest->bs);
1977   }
1978 
1979   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
1980     if (not1 != 0 && not2 != 0) {
1981       r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
1982     }
1983     else {
1984       r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
1985       if (r == 0 && not1 != 0) {
1986 	BBuf *tbuf;
1987 	r = not_code_range_buf(enc, pbuf, &tbuf);
1988 	if (r != 0) {
1989 	  bbuf_free(pbuf);
1990 	  return r;
1991 	}
1992 	bbuf_free(pbuf);
1993 	pbuf = tbuf;
1994       }
1995     }
1996     if (r != 0) return r;
1997 
1998     dest->mbuf = pbuf;
1999     bbuf_free(buf1);
2000     return r;
2001   }
2002   return 0;
2003 }
2004 
2005 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2006 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2007 {
2008   int r, not1, not2;
2009   BBuf *buf1, *buf2, *pbuf;
2010   BitSetRef bsr1, bsr2;
2011   BitSet bs1, bs2;
2012 
2013   not1 = IS_CCLASS_NOT(dest);
2014   bsr1 = dest->bs;
2015   buf1 = dest->mbuf;
2016   not2 = IS_CCLASS_NOT(cc);
2017   bsr2 = cc->bs;
2018   buf2 = cc->mbuf;
2019 
2020   if (not1 != 0) {
2021     bitset_invert_to(bsr1, bs1);
2022     bsr1 = bs1;
2023   }
2024   if (not2 != 0) {
2025     bitset_invert_to(bsr2, bs2);
2026     bsr2 = bs2;
2027   }
2028   bitset_or(bsr1, bsr2);
2029   if (bsr1 != dest->bs) {
2030     bitset_copy(dest->bs, bsr1);
2031     bsr1 = dest->bs;
2032   }
2033   if (not1 != 0) {
2034     bitset_invert(dest->bs);
2035   }
2036 
2037   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2038     if (not1 != 0 && not2 != 0) {
2039       r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2040     }
2041     else {
2042       r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2043       if (r == 0 && not1 != 0) {
2044 	BBuf *tbuf;
2045 	r = not_code_range_buf(enc, pbuf, &tbuf);
2046 	if (r != 0) {
2047 	  bbuf_free(pbuf);
2048 	  return r;
2049 	}
2050 	bbuf_free(pbuf);
2051 	pbuf = tbuf;
2052       }
2053     }
2054     if (r != 0) return r;
2055 
2056     dest->mbuf = pbuf;
2057     bbuf_free(buf1);
2058     return r;
2059   }
2060   else
2061     return 0;
2062 }
2063 
2064 static int
conv_backslash_value(int c,ScanEnv * env)2065 conv_backslash_value(int c, ScanEnv* env)
2066 {
2067   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2068     switch (c) {
2069     case 'n':  return '\n';
2070     case 't':  return '\t';
2071     case 'r':  return '\r';
2072     case 'f':  return '\f';
2073     case 'a':  return '\007';
2074     case 'b':  return '\010';
2075     case 'e':  return '\033';
2076     case 'v':
2077       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2078 	return '\v';
2079       break;
2080 
2081     default:
2082       break;
2083     }
2084   }
2085   return c;
2086 }
2087 
2088 static int
is_invalid_quantifier_target(Node * node)2089 is_invalid_quantifier_target(Node* node)
2090 {
2091   switch (NTYPE(node)) {
2092   case N_ANCHOR:
2093     return 1;
2094     break;
2095 
2096   case N_EFFECT:
2097     if (NEFFECT(node).type == EFFECT_OPTION)
2098       return is_invalid_quantifier_target(NEFFECT(node).target);
2099     break;
2100 
2101   case N_LIST: /* ex. (?:\G\A)* */
2102     do {
2103       if (! is_invalid_quantifier_target(NCONS(node).left)) return 0;
2104     } while (IS_NOT_NULL(node = NCONS(node).right));
2105     return 0;
2106     break;
2107 
2108   case N_ALT:  /* ex. (?:abc|\A)* */
2109     do {
2110       if (is_invalid_quantifier_target(NCONS(node).left)) return 1;
2111     } while (IS_NOT_NULL(node = NCONS(node).right));
2112     break;
2113 
2114   default:
2115     break;
2116   }
2117   return 0;
2118 }
2119 
2120 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2121 static int
popular_quantifier_num(QuantifierNode * qf)2122 popular_quantifier_num(QuantifierNode* qf)
2123 {
2124   if (qf->greedy) {
2125     if (qf->lower == 0) {
2126       if (qf->upper == 1) return 0;
2127       else if (IS_REPEAT_INFINITE(qf->upper)) return 1;
2128     }
2129     else if (qf->lower == 1) {
2130       if (IS_REPEAT_INFINITE(qf->upper)) return 2;
2131     }
2132   }
2133   else {
2134     if (qf->lower == 0) {
2135       if (qf->upper == 1) return 3;
2136       else if (IS_REPEAT_INFINITE(qf->upper)) return 4;
2137     }
2138     else if (qf->lower == 1) {
2139       if (IS_REPEAT_INFINITE(qf->upper)) return 5;
2140     }
2141   }
2142   return -1;
2143 }
2144 
2145 
2146 enum ReduceType {
2147   RQ_ASIS = 0, /* as is */
2148   RQ_DEL  = 1, /* delete parent */
2149   RQ_A,        /* to '*'    */
2150   RQ_AQ,       /* to '*?'   */
2151   RQ_QQ,       /* to '??'   */
2152   RQ_P_QQ,     /* to '+)??' */
2153   RQ_PQ_Q      /* to '+?)?' */
2154 };
2155 
2156 static enum ReduceType ReduceTypeTable[6][6] = {
2157   {RQ_DEL,  RQ_A,    RQ_A,   RQ_QQ,   RQ_AQ,   RQ_ASIS}, /* '?'  */
2158   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},  /* '*'  */
2159   {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */
2160   {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */
2161   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */
2162   {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */
2163 };
2164 
2165 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2166 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2167 {
2168   int pnum, cnum;
2169   QuantifierNode *p, *c;
2170 
2171   p = &(NQUANTIFIER(pnode));
2172   c = &(NQUANTIFIER(cnode));
2173   pnum = popular_quantifier_num(p);
2174   cnum = popular_quantifier_num(c);
2175 
2176   switch(ReduceTypeTable[cnum][pnum]) {
2177   case RQ_DEL:
2178     *p = *c;
2179     break;
2180   case RQ_A:
2181     p->target = c->target;
2182     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 1;
2183     break;
2184   case RQ_AQ:
2185     p->target = c->target;
2186     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 0;
2187     break;
2188   case RQ_QQ:
2189     p->target = c->target;
2190     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2191     break;
2192   case RQ_P_QQ:
2193     p->target = cnode;
2194     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2195     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 1;
2196     return ;
2197     break;
2198   case RQ_PQ_Q:
2199     p->target = cnode;
2200     p->lower  = 0;  p->upper = 1;  p->greedy = 1;
2201     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 0;
2202     return ;
2203     break;
2204   case RQ_ASIS:
2205     p->target = cnode;
2206     return ;
2207     break;
2208   }
2209 
2210   c->target = NULL_NODE;
2211   onig_node_free(cnode);
2212 }
2213 
2214 
2215 enum TokenSyms {
2216   TK_EOT      = 0,   /* end of token */
2217   TK_RAW_BYTE = 1,
2218   TK_CHAR,
2219   TK_STRING,
2220   TK_CODE_POINT,
2221   TK_ANYCHAR,
2222   TK_CHAR_TYPE,
2223   TK_BACKREF,
2224   TK_CALL,
2225   TK_ANCHOR,
2226   TK_OP_REPEAT,
2227   TK_INTERVAL,
2228   TK_ANYCHAR_ANYTIME,  /* SQL '%' == .* */
2229   TK_ALT,
2230   TK_SUBEXP_OPEN,
2231   TK_SUBEXP_CLOSE,
2232   TK_CC_OPEN,
2233   TK_QUOTE_OPEN,
2234   TK_CHAR_PROPERTY,    /* \p{...}, \P{...} */
2235   /* in cc */
2236   TK_CC_CLOSE,
2237   TK_CC_RANGE,
2238   TK_POSIX_BRACKET_OPEN,
2239   TK_CC_AND,             /* && */
2240   TK_CC_CC_OPEN          /* [ */
2241 };
2242 
2243 typedef struct {
2244   enum TokenSyms type;
2245   int escaped;
2246   int base;   /* is number: 8, 16 (used in [....]) */
2247   UChar* backp;
2248   union {
2249     UChar* s;
2250     int   c;
2251     OnigCodePoint code;
2252     int   anchor;
2253     int   subtype;
2254     struct {
2255       int lower;
2256       int upper;
2257       int greedy;
2258       int possessive;
2259     } repeat;
2260     struct {
2261       int  num;
2262       int  ref1;
2263       int* refs;
2264       int  by_name;
2265 #ifdef USE_BACKREF_AT_LEVEL
2266       int  exist_level;
2267       int  level;   /* \k<name+n> */
2268 #endif
2269     } backref;
2270     struct {
2271       UChar* name;
2272       UChar* name_end;
2273     } call;
2274     struct {
2275       int not;
2276     } prop;
2277   } u;
2278 } OnigToken;
2279 
2280 
2281 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2282 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2283 {
2284   int low, up, syn_allow, non_low = 0;
2285   int r = 0;
2286   OnigCodePoint c;
2287   OnigEncoding enc = env->enc;
2288   UChar* p = *src;
2289   PFETCH_READY;
2290 
2291   syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2292 
2293   if (PEND) {
2294     if (syn_allow)
2295       return 1;  /* "....{" : OK! */
2296     else
2297       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;  /* "....{" syntax error */
2298   }
2299 
2300   if (! syn_allow) {
2301     c = PPEEK;
2302     if (c == ')' || c == '(' || c == '|') {
2303       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2304     }
2305   }
2306 
2307   low = onig_scan_unsigned_number(&p, end, env->enc);
2308   if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2309   if (low > ONIG_MAX_REPEAT_NUM)
2310     return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2311 
2312   if (p == *src) { /* can't read low */
2313     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2314       /* allow {,n} as {0,n} */
2315       low = 0;
2316       non_low = 1;
2317     }
2318     else
2319       goto invalid;
2320   }
2321 
2322   if (PEND) goto invalid;
2323   PFETCH(c);
2324   if (c == ',') {
2325     UChar* prev = p;
2326     up = onig_scan_unsigned_number(&p, end, env->enc);
2327     if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2328     if (up > ONIG_MAX_REPEAT_NUM)
2329       return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2330 
2331     if (p == prev) {
2332       if (non_low != 0)
2333 	goto invalid;
2334       up = REPEAT_INFINITE;  /* {n,} : {n,infinite} */
2335     }
2336   }
2337   else {
2338     if (non_low != 0)
2339       goto invalid;
2340 
2341     PUNFETCH;
2342     up = low;  /* {n} : exact n times */
2343     r = 2;     /* fixed */
2344   }
2345 
2346   if (PEND) goto invalid;
2347   PFETCH(c);
2348   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2349     if (c != MC_ESC(enc)) goto invalid;
2350     PFETCH(c);
2351   }
2352   if (c != '}') goto invalid;
2353 
2354   if (!IS_REPEAT_INFINITE(up) && low > up) {
2355     return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2356   }
2357 
2358   tok->type = TK_INTERVAL;
2359   tok->u.repeat.lower = low;
2360   tok->u.repeat.upper = up;
2361   *src = p;
2362   return r; /* 0: normal {n,m}, 2: fixed {n} */
2363 
2364  invalid:
2365   if (syn_allow)
2366     return 1;  /* OK */
2367   else
2368     return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2369 }
2370 
2371 /* \M-, \C-, \c, or \... */
2372 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2373 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2374 {
2375   int v;
2376   OnigCodePoint c;
2377   OnigEncoding enc = env->enc;
2378   UChar* p = *src;
2379   PFETCH_READY;
2380 
2381   if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2382 
2383   PFETCH(c);
2384   switch (c) {
2385   case 'M':
2386     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2387       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2388       PFETCH(c);
2389       if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2390       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2391       PFETCH(c);
2392       if (c == MC_ESC(enc)) {
2393 	v = fetch_escaped_value(&p, end, env);
2394 	if (v < 0) return v;
2395         c = (OnigCodePoint )v;
2396       }
2397       c = ((c & 0xff) | 0x80);
2398     }
2399     else
2400       goto backslash;
2401     break;
2402 
2403   case 'C':
2404     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2405       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2406       PFETCH(c);
2407       if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2408       goto control;
2409     }
2410     else
2411       goto backslash;
2412 
2413   case 'c':
2414     if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2415     control:
2416       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2417       PFETCH(c);
2418       if (c == '?') {
2419 	c = 0177;
2420       }
2421       else {
2422         if (c == MC_ESC(enc)) {
2423           v = fetch_escaped_value(&p, end, env);
2424           if (v < 0) return v;
2425           c = (OnigCodePoint )v;
2426         }
2427 	c &= 0x9f;
2428       }
2429       break;
2430     }
2431     /* fall through */
2432 
2433   default:
2434     {
2435     backslash:
2436       c = conv_backslash_value(c, env);
2437     }
2438     break;
2439   }
2440 
2441   *src = p;
2442   return c;
2443 }
2444 
2445 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2446 
2447 #ifdef USE_NAMED_GROUP
2448 #ifdef USE_BACKREF_AT_LEVEL
2449 /*
2450    \k<name+n>, \k<name-n>
2451 */
2452 static int
fetch_name_with_level(UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * level)2453 fetch_name_with_level(UChar** src, UChar* end, UChar** rname_end
2454 		      , ScanEnv* env, int* level)
2455 {
2456   int r, exist_level = 0;
2457   OnigCodePoint c = 0;
2458   OnigCodePoint first_code;
2459   OnigEncoding enc = env->enc;
2460   UChar *name_end;
2461   UChar *p = *src;
2462   PFETCH_READY;
2463 
2464   name_end = end;
2465   r = 0;
2466   if (PEND) {
2467     return ONIGERR_EMPTY_GROUP_NAME;
2468   }
2469   else {
2470     PFETCH(c);
2471     first_code = c;
2472     if (c == '>')
2473       return ONIGERR_EMPTY_GROUP_NAME;
2474 
2475     if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2476       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2477     }
2478   }
2479 
2480   while (!PEND) {
2481     name_end = p;
2482     PFETCH(c);
2483     if (c == '>' || c == ')' || c == '+' || c == '-') break;
2484 
2485     if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2486       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2487     }
2488   }
2489 
2490   if (c != '>') {
2491     if (c == '+' || c == '-') {
2492       int num;
2493       int flag = (c == '-' ? -1 : 1);
2494 
2495       PFETCH(c);
2496       if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2497       PUNFETCH;
2498       num = onig_scan_unsigned_number(&p, end, enc);
2499       if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2500       *level = (num * flag);
2501       exist_level = 1;
2502 
2503       PFETCH(c);
2504       if (c == '>')
2505 	goto first_check;
2506     }
2507 
2508   err:
2509     r = ONIGERR_INVALID_GROUP_NAME;
2510     name_end = end;
2511   }
2512   else {
2513   first_check:
2514     if (ONIGENC_IS_CODE_ASCII(first_code) &&
2515         ONIGENC_IS_CODE_UPPER(enc, first_code))
2516       r = ONIGERR_INVALID_GROUP_NAME;
2517   }
2518 
2519   if (r == 0) {
2520     *rname_end = name_end;
2521     *src = p;
2522     return (exist_level ? 1 : 0);
2523   }
2524   else {
2525     onig_scan_env_set_error_string(env, r, *src, name_end);
2526     return r;
2527   }
2528 }
2529 #endif /* USE_BACKREF_AT_LEVEL */
2530 
2531 /*
2532   def: 0 -> define name    (don't allow number name)
2533        1 -> reference name (allow number name)
2534 */
2535 static int
fetch_name(UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int ref)2536 fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
2537 {
2538   int r, is_num;
2539   OnigCodePoint c = 0;
2540   OnigCodePoint first_code;
2541   OnigEncoding enc = env->enc;
2542   UChar *name_end;
2543   UChar *p = *src;
2544   PFETCH_READY;
2545 
2546   name_end = end;
2547   r = 0;
2548   is_num = 0;
2549   if (PEND) {
2550     return ONIGERR_EMPTY_GROUP_NAME;
2551   }
2552   else {
2553     PFETCH(c);
2554     first_code = c;
2555     if (c == '>')
2556       return ONIGERR_EMPTY_GROUP_NAME;
2557 
2558     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2559       if (ref == 1)
2560 	is_num = 1;
2561       else {
2562 	r = ONIGERR_INVALID_GROUP_NAME;
2563       }
2564     }
2565     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2566       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2567     }
2568   }
2569 
2570   while (!PEND) {
2571     name_end = p;
2572     PFETCH(c);
2573     if (c == '>' || c == ')') break;
2574 
2575     if (is_num == 1) {
2576       if (! ONIGENC_IS_CODE_DIGIT(enc, c)) {
2577 	if (!ONIGENC_IS_CODE_WORD(enc, c))
2578 	  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2579 	else
2580 	  r = ONIGERR_INVALID_GROUP_NAME;
2581       }
2582     }
2583     else {
2584       if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2585         r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2586       }
2587     }
2588   }
2589 
2590   if (c != '>') {
2591     r = ONIGERR_INVALID_GROUP_NAME;
2592     name_end = end;
2593   }
2594   else {
2595     if (ONIGENC_IS_CODE_ASCII(first_code) &&
2596         ONIGENC_IS_CODE_UPPER(enc, first_code))
2597       r = ONIGERR_INVALID_GROUP_NAME;
2598   }
2599 
2600   if (r == 0) {
2601     *rname_end = name_end;
2602     *src = p;
2603     return 0;
2604   }
2605   else {
2606     onig_scan_env_set_error_string(env, r, *src, name_end);
2607     return r;
2608   }
2609 }
2610 #else
2611 static int
fetch_name(UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int ref)2612 fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
2613 {
2614   int r, len;
2615   OnigCodePoint c = 0;
2616   UChar *name_end;
2617   OnigEncoding enc = env->enc;
2618   UChar *p = *src;
2619   PFETCH_READY;
2620 
2621   r = 0;
2622   while (!PEND) {
2623     name_end = p;
2624     if (enc_len(enc, p) > 1)
2625       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2626 
2627     PFETCH(c);
2628     if (c == '>' || c == ')') break;
2629     if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2630       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2631   }
2632   if (c != '>') {
2633     r = ONIGERR_INVALID_GROUP_NAME;
2634     name_end = end;
2635   }
2636 
2637   if (r == 0) {
2638     *rname_end = name_end;
2639     *src = p;
2640     return 0;
2641   }
2642   else {
2643   err:
2644     onig_scan_env_set_error_string(env, r, *src, name_end);
2645     return r;
2646   }
2647 }
2648 #endif
2649 
2650 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2651 CC_ESC_WARN(ScanEnv* env, UChar *c)
2652 {
2653   if (onig_warn == onig_null_warn) return ;
2654 
2655   if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2656       IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2657     UChar buf[WARN_BUFSIZE];
2658     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2659 		env->pattern, env->pattern_end,
2660                 (UChar* )"character class has '%s' without escape", c);
2661     (*onig_warn)((char* )buf);
2662   }
2663 }
2664 
2665 static void
CCEND_ESC_WARN(ScanEnv * env,UChar * c)2666 CCEND_ESC_WARN(ScanEnv* env, UChar* c)
2667 {
2668   if (onig_warn == onig_null_warn) return ;
2669 
2670   if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2671     UChar buf[WARN_BUFSIZE];
2672     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2673 		(env)->pattern, (env)->pattern_end,
2674 		(UChar* )"regular expression has '%s' without escape", c);
2675     (*onig_warn)((char* )buf);
2676   }
2677 }
2678 
2679 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2680 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2681 		  UChar **next, OnigEncoding enc)
2682 {
2683   int i;
2684   OnigCodePoint x;
2685   UChar *q;
2686   UChar *p = from;
2687 
2688   while (p < to) {
2689     x = ONIGENC_MBC_TO_CODE(enc, p, to);
2690     q = p + enc_len(enc, p);
2691     if (x == s[0]) {
2692       for (i = 1; i < n && q < to; i++) {
2693 	x = ONIGENC_MBC_TO_CODE(enc, q, to);
2694 	if (x != s[i]) break;
2695 	q += enc_len(enc, q);
2696       }
2697       if (i >= n) {
2698 	if (IS_NOT_NULL(next))
2699 	  *next = q;
2700 	return p;
2701       }
2702     }
2703     p = q;
2704   }
2705   return NULL_UCHARP;
2706 }
2707 
2708 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc)2709 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2710 			 OnigCodePoint bad, OnigEncoding enc)
2711 {
2712   int i, in_esc;
2713   OnigCodePoint x;
2714   UChar *q;
2715   UChar *p = from;
2716 
2717   in_esc = 0;
2718   while (p < to) {
2719     if (in_esc) {
2720       in_esc = 0;
2721       p += enc_len(enc, p);
2722     }
2723     else {
2724       x = ONIGENC_MBC_TO_CODE(enc, p, to);
2725       q = p + enc_len(enc, p);
2726       if (x == s[0]) {
2727 	for (i = 1; i < n && q < to; i++) {
2728 	  x = ONIGENC_MBC_TO_CODE(enc, q, to);
2729 	  if (x != s[i]) break;
2730 	  q += enc_len(enc, q);
2731 	}
2732 	if (i >= n) return 1;
2733 	p += enc_len(enc, p);
2734       }
2735       else {
2736 	x = ONIGENC_MBC_TO_CODE(enc, p, to);
2737 	if (x == bad) return 0;
2738 	else if (x == MC_ESC(enc)) in_esc = 1;
2739 	p = q;
2740       }
2741     }
2742   }
2743   return 0;
2744 }
2745 
2746 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2747 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2748 {
2749   int num;
2750   OnigCodePoint c, c2;
2751   OnigSyntaxType* syn = env->syntax;
2752   OnigEncoding enc = env->enc;
2753   UChar* prev;
2754   UChar* p = *src;
2755   PFETCH_READY;
2756 
2757   if (PEND) {
2758     tok->type = TK_EOT;
2759     return tok->type;
2760   }
2761 
2762   PFETCH(c);
2763   tok->type = TK_CHAR;
2764   tok->base = 0;
2765   tok->u.c  = c;
2766   tok->escaped = 0;
2767 
2768   if (c == ']') {
2769     tok->type = TK_CC_CLOSE;
2770   }
2771   else if (c == '-') {
2772     tok->type = TK_CC_RANGE;
2773   }
2774   else if (c == MC_ESC(enc)) {
2775     if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2776       goto end;
2777 
2778     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2779 
2780     PFETCH(c);
2781     tok->escaped = 1;
2782     tok->u.c = c;
2783     switch (c) {
2784     case 'w':
2785       tok->type = TK_CHAR_TYPE;
2786       tok->u.subtype = CTYPE_WORD;
2787       break;
2788     case 'W':
2789       tok->type = TK_CHAR_TYPE;
2790       tok->u.subtype = CTYPE_NOT_WORD;
2791       break;
2792     case 'd':
2793       tok->type = TK_CHAR_TYPE;
2794       tok->u.subtype = CTYPE_DIGIT;
2795       break;
2796     case 'D':
2797       tok->type = TK_CHAR_TYPE;
2798       tok->u.subtype = CTYPE_NOT_DIGIT;
2799       break;
2800     case 's':
2801       tok->type = TK_CHAR_TYPE;
2802       tok->u.subtype = CTYPE_WHITE_SPACE;
2803       break;
2804     case 'S':
2805       tok->type = TK_CHAR_TYPE;
2806       tok->u.subtype = CTYPE_NOT_WHITE_SPACE;
2807       break;
2808     case 'h':
2809       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2810       tok->type = TK_CHAR_TYPE;
2811       tok->u.subtype = CTYPE_XDIGIT;
2812       break;
2813     case 'H':
2814       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2815       tok->type = TK_CHAR_TYPE;
2816       tok->u.subtype = CTYPE_NOT_XDIGIT;
2817       break;
2818 
2819     case 'p':
2820     case 'P':
2821       c2 = PPEEK;
2822       if (c2 == '{' &&
2823 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2824 	PINC;
2825 	tok->type = TK_CHAR_PROPERTY;
2826 	tok->u.prop.not = (c == 'P' ? 1 : 0);
2827 
2828 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2829 	  PFETCH(c2);
2830 	  if (c2 == '^') {
2831 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
2832 	  }
2833 	  else
2834 	    PUNFETCH;
2835 	}
2836       }
2837       break;
2838 
2839     case 'x':
2840       if (PEND) break;
2841 
2842       prev = p;
2843       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
2844 	PINC;
2845 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
2846 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
2847 	if (!PEND) {
2848           c2 = PPEEK;
2849           if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
2850             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
2851         }
2852 
2853 	if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) {
2854 	  PINC;
2855 	  tok->type   = TK_CODE_POINT;
2856 	  tok->base   = 16;
2857 	  tok->u.code = (OnigCodePoint )num;
2858 	}
2859 	else {
2860 	  /* can't read nothing or invalid format */
2861 	  p = prev;
2862 	}
2863       }
2864       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
2865 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
2866 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2867 	if (p == prev) {  /* can't read nothing. */
2868 	  num = 0; /* but, it's not error */
2869 	}
2870 	tok->type = TK_RAW_BYTE;
2871 	tok->base = 16;
2872 	tok->u.c  = num;
2873       }
2874       break;
2875 
2876     case 'u':
2877       if (PEND) break;
2878 
2879       prev = p;
2880       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
2881 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
2882 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2883 	if (p == prev) {  /* can't read nothing. */
2884 	  num = 0; /* but, it's not error */
2885 	}
2886 	tok->type   = TK_CODE_POINT;
2887 	tok->base   = 16;
2888 	tok->u.code = (OnigCodePoint )num;
2889       }
2890       break;
2891 
2892     case '0':
2893     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
2894       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
2895 	PUNFETCH;
2896 	prev = p;
2897 	num = scan_unsigned_octal_number(&p, end, 3, enc);
2898 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
2899 	if (p == prev) {  /* can't read nothing. */
2900 	  num = 0; /* but, it's not error */
2901 	}
2902 	tok->type = TK_RAW_BYTE;
2903 	tok->base = 8;
2904 	tok->u.c  = num;
2905       }
2906       break;
2907 
2908     default:
2909       PUNFETCH;
2910       num = fetch_escaped_value(&p, end, env);
2911       if (num < 0) return num;
2912       if (tok->u.c != num) {
2913 	tok->u.code = (OnigCodePoint )num;
2914 	tok->type   = TK_CODE_POINT;
2915       }
2916       break;
2917     }
2918   }
2919   else if (c == '[') {
2920     if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
2921       OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
2922       tok->backp = p; /* point at '[' is readed */
2923       PINC;
2924       if (str_exist_check_with_esc(send, 2, p, end,
2925                                    (OnigCodePoint )']', enc)) {
2926 	tok->type = TK_POSIX_BRACKET_OPEN;
2927       }
2928       else {
2929 	PUNFETCH;
2930 	goto cc_in_cc;
2931       }
2932     }
2933     else {
2934     cc_in_cc:
2935       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
2936 	tok->type = TK_CC_CC_OPEN;
2937       }
2938       else {
2939 	CC_ESC_WARN(env, (UChar* )"[");
2940       }
2941     }
2942   }
2943   else if (c == '&') {
2944     if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
2945 	!PEND && (PPEEK_IS('&'))) {
2946       PINC;
2947       tok->type = TK_CC_AND;
2948     }
2949   }
2950 
2951  end:
2952   *src = p;
2953   return tok->type;
2954 }
2955 
2956 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2957 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2958 {
2959   int r, num;
2960   OnigCodePoint c;
2961   OnigEncoding enc = env->enc;
2962   OnigSyntaxType* syn = env->syntax;
2963   UChar* prev;
2964   UChar* p = *src;
2965   PFETCH_READY;
2966 
2967  start:
2968   if (PEND) {
2969     tok->type = TK_EOT;
2970     return tok->type;
2971   }
2972 
2973   tok->type  = TK_STRING;
2974   tok->base  = 0;
2975   tok->backp = p;
2976 
2977   PFETCH(c);
2978   if (IS_MC_ESC_CODE(c, enc, syn)) {
2979     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2980 
2981     tok->backp = p;
2982     PFETCH(c);
2983 
2984     tok->u.c = c;
2985     tok->escaped = 1;
2986     switch (c) {
2987     case '*':
2988       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
2989       tok->type = TK_OP_REPEAT;
2990       tok->u.repeat.lower = 0;
2991       tok->u.repeat.upper = REPEAT_INFINITE;
2992       goto greedy_check;
2993       break;
2994 
2995     case '+':
2996       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
2997       tok->type = TK_OP_REPEAT;
2998       tok->u.repeat.lower = 1;
2999       tok->u.repeat.upper = REPEAT_INFINITE;
3000       goto greedy_check;
3001       break;
3002 
3003     case '?':
3004       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3005       tok->type = TK_OP_REPEAT;
3006       tok->u.repeat.lower = 0;
3007       tok->u.repeat.upper = 1;
3008     greedy_check:
3009       if (!PEND && PPEEK_IS('?') &&
3010 	  IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3011 	PFETCH(c);
3012 	tok->u.repeat.greedy     = 0;
3013 	tok->u.repeat.possessive = 0;
3014       }
3015       else {
3016       possessive_check:
3017 	if (!PEND && PPEEK_IS('+') &&
3018 	    ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3019 	      tok->type != TK_INTERVAL)  ||
3020 	     (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3021 	      tok->type == TK_INTERVAL))) {
3022 	  PFETCH(c);
3023 	  tok->u.repeat.greedy     = 1;
3024 	  tok->u.repeat.possessive = 1;
3025 	}
3026 	else {
3027 	  tok->u.repeat.greedy     = 1;
3028 	  tok->u.repeat.possessive = 0;
3029 	}
3030       }
3031       break;
3032 
3033     case '{':
3034       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3035       r = fetch_range_quantifier(&p, end, tok, env);
3036       if (r < 0) return r;  /* error */
3037       if (r == 0) goto greedy_check;
3038       else if (r == 2) { /* {n} */
3039 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3040 	  goto possessive_check;
3041 
3042 	goto greedy_check;
3043       }
3044       /* r == 1 : normal char */
3045       break;
3046 
3047     case '|':
3048       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3049       tok->type = TK_ALT;
3050       break;
3051 
3052     case '(':
3053       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3054       tok->type = TK_SUBEXP_OPEN;
3055       break;
3056 
3057     case ')':
3058       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3059       tok->type = TK_SUBEXP_CLOSE;
3060       break;
3061 
3062     case 'w':
3063       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3064       tok->type = TK_CHAR_TYPE;
3065       tok->u.subtype = CTYPE_WORD;
3066       break;
3067 
3068     case 'W':
3069       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3070       tok->type = TK_CHAR_TYPE;
3071       tok->u.subtype = CTYPE_NOT_WORD;
3072       break;
3073 
3074     case 'b':
3075       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3076       tok->type = TK_ANCHOR;
3077       tok->u.anchor = ANCHOR_WORD_BOUND;
3078       break;
3079 
3080     case 'B':
3081       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3082       tok->type = TK_ANCHOR;
3083       tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3084       break;
3085 
3086 #ifdef USE_WORD_BEGIN_END
3087     case '<':
3088       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3089       tok->type = TK_ANCHOR;
3090       tok->u.anchor = ANCHOR_WORD_BEGIN;
3091       break;
3092 
3093     case '>':
3094       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3095       tok->type = TK_ANCHOR;
3096       tok->u.anchor = ANCHOR_WORD_END;
3097       break;
3098 #endif
3099 
3100     case 's':
3101       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3102       tok->type = TK_CHAR_TYPE;
3103       tok->u.subtype = CTYPE_WHITE_SPACE;
3104       break;
3105 
3106     case 'S':
3107       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3108       tok->type = TK_CHAR_TYPE;
3109       tok->u.subtype = CTYPE_NOT_WHITE_SPACE;
3110       break;
3111 
3112     case 'd':
3113       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3114       tok->type = TK_CHAR_TYPE;
3115       tok->u.subtype = CTYPE_DIGIT;
3116       break;
3117 
3118     case 'D':
3119       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3120       tok->type = TK_CHAR_TYPE;
3121       tok->u.subtype = CTYPE_NOT_DIGIT;
3122       break;
3123 
3124     case 'h':
3125       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3126       tok->type = TK_CHAR_TYPE;
3127       tok->u.subtype = CTYPE_XDIGIT;
3128       break;
3129 
3130     case 'H':
3131       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3132       tok->type = TK_CHAR_TYPE;
3133       tok->u.subtype = CTYPE_NOT_XDIGIT;
3134       break;
3135 
3136     case 'A':
3137       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3138     begin_buf:
3139       tok->type = TK_ANCHOR;
3140       tok->u.subtype = ANCHOR_BEGIN_BUF;
3141       break;
3142 
3143     case 'Z':
3144       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3145       tok->type = TK_ANCHOR;
3146       tok->u.subtype = ANCHOR_SEMI_END_BUF;
3147       break;
3148 
3149     case 'z':
3150       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3151     end_buf:
3152       tok->type = TK_ANCHOR;
3153       tok->u.subtype = ANCHOR_END_BUF;
3154       break;
3155 
3156     case 'G':
3157       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3158       tok->type = TK_ANCHOR;
3159       tok->u.subtype = ANCHOR_BEGIN_POSITION;
3160       break;
3161 
3162     case '`':
3163       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3164       goto begin_buf;
3165       break;
3166 
3167     case '\'':
3168       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3169       goto end_buf;
3170       break;
3171 
3172     case 'x':
3173       if (PEND) break;
3174 
3175       prev = p;
3176       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3177 	PINC;
3178 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3179 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3180 	if (!PEND) {
3181           if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3182             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3183         }
3184 
3185 	if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) {
3186 	  PINC;
3187 	  tok->type   = TK_CODE_POINT;
3188 	  tok->u.code = (OnigCodePoint )num;
3189 	}
3190 	else {
3191 	  /* can't read nothing or invalid format */
3192 	  p = prev;
3193 	}
3194       }
3195       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3196 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3197 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3198 	if (p == prev) {  /* can't read nothing. */
3199 	  num = 0; /* but, it's not error */
3200 	}
3201 	tok->type = TK_RAW_BYTE;
3202 	tok->base = 16;
3203 	tok->u.c  = num;
3204       }
3205       break;
3206 
3207     case 'u':
3208       if (PEND) break;
3209 
3210       prev = p;
3211       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3212 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3213 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3214 	if (p == prev) {  /* can't read nothing. */
3215 	  num = 0; /* but, it's not error */
3216 	}
3217 	tok->type   = TK_CODE_POINT;
3218 	tok->base   = 16;
3219 	tok->u.code = (OnigCodePoint )num;
3220       }
3221       break;
3222 
3223     case '1': case '2': case '3': case '4':
3224     case '5': case '6': case '7': case '8': case '9':
3225       PUNFETCH;
3226       prev = p;
3227       num = onig_scan_unsigned_number(&p, end, enc);
3228       if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3229         goto skip_backref;
3230       }
3231 
3232       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3233 	  (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3234 	if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3235 	  if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3236 	    return ONIGERR_INVALID_BACKREF;
3237 	}
3238 
3239 	tok->type = TK_BACKREF;
3240 	tok->u.backref.num     = 1;
3241 	tok->u.backref.ref1    = num;
3242 	tok->u.backref.by_name = 0;
3243 #ifdef USE_BACKREF_AT_LEVEL
3244 	tok->u.backref.exist_level = 0;
3245 #endif
3246 	break;
3247       }
3248 
3249     skip_backref:
3250       if (c == '8' || c == '9') {
3251 	/* normal char */
3252 	p = prev; PINC;
3253 	break;
3254       }
3255 
3256       p = prev;
3257       /* fall through */
3258     case '0':
3259       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3260 	prev = p;
3261 	num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3262 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3263 	if (p == prev) {  /* can't read nothing. */
3264 	  num = 0; /* but, it's not error */
3265 	}
3266 	tok->type = TK_RAW_BYTE;
3267 	tok->base = 8;
3268 	tok->u.c  = num;
3269       }
3270       else if (c != '0') {
3271 	PINC;
3272       }
3273       break;
3274 
3275 #ifdef USE_NAMED_GROUP
3276     case 'k':
3277       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3278 	PFETCH(c);
3279 	if (c == '<') {
3280 	  UChar* name_end;
3281 	  int* backs;
3282 
3283 	  prev = p;
3284 
3285 #ifdef USE_BACKREF_AT_LEVEL
3286 	  name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3287 	  r = fetch_name_with_level(&p, end, &name_end, env, &tok->u.backref.level);
3288 	  if (r == 1) tok->u.backref.exist_level = 1;
3289 	  else        tok->u.backref.exist_level = 0;
3290 #else
3291 	  r = fetch_name(&p, end, &name_end, env, 1);
3292 #endif
3293 	  if (r < 0) return r;
3294 
3295 	  num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3296 	  if (num <= 0) {
3297 	    onig_scan_env_set_error_string(env,
3298 			    ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3299 	    return ONIGERR_UNDEFINED_NAME_REFERENCE;
3300 	  }
3301 	  if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3302 	    int i;
3303 	    for (i = 0; i < num; i++) {
3304 	      if (backs[i] > env->num_mem ||
3305 		  IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3306 		return ONIGERR_INVALID_BACKREF;
3307 	    }
3308 	  }
3309 
3310 	  tok->type = TK_BACKREF;
3311 	  tok->u.backref.by_name = 1;
3312 	  if (num == 1) {
3313 	    tok->u.backref.num  = 1;
3314 	    tok->u.backref.ref1 = backs[0];
3315 	  }
3316 	  else {
3317 	    tok->u.backref.num  = num;
3318 	    tok->u.backref.refs = backs;
3319 	  }
3320 	}
3321 	else
3322 	  PUNFETCH;
3323       }
3324       break;
3325 #endif
3326 
3327 #ifdef USE_SUBEXP_CALL
3328     case 'g':
3329       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3330 	PFETCH(c);
3331 	if (c == '<') {
3332 	  UChar* name_end;
3333 
3334 	  prev = p;
3335 	  r = fetch_name(&p, end, &name_end, env, 1);
3336 	  if (r < 0) return r;
3337 
3338 	  tok->type = TK_CALL;
3339 	  tok->u.call.name     = prev;
3340 	  tok->u.call.name_end = name_end;
3341 	}
3342 	else
3343 	  PUNFETCH;
3344       }
3345       break;
3346 #endif
3347 
3348     case 'Q':
3349       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3350 	tok->type = TK_QUOTE_OPEN;
3351       }
3352       break;
3353 
3354     case 'p':
3355     case 'P':
3356       if (PPEEK_IS('{') &&
3357 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3358 	PINC;
3359 	tok->type = TK_CHAR_PROPERTY;
3360 	tok->u.prop.not = (c == 'P' ? 1 : 0);
3361 
3362 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3363 	  PFETCH(c);
3364 	  if (c == '^') {
3365 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3366 	  }
3367 	  else
3368 	    PUNFETCH;
3369 	}
3370       }
3371       break;
3372 
3373     default:
3374       PUNFETCH;
3375       num = fetch_escaped_value(&p, end, env);
3376       if (num < 0) return num;
3377       /* set_raw: */
3378       if (tok->u.c != num) {
3379 	tok->type = TK_CODE_POINT;
3380 	tok->u.code = (OnigCodePoint )num;
3381       }
3382       else { /* string */
3383 	p = tok->backp + enc_len(enc, tok->backp);
3384       }
3385       break;
3386     }
3387   }
3388   else {
3389     tok->u.c = c;
3390     tok->escaped = 0;
3391 
3392 #ifdef USE_VARIABLE_META_CHARS
3393     if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3394 	IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3395       if (c == MC_ANYCHAR(enc))
3396 	goto any_char;
3397       else if (c == MC_ANYTIME(enc))
3398 	goto anytime;
3399       else if (c == MC_ZERO_OR_ONE_TIME(enc))
3400 	goto zero_or_one_time;
3401       else if (c == MC_ONE_OR_MORE_TIME(enc))
3402 	goto one_or_more_time;
3403       else if (c == MC_ANYCHAR_ANYTIME(enc)) {
3404 	tok->type = TK_ANYCHAR_ANYTIME;
3405 	goto out;
3406       }
3407     }
3408 #endif
3409 
3410     switch (c) {
3411     case '.':
3412       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3413 #ifdef USE_VARIABLE_META_CHARS
3414     any_char:
3415 #endif
3416       tok->type = TK_ANYCHAR;
3417       break;
3418 
3419     case '*':
3420       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3421 #ifdef USE_VARIABLE_META_CHARS
3422     anytime:
3423 #endif
3424       tok->type = TK_OP_REPEAT;
3425       tok->u.repeat.lower = 0;
3426       tok->u.repeat.upper = REPEAT_INFINITE;
3427       goto greedy_check;
3428       break;
3429 
3430     case '+':
3431       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3432 #ifdef USE_VARIABLE_META_CHARS
3433     one_or_more_time:
3434 #endif
3435       tok->type = TK_OP_REPEAT;
3436       tok->u.repeat.lower = 1;
3437       tok->u.repeat.upper = REPEAT_INFINITE;
3438       goto greedy_check;
3439       break;
3440 
3441     case '?':
3442       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3443 #ifdef USE_VARIABLE_META_CHARS
3444     zero_or_one_time:
3445 #endif
3446       tok->type = TK_OP_REPEAT;
3447       tok->u.repeat.lower = 0;
3448       tok->u.repeat.upper = 1;
3449       goto greedy_check;
3450       break;
3451 
3452     case '{':
3453       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3454       r = fetch_range_quantifier(&p, end, tok, env);
3455       if (r < 0) return r;  /* error */
3456       if (r == 0) goto greedy_check;
3457       else if (r == 2) { /* {n} */
3458 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3459 	  goto possessive_check;
3460 
3461 	goto greedy_check;
3462       }
3463       /* r == 1 : normal char */
3464       break;
3465 
3466     case '|':
3467       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3468       tok->type = TK_ALT;
3469       break;
3470 
3471     case '(':
3472       if (PPEEK_IS('?') &&
3473           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3474         PINC;
3475         if (PPEEK_IS('#')) {
3476           PFETCH(c);
3477           while (1) {
3478             if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3479             PFETCH(c);
3480             if (c == MC_ESC(enc)) {
3481               if (!PEND) PFETCH(c);
3482             }
3483             else {
3484               if (c == ')') break;
3485             }
3486           }
3487           goto start;
3488         }
3489         PUNFETCH;
3490       }
3491 
3492       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3493       tok->type = TK_SUBEXP_OPEN;
3494       break;
3495 
3496     case ')':
3497       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3498       tok->type = TK_SUBEXP_CLOSE;
3499       break;
3500 
3501     case '^':
3502       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3503       tok->type = TK_ANCHOR;
3504       tok->u.subtype = (IS_SINGLELINE(env->option)
3505 			? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3506       break;
3507 
3508     case '$':
3509       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3510       tok->type = TK_ANCHOR;
3511       tok->u.subtype = (IS_SINGLELINE(env->option)
3512 			? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3513       break;
3514 
3515     case '[':
3516       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3517       tok->type = TK_CC_OPEN;
3518       break;
3519 
3520     case ']':
3521       if (*src > env->pattern)   /* /].../ is allowed. */
3522 	CCEND_ESC_WARN(env, (UChar* )"]");
3523       break;
3524 
3525     case '#':
3526       if (IS_EXTEND(env->option)) {
3527 	while (!PEND) {
3528 	  PFETCH(c);
3529 	  if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3530 	    break;
3531 	}
3532 	goto start;
3533 	break;
3534       }
3535       break;
3536 
3537     case ' ': case '\t': case '\n': case '\r': case '\f':
3538       if (IS_EXTEND(env->option))
3539 	goto start;
3540       break;
3541 
3542     default:
3543       /* string */
3544       break;
3545     }
3546   }
3547 
3548 #ifdef USE_VARIABLE_META_CHARS
3549  out:
3550 #endif
3551   *src = p;
3552   return tok->type;
3553 }
3554 
3555 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype,int not,OnigEncoding enc,const OnigCodePoint sbr[],const OnigCodePoint mbr[])3556 add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc,
3557                          const OnigCodePoint sbr[], const OnigCodePoint mbr[])
3558 {
3559   int i, r;
3560   OnigCodePoint j;
3561 
3562   int nsb = ONIGENC_CODE_RANGE_NUM(sbr);
3563   int nmb = ONIGENC_CODE_RANGE_NUM(mbr);
3564 
3565   if (not == 0) {
3566     for (i = 0; i < nsb; i++) {
3567       for (j  = ONIGENC_CODE_RANGE_FROM(sbr, i);
3568            j <= ONIGENC_CODE_RANGE_TO(sbr, i); j++) {
3569         BITSET_SET_BIT(cc->bs, j);
3570       }
3571     }
3572 
3573     for (i = 0; i < nmb; i++) {
3574       r = add_code_range_to_buf(&(cc->mbuf),
3575                                 ONIGENC_CODE_RANGE_FROM(mbr, i),
3576                                 ONIGENC_CODE_RANGE_TO(mbr, i));
3577       if (r != 0) return r;
3578     }
3579   }
3580   else {
3581     OnigCodePoint prev = 0;
3582 
3583     if (ONIGENC_MBC_MINLEN(enc) == 1) {
3584       for (i = 0; i < nsb; i++) {
3585         for (j = prev;
3586              j < ONIGENC_CODE_RANGE_FROM(sbr, i); j++) {
3587           BITSET_SET_BIT(cc->bs, j);
3588         }
3589         prev = ONIGENC_CODE_RANGE_TO(sbr, i) + 1;
3590       }
3591       if (prev < 0x7f) {
3592         for (j = prev; j < 0x7f; j++) {
3593           BITSET_SET_BIT(cc->bs, j);
3594         }
3595       }
3596 
3597       prev = 0x80;
3598     }
3599 
3600     for (i = 0; i < nmb; i++) {
3601       if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3602 	r = add_code_range_to_buf(&(cc->mbuf), prev,
3603                                   ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3604 	if (r != 0) return r;
3605       }
3606       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3607     }
3608     if (prev < 0x7fffffff) {
3609       r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3610       if (r != 0) return r;
3611     }
3612   }
3613 
3614   return 0;
3615 }
3616 
3617 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3618 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3619 {
3620   int c, r;
3621   const OnigCodePoint *sbr, *mbr;
3622   OnigEncoding enc = env->enc;
3623 
3624   r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sbr, &mbr);
3625   if (r == 0) {
3626     return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sbr, mbr);
3627   }
3628   else if (r != ONIG_NO_SUPPORT_CONFIG) {
3629     return r;
3630   }
3631 
3632   r = 0;
3633   switch (ctype) {
3634   case ONIGENC_CTYPE_ALPHA:
3635   case ONIGENC_CTYPE_BLANK:
3636   case ONIGENC_CTYPE_CNTRL:
3637   case ONIGENC_CTYPE_DIGIT:
3638   case ONIGENC_CTYPE_LOWER:
3639   case ONIGENC_CTYPE_PUNCT:
3640   case ONIGENC_CTYPE_SPACE:
3641   case ONIGENC_CTYPE_UPPER:
3642   case ONIGENC_CTYPE_XDIGIT:
3643   case ONIGENC_CTYPE_ASCII:
3644   case ONIGENC_CTYPE_ALNUM:
3645     if (not != 0) {
3646       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3647 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3648 	  BITSET_SET_BIT(cc->bs, c);
3649       }
3650       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3651     }
3652     else {
3653       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3654 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3655 	  BITSET_SET_BIT(cc->bs, c);
3656       }
3657     }
3658     break;
3659 
3660   case ONIGENC_CTYPE_GRAPH:
3661   case ONIGENC_CTYPE_PRINT:
3662     if (not != 0) {
3663       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3664 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3665 	  BITSET_SET_BIT(cc->bs, c);
3666       }
3667     }
3668     else {
3669       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3670 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3671 	  BITSET_SET_BIT(cc->bs, c);
3672       }
3673       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3674     }
3675     break;
3676 
3677   case ONIGENC_CTYPE_WORD:
3678     if (not == 0) {
3679       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3680 	if (ONIGENC_IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3681       }
3682       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3683     }
3684     else {
3685       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3686         if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0)  /* 0: invalid code point */
3687 	    && ! ONIGENC_IS_CODE_WORD(enc, c))
3688 	  BITSET_SET_BIT(cc->bs, c);
3689       }
3690     }
3691     break;
3692 
3693   default:
3694     return ONIGERR_PARSER_BUG;
3695     break;
3696   }
3697 
3698   return r;
3699 }
3700 
3701 static int
parse_ctype_to_enc_ctype(int pctype,int * not)3702 parse_ctype_to_enc_ctype(int pctype, int* not)
3703 {
3704   int ctype;
3705 
3706   switch (pctype) {
3707   case CTYPE_WORD:
3708     ctype = ONIGENC_CTYPE_WORD;
3709     *not = 0;
3710     break;
3711   case CTYPE_NOT_WORD:
3712     ctype = ONIGENC_CTYPE_WORD;
3713     *not = 1;
3714     break;
3715   case CTYPE_WHITE_SPACE:
3716     ctype = ONIGENC_CTYPE_SPACE;
3717     *not = 0;
3718     break;
3719   case CTYPE_NOT_WHITE_SPACE:
3720     ctype = ONIGENC_CTYPE_SPACE;
3721     *not = 1;
3722     break;
3723   case CTYPE_DIGIT:
3724     ctype = ONIGENC_CTYPE_DIGIT;
3725     *not = 0;
3726     break;
3727   case CTYPE_NOT_DIGIT:
3728     ctype = ONIGENC_CTYPE_DIGIT;
3729     *not = 1;
3730     break;
3731   case CTYPE_XDIGIT:
3732     ctype = ONIGENC_CTYPE_XDIGIT;
3733     *not = 0;
3734     break;
3735   case CTYPE_NOT_XDIGIT:
3736     ctype = ONIGENC_CTYPE_XDIGIT;
3737     *not = 1;
3738     break;
3739   default:
3740     return ONIGERR_PARSER_BUG;
3741     break;
3742   }
3743   return ctype;
3744 }
3745 
3746 typedef struct {
3747   UChar    *name;
3748   int       ctype;
3749   short int len;
3750 } PosixBracketEntryType;
3751 
3752 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3753 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3754 {
3755 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH  20
3756 #define POSIX_BRACKET_NAME_MAX_LEN         6
3757 
3758   static PosixBracketEntryType PBS[] = {
3759     { (UChar* )"alnum",  ONIGENC_CTYPE_ALNUM,  5 },
3760     { (UChar* )"alpha",  ONIGENC_CTYPE_ALPHA,  5 },
3761     { (UChar* )"blank",  ONIGENC_CTYPE_BLANK,  5 },
3762     { (UChar* )"cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
3763     { (UChar* )"digit",  ONIGENC_CTYPE_DIGIT,  5 },
3764     { (UChar* )"graph",  ONIGENC_CTYPE_GRAPH,  5 },
3765     { (UChar* )"lower",  ONIGENC_CTYPE_LOWER,  5 },
3766     { (UChar* )"print",  ONIGENC_CTYPE_PRINT,  5 },
3767     { (UChar* )"punct",  ONIGENC_CTYPE_PUNCT,  5 },
3768     { (UChar* )"space",  ONIGENC_CTYPE_SPACE,  5 },
3769     { (UChar* )"upper",  ONIGENC_CTYPE_UPPER,  5 },
3770     { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3771     { (UChar* )"ascii",  ONIGENC_CTYPE_ASCII,  5 },
3772     { (UChar* )NULL, -1, 0 }
3773   };
3774 
3775   PosixBracketEntryType *pb;
3776   int not, i, r;
3777   OnigCodePoint c;
3778   OnigEncoding enc = env->enc;
3779   UChar *p = *src;
3780   PFETCH_READY;
3781 
3782   if (PPEEK_IS('^')) {
3783     PINC;
3784     not = 1;
3785   }
3786   else
3787     not = 0;
3788 
3789   if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2)
3790     goto not_posix_bracket;
3791 
3792   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3793     if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3794       p = (UChar* )onigenc_step(enc, p, end, pb->len);
3795       if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3796 	return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3797 
3798       r = add_ctype_to_cc(cc, pb->ctype, not, env);
3799       if (r != 0) return r;
3800 
3801       PINC; PINC;
3802       *src = p;
3803       return 0;
3804     }
3805   }
3806 
3807  not_posix_bracket:
3808   c = 0;
3809   i = 0;
3810   while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3811     PINC;
3812     if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3813   }
3814   if (c == ':' && ! PEND) {
3815     PINC;
3816     if (! PEND) {
3817       PFETCH(c);
3818       if (c == ']')
3819 	return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3820     }
3821   }
3822 
3823   return 1;   /* 1: is not POSIX bracket, but no error. */
3824 }
3825 
3826 static int
property_name_to_ctype(UChar * p,UChar * end,OnigEncoding enc)3827 property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc)
3828 {
3829   static PosixBracketEntryType PBS[] = {
3830     { (UChar* )"Alnum",  ONIGENC_CTYPE_ALNUM,  5 },
3831     { (UChar* )"Alpha",  ONIGENC_CTYPE_ALPHA,  5 },
3832     { (UChar* )"Blank",  ONIGENC_CTYPE_BLANK,  5 },
3833     { (UChar* )"Cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
3834     { (UChar* )"Digit",  ONIGENC_CTYPE_DIGIT,  5 },
3835     { (UChar* )"Graph",  ONIGENC_CTYPE_GRAPH,  5 },
3836     { (UChar* )"Lower",  ONIGENC_CTYPE_LOWER,  5 },
3837     { (UChar* )"Print",  ONIGENC_CTYPE_PRINT,  5 },
3838     { (UChar* )"Punct",  ONIGENC_CTYPE_PUNCT,  5 },
3839     { (UChar* )"Space",  ONIGENC_CTYPE_SPACE,  5 },
3840     { (UChar* )"Upper",  ONIGENC_CTYPE_UPPER,  5 },
3841     { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
3842     { (UChar* )"ASCII",  ONIGENC_CTYPE_ASCII,  5 },
3843     { (UChar* )NULL, -1, 0 }
3844   };
3845 
3846   PosixBracketEntryType *pb;
3847   int len;
3848 
3849   len = onigenc_strlen(enc, p, end);
3850   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3851     if (len == pb->len &&
3852         onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
3853       return pb->ctype;
3854   }
3855 
3856   return -1;
3857 }
3858 
3859 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3860 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3861 {
3862   int ctype;
3863   OnigCodePoint c;
3864   OnigEncoding enc = env->enc;
3865   UChar *prev, *start, *p = *src;
3866   PFETCH_READY;
3867 
3868   /* 'IsXXXX' => 'XXXX' */
3869   if (!PEND &&
3870       IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS)) {
3871     c = PPEEK;
3872     if (c == 'I') {
3873       PINC;
3874       if (! PEND) {
3875 	c = PPEEK;
3876 	if (c == 's')
3877 	  PINC;
3878 	else
3879 	  PUNFETCH;
3880       }
3881     }
3882   }
3883 
3884   start = prev = p;
3885 
3886   while (!PEND) {
3887     prev = p;
3888     PFETCH(c);
3889     if (c == '}') {
3890       ctype = property_name_to_ctype(start, prev, enc);
3891       if (ctype < 0) break;
3892 
3893       *src = p;
3894       return ctype;
3895     }
3896     else if (c == '(' || c == ')' || c == '{' || c == '|')
3897       break;
3898   }
3899 
3900   onig_scan_env_set_error_string(env, ONIGERR_INVALID_CHAR_PROPERTY_NAME,
3901 				 *src, prev);
3902   return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
3903 }
3904 
3905 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3906 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
3907 		    ScanEnv* env)
3908 {
3909   int r, ctype;
3910   CClassNode* cc;
3911 
3912   ctype = fetch_char_property_to_ctype(src, end, env);
3913   if (ctype < 0) return ctype;
3914 
3915   *np = node_new_cclass();
3916   CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
3917   cc = &(NCCLASS(*np));
3918   r = add_ctype_to_cc(cc, ctype, 0, env);
3919   if (r != 0) return r;
3920   if (tok->u.prop.not != 0) CCLASS_SET_NOT(cc);
3921 
3922   return 0;
3923 }
3924 
3925 
3926 enum CCSTATE {
3927   CCS_VALUE,
3928   CCS_RANGE,
3929   CCS_COMPLETE,
3930   CCS_START
3931 };
3932 
3933 enum CCVALTYPE {
3934   CCV_SB,
3935   CCV_CODE_POINT,
3936   CCV_CLASS
3937 };
3938 
3939 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)3940 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
3941 		 enum CCSTATE* state, ScanEnv* env)
3942 {
3943   int r;
3944 
3945   if (*state == CCS_RANGE)
3946     return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
3947 
3948   if (*state == CCS_VALUE && *type != CCV_CLASS) {
3949     if (*type == CCV_SB)
3950       BITSET_SET_BIT(cc->bs, (int )(*vs));
3951     else if (*type == CCV_CODE_POINT) {
3952       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
3953       if (r < 0) return r;
3954     }
3955   }
3956 
3957   *state = CCS_VALUE;
3958   *type  = CCV_CLASS;
3959   return 0;
3960 }
3961 
3962 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)3963 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
3964 	       int* vs_israw, int v_israw,
3965 	       enum CCVALTYPE intype, enum CCVALTYPE* type,
3966 	       enum CCSTATE* state, ScanEnv* env)
3967 {
3968   int r;
3969 
3970   switch (*state) {
3971   case CCS_VALUE:
3972     if (*type == CCV_SB)
3973       BITSET_SET_BIT(cc->bs, (int )(*vs));
3974     else if (*type == CCV_CODE_POINT) {
3975       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
3976       if (r < 0) return r;
3977     }
3978     break;
3979 
3980   case CCS_RANGE:
3981     if (intype == *type) {
3982       if (intype == CCV_SB) {
3983         if (*vs > 0xff || v > 0xff)
3984           return ONIGERR_INVALID_WIDE_CHAR_VALUE;
3985 
3986 	if (*vs > v) {
3987 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
3988 	    goto ccs_range_end;
3989 	  else
3990 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
3991 	}
3992 	bitset_set_range(cc->bs, (int )*vs, (int )v);
3993       }
3994       else {
3995 	r = add_code_range(&(cc->mbuf), env, *vs, v);
3996 	if (r < 0) return r;
3997       }
3998     }
3999     else {
4000 #if 0
4001       if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4002 #endif
4003 	if (*vs > v) {
4004 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4005 	    goto ccs_range_end;
4006 	  else
4007 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4008 	}
4009 	bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4010 	r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4011 	if (r < 0) return r;
4012 #if 0
4013       }
4014       else
4015 	return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4016 #endif
4017     }
4018   ccs_range_end:
4019     *state = CCS_COMPLETE;
4020     break;
4021 
4022   case CCS_COMPLETE:
4023   case CCS_START:
4024     *state = CCS_VALUE;
4025     break;
4026 
4027   default:
4028     break;
4029   }
4030 
4031   *vs_israw = v_israw;
4032   *vs       = v;
4033   *type     = intype;
4034   return 0;
4035 }
4036 
4037 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,OnigEncoding enc)4038 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4039 		 OnigEncoding enc)
4040 {
4041   int in_esc;
4042   OnigCodePoint code;
4043   UChar* p = from;
4044   PFETCH_READY;
4045 
4046   in_esc = 0;
4047   while (! PEND) {
4048     if (ignore_escaped && in_esc) {
4049       in_esc = 0;
4050     }
4051     else {
4052       PFETCH(code);
4053       if (code == c) return 1;
4054       if (code == MC_ESC(enc)) in_esc = 1;
4055     }
4056   }
4057   return 0;
4058 }
4059 
4060 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4061 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4062 		 ScanEnv* env)
4063 {
4064   int r, neg, len, fetched, and_start;
4065   OnigCodePoint v, vs;
4066   UChar *p;
4067   Node* node;
4068   CClassNode *cc, *prev_cc;
4069   CClassNode work_cc;
4070 
4071   enum CCSTATE state;
4072   enum CCVALTYPE val_type, in_type;
4073   int val_israw, in_israw;
4074 
4075   prev_cc = (CClassNode* )NULL;
4076   *np = NULL_NODE;
4077   r = fetch_token_in_cc(tok, src, end, env);
4078   if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4079     neg = 1;
4080     r = fetch_token_in_cc(tok, src, end, env);
4081   }
4082   else {
4083     neg = 0;
4084   }
4085 
4086   if (r < 0) return r;
4087   if (r == TK_CC_CLOSE) {
4088     if (! code_exist_check((OnigCodePoint )']',
4089                            *src, env->pattern_end, 1, env->enc))
4090       return ONIGERR_EMPTY_CHAR_CLASS;
4091 
4092     CC_ESC_WARN(env, (UChar* )"]");
4093     r = tok->type = TK_CHAR;  /* allow []...] */
4094   }
4095 
4096   *np = node = node_new_cclass();
4097   CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY);
4098   cc = &(NCCLASS(node));
4099 
4100   and_start = 0;
4101   state = CCS_START;
4102   p = *src;
4103   while (r != TK_CC_CLOSE) {
4104     fetched = 0;
4105     switch (r) {
4106     case TK_CHAR:
4107       len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4108       if (len > 1) {
4109 	in_type = CCV_CODE_POINT;
4110       }
4111       else {
4112       sb_char:
4113 	in_type = CCV_SB;
4114       }
4115       v = (OnigCodePoint )tok->u.c;
4116       in_israw = 0;
4117       goto val_entry2;
4118       break;
4119 
4120     case TK_RAW_BYTE:
4121       /* tok->base != 0 : octal or hexadec. */
4122       if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4123 	UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4124 	UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4125 	UChar* psave = p;
4126 	int i, base = tok->base;
4127 
4128 	buf[0] = tok->u.c;
4129 	for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4130 	  r = fetch_token_in_cc(tok, &p, end, env);
4131 	  if (r < 0) goto err;
4132 	  if (r != TK_RAW_BYTE || tok->base != base) {
4133 	    fetched = 1;
4134 	    break;
4135 	  }
4136 	  buf[i] = tok->u.c;
4137 	}
4138 
4139 	if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4140 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4141 	  goto err;
4142 	}
4143 
4144 	len = enc_len(env->enc, buf);
4145 	if (i < len) {
4146 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4147 	  goto err;
4148 	}
4149 	else if (i > len) { /* fetch back */
4150 	  p = psave;
4151 	  for (i = 1; i < len; i++) {
4152 	    r = fetch_token_in_cc(tok, &p, end, env);
4153 	  }
4154 	  fetched = 0;
4155 	}
4156 
4157 	if (i == 1) {
4158 	  v = (OnigCodePoint )buf[0];
4159 	  goto raw_single;
4160 	}
4161 	else {
4162 	  v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4163 	  in_type = CCV_CODE_POINT;
4164 	}
4165       }
4166       else {
4167 	v = (OnigCodePoint )tok->u.c;
4168       raw_single:
4169 	in_type = CCV_SB;
4170       }
4171       in_israw = 1;
4172       goto val_entry2;
4173       break;
4174 
4175     case TK_CODE_POINT:
4176       v = tok->u.code;
4177       in_israw = 1;
4178     val_entry:
4179       len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4180       if (len < 0) {
4181 	r = len;
4182 	goto err;
4183       }
4184       in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4185     val_entry2:
4186       r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4187 			 &state, env);
4188       if (r != 0) goto err;
4189       break;
4190 
4191     case TK_POSIX_BRACKET_OPEN:
4192       r = parse_posix_bracket(cc, &p, end, env);
4193       if (r < 0) goto err;
4194       if (r == 1) {  /* is not POSIX bracket */
4195 	CC_ESC_WARN(env, (UChar* )"[");
4196 	p = tok->backp;
4197 	v = (OnigCodePoint )tok->u.c;
4198 	in_israw = 0;
4199 	goto val_entry;
4200       }
4201       goto next_class;
4202       break;
4203 
4204     case TK_CHAR_TYPE:
4205       {
4206 	int ctype, not;
4207 	ctype = parse_ctype_to_enc_ctype(tok->u.subtype, &not);
4208 	r = add_ctype_to_cc(cc, ctype, not, env);
4209 	if (r != 0) return r;
4210       }
4211 
4212     next_class:
4213       r = next_state_class(cc, &vs, &val_type, &state, env);
4214       if (r != 0) goto err;
4215       break;
4216 
4217     case TK_CHAR_PROPERTY:
4218       {
4219 	int ctype;
4220 
4221 	ctype = fetch_char_property_to_ctype(&p, end, env);
4222 	if (ctype < 0) return ctype;
4223 	r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4224 	if (r != 0) return r;
4225 	goto next_class;
4226       }
4227       break;
4228 
4229     case TK_CC_RANGE:
4230       if (state == CCS_VALUE) {
4231 	r = fetch_token_in_cc(tok, &p, end, env);
4232 	if (r < 0) goto err;
4233 	fetched = 1;
4234 	if (r == TK_CC_CLOSE) { /* allow [x-] */
4235 	range_end_val:
4236 	  v = (OnigCodePoint )'-';
4237 	  in_israw = 0;
4238 	  goto val_entry;
4239 	}
4240 	else if (r == TK_CC_AND) {
4241 	  CC_ESC_WARN(env, (UChar* )"-");
4242 	  goto range_end_val;
4243 	}
4244 	state = CCS_RANGE;
4245       }
4246       else if (state == CCS_START) {
4247 	/* [-xa] is allowed */
4248 	v = (OnigCodePoint )tok->u.c;
4249 	in_israw = 0;
4250 
4251 	r = fetch_token_in_cc(tok, &p, end, env);
4252 	if (r < 0) goto err;
4253 	fetched = 1;
4254 	/* [--x] or [a&&-x] is warned. */
4255 	if (r == TK_CC_RANGE || and_start != 0)
4256 	  CC_ESC_WARN(env, (UChar* )"-");
4257 
4258 	goto val_entry;
4259       }
4260       else if (state == CCS_RANGE) {
4261 	CC_ESC_WARN(env, (UChar* )"-");
4262 	goto sb_char;  /* [!--x] is allowed */
4263       }
4264       else { /* CCS_COMPLETE */
4265 	r = fetch_token_in_cc(tok, &p, end, env);
4266 	if (r < 0) goto err;
4267 	fetched = 1;
4268 	if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4269 	else if (r == TK_CC_AND) {
4270 	  CC_ESC_WARN(env, (UChar* )"-");
4271 	  goto range_end_val;
4272 	}
4273 
4274 	if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4275 	  CC_ESC_WARN(env, (UChar* )"-");
4276 	  goto sb_char;   /* [0-9-a] is allowed as [0-9\-a] */
4277 	}
4278 	r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4279 	goto err;
4280       }
4281       break;
4282 
4283     case TK_CC_CC_OPEN: /* [ */
4284       {
4285 	Node *anode;
4286 	CClassNode* acc;
4287 
4288 	r = parse_char_class(&anode, tok, &p, end, env);
4289 	if (r != 0) goto cc_open_err;
4290 	acc = &(NCCLASS(anode));
4291 	r = or_cclass(cc, acc, env->enc);
4292 
4293 	onig_node_free(anode);
4294       cc_open_err:
4295 	if (r != 0) goto err;
4296       }
4297       break;
4298 
4299     case TK_CC_AND: /* && */
4300       {
4301 	if (state == CCS_VALUE) {
4302 	  r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4303 			     &val_type, &state, env);
4304 	  if (r != 0) goto err;
4305 	}
4306 	/* initialize local variables */
4307 	and_start = 1;
4308 	state = CCS_START;
4309 
4310 	if (IS_NOT_NULL(prev_cc)) {
4311 	  r = and_cclass(prev_cc, cc, env->enc);
4312 	  if (r != 0) goto err;
4313 	  bbuf_free(cc->mbuf);
4314 	}
4315 	else {
4316 	  prev_cc = cc;
4317 	  cc = &work_cc;
4318 	}
4319 	initialize_cclass(cc);
4320       }
4321       break;
4322 
4323     case TK_EOT:
4324       r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4325       goto err;
4326       break;
4327     default:
4328       r = ONIGERR_PARSER_BUG;
4329       goto err;
4330       break;
4331     }
4332 
4333     if (fetched)
4334       r = tok->type;
4335     else {
4336       r = fetch_token_in_cc(tok, &p, end, env);
4337       if (r < 0) goto err;
4338     }
4339   }
4340 
4341   if (state == CCS_VALUE) {
4342     r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4343 		       &val_type, &state, env);
4344     if (r != 0) goto err;
4345   }
4346 
4347   if (IS_NOT_NULL(prev_cc)) {
4348     r = and_cclass(prev_cc, cc, env->enc);
4349     if (r != 0) goto err;
4350     bbuf_free(cc->mbuf);
4351     cc = prev_cc;
4352   }
4353 
4354   if (neg != 0)
4355     CCLASS_SET_NOT(cc);
4356   else
4357     CCLASS_CLEAR_NOT(cc);
4358   if (IS_CCLASS_NOT(cc) &&
4359       IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4360     int is_empty;
4361 
4362     is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4363     if (is_empty != 0)
4364       BITSET_IS_EMPTY(cc->bs, is_empty);
4365 
4366     if (is_empty == 0) {
4367 #define NEWLINE_CODE    0x0a
4368 
4369       if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4370         if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4371           BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4372         else
4373           add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4374       }
4375     }
4376   }
4377   *src = p;
4378   return 0;
4379 
4380  err:
4381   if (cc != &(NCCLASS(*np)))
4382     bbuf_free(cc->mbuf);
4383   onig_node_free(*np);
4384   return r;
4385 }
4386 
4387 static int parse_subexp(Node** top, OnigToken* tok, int term,
4388 			UChar** src, UChar* end, ScanEnv* env);
4389 
4390 static int
parse_effect(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4391 parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4392 	     ScanEnv* env)
4393 {
4394   int r, num;
4395   int list_capture;
4396   Node *target;
4397   OnigOptionType option;
4398   OnigEncoding enc = env->enc;
4399   OnigCodePoint c;
4400   UChar* p = *src;
4401   PFETCH_READY;
4402 
4403   *np = NULL;
4404   if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4405 
4406   option = env->option;
4407   if (PPEEK_IS('?') &&
4408       IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4409     PINC;
4410     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4411 
4412     PFETCH(c);
4413     switch (c) {
4414     case ':':   /* (?:...) grouping only */
4415     group:
4416       r = fetch_token(tok, &p, end, env);
4417       if (r < 0) return r;
4418       r = parse_subexp(np, tok, term, &p, end, env);
4419       if (r < 0) return r;
4420       *src = p;
4421       return 1; /* group */
4422       break;
4423 
4424     case '=':
4425       *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4426       break;
4427     case '!':  /*         preceding read */
4428       *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4429       break;
4430     case '>':            /* (?>...) stop backtrack */
4431       *np = node_new_effect(EFFECT_STOP_BACKTRACK);
4432       break;
4433 
4434     case '<':   /* look behind (?<=...), (?<!...) */
4435       PFETCH(c);
4436       if (c == '=')
4437 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4438       else if (c == '!')
4439 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4440 #ifdef USE_NAMED_GROUP
4441       else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4442 	UChar *name;
4443 	UChar *name_end;
4444 
4445 	PUNFETCH;
4446 	list_capture = 0;
4447 
4448       named_group:
4449 	name = p;
4450 	r = fetch_name(&p, end, &name_end, env, 0);
4451 	if (r < 0) return r;
4452 
4453 	num = scan_env_add_mem_entry(env);
4454 	if (num < 0) return num;
4455 	if (list_capture != 0 && num >= BIT_STATUS_BITS_NUM)
4456 	  return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4457 
4458 	r = name_add(env->reg, name, name_end, num, env);
4459 	if (r != 0) return r;
4460 	*np = node_new_effect_memory(env->option, 1);
4461 	CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4462 	NEFFECT(*np).regnum = num;
4463 	if (list_capture != 0)
4464 	  BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4465 	env->num_named++;
4466       }
4467 #endif
4468       else
4469 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4470       break;
4471 
4472     case '@':
4473       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4474 #ifdef USE_NAMED_GROUP
4475 	if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4476 	  PFETCH(c);
4477 	  if (c == '<') {
4478 	    list_capture = 1;
4479 	    goto named_group; /* (?@<name>...) */
4480 	  }
4481 	  PUNFETCH;
4482 	}
4483 #endif
4484 	*np = node_new_effect_memory(env->option, 0);
4485 	CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4486 	num = scan_env_add_mem_entry(env);
4487 	if (num < 0) {
4488 	  onig_node_free(*np);
4489 	  return num;
4490 	}
4491 	else if (num >= BIT_STATUS_BITS_NUM) {
4492 	  onig_node_free(*np);
4493 	  return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4494 	}
4495 	NEFFECT(*np).regnum = num;
4496 	BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4497       }
4498       else {
4499 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4500       }
4501       break;
4502 
4503 #ifdef USE_POSIXLINE_OPTION
4504     case 'p':
4505 #endif
4506     case '-': case 'i': case 'm': case 's': case 'x':
4507       {
4508 	int neg = 0;
4509 
4510 	while (1) {
4511 	  switch (c) {
4512 	  case ':':
4513 	  case ')':
4514 	  break;
4515 
4516 	  case '-':  neg = 1; break;
4517 	  case 'x':  ONOFF(option, ONIG_OPTION_EXTEND,     neg); break;
4518 	  case 'i':  ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4519 	  case 's':
4520 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4521 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4522 	    }
4523 	    else
4524 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4525 	    break;
4526 
4527 	  case 'm':
4528 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4529 	      ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4530 	    }
4531 	    else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4532 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4533 	    }
4534 	    else
4535 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4536 	    break;
4537 #ifdef USE_POSIXLINE_OPTION
4538 	  case 'p':
4539 	    ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4540 	    break;
4541 #endif
4542 	  default:
4543 	    return ONIGERR_UNDEFINED_GROUP_OPTION;
4544 	  }
4545 
4546 	  if (c == ')') {
4547 	    *np = node_new_option(option);
4548 	    CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4549 	    *src = p;
4550 	    return 2; /* option only */
4551 	  }
4552 	  else if (c == ':') {
4553 	    OnigOptionType prev = env->option;
4554 
4555 	    env->option     = option;
4556 	    r = fetch_token(tok, &p, end, env);
4557 	    if (r < 0) return r;
4558 	    r = parse_subexp(&target, tok, term, &p, end, env);
4559 	    env->option = prev;
4560 	    if (r < 0) return r;
4561 	    *np = node_new_option(option);
4562 	    CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4563 	    NEFFECT(*np).target = target;
4564 	    *src = p;
4565 	    return 0;
4566 	  }
4567 
4568 	  if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4569 	  PFETCH(c);
4570 	}
4571       }
4572       break;
4573 
4574     default:
4575       return ONIGERR_UNDEFINED_GROUP_OPTION;
4576     }
4577   }
4578   else {
4579     if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4580       goto group;
4581 
4582     *np = node_new_effect_memory(env->option, 0);
4583     CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4584     num = scan_env_add_mem_entry(env);
4585     if (num < 0) return num;
4586     NEFFECT(*np).regnum = num;
4587   }
4588 
4589   CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4590   r = fetch_token(tok, &p, end, env);
4591   if (r < 0) return r;
4592   r = parse_subexp(&target, tok, term, &p, end, env);
4593   if (r < 0) return r;
4594 
4595   if (NTYPE(*np) == N_ANCHOR)
4596     NANCHOR(*np).target = target;
4597   else {
4598     NEFFECT(*np).target = target;
4599     if (NEFFECT(*np).type == EFFECT_MEMORY) {
4600       /* Don't move this to previous of parse_subexp() */
4601       r = scan_env_set_mem_node(env, NEFFECT(*np).regnum, *np);
4602       if (r != 0) return r;
4603     }
4604   }
4605 
4606   *src = p;
4607   return 0;
4608 }
4609 
4610 static const char* PopularQStr[] = {
4611   "?", "*", "+", "??", "*?", "+?"
4612 };
4613 
4614 static const char* ReduceQStr[] = {
4615   "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4616 };
4617 
4618 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4619 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4620 {
4621   QuantifierNode* qn;
4622 
4623   qn = &(NQUANTIFIER(qnode));
4624   if (qn->lower == 1 && qn->upper == 1) {
4625     return 1;
4626   }
4627 
4628   switch (NTYPE(target)) {
4629   case N_STRING:
4630     if (! group) {
4631       StrNode* sn = &(NSTRING(target));
4632       if (str_node_can_be_split(sn, env->enc)) {
4633 	Node* n = str_node_split_last_char(sn, env->enc);
4634 	if (IS_NOT_NULL(n)) {
4635 	  qn->target = n;
4636 	  return 2;
4637 	}
4638       }
4639     }
4640     break;
4641 
4642   case N_QUANTIFIER:
4643     { /* check redundant double repeat. */
4644       /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4645       QuantifierNode* qnt = &(NQUANTIFIER(target));
4646       int nestq_num   = popular_quantifier_num(qn);
4647       int targetq_num = popular_quantifier_num(qnt);
4648 
4649 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4650       if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4651 	  IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4652         UChar buf[WARN_BUFSIZE];
4653 
4654         switch(ReduceTypeTable[targetq_num][nestq_num]) {
4655         case RQ_ASIS:
4656           break;
4657 
4658         case RQ_DEL:
4659           if (onig_verb_warn != onig_null_warn) {
4660             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4661                                  env->pattern, env->pattern_end,
4662                                  (UChar* )"redundant nested repeat operator");
4663             (*onig_verb_warn)((char* )buf);
4664           }
4665           goto warn_exit;
4666           break;
4667 
4668         default:
4669           if (onig_verb_warn != onig_null_warn) {
4670             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4671                                        env->pattern, env->pattern_end,
4672             (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4673             PopularQStr[targetq_num], PopularQStr[nestq_num],
4674             ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4675             (*onig_verb_warn)((char* )buf);
4676           }
4677           goto warn_exit;
4678           break;
4679         }
4680       }
4681 
4682     warn_exit:
4683 #endif
4684       if (targetq_num >= 0) {
4685 	if (nestq_num >= 0) {
4686 	  onig_reduce_nested_quantifier(qnode, target);
4687 	  goto q_exit;
4688 	}
4689 	else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4690 	  /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4691 	  if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4692 	    qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4693 	  }
4694 	}
4695       }
4696     }
4697     break;
4698 
4699   default:
4700     break;
4701   }
4702 
4703   qn->target = target;
4704  q_exit:
4705   return 0;
4706 }
4707 
4708 #ifdef USE_SHARED_CCLASS_TABLE
4709 
4710 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS     8
4711 
4712 /* for ctype node hash table */
4713 
4714 typedef struct {
4715   OnigEncoding enc;
4716   int not;
4717   int type;
4718 } type_cclass_key;
4719 
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4720 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4721 {
4722   if (x->type != y->type) return 1;
4723   if (x->enc  != y->enc)  return 1;
4724   if (x->not  != y->not)  return 1;
4725   return 0;
4726 }
4727 
type_cclass_hash(type_cclass_key * key)4728 static int type_cclass_hash(type_cclass_key* key)
4729 {
4730   int i, val;
4731   unsigned char *p;
4732 
4733   val = 0;
4734 
4735   p = (unsigned char* )&(key->enc);
4736   for (i = 0; i < sizeof(key->enc); i++) {
4737     val = val * 997 + (int )*p++;
4738   }
4739 
4740   p = (unsigned char* )(&key->type);
4741   for (i = 0; i < sizeof(key->type); i++) {
4742     val = val * 997 + (int )*p++;
4743   }
4744 
4745   val += key->not;
4746   return val + (val >> 5);
4747 }
4748 
4749 static struct st_hash_type type_type_cclass_hash = {
4750     type_cclass_cmp,
4751     type_cclass_hash,
4752 };
4753 
4754 static st_table* OnigTypeCClassTable;
4755 
4756 
4757 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg)4758 i_free_shared_class(type_cclass_key* key, Node* node, void* arg)
4759 {
4760   if (IS_NOT_NULL(node)) {
4761     CClassNode* cc = &(NCCLASS(node));
4762     if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4763     xfree(node);
4764   }
4765 
4766   if (IS_NOT_NULL(key)) xfree(key);
4767   return ST_DELETE;
4768 }
4769 
4770 extern int
onig_free_shared_cclass_table(void)4771 onig_free_shared_cclass_table(void)
4772 {
4773   if (IS_NOT_NULL(OnigTypeCClassTable)) {
4774     onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4775     onig_st_free_table(OnigTypeCClassTable);
4776     OnigTypeCClassTable = NULL;
4777   }
4778 
4779   return 0;
4780 }
4781 
4782 #endif /* USE_SHARED_CCLASS_TABLE */
4783 
4784 
4785 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4786 parse_exp(Node** np, OnigToken* tok, int term,
4787 	  UChar** src, UChar* end, ScanEnv* env)
4788 {
4789   int r, len, group = 0;
4790   Node* qn;
4791   Node** targetp;
4792 
4793   *np = NULL;
4794   if (tok->type == term)
4795     goto end_of_token;
4796 
4797   switch (tok->type) {
4798   case TK_ALT:
4799   case TK_EOT:
4800   end_of_token:
4801   *np = node_new_empty();
4802   return tok->type;
4803   break;
4804 
4805   case TK_SUBEXP_OPEN:
4806     r = parse_effect(np, tok, TK_SUBEXP_CLOSE, src, end, env);
4807     if (r < 0) return r;
4808     if (r == 1) group = 1;
4809     else if (r == 2) { /* option only */
4810       Node* target;
4811       OnigOptionType prev = env->option;
4812 
4813       env->option = NEFFECT(*np).option;
4814       r = fetch_token(tok, src, end, env);
4815       if (r < 0) return r;
4816       r = parse_subexp(&target, tok, term, src, end, env);
4817       env->option = prev;
4818       if (r < 0) return r;
4819       NEFFECT(*np).target = target;
4820       return tok->type;
4821     }
4822     break;
4823 
4824   case TK_SUBEXP_CLOSE:
4825     if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
4826       return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
4827 
4828     if (tok->escaped) goto tk_raw_byte;
4829     else goto tk_byte;
4830     break;
4831 
4832   case TK_STRING:
4833   tk_byte:
4834     {
4835       *np = node_new_str(tok->backp, *src);
4836       CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4837 
4838       while (1) {
4839 	r = fetch_token(tok, src, end, env);
4840 	if (r < 0) return r;
4841 	if (r != TK_STRING) break;
4842 
4843 	r = onig_node_str_cat(*np, tok->backp, *src);
4844 	if (r < 0) return r;
4845       }
4846 
4847     string_end:
4848       targetp = np;
4849       goto repeat;
4850     }
4851     break;
4852 
4853   case TK_RAW_BYTE:
4854   tk_raw_byte:
4855     {
4856       *np = node_new_str_char((UChar )tok->u.c);
4857       CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4858       len = 1;
4859       while (1) {
4860 	if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
4861 	  if (len == enc_len(env->enc, NSTRING(*np).s)) {
4862 	    r = fetch_token(tok, src, end, env);
4863 	    goto string_end;
4864 	  }
4865 	}
4866 
4867 	r = fetch_token(tok, src, end, env);
4868 	if (r < 0) return r;
4869 	if (r != TK_RAW_BYTE) {
4870 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
4871 	  int rem;
4872 	  if (len < ONIGENC_MBC_MINLEN(env->enc)) {
4873 	    rem = ONIGENC_MBC_MINLEN(env->enc) - len;
4874 	    (void )node_str_head_pad(&NSTRING(*np), rem, (UChar )0);
4875 	    if (len + rem == enc_len(env->enc, NSTRING(*np).s)) {
4876 	      goto string_end;
4877 	    }
4878 	  }
4879 #endif
4880 	  return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4881 	}
4882 
4883 	r = node_str_cat_char(*np, (UChar )tok->u.c);
4884 	if (r < 0) return r;
4885 
4886 	len++;
4887       }
4888     }
4889     break;
4890 
4891   case TK_CODE_POINT:
4892     {
4893       UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4894       int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
4895       if (num < 0) return num;
4896 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
4897       *np = node_new_str_raw(buf, buf + num);
4898 #else
4899       *np = node_new_str(buf, buf + num);
4900 #endif
4901       CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4902     }
4903     break;
4904 
4905   case TK_QUOTE_OPEN:
4906     {
4907       OnigCodePoint end_op[2];
4908       UChar *qstart, *qend, *nextp;
4909 
4910       end_op[0] = (OnigCodePoint )MC_ESC(env->enc);
4911       end_op[1] = (OnigCodePoint )'E';
4912       qstart = *src;
4913       qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
4914       if (IS_NULL(qend)) {
4915 	nextp = qend = end;
4916       }
4917       *np = node_new_str(qstart, qend);
4918       CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4919       *src = nextp;
4920     }
4921     break;
4922 
4923   case TK_CHAR_TYPE:
4924     {
4925       switch (tok->u.subtype) {
4926       case CTYPE_WORD:
4927       case CTYPE_NOT_WORD:
4928 	*np = node_new_ctype(tok->u.subtype);
4929 	CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4930 	break;
4931 
4932       case CTYPE_WHITE_SPACE:
4933       case CTYPE_NOT_WHITE_SPACE:
4934       case CTYPE_DIGIT:
4935       case CTYPE_NOT_DIGIT:
4936       case CTYPE_XDIGIT:
4937       case CTYPE_NOT_XDIGIT:
4938 	{
4939 	  CClassNode* cc;
4940 	  int ctype, not;
4941 
4942 #ifdef USE_SHARED_CCLASS_TABLE
4943           const OnigCodePoint *sbr, *mbr;
4944 
4945 	  ctype = parse_ctype_to_enc_ctype(tok->u.subtype, &not);
4946           r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, ctype, &sbr, &mbr);
4947           if (r == 0 &&
4948               ONIGENC_CODE_RANGE_NUM(mbr)
4949               >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
4950             type_cclass_key  key;
4951             type_cclass_key* new_key;
4952 
4953             key.enc  = env->enc;
4954             key.not  = not;
4955             key.type = ctype;
4956 
4957             THREAD_ATOMIC_START;
4958 
4959             if (IS_NULL(OnigTypeCClassTable)) {
4960               OnigTypeCClassTable
4961                 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
4962               if (IS_NULL(OnigTypeCClassTable)) {
4963                 THREAD_ATOMIC_END;
4964                 return ONIGERR_MEMORY;
4965               }
4966             }
4967             else {
4968               if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
4969                                  (st_data_t* )np)) {
4970                 THREAD_ATOMIC_END;
4971                 break;
4972               }
4973             }
4974 
4975             *np = node_new_cclass_by_codepoint_range(not, sbr, mbr);
4976             if (IS_NULL(*np)) {
4977               THREAD_ATOMIC_END;
4978               return ONIGERR_MEMORY;
4979             }
4980 
4981             CCLASS_SET_SHARE(&(NCCLASS(*np)));
4982             new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
4983             xmemcpy(new_key, &key, sizeof(type_cclass_key));
4984             onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
4985                                (st_data_t )*np);
4986 
4987             THREAD_ATOMIC_END;
4988           }
4989           else {
4990 #endif
4991             ctype = parse_ctype_to_enc_ctype(tok->u.subtype, &not);
4992             *np = node_new_cclass();
4993             CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
4994             cc = &(NCCLASS(*np));
4995             add_ctype_to_cc(cc, ctype, 0, env);
4996             if (not != 0) CCLASS_SET_NOT(cc);
4997 #ifdef USE_SHARED_CCLASS_TABLE
4998           }
4999 #endif
5000 	}
5001 	break;
5002 
5003       default:
5004 	return ONIGERR_PARSER_BUG;
5005 	break;
5006       }
5007     }
5008     break;
5009 
5010   case TK_CHAR_PROPERTY:
5011     r = parse_char_property(np, tok, src, end, env);
5012     if (r != 0) return r;
5013     break;
5014 
5015   case TK_CC_OPEN:
5016     {
5017       CClassNode* cc;
5018 
5019       r = parse_char_class(np, tok, src, end, env);
5020       if (r != 0) return r;
5021 
5022       cc = &(NCCLASS(*np));
5023 
5024       if (IS_IGNORECASE(env->option)) {
5025         int i, n, in_cc;
5026         const OnigPairAmbigCodes* ccs;
5027         BitSetRef bs = cc->bs;
5028         OnigAmbigType amb;
5029 
5030         for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) {
5031           if ((amb & env->ambig_flag) == 0)  continue;
5032 
5033           n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs);
5034           for (i = 0; i < n; i++) {
5035             in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc);
5036 
5037             if ((in_cc != 0 && !IS_CCLASS_NOT(cc)) ||
5038                 (in_cc == 0 && IS_CCLASS_NOT(cc))) {
5039               if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
5040                   ccs[i].from >= SINGLE_BYTE_SIZE) {
5041                 /* if (cc->not) clear_not_flag_cclass(cc, env->enc); */
5042                 add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to);
5043               }
5044               else {
5045                 if (BITSET_AT(bs, ccs[i].from)) {
5046                   /* /(?i:[^A-C])/.match("a") ==> fail. */
5047                   BITSET_SET_BIT(bs, ccs[i].to);
5048                 }
5049                 if (BITSET_AT(bs, ccs[i].to)) {
5050                   BITSET_SET_BIT(bs, ccs[i].from);
5051                 }
5052               }
5053             }
5054           }
5055         }
5056       }
5057     }
5058     break;
5059 
5060   case TK_ANYCHAR:
5061     *np = node_new_anychar();
5062     CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5063     break;
5064 
5065   case TK_ANYCHAR_ANYTIME:
5066     *np = node_new_anychar();
5067     CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5068     qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5069     CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY);
5070     NQUANTIFIER(qn).target = *np;
5071     *np = qn;
5072     break;
5073 
5074   case TK_BACKREF:
5075     len = tok->u.backref.num;
5076     *np = node_new_backref(len,
5077 		   (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5078 			   tok->u.backref.by_name,
5079 #ifdef USE_BACKREF_AT_LEVEL
5080 			   tok->u.backref.exist_level,
5081 			   tok->u.backref.level,
5082 #endif
5083 			   env);
5084     CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5085     break;
5086 
5087 #ifdef USE_SUBEXP_CALL
5088   case TK_CALL:
5089     *np = node_new_call(tok->u.call.name, tok->u.call.name_end);
5090     CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
5091     env->num_call++;
5092     break;
5093 #endif
5094 
5095   case TK_ANCHOR:
5096     *np = onig_node_new_anchor(tok->u.anchor);
5097     break;
5098 
5099   case TK_OP_REPEAT:
5100   case TK_INTERVAL:
5101     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5102       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5103 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5104       else
5105 	*np = node_new_empty();
5106     }
5107     else {
5108       goto tk_byte;
5109     }
5110     break;
5111 
5112   default:
5113     return ONIGERR_PARSER_BUG;
5114     break;
5115   }
5116 
5117   {
5118     targetp = np;
5119 
5120   re_entry:
5121     r = fetch_token(tok, src, end, env);
5122     if (r < 0) return r;
5123 
5124   repeat:
5125     if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5126       if (is_invalid_quantifier_target(*targetp))
5127 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5128 
5129       qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5130 			      (r == TK_INTERVAL ? 1 : 0));
5131       CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY);
5132       NQUANTIFIER(qn).greedy = tok->u.repeat.greedy;
5133       r = set_quantifier(qn, *targetp, group, env);
5134       if (r < 0) return r;
5135 
5136       if (tok->u.repeat.possessive != 0) {
5137 	Node* en;
5138 	en = node_new_effect(EFFECT_STOP_BACKTRACK);
5139 	CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY);
5140 	NEFFECT(en).target = qn;
5141 	qn = en;
5142       }
5143 
5144       if (r == 0) {
5145 	*targetp = qn;
5146       }
5147       else if (r == 2) { /* split case: /abc+/ */
5148 	Node *tmp;
5149 
5150 	*targetp = node_new_list(*targetp, NULL);
5151 	CHECK_NULL_RETURN_VAL(*targetp, ONIGERR_MEMORY);
5152 	tmp = NCONS(*targetp).right = node_new_list(qn, NULL);
5153 	CHECK_NULL_RETURN_VAL(tmp, ONIGERR_MEMORY);
5154 	targetp = &(NCONS(tmp).left);
5155       }
5156       goto re_entry;
5157     }
5158   }
5159 
5160   return r;
5161 }
5162 
5163 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5164 parse_branch(Node** top, OnigToken* tok, int term,
5165 	     UChar** src, UChar* end, ScanEnv* env)
5166 {
5167   int r;
5168   Node *node, **headp;
5169 
5170   *top = NULL;
5171   r = parse_exp(&node, tok, term, src, end, env);
5172   if (r < 0) return r;
5173 
5174   if (r == TK_EOT || r == term || r == TK_ALT) {
5175     *top = node;
5176   }
5177   else {
5178     *top  = node_new_list(node, NULL);
5179     headp = &(NCONS(*top).right);
5180     while (r != TK_EOT && r != term && r != TK_ALT) {
5181       r = parse_exp(&node, tok, term, src, end, env);
5182       if (r < 0) return r;
5183 
5184       if (NTYPE(node) == N_LIST) {
5185 	*headp = node;
5186 	while (IS_NOT_NULL(NCONS(node).right)) node = NCONS(node).right;
5187 	headp = &(NCONS(node).right);
5188       }
5189       else {
5190 	*headp = node_new_list(node, NULL);
5191 	headp = &(NCONS(*headp).right);
5192       }
5193     }
5194   }
5195 
5196   return r;
5197 }
5198 
5199 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5200 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5201 parse_subexp(Node** top, OnigToken* tok, int term,
5202 	     UChar** src, UChar* end, ScanEnv* env)
5203 {
5204   int r;
5205   Node *node, **headp;
5206 
5207   *top = NULL;
5208   r = parse_branch(&node, tok, term, src, end, env);
5209   if (r < 0) {
5210     onig_node_free(node);
5211     return r;
5212   }
5213 
5214   if (r == term) {
5215     *top = node;
5216   }
5217   else if (r == TK_ALT) {
5218     *top  = node_new_alt(node, NULL);
5219     headp = &(NCONS(*top).right);
5220     while (r == TK_ALT) {
5221       r = fetch_token(tok, src, end, env);
5222       if (r < 0) return r;
5223       r = parse_branch(&node, tok, term, src, end, env);
5224       if (r < 0) return r;
5225 
5226       *headp = node_new_alt(node, NULL);
5227       headp = &(NCONS(*headp).right);
5228     }
5229 
5230     if (tok->type != term)
5231       goto err;
5232   }
5233   else {
5234   err:
5235     if (term == TK_SUBEXP_CLOSE)
5236       return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5237     else
5238       return ONIGERR_PARSER_BUG;
5239   }
5240 
5241   return r;
5242 }
5243 
5244 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5245 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5246 {
5247   int r;
5248   OnigToken tok;
5249 
5250   r = fetch_token(&tok, src, end, env);
5251   if (r < 0) return r;
5252   r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5253   if (r < 0) return r;
5254   return 0;
5255 }
5256 
5257 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5258 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_t* reg,
5259 		      ScanEnv* env)
5260 {
5261   int r;
5262   UChar* p;
5263 
5264 #ifdef USE_NAMED_GROUP
5265   names_clear(reg);
5266 #endif
5267 
5268   scan_env_clear(env);
5269   env->option      = reg->options;
5270   env->ambig_flag  = reg->ambig_flag;
5271   env->enc         = reg->enc;
5272   env->syntax      = reg->syntax;
5273   env->pattern     = (UChar* )pattern;
5274   env->pattern_end = (UChar* )end;
5275   env->reg         = reg;
5276 
5277   *root = NULL;
5278   p = (UChar* )pattern;
5279   r = parse_regexp(root, &p, (UChar* )end, env);
5280   reg->num_mem = env->num_mem;
5281   return r;
5282 }
5283 
5284 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode,UChar * arg,UChar * arg_end)5285 onig_scan_env_set_error_string(ScanEnv* env, int ecode,
5286 				UChar* arg, UChar* arg_end)
5287 {
5288   env->error     = arg;
5289   env->error_end = arg_end;
5290 }
5291