xref: /PHP-5.6/ext/mbstring/oniguruma/regparse.c (revision c95daa9c)
1 /**********************************************************************
2   regparse.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 #include "st.h"
32 
33 #define WARN_BUFSIZE    256
34 
35 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
36 
37 
38 OnigSyntaxType OnigSyntaxRuby = {
39   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
40      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
41      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
42      ONIG_SYN_OP_ESC_C_CONTROL )
43    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
44   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
45       ONIG_SYN_OP2_OPTION_RUBY |
46       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
47       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
48       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
49       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
50       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
51       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
52       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
53       ONIG_SYN_OP2_ESC_H_XDIGIT )
54   , ( SYN_GNU_REGEX_BV |
55       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
56       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
57       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
58       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
59       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
60       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
61       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
62   , ONIG_OPTION_NONE
63   ,
64   {
65       (OnigCodePoint )'\\'                       /* esc */
66     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
67     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
68     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
69     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
70     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
71   }
72 };
73 
74 OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
75 
onig_null_warn(const char * s ARG_UNUSED)76 extern void onig_null_warn(const char* s ARG_UNUSED) { }
77 
78 #ifdef DEFAULT_WARN_FUNCTION
79 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
80 #else
81 static OnigWarnFunc onig_warn = onig_null_warn;
82 #endif
83 
84 #ifdef DEFAULT_VERB_WARN_FUNCTION
85 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
86 #else
87 static OnigWarnFunc onig_verb_warn = onig_null_warn;
88 #endif
89 
onig_set_warn_func(OnigWarnFunc f)90 extern void onig_set_warn_func(OnigWarnFunc f)
91 {
92   onig_warn = f;
93 }
94 
onig_set_verb_warn_func(OnigWarnFunc f)95 extern void onig_set_verb_warn_func(OnigWarnFunc f)
96 {
97   onig_verb_warn = f;
98 }
99 
100 static void
bbuf_free(BBuf * bbuf)101 bbuf_free(BBuf* bbuf)
102 {
103   if (IS_NOT_NULL(bbuf)) {
104     if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
105     xfree(bbuf);
106   }
107 }
108 
109 static int
bbuf_clone(BBuf ** rto,BBuf * from)110 bbuf_clone(BBuf** rto, BBuf* from)
111 {
112   int r;
113   BBuf *to;
114 
115   *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
116   CHECK_NULL_RETURN_MEMERR(to);
117   r = BBUF_INIT(to, from->alloc);
118   if (r != 0) return r;
119   to->used = from->used;
120   xmemcpy(to->p, from->p, from->used);
121   return 0;
122 }
123 
124 #define BACKREF_REL_TO_ABS(rel_no, env) \
125   ((env)->num_mem + 1 + (rel_no))
126 
127 #define ONOFF(v,f,negative)    (negative) ? ((v) &= ~(f)) : ((v) |= (f))
128 
129 #define MBCODE_START_POS(enc) \
130   (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
131 
132 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
133   add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
134 
135 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
136   if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
137     r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
138     if (r) return r;\
139   }\
140 } while (0)
141 
142 
143 #define BITSET_IS_EMPTY(bs,empty) do {\
144   int i;\
145   empty = 1;\
146   for (i = 0; i < (int )BITSET_SIZE; i++) {\
147     if ((bs)[i] != 0) {\
148       empty = 0; break;\
149     }\
150   }\
151 } while (0)
152 
153 static void
bitset_set_range(BitSetRef bs,int from,int to)154 bitset_set_range(BitSetRef bs, int from, int to)
155 {
156   int i;
157   for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
158     BITSET_SET_BIT(bs, i);
159   }
160 }
161 
162 #if 0
163 static void
164 bitset_set_all(BitSetRef bs)
165 {
166   int i;
167   for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
168 }
169 #endif
170 
171 static void
bitset_invert(BitSetRef bs)172 bitset_invert(BitSetRef bs)
173 {
174   int i;
175   for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
176 }
177 
178 static void
bitset_invert_to(BitSetRef from,BitSetRef to)179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181   int i;
182   for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
183 }
184 
185 static void
bitset_and(BitSetRef dest,BitSetRef bs)186 bitset_and(BitSetRef dest, BitSetRef bs)
187 {
188   int i;
189   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
190 }
191 
192 static void
bitset_or(BitSetRef dest,BitSetRef bs)193 bitset_or(BitSetRef dest, BitSetRef bs)
194 {
195   int i;
196   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
197 }
198 
199 static void
bitset_copy(BitSetRef dest,BitSetRef bs)200 bitset_copy(BitSetRef dest, BitSetRef bs)
201 {
202   int i;
203   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
204 }
205 
206 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)207 onig_strncmp(const UChar* s1, const UChar* s2, int n)
208 {
209   int x;
210 
211   while (n-- > 0) {
212     x = *s2++ - *s1++;
213     if (x) return x;
214   }
215   return 0;
216 }
217 
218 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)219 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
220 {
221   int len = end - src;
222   if (len > 0) {
223     xmemcpy(dest, src, len);
224     dest[len] = (UChar )0;
225   }
226 }
227 
228 #ifdef USE_NAMED_GROUP
229 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)230 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
231 {
232   int slen, term_len, i;
233   UChar *r;
234 
235   slen = end - s;
236   term_len = ONIGENC_MBC_MINLEN(enc);
237 
238   r = (UChar* )xmalloc(slen + term_len);
239   CHECK_NULL_RETURN(r);
240   xmemcpy(r, s, slen);
241 
242   for (i = 0; i < term_len; i++)
243     r[slen + i] = (UChar )0;
244 
245   return r;
246 }
247 #endif
248 
249 /* scan pattern methods */
250 #define PEND_VALUE   0
251 
252 #define PFETCH_READY  UChar* pfetch_prev
253 #define PEND         (p < end ?  0 : 1)
254 #define PUNFETCH     p = pfetch_prev
255 #define PINC       do { \
256   pfetch_prev = p; \
257   p += ONIGENC_MBC_ENC_LEN(enc, p); \
258 } while (0)
259 #define PFETCH(c)  do { \
260   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
261   pfetch_prev = p; \
262   p += ONIGENC_MBC_ENC_LEN(enc, p); \
263   if(UNEXPECTED(p > end)) p = end; \
264 } while (0)
265 
266 #define PINC_S     do { \
267   p += ONIGENC_MBC_ENC_LEN(enc, p); \
268   if(UNEXPECTED(p > end)) p = end; \
269 } while (0)
270 #define PFETCH_S(c) do { \
271   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
272   p += ONIGENC_MBC_ENC_LEN(enc, p); \
273   if(UNEXPECTED(p > end)) p = end; \
274 } while (0)
275 
276 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
277 #define PPEEK_IS(c)  (PPEEK == (OnigCodePoint )c)
278 
279 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)280 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
281 	      int capa)
282 {
283   UChar* r;
284 
285   if (dest)
286     r = (UChar* )xrealloc(dest, capa + 1);
287   else
288     r = (UChar* )xmalloc(capa + 1);
289 
290   CHECK_NULL_RETURN(r);
291   onig_strcpy(r + (dest_end - dest), src, src_end);
292   return r;
293 }
294 
295 /* dest on static area */
296 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)297 strcat_capa_from_static(UChar* dest, UChar* dest_end,
298 			const UChar* src, const UChar* src_end, int capa)
299 {
300   UChar* r;
301 
302   r = (UChar* )xmalloc(capa + 1);
303   CHECK_NULL_RETURN(r);
304   onig_strcpy(r, dest, dest_end);
305   onig_strcpy(r + (dest_end - dest), src, src_end);
306   return r;
307 }
308 
309 
310 #ifdef USE_ST_LIBRARY
311 
312 typedef struct {
313   UChar* s;
314   UChar* end;
315 } st_str_end_key;
316 
317 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)318 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
319 {
320   UChar *p, *q;
321   int c;
322 
323   if ((x->end - x->s) != (y->end - y->s))
324     return 1;
325 
326   p = x->s;
327   q = y->s;
328   while (p < x->end) {
329     c = (int )*p - (int )*q;
330     if (c != 0) return c;
331 
332     p++; q++;
333   }
334 
335   return 0;
336 }
337 
338 static int
str_end_hash(st_str_end_key * x)339 str_end_hash(st_str_end_key* x)
340 {
341   UChar *p;
342   int val = 0;
343 
344   p = x->s;
345   while (p < x->end) {
346     val = val * 997 + (int )*p++;
347   }
348 
349   return val + (val >> 5);
350 }
351 
352 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)353 onig_st_init_strend_table_with_size(int size)
354 {
355   static struct st_hash_type hashType = {
356     str_end_cmp,
357     str_end_hash,
358   };
359 
360   return (hash_table_type* )
361            onig_st_init_table_with_size(&hashType, size);
362 }
363 
364 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)365 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
366 		      const UChar* end_key, hash_data_type *value)
367 {
368   st_str_end_key key;
369 
370   key.s   = (UChar* )str_key;
371   key.end = (UChar* )end_key;
372 
373   return onig_st_lookup(table, (st_data_t )(&key), value);
374 }
375 
376 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)377 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
378 		      const UChar* end_key, hash_data_type value)
379 {
380   st_str_end_key* key;
381   int result;
382 
383   key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
384   key->s   = (UChar* )str_key;
385   key->end = (UChar* )end_key;
386   result = onig_st_insert(table, (st_data_t )key, value);
387   if (result) {
388     xfree(key);
389   }
390   return result;
391 }
392 
393 #endif /* USE_ST_LIBRARY */
394 
395 
396 #ifdef USE_NAMED_GROUP
397 
398 #define INIT_NAME_BACKREFS_ALLOC_NUM   8
399 
400 typedef struct {
401   UChar* name;
402   int    name_len;   /* byte length */
403   int    back_num;   /* number of backrefs */
404   int    back_alloc;
405   int    back_ref1;
406   int*   back_refs;
407 } NameEntry;
408 
409 #ifdef USE_ST_LIBRARY
410 
411 typedef st_table  NameTable;
412 typedef st_data_t HashDataType;   /* 1.6 st.h doesn't define st_data_t type */
413 
414 #define NAMEBUF_SIZE    24
415 #define NAMEBUF_SIZE_1  25
416 
417 #ifdef ONIG_DEBUG
418 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)419 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
420 {
421   int i;
422   FILE* fp = (FILE* )arg;
423 
424   fprintf(fp, "%s: ", e->name);
425   if (e->back_num == 0)
426     fputs("-", fp);
427   else if (e->back_num == 1)
428     fprintf(fp, "%d", e->back_ref1);
429   else {
430     for (i = 0; i < e->back_num; i++) {
431       if (i > 0) fprintf(fp, ", ");
432       fprintf(fp, "%d", e->back_refs[i]);
433     }
434   }
435   fputs("\n", fp);
436   return ST_CONTINUE;
437 }
438 
439 extern int
onig_print_names(FILE * fp,regex_t * reg)440 onig_print_names(FILE* fp, regex_t* reg)
441 {
442   NameTable* t = (NameTable* )reg->name_table;
443 
444   if (IS_NOT_NULL(t)) {
445     fprintf(fp, "name table\n");
446     onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
447     fputs("\n", fp);
448   }
449   return 0;
450 }
451 #endif /* ONIG_DEBUG */
452 
453 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)454 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
455 {
456   xfree(e->name);
457   if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
458   xfree(key);
459   xfree(e);
460   return ST_DELETE;
461 }
462 
463 static int
names_clear(regex_t * reg)464 names_clear(regex_t* reg)
465 {
466   NameTable* t = (NameTable* )reg->name_table;
467 
468   if (IS_NOT_NULL(t)) {
469     onig_st_foreach(t, i_free_name_entry, 0);
470   }
471   return 0;
472 }
473 
474 extern int
onig_names_free(regex_t * reg)475 onig_names_free(regex_t* reg)
476 {
477   int r;
478   NameTable* t;
479 
480   r = names_clear(reg);
481   if (r) return r;
482 
483   t = (NameTable* )reg->name_table;
484   if (IS_NOT_NULL(t)) onig_st_free_table(t);
485   reg->name_table = (void* )NULL;
486   return 0;
487 }
488 
489 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)490 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
491 {
492   NameEntry* e;
493   NameTable* t = (NameTable* )reg->name_table;
494 
495   e = (NameEntry* )NULL;
496   if (IS_NOT_NULL(t)) {
497     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
498   }
499   return e;
500 }
501 
502 typedef struct {
503   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
504   regex_t* reg;
505   void* arg;
506   int ret;
507   OnigEncoding enc;
508 } INamesArg;
509 
510 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)511 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
512 {
513   int r = (*(arg->func))(e->name,
514                          e->name + e->name_len,
515                          e->back_num,
516 			 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
517 			 arg->reg, arg->arg);
518   if (r != 0) {
519     arg->ret = r;
520     return ST_STOP;
521   }
522   return ST_CONTINUE;
523 }
524 
525 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)526 onig_foreach_name(regex_t* reg,
527   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
528 {
529   INamesArg narg;
530   NameTable* t = (NameTable* )reg->name_table;
531 
532   narg.ret = 0;
533   if (IS_NOT_NULL(t)) {
534     narg.func = func;
535     narg.reg  = reg;
536     narg.arg  = arg;
537     narg.enc  = reg->enc; /* should be pattern encoding. */
538     onig_st_foreach(t, i_names, (HashDataType )&narg);
539   }
540   return narg.ret;
541 }
542 
543 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)544 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
545 {
546   int i;
547 
548   if (e->back_num > 1) {
549     for (i = 0; i < e->back_num; i++) {
550       e->back_refs[i] = map[e->back_refs[i]].new_val;
551     }
552   }
553   else if (e->back_num == 1) {
554     e->back_ref1 = map[e->back_ref1].new_val;
555   }
556 
557   return ST_CONTINUE;
558 }
559 
560 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)561 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
562 {
563   NameTable* t = (NameTable* )reg->name_table;
564 
565   if (IS_NOT_NULL(t)) {
566     onig_st_foreach(t, i_renumber_name, (HashDataType )map);
567   }
568   return 0;
569 }
570 
571 
572 extern int
onig_number_of_names(regex_t * reg)573 onig_number_of_names(regex_t* reg)
574 {
575   NameTable* t = (NameTable* )reg->name_table;
576 
577   if (IS_NOT_NULL(t))
578     return t->num_entries;
579   else
580     return 0;
581 }
582 
583 #else  /* USE_ST_LIBRARY */
584 
585 #define INIT_NAMES_ALLOC_NUM    8
586 
587 typedef struct {
588   NameEntry* e;
589   int        num;
590   int        alloc;
591 } NameTable;
592 
593 #ifdef ONIG_DEBUG
594 extern int
onig_print_names(FILE * fp,regex_t * reg)595 onig_print_names(FILE* fp, regex_t* reg)
596 {
597   int i, j;
598   NameEntry* e;
599   NameTable* t = (NameTable* )reg->name_table;
600 
601   if (IS_NOT_NULL(t) && t->num > 0) {
602     fprintf(fp, "name table\n");
603     for (i = 0; i < t->num; i++) {
604       e = &(t->e[i]);
605       fprintf(fp, "%s: ", e->name);
606       if (e->back_num == 0) {
607 	fputs("-", fp);
608       }
609       else if (e->back_num == 1) {
610 	fprintf(fp, "%d", e->back_ref1);
611       }
612       else {
613 	for (j = 0; j < e->back_num; j++) {
614 	  if (j > 0) fprintf(fp, ", ");
615 	  fprintf(fp, "%d", e->back_refs[j]);
616 	}
617       }
618       fputs("\n", fp);
619     }
620     fputs("\n", fp);
621   }
622   return 0;
623 }
624 #endif
625 
626 static int
names_clear(regex_t * reg)627 names_clear(regex_t* reg)
628 {
629   int i;
630   NameEntry* e;
631   NameTable* t = (NameTable* )reg->name_table;
632 
633   if (IS_NOT_NULL(t)) {
634     for (i = 0; i < t->num; i++) {
635       e = &(t->e[i]);
636       if (IS_NOT_NULL(e->name)) {
637 	xfree(e->name);
638 	e->name       = NULL;
639 	e->name_len   = 0;
640 	e->back_num   = 0;
641 	e->back_alloc = 0;
642 	if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
643 	e->back_refs = (int* )NULL;
644       }
645     }
646     if (IS_NOT_NULL(t->e)) {
647       xfree(t->e);
648       t->e = NULL;
649     }
650     t->num = 0;
651   }
652   return 0;
653 }
654 
655 extern int
onig_names_free(regex_t * reg)656 onig_names_free(regex_t* reg)
657 {
658   int r;
659   NameTable* t;
660 
661   r = names_clear(reg);
662   if (r) return r;
663 
664   t = (NameTable* )reg->name_table;
665   if (IS_NOT_NULL(t)) xfree(t);
666   reg->name_table = NULL;
667   return 0;
668 }
669 
670 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)671 name_find(regex_t* reg, UChar* name, UChar* name_end)
672 {
673   int i, len;
674   NameEntry* e;
675   NameTable* t = (NameTable* )reg->name_table;
676 
677   if (IS_NOT_NULL(t)) {
678     len = name_end - name;
679     for (i = 0; i < t->num; i++) {
680       e = &(t->e[i]);
681       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
682 	return e;
683     }
684   }
685   return (NameEntry* )NULL;
686 }
687 
688 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)689 onig_foreach_name(regex_t* reg,
690   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
691 {
692   int i, r;
693   NameEntry* e;
694   NameTable* t = (NameTable* )reg->name_table;
695 
696   if (IS_NOT_NULL(t)) {
697     for (i = 0; i < t->num; i++) {
698       e = &(t->e[i]);
699       r = (*func)(e->name, e->name + e->name_len, e->back_num,
700 		  (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
701 		  reg, arg);
702       if (r != 0) return r;
703     }
704   }
705   return 0;
706 }
707 
708 extern int
onig_number_of_names(regex_t * reg)709 onig_number_of_names(regex_t* reg)
710 {
711   NameTable* t = (NameTable* )reg->name_table;
712 
713   if (IS_NOT_NULL(t))
714     return t->num;
715   else
716     return 0;
717 }
718 
719 #endif /* else USE_ST_LIBRARY */
720 
721 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)722 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
723 {
724   int alloc;
725   NameEntry* e;
726   NameTable* t = (NameTable* )reg->name_table;
727 
728   if (name_end - name <= 0)
729     return ONIGERR_EMPTY_GROUP_NAME;
730 
731   e = name_find(reg, name, name_end);
732   if (IS_NULL(e)) {
733 #ifdef USE_ST_LIBRARY
734     if (IS_NULL(t)) {
735       t = onig_st_init_strend_table_with_size(5);
736       reg->name_table = (void* )t;
737     }
738     e = (NameEntry* )xmalloc(sizeof(NameEntry));
739     CHECK_NULL_RETURN_MEMERR(e);
740 
741     e->name = strdup_with_null(reg->enc, name, name_end);
742     if (IS_NULL(e->name)) {
743       xfree(e);  return ONIGERR_MEMORY;
744     }
745     onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
746                           (HashDataType )e);
747 
748     e->name_len   = name_end - name;
749     e->back_num   = 0;
750     e->back_alloc = 0;
751     e->back_refs  = (int* )NULL;
752 
753 #else
754 
755     if (IS_NULL(t)) {
756       alloc = INIT_NAMES_ALLOC_NUM;
757       t = (NameTable* )xmalloc(sizeof(NameTable));
758       CHECK_NULL_RETURN_MEMERR(t);
759       t->e     = NULL;
760       t->alloc = 0;
761       t->num   = 0;
762 
763       t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
764       if (IS_NULL(t->e)) {
765 	xfree(t);
766 	return ONIGERR_MEMORY;
767       }
768       t->alloc = alloc;
769       reg->name_table = t;
770       goto clear;
771     }
772     else if (t->num == t->alloc) {
773       int i;
774 
775       alloc = t->alloc * 2;
776       t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
777       CHECK_NULL_RETURN_MEMERR(t->e);
778       t->alloc = alloc;
779 
780     clear:
781       for (i = t->num; i < t->alloc; i++) {
782 	t->e[i].name       = NULL;
783 	t->e[i].name_len   = 0;
784 	t->e[i].back_num   = 0;
785 	t->e[i].back_alloc = 0;
786 	t->e[i].back_refs  = (int* )NULL;
787       }
788     }
789     e = &(t->e[t->num]);
790     t->num++;
791     e->name = strdup_with_null(reg->enc, name, name_end);
792     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
793     e->name_len = name_end - name;
794 #endif
795   }
796 
797   if (e->back_num >= 1 &&
798       ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
799     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
800 				    name, name_end);
801     return ONIGERR_MULTIPLEX_DEFINED_NAME;
802   }
803 
804   e->back_num++;
805   if (e->back_num == 1) {
806     e->back_ref1 = backref;
807   }
808   else {
809     if (e->back_num == 2) {
810       alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
811       e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
812       CHECK_NULL_RETURN_MEMERR(e->back_refs);
813       e->back_alloc = alloc;
814       e->back_refs[0] = e->back_ref1;
815       e->back_refs[1] = backref;
816     }
817     else {
818       if (e->back_num > e->back_alloc) {
819 	alloc = e->back_alloc * 2;
820 	e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
821 	CHECK_NULL_RETURN_MEMERR(e->back_refs);
822 	e->back_alloc = alloc;
823       }
824       e->back_refs[e->back_num - 1] = backref;
825     }
826   }
827 
828   return 0;
829 }
830 
831 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)832 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
833 			   const UChar* name_end, int** nums)
834 {
835   NameEntry* e = name_find(reg, name, name_end);
836 
837   if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
838 
839   switch (e->back_num) {
840   case 0:
841     break;
842   case 1:
843     *nums = &(e->back_ref1);
844     break;
845   default:
846     *nums = e->back_refs;
847     break;
848   }
849   return e->back_num;
850 }
851 
852 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)853 onig_name_to_backref_number(regex_t* reg, const UChar* name,
854 			    const UChar* name_end, OnigRegion *region)
855 {
856   int i, n, *nums;
857 
858   n = onig_name_to_group_numbers(reg, name, name_end, &nums);
859   if (n < 0)
860     return n;
861   else if (n == 0)
862     return ONIGERR_PARSER_BUG;
863   else if (n == 1)
864     return nums[0];
865   else {
866     if (IS_NOT_NULL(region)) {
867       for (i = n - 1; i >= 0; i--) {
868 	if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
869 	  return nums[i];
870       }
871     }
872     return nums[n - 1];
873   }
874 }
875 
876 #else /* USE_NAMED_GROUP */
877 
878 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)879 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
880 			   const UChar* name_end, int** nums)
881 {
882   return ONIG_NO_SUPPORT_CONFIG;
883 }
884 
885 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)886 onig_name_to_backref_number(regex_t* reg, const UChar* name,
887 			    const UChar* name_end, OnigRegion* region)
888 {
889   return ONIG_NO_SUPPORT_CONFIG;
890 }
891 
892 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)893 onig_foreach_name(regex_t* reg,
894   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
895 {
896   return ONIG_NO_SUPPORT_CONFIG;
897 }
898 
899 extern int
onig_number_of_names(regex_t * reg)900 onig_number_of_names(regex_t* reg)
901 {
902   return 0;
903 }
904 #endif /* else USE_NAMED_GROUP */
905 
906 extern int
onig_noname_group_capture_is_active(regex_t * reg)907 onig_noname_group_capture_is_active(regex_t* reg)
908 {
909   if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
910     return 0;
911 
912 #ifdef USE_NAMED_GROUP
913   if (onig_number_of_names(reg) > 0 &&
914       IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
915       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
916     return 0;
917   }
918 #endif
919 
920   return 1;
921 }
922 
923 
924 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE   16
925 
926 static void
scan_env_clear(ScanEnv * env)927 scan_env_clear(ScanEnv* env)
928 {
929   int i;
930 
931   BIT_STATUS_CLEAR(env->capture_history);
932   BIT_STATUS_CLEAR(env->bt_mem_start);
933   BIT_STATUS_CLEAR(env->bt_mem_end);
934   BIT_STATUS_CLEAR(env->backrefed_mem);
935   env->error      = (UChar* )NULL;
936   env->error_end  = (UChar* )NULL;
937   env->num_call   = 0;
938   env->num_mem    = 0;
939 #ifdef USE_NAMED_GROUP
940   env->num_named  = 0;
941 #endif
942   env->mem_alloc         = 0;
943   env->mem_nodes_dynamic = (Node** )NULL;
944 
945   for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
946     env->mem_nodes_static[i] = NULL_NODE;
947 
948 #ifdef USE_COMBINATION_EXPLOSION_CHECK
949   env->num_comb_exp_check  = 0;
950   env->comb_exp_max_regnum = 0;
951   env->curr_max_regnum     = 0;
952   env->has_recursion       = 0;
953 #endif
954 }
955 
956 static int
scan_env_add_mem_entry(ScanEnv * env)957 scan_env_add_mem_entry(ScanEnv* env)
958 {
959   int i, need, alloc;
960   Node** p;
961 
962   need = env->num_mem + 1;
963   if (need >= SCANENV_MEMNODES_SIZE) {
964     if (env->mem_alloc <= need) {
965       if (IS_NULL(env->mem_nodes_dynamic)) {
966 	alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
967 	p = (Node** )xmalloc(sizeof(Node*) * alloc);
968 	xmemcpy(p, env->mem_nodes_static,
969 		sizeof(Node*) * SCANENV_MEMNODES_SIZE);
970       }
971       else {
972 	alloc = env->mem_alloc * 2;
973 	p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
974       }
975       CHECK_NULL_RETURN_MEMERR(p);
976 
977       for (i = env->num_mem + 1; i < alloc; i++)
978 	p[i] = NULL_NODE;
979 
980       env->mem_nodes_dynamic = p;
981       env->mem_alloc = alloc;
982     }
983   }
984 
985   env->num_mem++;
986   return env->num_mem;
987 }
988 
989 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)990 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
991 {
992   if (env->num_mem >= num)
993     SCANENV_MEM_NODES(env)[num] = node;
994   else
995     return ONIGERR_PARSER_BUG;
996   return 0;
997 }
998 
999 
1000 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1001 typedef struct _FreeNode {
1002   struct _FreeNode* next;
1003 } FreeNode;
1004 
1005 static FreeNode* FreeNodeList = (FreeNode* )NULL;
1006 #endif
1007 
1008 extern void
onig_node_free(Node * node)1009 onig_node_free(Node* node)
1010 {
1011  start:
1012   if (IS_NULL(node)) return ;
1013 
1014   switch (NTYPE(node)) {
1015   case NT_STR:
1016     if (NSTR(node)->capa != 0 &&
1017 	IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1018       xfree(NSTR(node)->s);
1019     }
1020     break;
1021 
1022   case NT_LIST:
1023   case NT_ALT:
1024     onig_node_free(NCAR(node));
1025     {
1026       Node* next_node = NCDR(node);
1027 
1028 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1029       {
1030 	FreeNode* n = (FreeNode* )node;
1031 
1032         THREAD_ATOMIC_START;
1033 	n->next = FreeNodeList;
1034 	FreeNodeList = n;
1035         THREAD_ATOMIC_END;
1036       }
1037 #else
1038       xfree(node);
1039 #endif
1040       node = next_node;
1041       goto start;
1042     }
1043     break;
1044 
1045   case NT_CCLASS:
1046     {
1047       CClassNode* cc = NCCLASS(node);
1048 
1049       if (IS_NCCLASS_SHARE(cc)) return ;
1050       if (cc->mbuf)
1051         bbuf_free(cc->mbuf);
1052     }
1053     break;
1054 
1055   case NT_QTFR:
1056     if (NQTFR(node)->target)
1057       onig_node_free(NQTFR(node)->target);
1058     break;
1059 
1060   case NT_ENCLOSE:
1061     if (NENCLOSE(node)->target)
1062       onig_node_free(NENCLOSE(node)->target);
1063     break;
1064 
1065   case NT_BREF:
1066     if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1067       xfree(NBREF(node)->back_dynamic);
1068     break;
1069 
1070   case NT_ANCHOR:
1071     if (NANCHOR(node)->target)
1072       onig_node_free(NANCHOR(node)->target);
1073     break;
1074   }
1075 
1076 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1077   {
1078     FreeNode* n = (FreeNode* )node;
1079 
1080     THREAD_ATOMIC_START;
1081     n->next = FreeNodeList;
1082     FreeNodeList = n;
1083     THREAD_ATOMIC_END;
1084   }
1085 #else
1086   xfree(node);
1087 #endif
1088 }
1089 
1090 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1091 extern int
onig_free_node_list(void)1092 onig_free_node_list(void)
1093 {
1094   FreeNode* n;
1095 
1096   /* THREAD_ATOMIC_START; */
1097   while (IS_NOT_NULL(FreeNodeList)) {
1098     n = FreeNodeList;
1099     FreeNodeList = FreeNodeList->next;
1100     xfree(n);
1101   }
1102   /* THREAD_ATOMIC_END; */
1103   return 0;
1104 }
1105 #endif
1106 
1107 static Node*
node_new(void)1108 node_new(void)
1109 {
1110   Node* node;
1111 
1112 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1113   THREAD_ATOMIC_START;
1114   if (IS_NOT_NULL(FreeNodeList)) {
1115     node = (Node* )FreeNodeList;
1116     FreeNodeList = FreeNodeList->next;
1117     THREAD_ATOMIC_END;
1118     return node;
1119   }
1120   THREAD_ATOMIC_END;
1121 #endif
1122 
1123   node = (Node* )xmalloc(sizeof(Node));
1124   /* xmemset(node, 0, sizeof(Node)); */
1125   return node;
1126 }
1127 
1128 
1129 static void
initialize_cclass(CClassNode * cc)1130 initialize_cclass(CClassNode* cc)
1131 {
1132   BITSET_CLEAR(cc->bs);
1133   /* cc->base.flags = 0; */
1134   cc->flags = 0;
1135   cc->mbuf  = NULL;
1136 }
1137 
1138 static Node*
node_new_cclass(void)1139 node_new_cclass(void)
1140 {
1141   Node* node = node_new();
1142   CHECK_NULL_RETURN(node);
1143 
1144   SET_NTYPE(node, NT_CCLASS);
1145   initialize_cclass(NCCLASS(node));
1146   return node;
1147 }
1148 
1149 static Node*
node_new_cclass_by_codepoint_range(int not,OnigCodePoint sb_out,const OnigCodePoint ranges[])1150 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1151 				   const OnigCodePoint ranges[])
1152 {
1153   int n, i;
1154   CClassNode* cc;
1155   OnigCodePoint j;
1156 
1157   Node* node = node_new_cclass();
1158   CHECK_NULL_RETURN(node);
1159 
1160   cc = NCCLASS(node);
1161   if (not != 0) NCCLASS_SET_NOT(cc);
1162 
1163   BITSET_CLEAR(cc->bs);
1164   if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1165     n = ONIGENC_CODE_RANGE_NUM(ranges);
1166     for (i = 0; i < n; i++) {
1167       for (j  = ONIGENC_CODE_RANGE_FROM(ranges, i);
1168            j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1169 	if (j >= sb_out) goto sb_end;
1170 
1171         BITSET_SET_BIT(cc->bs, j);
1172       }
1173     }
1174   }
1175 
1176  sb_end:
1177   if (IS_NULL(ranges)) {
1178   is_null:
1179     cc->mbuf = NULL;
1180   }
1181   else {
1182     BBuf* bbuf;
1183 
1184     n = ONIGENC_CODE_RANGE_NUM(ranges);
1185     if (n == 0) goto is_null;
1186 
1187     bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1188     CHECK_NULL_RETURN(bbuf);
1189     bbuf->alloc = n + 1;
1190     bbuf->used  = n + 1;
1191     bbuf->p     = (UChar* )((void* )ranges);
1192 
1193     cc->mbuf = bbuf;
1194   }
1195 
1196   return node;
1197 }
1198 
1199 static Node*
node_new_ctype(int type,int not)1200 node_new_ctype(int type, int not)
1201 {
1202   Node* node = node_new();
1203   CHECK_NULL_RETURN(node);
1204 
1205   SET_NTYPE(node, NT_CTYPE);
1206   NCTYPE(node)->ctype = type;
1207   NCTYPE(node)->not   = not;
1208   return node;
1209 }
1210 
1211 static Node*
node_new_anychar(void)1212 node_new_anychar(void)
1213 {
1214   Node* node = node_new();
1215   CHECK_NULL_RETURN(node);
1216 
1217   SET_NTYPE(node, NT_CANY);
1218   return node;
1219 }
1220 
1221 static Node*
node_new_list(Node * left,Node * right)1222 node_new_list(Node* left, Node* right)
1223 {
1224   Node* node = node_new();
1225   CHECK_NULL_RETURN(node);
1226 
1227   SET_NTYPE(node, NT_LIST);
1228   NCAR(node)  = left;
1229   NCDR(node) = right;
1230   return node;
1231 }
1232 
1233 extern Node*
onig_node_new_list(Node * left,Node * right)1234 onig_node_new_list(Node* left, Node* right)
1235 {
1236   return node_new_list(left, right);
1237 }
1238 
1239 extern Node*
onig_node_list_add(Node * list,Node * x)1240 onig_node_list_add(Node* list, Node* x)
1241 {
1242   Node *n;
1243 
1244   n = onig_node_new_list(x, NULL);
1245   if (IS_NULL(n)) return NULL_NODE;
1246 
1247   if (IS_NOT_NULL(list)) {
1248     while (IS_NOT_NULL(NCDR(list)))
1249       list = NCDR(list);
1250 
1251     NCDR(list) = n;
1252   }
1253 
1254   return n;
1255 }
1256 
1257 extern Node*
onig_node_new_alt(Node * left,Node * right)1258 onig_node_new_alt(Node* left, Node* right)
1259 {
1260   Node* node = node_new();
1261   CHECK_NULL_RETURN(node);
1262 
1263   SET_NTYPE(node, NT_ALT);
1264   NCAR(node)  = left;
1265   NCDR(node) = right;
1266   return node;
1267 }
1268 
1269 extern Node*
onig_node_new_anchor(int type)1270 onig_node_new_anchor(int type)
1271 {
1272   Node* node = node_new();
1273   CHECK_NULL_RETURN(node);
1274 
1275   SET_NTYPE(node, NT_ANCHOR);
1276   NANCHOR(node)->type     = type;
1277   NANCHOR(node)->target   = NULL;
1278   NANCHOR(node)->char_len = -1;
1279   return node;
1280 }
1281 
1282 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1283 node_new_backref(int back_num, int* backrefs, int by_name,
1284 #ifdef USE_BACKREF_WITH_LEVEL
1285 		 int exist_level, int nest_level,
1286 #endif
1287 		 ScanEnv* env)
1288 {
1289   int i;
1290   Node* node = node_new();
1291 
1292   CHECK_NULL_RETURN(node);
1293 
1294   SET_NTYPE(node, NT_BREF);
1295   NBREF(node)->state    = 0;
1296   NBREF(node)->back_num = back_num;
1297   NBREF(node)->back_dynamic = (int* )NULL;
1298   if (by_name != 0)
1299     NBREF(node)->state |= NST_NAME_REF;
1300 
1301 #ifdef USE_BACKREF_WITH_LEVEL
1302   if (exist_level != 0) {
1303     NBREF(node)->state |= NST_NEST_LEVEL;
1304     NBREF(node)->nest_level  = nest_level;
1305   }
1306 #endif
1307 
1308   for (i = 0; i < back_num; i++) {
1309     if (backrefs[i] <= env->num_mem &&
1310 	IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1311       NBREF(node)->state |= NST_RECURSION;   /* /...(\1).../ */
1312       break;
1313     }
1314   }
1315 
1316   if (back_num <= NODE_BACKREFS_SIZE) {
1317     for (i = 0; i < back_num; i++)
1318       NBREF(node)->back_static[i] = backrefs[i];
1319   }
1320   else {
1321     int* p = (int* )xmalloc(sizeof(int) * back_num);
1322     if (IS_NULL(p)) {
1323       onig_node_free(node);
1324       return NULL;
1325     }
1326     NBREF(node)->back_dynamic = p;
1327     for (i = 0; i < back_num; i++)
1328       p[i] = backrefs[i];
1329   }
1330   return node;
1331 }
1332 
1333 #ifdef USE_SUBEXP_CALL
1334 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1335 node_new_call(UChar* name, UChar* name_end, int gnum)
1336 {
1337   Node* node = node_new();
1338   CHECK_NULL_RETURN(node);
1339 
1340   SET_NTYPE(node, NT_CALL);
1341   NCALL(node)->state     = 0;
1342   NCALL(node)->target    = NULL_NODE;
1343   NCALL(node)->name      = name;
1344   NCALL(node)->name_end  = name_end;
1345   NCALL(node)->group_num = gnum;  /* call by number if gnum != 0 */
1346   return node;
1347 }
1348 #endif
1349 
1350 static Node*
node_new_quantifier(int lower,int upper,int by_number)1351 node_new_quantifier(int lower, int upper, int by_number)
1352 {
1353   Node* node = node_new();
1354   CHECK_NULL_RETURN(node);
1355 
1356   SET_NTYPE(node, NT_QTFR);
1357   NQTFR(node)->state  = 0;
1358   NQTFR(node)->target = NULL;
1359   NQTFR(node)->lower  = lower;
1360   NQTFR(node)->upper  = upper;
1361   NQTFR(node)->greedy = 1;
1362   NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1363   NQTFR(node)->head_exact        = NULL_NODE;
1364   NQTFR(node)->next_head_exact   = NULL_NODE;
1365   NQTFR(node)->is_refered        = 0;
1366   if (by_number != 0)
1367     NQTFR(node)->state |= NST_BY_NUMBER;
1368 
1369 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1370   NQTFR(node)->comb_exp_check_num = 0;
1371 #endif
1372 
1373   return node;
1374 }
1375 
1376 static Node*
node_new_enclose(int type)1377 node_new_enclose(int type)
1378 {
1379   Node* node = node_new();
1380   CHECK_NULL_RETURN(node);
1381 
1382   SET_NTYPE(node, NT_ENCLOSE);
1383   NENCLOSE(node)->type      = type;
1384   NENCLOSE(node)->state     =  0;
1385   NENCLOSE(node)->regnum    =  0;
1386   NENCLOSE(node)->option    =  0;
1387   NENCLOSE(node)->target    = NULL;
1388   NENCLOSE(node)->call_addr = -1;
1389   NENCLOSE(node)->opt_count =  0;
1390   return node;
1391 }
1392 
1393 extern Node*
onig_node_new_enclose(int type)1394 onig_node_new_enclose(int type)
1395 {
1396   return node_new_enclose(type);
1397 }
1398 
1399 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1400 node_new_enclose_memory(OnigOptionType option, int is_named)
1401 {
1402   Node* node = node_new_enclose(ENCLOSE_MEMORY);
1403   CHECK_NULL_RETURN(node);
1404   if (is_named != 0)
1405     SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1406 
1407 #ifdef USE_SUBEXP_CALL
1408   NENCLOSE(node)->option = option;
1409 #endif
1410   return node;
1411 }
1412 
1413 static Node*
node_new_option(OnigOptionType option)1414 node_new_option(OnigOptionType option)
1415 {
1416   Node* node = node_new_enclose(ENCLOSE_OPTION);
1417   CHECK_NULL_RETURN(node);
1418   NENCLOSE(node)->option = option;
1419   return node;
1420 }
1421 
1422 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1423 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1424 {
1425   int addlen = end - s;
1426 
1427   if (addlen > 0) {
1428     int len  = NSTR(node)->end - NSTR(node)->s;
1429 
1430     if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1431       UChar* p;
1432       int capa = len + addlen + NODE_STR_MARGIN;
1433 
1434       if (capa <= NSTR(node)->capa) {
1435 	onig_strcpy(NSTR(node)->s + len, s, end);
1436       }
1437       else {
1438 	if (NSTR(node)->s == NSTR(node)->buf)
1439 	  p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1440 				      s, end, capa);
1441 	else
1442 	  p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1443 
1444 	CHECK_NULL_RETURN_MEMERR(p);
1445 	NSTR(node)->s    = p;
1446 	NSTR(node)->capa = capa;
1447       }
1448     }
1449     else {
1450       onig_strcpy(NSTR(node)->s + len, s, end);
1451     }
1452     NSTR(node)->end = NSTR(node)->s + len + addlen;
1453   }
1454 
1455   return 0;
1456 }
1457 
1458 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1459 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1460 {
1461   onig_node_str_clear(node);
1462   return onig_node_str_cat(node, s, end);
1463 }
1464 
1465 static int
node_str_cat_char(Node * node,UChar c)1466 node_str_cat_char(Node* node, UChar c)
1467 {
1468   UChar s[1];
1469 
1470   s[0] = c;
1471   return onig_node_str_cat(node, s, s + 1);
1472 }
1473 
1474 extern void
onig_node_conv_to_str_node(Node * node,int flag)1475 onig_node_conv_to_str_node(Node* node, int flag)
1476 {
1477   SET_NTYPE(node, NT_STR);
1478   NSTR(node)->flag = flag;
1479   NSTR(node)->capa = 0;
1480   NSTR(node)->s    = NSTR(node)->buf;
1481   NSTR(node)->end  = NSTR(node)->buf;
1482 }
1483 
1484 extern void
onig_node_str_clear(Node * node)1485 onig_node_str_clear(Node* node)
1486 {
1487   if (NSTR(node)->capa != 0 &&
1488       IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1489     xfree(NSTR(node)->s);
1490   }
1491 
1492   NSTR(node)->capa = 0;
1493   NSTR(node)->flag = 0;
1494   NSTR(node)->s    = NSTR(node)->buf;
1495   NSTR(node)->end  = NSTR(node)->buf;
1496 }
1497 
1498 static Node*
node_new_str(const UChar * s,const UChar * end)1499 node_new_str(const UChar* s, const UChar* end)
1500 {
1501   Node* node = node_new();
1502   CHECK_NULL_RETURN(node);
1503 
1504   SET_NTYPE(node, NT_STR);
1505   NSTR(node)->capa = 0;
1506   NSTR(node)->flag = 0;
1507   NSTR(node)->s    = NSTR(node)->buf;
1508   NSTR(node)->end  = NSTR(node)->buf;
1509   if (onig_node_str_cat(node, s, end)) {
1510     onig_node_free(node);
1511     return NULL;
1512   }
1513   return node;
1514 }
1515 
1516 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1517 onig_node_new_str(const UChar* s, const UChar* end)
1518 {
1519   return node_new_str(s, end);
1520 }
1521 
1522 static Node*
node_new_str_raw(UChar * s,UChar * end)1523 node_new_str_raw(UChar* s, UChar* end)
1524 {
1525   Node* node = node_new_str(s, end);
1526   NSTRING_SET_RAW(node);
1527   return node;
1528 }
1529 
1530 static Node*
node_new_empty(void)1531 node_new_empty(void)
1532 {
1533   return node_new_str(NULL, NULL);
1534 }
1535 
1536 static Node*
node_new_str_raw_char(UChar c)1537 node_new_str_raw_char(UChar c)
1538 {
1539   UChar p[1];
1540 
1541   p[0] = c;
1542   return node_new_str_raw(p, p + 1);
1543 }
1544 
1545 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1546 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1547 {
1548   const UChar *p;
1549   Node* n = NULL_NODE;
1550 
1551   if (sn->end > sn->s) {
1552     p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1553     if (p && p > sn->s) { /* can be splitted. */
1554       n = node_new_str(p, sn->end);
1555       if ((sn->flag & NSTR_RAW) != 0)
1556 	NSTRING_SET_RAW(n);
1557       sn->end = (UChar* )p;
1558     }
1559   }
1560   return n;
1561 }
1562 
1563 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1564 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1565 {
1566   if (sn->end > sn->s) {
1567     return ((enclen(enc, sn->s) < sn->end - sn->s)  ?  1 : 0);
1568   }
1569   return 0;
1570 }
1571 
1572 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1573 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1574 node_str_head_pad(StrNode* sn, int num, UChar val)
1575 {
1576   UChar buf[NODE_STR_BUF_SIZE];
1577   int i, len;
1578 
1579   len = sn->end - sn->s;
1580   onig_strcpy(buf, sn->s, sn->end);
1581   onig_strcpy(&(sn->s[num]), buf, buf + len);
1582   sn->end += num;
1583 
1584   for (i = 0; i < num; i++) {
1585     sn->s[i] = val;
1586   }
1587 }
1588 #endif
1589 
1590 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1591 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1592 {
1593   unsigned int num, val;
1594   OnigCodePoint c;
1595   UChar* p = *src;
1596   PFETCH_READY;
1597 
1598   num = 0;
1599   while (!PEND) {
1600     PFETCH(c);
1601     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1602       val = (unsigned int )DIGITVAL(c);
1603       if ((INT_MAX_LIMIT - val) / 10UL < num)
1604 	return -1;  /* overflow */
1605 
1606       num = num * 10 + val;
1607     }
1608     else {
1609       PUNFETCH;
1610       break;
1611     }
1612   }
1613   *src = p;
1614   return num;
1615 }
1616 
1617 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1618 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1619 				 OnigEncoding enc)
1620 {
1621   OnigCodePoint c;
1622   unsigned int num, val;
1623   UChar* p = *src;
1624   PFETCH_READY;
1625 
1626   num = 0;
1627   while (!PEND && maxlen-- != 0) {
1628     PFETCH(c);
1629     if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1630       val = (unsigned int )XDIGITVAL(enc,c);
1631       if ((INT_MAX_LIMIT - val) / 16UL < num)
1632 	return -1;  /* overflow */
1633 
1634       num = (num << 4) + XDIGITVAL(enc,c);
1635     }
1636     else {
1637       PUNFETCH;
1638       break;
1639     }
1640   }
1641   *src = p;
1642   return num;
1643 }
1644 
1645 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1646 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1647 			   OnigEncoding enc)
1648 {
1649   OnigCodePoint c;
1650   unsigned int num, val;
1651   UChar* p = *src;
1652   PFETCH_READY;
1653 
1654   num = 0;
1655   while (!PEND && maxlen-- != 0) {
1656     PFETCH(c);
1657     if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1658       val = ODIGITVAL(c);
1659       if ((INT_MAX_LIMIT - val) / 8UL < num)
1660 	return -1;  /* overflow */
1661 
1662       num = (num << 3) + val;
1663     }
1664     else {
1665       PUNFETCH;
1666       break;
1667     }
1668   }
1669   *src = p;
1670   return num;
1671 }
1672 
1673 
1674 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1675     BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1676 
1677 /* data format:
1678      [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1679      (all data size is OnigCodePoint)
1680  */
1681 static int
new_code_range(BBuf ** pbuf)1682 new_code_range(BBuf** pbuf)
1683 {
1684 #define INIT_MULTI_BYTE_RANGE_SIZE  (SIZE_CODE_POINT * 5)
1685   int r;
1686   OnigCodePoint n;
1687   BBuf* bbuf;
1688 
1689   bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1690   CHECK_NULL_RETURN_MEMERR(*pbuf);
1691   r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1692   if (r) return r;
1693 
1694   n = 0;
1695   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1696   return 0;
1697 }
1698 
1699 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1700 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1701 {
1702   int r, inc_n, pos;
1703   int low, high, bound, x;
1704   OnigCodePoint n, *data;
1705   BBuf* bbuf;
1706 
1707   if (from > to) {
1708     n = from; from = to; to = n;
1709   }
1710 
1711   if (IS_NULL(*pbuf)) {
1712     r = new_code_range(pbuf);
1713     if (r) return r;
1714     bbuf = *pbuf;
1715     n = 0;
1716   }
1717   else {
1718     bbuf = *pbuf;
1719     GET_CODE_POINT(n, bbuf->p);
1720   }
1721   data = (OnigCodePoint* )(bbuf->p);
1722   data++;
1723 
1724   for (low = 0, bound = n; low < bound; ) {
1725     x = (low + bound) >> 1;
1726     if (from > data[x*2 + 1])
1727       low = x + 1;
1728     else
1729       bound = x;
1730   }
1731 
1732   for (high = low, bound = n; high < bound; ) {
1733     x = (high + bound) >> 1;
1734     if (to >= data[x*2] - 1)
1735       high = x + 1;
1736     else
1737       bound = x;
1738   }
1739 
1740   inc_n = low + 1 - high;
1741   if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1742     return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1743 
1744   if (inc_n != 1) {
1745     if (from > data[low*2])
1746       from = data[low*2];
1747     if (to < data[(high - 1)*2 + 1])
1748       to = data[(high - 1)*2 + 1];
1749   }
1750 
1751   if (inc_n != 0 && (OnigCodePoint )high < n) {
1752     int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1753     int to_pos   = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1754     int size = (n - high) * 2 * SIZE_CODE_POINT;
1755 
1756     if (inc_n > 0) {
1757       BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1758     }
1759     else {
1760       BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1761     }
1762   }
1763 
1764   pos = SIZE_CODE_POINT * (1 + low * 2);
1765   BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1766   BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1767   BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1768   n += inc_n;
1769   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1770 
1771   return 0;
1772 }
1773 
1774 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1775 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1776 {
1777   if (from > to) {
1778     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1779       return 0;
1780     else
1781       return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1782   }
1783 
1784   return add_code_range_to_buf(pbuf, from, to);
1785 }
1786 
1787 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1788 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1789 {
1790   int r, i, n;
1791   OnigCodePoint pre, from, *data, to = 0;
1792 
1793   *pbuf = (BBuf* )NULL;
1794   if (IS_NULL(bbuf)) {
1795   set_all:
1796     return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1797   }
1798 
1799   data = (OnigCodePoint* )(bbuf->p);
1800   GET_CODE_POINT(n, data);
1801   data++;
1802   if (n <= 0) goto set_all;
1803 
1804   r = 0;
1805   pre = MBCODE_START_POS(enc);
1806   for (i = 0; i < n; i++) {
1807     from = data[i*2];
1808     to   = data[i*2+1];
1809     if (pre <= from - 1) {
1810       r = add_code_range_to_buf(pbuf, pre, from - 1);
1811       if (r != 0) return r;
1812     }
1813     if (to == ~((OnigCodePoint )0)) break;
1814     pre = to + 1;
1815   }
1816   if (to < ~((OnigCodePoint )0)) {
1817     r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1818   }
1819   return r;
1820 }
1821 
1822 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1823   BBuf *tbuf; \
1824   int  tnot; \
1825   tnot = not1;  not1  = not2;  not2  = tnot; \
1826   tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1827 } while (0)
1828 
1829 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1830 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1831                   BBuf* bbuf2, int not2, BBuf** pbuf)
1832 {
1833   int r;
1834   OnigCodePoint i, n1, *data1;
1835   OnigCodePoint from, to;
1836 
1837   *pbuf = (BBuf* )NULL;
1838   if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1839     if (not1 != 0 || not2 != 0)
1840       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1841     return 0;
1842   }
1843 
1844   r = 0;
1845   if (IS_NULL(bbuf2))
1846     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1847 
1848   if (IS_NULL(bbuf1)) {
1849     if (not1 != 0) {
1850       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1851     }
1852     else {
1853       if (not2 == 0) {
1854 	return bbuf_clone(pbuf, bbuf2);
1855       }
1856       else {
1857 	return not_code_range_buf(enc, bbuf2, pbuf);
1858       }
1859     }
1860   }
1861 
1862   if (not1 != 0)
1863     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1864 
1865   data1 = (OnigCodePoint* )(bbuf1->p);
1866   GET_CODE_POINT(n1, data1);
1867   data1++;
1868 
1869   if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1870     r = bbuf_clone(pbuf, bbuf2);
1871   }
1872   else if (not1 == 0) { /* 1 OR (not 2) */
1873     r = not_code_range_buf(enc, bbuf2, pbuf);
1874   }
1875   if (r != 0) return r;
1876 
1877   for (i = 0; i < n1; i++) {
1878     from = data1[i*2];
1879     to   = data1[i*2+1];
1880     r = add_code_range_to_buf(pbuf, from, to);
1881     if (r != 0) return r;
1882   }
1883   return 0;
1884 }
1885 
1886 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1887 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1888 	        OnigCodePoint* data, int n)
1889 {
1890   int i, r;
1891   OnigCodePoint from2, to2;
1892 
1893   for (i = 0; i < n; i++) {
1894     from2 = data[i*2];
1895     to2   = data[i*2+1];
1896     if (from2 < from1) {
1897       if (to2 < from1) continue;
1898       else {
1899 	from1 = to2 + 1;
1900       }
1901     }
1902     else if (from2 <= to1) {
1903       if (to2 < to1) {
1904 	if (from1 <= from2 - 1) {
1905 	  r = add_code_range_to_buf(pbuf, from1, from2-1);
1906 	  if (r != 0) return r;
1907 	}
1908 	from1 = to2 + 1;
1909       }
1910       else {
1911 	to1 = from2 - 1;
1912       }
1913     }
1914     else {
1915       from1 = from2;
1916     }
1917     if (from1 > to1) break;
1918   }
1919   if (from1 <= to1) {
1920     r = add_code_range_to_buf(pbuf, from1, to1);
1921     if (r != 0) return r;
1922   }
1923   return 0;
1924 }
1925 
1926 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1927 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1928 {
1929   int r;
1930   OnigCodePoint i, j, n1, n2, *data1, *data2;
1931   OnigCodePoint from, to, from1, to1, from2, to2;
1932 
1933   *pbuf = (BBuf* )NULL;
1934   if (IS_NULL(bbuf1)) {
1935     if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1936       return bbuf_clone(pbuf, bbuf2);
1937     return 0;
1938   }
1939   else if (IS_NULL(bbuf2)) {
1940     if (not2 != 0)
1941       return bbuf_clone(pbuf, bbuf1);
1942     return 0;
1943   }
1944 
1945   if (not1 != 0)
1946     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1947 
1948   data1 = (OnigCodePoint* )(bbuf1->p);
1949   data2 = (OnigCodePoint* )(bbuf2->p);
1950   GET_CODE_POINT(n1, data1);
1951   GET_CODE_POINT(n2, data2);
1952   data1++;
1953   data2++;
1954 
1955   if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1956     for (i = 0; i < n1; i++) {
1957       from1 = data1[i*2];
1958       to1   = data1[i*2+1];
1959       for (j = 0; j < n2; j++) {
1960 	from2 = data2[j*2];
1961 	to2   = data2[j*2+1];
1962 	if (from2 > to1) break;
1963 	if (to2 < from1) continue;
1964 	from = MAX(from1, from2);
1965 	to   = MIN(to1, to2);
1966 	r = add_code_range_to_buf(pbuf, from, to);
1967 	if (r != 0) return r;
1968       }
1969     }
1970   }
1971   else if (not1 == 0) { /* 1 AND (not 2) */
1972     for (i = 0; i < n1; i++) {
1973       from1 = data1[i*2];
1974       to1   = data1[i*2+1];
1975       r = and_code_range1(pbuf, from1, to1, data2, n2);
1976       if (r != 0) return r;
1977     }
1978   }
1979 
1980   return 0;
1981 }
1982 
1983 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1984 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1985 {
1986   int r, not1, not2;
1987   BBuf *buf1, *buf2, *pbuf;
1988   BitSetRef bsr1, bsr2;
1989   BitSet bs1, bs2;
1990 
1991   not1 = IS_NCCLASS_NOT(dest);
1992   bsr1 = dest->bs;
1993   buf1 = dest->mbuf;
1994   not2 = IS_NCCLASS_NOT(cc);
1995   bsr2 = cc->bs;
1996   buf2 = cc->mbuf;
1997 
1998   if (not1 != 0) {
1999     bitset_invert_to(bsr1, bs1);
2000     bsr1 = bs1;
2001   }
2002   if (not2 != 0) {
2003     bitset_invert_to(bsr2, bs2);
2004     bsr2 = bs2;
2005   }
2006   bitset_and(bsr1, bsr2);
2007   if (bsr1 != dest->bs) {
2008     bitset_copy(dest->bs, bsr1);
2009     bsr1 = dest->bs;
2010   }
2011   if (not1 != 0) {
2012     bitset_invert(dest->bs);
2013   }
2014 
2015   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2016     if (not1 != 0 && not2 != 0) {
2017       r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2018     }
2019     else {
2020       r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2021       if (r == 0 && not1 != 0) {
2022 	BBuf *tbuf;
2023 	r = not_code_range_buf(enc, pbuf, &tbuf);
2024 	if (r != 0) {
2025 	  bbuf_free(pbuf);
2026 	  return r;
2027 	}
2028 	bbuf_free(pbuf);
2029 	pbuf = tbuf;
2030       }
2031     }
2032     if (r != 0) return r;
2033 
2034     dest->mbuf = pbuf;
2035     bbuf_free(buf1);
2036     return r;
2037   }
2038   return 0;
2039 }
2040 
2041 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2042 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2043 {
2044   int r, not1, not2;
2045   BBuf *buf1, *buf2, *pbuf;
2046   BitSetRef bsr1, bsr2;
2047   BitSet bs1, bs2;
2048 
2049   not1 = IS_NCCLASS_NOT(dest);
2050   bsr1 = dest->bs;
2051   buf1 = dest->mbuf;
2052   not2 = IS_NCCLASS_NOT(cc);
2053   bsr2 = cc->bs;
2054   buf2 = cc->mbuf;
2055 
2056   if (not1 != 0) {
2057     bitset_invert_to(bsr1, bs1);
2058     bsr1 = bs1;
2059   }
2060   if (not2 != 0) {
2061     bitset_invert_to(bsr2, bs2);
2062     bsr2 = bs2;
2063   }
2064   bitset_or(bsr1, bsr2);
2065   if (bsr1 != dest->bs) {
2066     bitset_copy(dest->bs, bsr1);
2067     bsr1 = dest->bs;
2068   }
2069   if (not1 != 0) {
2070     bitset_invert(dest->bs);
2071   }
2072 
2073   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2074     if (not1 != 0 && not2 != 0) {
2075       r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2076     }
2077     else {
2078       r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2079       if (r == 0 && not1 != 0) {
2080 	BBuf *tbuf;
2081 	r = not_code_range_buf(enc, pbuf, &tbuf);
2082 	if (r != 0) {
2083 	  bbuf_free(pbuf);
2084 	  return r;
2085 	}
2086 	bbuf_free(pbuf);
2087 	pbuf = tbuf;
2088       }
2089     }
2090     if (r != 0) return r;
2091 
2092     dest->mbuf = pbuf;
2093     bbuf_free(buf1);
2094     return r;
2095   }
2096   else
2097     return 0;
2098 }
2099 
2100 static int
conv_backslash_value(int c,ScanEnv * env)2101 conv_backslash_value(int c, ScanEnv* env)
2102 {
2103   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2104     switch (c) {
2105     case 'n': return '\n';
2106     case 't': return '\t';
2107     case 'r': return '\r';
2108     case 'f': return '\f';
2109     case 'a': return '\007';
2110     case 'b': return '\010';
2111     case 'e': return '\033';
2112     case 'v':
2113       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2114 	return '\v';
2115       break;
2116 
2117     default:
2118       break;
2119     }
2120   }
2121   return c;
2122 }
2123 
2124 static int
is_invalid_quantifier_target(Node * node)2125 is_invalid_quantifier_target(Node* node)
2126 {
2127   switch (NTYPE(node)) {
2128   case NT_ANCHOR:
2129     return 1;
2130     break;
2131 
2132   case NT_ENCLOSE:
2133     /* allow enclosed elements */
2134     /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2135     break;
2136 
2137   case NT_LIST:
2138     do {
2139       if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2140     } while (IS_NOT_NULL(node = NCDR(node)));
2141     return 0;
2142     break;
2143 
2144   case NT_ALT:
2145     do {
2146       if (is_invalid_quantifier_target(NCAR(node))) return 1;
2147     } while (IS_NOT_NULL(node = NCDR(node)));
2148     break;
2149 
2150   default:
2151     break;
2152   }
2153   return 0;
2154 }
2155 
2156 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2157 static int
popular_quantifier_num(QtfrNode * q)2158 popular_quantifier_num(QtfrNode* q)
2159 {
2160   if (q->greedy) {
2161     if (q->lower == 0) {
2162       if (q->upper == 1) return 0;
2163       else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2164     }
2165     else if (q->lower == 1) {
2166       if (IS_REPEAT_INFINITE(q->upper)) return 2;
2167     }
2168   }
2169   else {
2170     if (q->lower == 0) {
2171       if (q->upper == 1) return 3;
2172       else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2173     }
2174     else if (q->lower == 1) {
2175       if (IS_REPEAT_INFINITE(q->upper)) return 5;
2176     }
2177   }
2178   return -1;
2179 }
2180 
2181 
2182 enum ReduceType {
2183   RQ_ASIS = 0, /* as is */
2184   RQ_DEL  = 1, /* delete parent */
2185   RQ_A,        /* to '*'    */
2186   RQ_AQ,       /* to '*?'   */
2187   RQ_QQ,       /* to '??'   */
2188   RQ_P_QQ,     /* to '+)??' */
2189   RQ_PQ_Q      /* to '+?)?' */
2190 };
2191 
2192 static enum ReduceType ReduceTypeTable[6][6] = {
2193   {RQ_DEL,  RQ_A,    RQ_A,   RQ_QQ,   RQ_AQ,   RQ_ASIS}, /* '?'  */
2194   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},  /* '*'  */
2195   {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */
2196   {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */
2197   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */
2198   {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */
2199 };
2200 
2201 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2202 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2203 {
2204   int pnum, cnum;
2205   QtfrNode *p, *c;
2206 
2207   p = NQTFR(pnode);
2208   c = NQTFR(cnode);
2209   pnum = popular_quantifier_num(p);
2210   cnum = popular_quantifier_num(c);
2211   if (pnum < 0 || cnum < 0) return ;
2212 
2213   switch(ReduceTypeTable[cnum][pnum]) {
2214   case RQ_DEL:
2215     *pnode = *cnode;
2216     break;
2217   case RQ_A:
2218     p->target = c->target;
2219     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 1;
2220     break;
2221   case RQ_AQ:
2222     p->target = c->target;
2223     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 0;
2224     break;
2225   case RQ_QQ:
2226     p->target = c->target;
2227     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2228     break;
2229   case RQ_P_QQ:
2230     p->target = cnode;
2231     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2232     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 1;
2233     return ;
2234     break;
2235   case RQ_PQ_Q:
2236     p->target = cnode;
2237     p->lower  = 0;  p->upper = 1;  p->greedy = 1;
2238     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 0;
2239     return ;
2240     break;
2241   case RQ_ASIS:
2242     p->target = cnode;
2243     return ;
2244     break;
2245   }
2246 
2247   c->target = NULL_NODE;
2248   onig_node_free(cnode);
2249 }
2250 
2251 
2252 enum TokenSyms {
2253   TK_EOT      = 0,   /* end of token */
2254   TK_RAW_BYTE = 1,
2255   TK_CHAR,
2256   TK_STRING,
2257   TK_CODE_POINT,
2258   TK_ANYCHAR,
2259   TK_CHAR_TYPE,
2260   TK_BACKREF,
2261   TK_CALL,
2262   TK_ANCHOR,
2263   TK_OP_REPEAT,
2264   TK_INTERVAL,
2265   TK_ANYCHAR_ANYTIME,  /* SQL '%' == .* */
2266   TK_ALT,
2267   TK_SUBEXP_OPEN,
2268   TK_SUBEXP_CLOSE,
2269   TK_CC_OPEN,
2270   TK_QUOTE_OPEN,
2271   TK_CHAR_PROPERTY,    /* \p{...}, \P{...} */
2272   /* in cc */
2273   TK_CC_CLOSE,
2274   TK_CC_RANGE,
2275   TK_POSIX_BRACKET_OPEN,
2276   TK_CC_AND,             /* && */
2277   TK_CC_CC_OPEN          /* [ */
2278 };
2279 
2280 typedef struct {
2281   enum TokenSyms type;
2282   int escaped;
2283   int base;   /* is number: 8, 16 (used in [....]) */
2284   UChar* backp;
2285   union {
2286     UChar* s;
2287     int   c;
2288     OnigCodePoint code;
2289     int   anchor;
2290     int   subtype;
2291     struct {
2292       int lower;
2293       int upper;
2294       int greedy;
2295       int possessive;
2296     } repeat;
2297     struct {
2298       int  num;
2299       int  ref1;
2300       int* refs;
2301       int  by_name;
2302 #ifdef USE_BACKREF_WITH_LEVEL
2303       int  exist_level;
2304       int  level;   /* \k<name+n> */
2305 #endif
2306     } backref;
2307     struct {
2308       UChar* name;
2309       UChar* name_end;
2310       int    gnum;
2311     } call;
2312     struct {
2313       int ctype;
2314       int not;
2315     } prop;
2316   } u;
2317 } OnigToken;
2318 
2319 
2320 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2321 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2322 {
2323   int low, up, syn_allow, non_low = 0;
2324   int r = 0;
2325   OnigCodePoint c;
2326   OnigEncoding enc = env->enc;
2327   UChar* p = *src;
2328   PFETCH_READY;
2329 
2330   syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2331 
2332   if (PEND) {
2333     if (syn_allow)
2334       return 1;  /* "....{" : OK! */
2335     else
2336       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;  /* "....{" syntax error */
2337   }
2338 
2339   if (! syn_allow) {
2340     c = PPEEK;
2341     if (c == ')' || c == '(' || c == '|') {
2342       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2343     }
2344   }
2345 
2346   low = onig_scan_unsigned_number(&p, end, env->enc);
2347   if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2348   if (low > ONIG_MAX_REPEAT_NUM)
2349     return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2350 
2351   if (p == *src) { /* can't read low */
2352     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2353       /* allow {,n} as {0,n} */
2354       low = 0;
2355       non_low = 1;
2356     }
2357     else
2358       goto invalid;
2359   }
2360 
2361   if (PEND) goto invalid;
2362   PFETCH(c);
2363   if (c == ',') {
2364     UChar* prev = p;
2365     up = onig_scan_unsigned_number(&p, end, env->enc);
2366     if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2367     if (up > ONIG_MAX_REPEAT_NUM)
2368       return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2369 
2370     if (p == prev) {
2371       if (non_low != 0)
2372 	goto invalid;
2373       up = REPEAT_INFINITE;  /* {n,} : {n,infinite} */
2374     }
2375   }
2376   else {
2377     if (non_low != 0)
2378       goto invalid;
2379 
2380     PUNFETCH;
2381     up = low;  /* {n} : exact n times */
2382     r = 2;     /* fixed */
2383   }
2384 
2385   if (PEND) goto invalid;
2386   PFETCH(c);
2387   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2388     if (c != MC_ESC(env->syntax)) goto invalid;
2389     PFETCH(c);
2390   }
2391   if (c != '}') goto invalid;
2392 
2393   if (!IS_REPEAT_INFINITE(up) && low > up) {
2394     return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2395   }
2396 
2397   tok->type = TK_INTERVAL;
2398   tok->u.repeat.lower = low;
2399   tok->u.repeat.upper = up;
2400   *src = p;
2401   return r; /* 0: normal {n,m}, 2: fixed {n} */
2402 
2403  invalid:
2404   if (syn_allow)
2405     return 1;  /* OK */
2406   else
2407     return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2408 }
2409 
2410 /* \M-, \C-, \c, or \... */
2411 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2412 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2413 {
2414   int v;
2415   OnigCodePoint c;
2416   OnigEncoding enc = env->enc;
2417   UChar* p = *src;
2418 
2419   if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2420 
2421   PFETCH_S(c);
2422   switch (c) {
2423   case 'M':
2424     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2425       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2426       PFETCH_S(c);
2427       if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2428       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2429       PFETCH_S(c);
2430       if (c == MC_ESC(env->syntax)) {
2431         v = fetch_escaped_value(&p, end, env);
2432         if (v < 0) return v;
2433         c = (OnigCodePoint )v;
2434       }
2435       c = ((c & 0xff) | 0x80);
2436     }
2437     else
2438       goto backslash;
2439     break;
2440 
2441   case 'C':
2442     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2443       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2444       PFETCH_S(c);
2445       if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2446       goto control;
2447     }
2448     else
2449       goto backslash;
2450 
2451   case 'c':
2452     if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2453     control:
2454       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2455       PFETCH_S(c);
2456       if (c == '?') {
2457         c = 0177;
2458       }
2459       else {
2460         if (c == MC_ESC(env->syntax)) {
2461           v = fetch_escaped_value(&p, end, env);
2462           if (v < 0) return v;
2463           c = (OnigCodePoint )v;
2464         }
2465         c &= 0x9f;
2466       }
2467       break;
2468     }
2469     /* fall through */
2470 
2471   default:
2472     {
2473     backslash:
2474       c = conv_backslash_value(c, env);
2475     }
2476     break;
2477   }
2478 
2479   *src = p;
2480   return c;
2481 }
2482 
2483 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2484 
2485 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2486 get_name_end_code_point(OnigCodePoint start)
2487 {
2488   switch (start) {
2489   case '<':  return (OnigCodePoint )'>'; break;
2490   case '\'': return (OnigCodePoint )'\''; break;
2491   default:
2492     break;
2493   }
2494 
2495   return (OnigCodePoint )0;
2496 }
2497 
2498 #ifdef USE_NAMED_GROUP
2499 #ifdef USE_BACKREF_WITH_LEVEL
2500 /*
2501    \k<name+n>, \k<name-n>
2502    \k<num+n>,  \k<num-n>
2503    \k<-num+n>, \k<-num-n>
2504 */
2505 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2506 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2507 		      UChar** rname_end, ScanEnv* env,
2508 		      int* rback_num, int* rlevel)
2509 {
2510   int r, sign, is_num, exist_level;
2511   OnigCodePoint end_code;
2512   OnigCodePoint c = 0;
2513   OnigEncoding enc = env->enc;
2514   UChar *name_end;
2515   UChar *pnum_head;
2516   UChar *p = *src;
2517   PFETCH_READY;
2518 
2519   *rback_num = 0;
2520   is_num = exist_level = 0;
2521   sign = 1;
2522   pnum_head = *src;
2523 
2524   end_code = get_name_end_code_point(start_code);
2525 
2526   name_end = end;
2527   r = 0;
2528   if (PEND) {
2529     return ONIGERR_EMPTY_GROUP_NAME;
2530   }
2531   else {
2532     PFETCH(c);
2533     if (c == end_code)
2534       return ONIGERR_EMPTY_GROUP_NAME;
2535 
2536     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2537       is_num = 1;
2538     }
2539     else if (c == '-') {
2540       is_num = 2;
2541       sign = -1;
2542       pnum_head = p;
2543     }
2544     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2545       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2546     }
2547   }
2548 
2549   while (!PEND) {
2550     name_end = p;
2551     PFETCH(c);
2552     if (c == end_code || c == ')' || c == '+' || c == '-') {
2553       if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2554       break;
2555     }
2556 
2557     if (is_num != 0) {
2558       if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2559         is_num = 1;
2560       }
2561       else {
2562         r = ONIGERR_INVALID_GROUP_NAME;
2563         is_num = 0;
2564       }
2565     }
2566     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2567       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2568     }
2569   }
2570 
2571   if (r == 0 && c != end_code) {
2572     if (c == '+' || c == '-') {
2573       int level;
2574       int flag = (c == '-' ? -1 : 1);
2575 
2576       PFETCH(c);
2577       if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2578       PUNFETCH;
2579       level = onig_scan_unsigned_number(&p, end, enc);
2580       if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2581       *rlevel = (level * flag);
2582       exist_level = 1;
2583 
2584       PFETCH(c);
2585       if (c == end_code)
2586 	goto end;
2587     }
2588 
2589   err:
2590     r = ONIGERR_INVALID_GROUP_NAME;
2591     name_end = end;
2592   }
2593 
2594  end:
2595   if (r == 0) {
2596     if (is_num != 0) {
2597       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2598       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2599       else if (*rback_num == 0) goto err;
2600 
2601       *rback_num *= sign;
2602     }
2603 
2604     *rname_end = name_end;
2605     *src = p;
2606     return (exist_level ? 1 : 0);
2607   }
2608   else {
2609     onig_scan_env_set_error_string(env, r, *src, name_end);
2610     return r;
2611   }
2612 }
2613 #endif /* USE_BACKREF_WITH_LEVEL */
2614 
2615 /*
2616   def: 0 -> define name    (don't allow number name)
2617        1 -> reference name (allow number name)
2618 */
2619 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2620 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2621 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2622 {
2623   int r, is_num, sign;
2624   OnigCodePoint end_code;
2625   OnigCodePoint c = 0;
2626   OnigEncoding enc = env->enc;
2627   UChar *name_end;
2628   UChar *pnum_head;
2629   UChar *p = *src;
2630 
2631   *rback_num = 0;
2632 
2633   end_code = get_name_end_code_point(start_code);
2634 
2635   name_end = end;
2636   pnum_head = *src;
2637   r = 0;
2638   is_num = 0;
2639   sign = 1;
2640   if (PEND) {
2641     return ONIGERR_EMPTY_GROUP_NAME;
2642   }
2643   else {
2644     PFETCH_S(c);
2645     if (c == end_code)
2646       return ONIGERR_EMPTY_GROUP_NAME;
2647 
2648     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2649       if (ref == 1)
2650         is_num = 1;
2651       else {
2652         r = ONIGERR_INVALID_GROUP_NAME;
2653         is_num = 0;
2654       }
2655     }
2656     else if (c == '-') {
2657       if (ref == 1) {
2658         is_num = 2;
2659         sign = -1;
2660         pnum_head = p;
2661       }
2662       else {
2663         r = ONIGERR_INVALID_GROUP_NAME;
2664         is_num = 0;
2665       }
2666     }
2667     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2668       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2669     }
2670   }
2671 
2672   if (r == 0) {
2673     while (!PEND) {
2674       name_end = p;
2675       PFETCH_S(c);
2676       if (c == end_code || c == ')') {
2677         if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2678         break;
2679       }
2680 
2681       if (is_num != 0) {
2682         if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2683           is_num = 1;
2684         }
2685         else {
2686           if (!ONIGENC_IS_CODE_WORD(enc, c))
2687             r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2688           else
2689             r = ONIGERR_INVALID_GROUP_NAME;
2690           is_num = 0;
2691         }
2692       }
2693       else {
2694         if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2695           r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2696         }
2697       }
2698     }
2699 
2700     if (c != end_code) {
2701       r = ONIGERR_INVALID_GROUP_NAME;
2702       name_end = end;
2703     }
2704 
2705     if (is_num != 0) {
2706       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2707       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2708       else if (*rback_num == 0) {
2709         r = ONIGERR_INVALID_GROUP_NAME;
2710         goto err;
2711       }
2712 
2713       *rback_num *= sign;
2714     }
2715 
2716     *rname_end = name_end;
2717     *src = p;
2718     return 0;
2719   }
2720   else {
2721     while (!PEND) {
2722       name_end = p;
2723       PFETCH_S(c);
2724       if (c == end_code || c == ')')
2725         break;
2726     }
2727     if (PEND)
2728       name_end = end;
2729 
2730   err:
2731     onig_scan_env_set_error_string(env, r, *src, name_end);
2732     return r;
2733   }
2734 }
2735 #else
2736 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2737 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2738 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2739 {
2740   int r, is_num, sign;
2741   OnigCodePoint end_code;
2742   OnigCodePoint c = 0;
2743   UChar *name_end;
2744   OnigEncoding enc = env->enc;
2745   UChar *pnum_head;
2746   UChar *p = *src;
2747   PFETCH_READY;
2748 
2749   *rback_num = 0;
2750 
2751   end_code = get_name_end_code_point(start_code);
2752 
2753   *rname_end = name_end = end;
2754   r = 0;
2755   pnum_head = *src;
2756   is_num = 0;
2757   sign = 1;
2758 
2759   if (PEND) {
2760     return ONIGERR_EMPTY_GROUP_NAME;
2761   }
2762   else {
2763     PFETCH(c);
2764     if (c == end_code)
2765       return ONIGERR_EMPTY_GROUP_NAME;
2766 
2767     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2768       is_num = 1;
2769     }
2770     else if (c == '-') {
2771       is_num = 2;
2772       sign = -1;
2773       pnum_head = p;
2774     }
2775     else {
2776       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2777     }
2778   }
2779 
2780   while (!PEND) {
2781     name_end = p;
2782 
2783     PFETCH(c);
2784     if (c == end_code || c == ')') break;
2785     if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2786       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2787   }
2788   if (r == 0 && c != end_code) {
2789     r = ONIGERR_INVALID_GROUP_NAME;
2790     name_end = end;
2791   }
2792 
2793   if (r == 0) {
2794     *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2795     if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2796     else if (*rback_num == 0) {
2797       r = ONIGERR_INVALID_GROUP_NAME;
2798       goto err;
2799     }
2800     *rback_num *= sign;
2801 
2802     *rname_end = name_end;
2803     *src = p;
2804     return 0;
2805   }
2806   else {
2807   err:
2808     onig_scan_env_set_error_string(env, r, *src, name_end);
2809     return r;
2810   }
2811 }
2812 #endif /* USE_NAMED_GROUP */
2813 
2814 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2815 CC_ESC_WARN(ScanEnv* env, UChar *c)
2816 {
2817   if (onig_warn == onig_null_warn) return ;
2818 
2819   if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2820       IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2821     UChar buf[WARN_BUFSIZE];
2822     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2823 		env->pattern, env->pattern_end,
2824                 (UChar* )"character class has '%s' without escape", c);
2825     (*onig_warn)((char* )buf);
2826   }
2827 }
2828 
2829 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2830 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2831 {
2832   if (onig_warn == onig_null_warn) return ;
2833 
2834   if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2835     UChar buf[WARN_BUFSIZE];
2836     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2837 		(env)->pattern, (env)->pattern_end,
2838 		(UChar* )"regular expression has '%s' without escape", c);
2839     (*onig_warn)((char* )buf);
2840   }
2841 }
2842 
2843 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2844 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2845 		  UChar **next, OnigEncoding enc)
2846 {
2847   int i;
2848   OnigCodePoint x;
2849   UChar *q;
2850   UChar *p = from;
2851 
2852   while (p < to) {
2853     x = ONIGENC_MBC_TO_CODE(enc, p, to);
2854     q = p + enclen(enc, p);
2855     if (x == s[0]) {
2856       for (i = 1; i < n && q < to; i++) {
2857 	x = ONIGENC_MBC_TO_CODE(enc, q, to);
2858 	if (x != s[i]) break;
2859 	q += enclen(enc, q);
2860       }
2861       if (i >= n) {
2862 	if (IS_NOT_NULL(next))
2863 	  *next = q;
2864 	return p;
2865       }
2866     }
2867     p = q;
2868   }
2869   return NULL_UCHARP;
2870 }
2871 
2872 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2873 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2874 		 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2875 {
2876   int i, in_esc;
2877   OnigCodePoint x;
2878   UChar *q;
2879   UChar *p = from;
2880 
2881   in_esc = 0;
2882   while (p < to) {
2883     if (in_esc) {
2884       in_esc = 0;
2885       p += enclen(enc, p);
2886     }
2887     else {
2888       x = ONIGENC_MBC_TO_CODE(enc, p, to);
2889       q = p + enclen(enc, p);
2890       if (x == s[0]) {
2891 	for (i = 1; i < n && q < to; i++) {
2892 	  x = ONIGENC_MBC_TO_CODE(enc, q, to);
2893 	  if (x != s[i]) break;
2894 	  q += enclen(enc, q);
2895 	}
2896 	if (i >= n) return 1;
2897 	p += enclen(enc, p);
2898       }
2899       else {
2900 	x = ONIGENC_MBC_TO_CODE(enc, p, to);
2901 	if (x == bad) return 0;
2902 	else if (x == MC_ESC(syn)) in_esc = 1;
2903 	p = q;
2904       }
2905     }
2906   }
2907   return 0;
2908 }
2909 
2910 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2911 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2912 {
2913   int num;
2914   OnigCodePoint c, c2;
2915   OnigSyntaxType* syn = env->syntax;
2916   OnigEncoding enc = env->enc;
2917   UChar* prev;
2918   UChar* p = *src;
2919   PFETCH_READY;
2920 
2921   if (PEND) {
2922     tok->type = TK_EOT;
2923     return tok->type;
2924   }
2925 
2926   PFETCH(c);
2927   tok->type = TK_CHAR;
2928   tok->base = 0;
2929   tok->u.c  = c;
2930   tok->escaped = 0;
2931 
2932   if (c == ']') {
2933     tok->type = TK_CC_CLOSE;
2934   }
2935   else if (c == '-') {
2936     tok->type = TK_CC_RANGE;
2937   }
2938   else if (c == MC_ESC(syn)) {
2939     if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2940       goto end;
2941 
2942     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2943 
2944     PFETCH(c);
2945     tok->escaped = 1;
2946     tok->u.c = c;
2947     switch (c) {
2948     case 'w':
2949       tok->type = TK_CHAR_TYPE;
2950       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2951       tok->u.prop.not   = 0;
2952       break;
2953     case 'W':
2954       tok->type = TK_CHAR_TYPE;
2955       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2956       tok->u.prop.not   = 1;
2957       break;
2958     case 'd':
2959       tok->type = TK_CHAR_TYPE;
2960       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2961       tok->u.prop.not   = 0;
2962       break;
2963     case 'D':
2964       tok->type = TK_CHAR_TYPE;
2965       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2966       tok->u.prop.not   = 1;
2967       break;
2968     case 's':
2969       tok->type = TK_CHAR_TYPE;
2970       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2971       tok->u.prop.not   = 0;
2972       break;
2973     case 'S':
2974       tok->type = TK_CHAR_TYPE;
2975       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2976       tok->u.prop.not   = 1;
2977       break;
2978     case 'h':
2979       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2980       tok->type = TK_CHAR_TYPE;
2981       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2982       tok->u.prop.not   = 0;
2983       break;
2984     case 'H':
2985       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2986       tok->type = TK_CHAR_TYPE;
2987       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2988       tok->u.prop.not   = 1;
2989       break;
2990 
2991     case 'p':
2992     case 'P':
2993       c2 = PPEEK;
2994       if (c2 == '{' &&
2995 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2996 	PINC;
2997 	tok->type = TK_CHAR_PROPERTY;
2998 	tok->u.prop.not = (c == 'P' ? 1 : 0);
2999 
3000 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3001 	  PFETCH(c2);
3002 	  if (c2 == '^') {
3003 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3004 	  }
3005 	  else
3006 	    PUNFETCH;
3007 	}
3008       }
3009       break;
3010 
3011     case 'x':
3012       if (PEND) break;
3013 
3014       prev = p;
3015       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3016 	PINC;
3017 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3018 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3019 	if (!PEND) {
3020           c2 = PPEEK;
3021           if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3022             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3023         }
3024 
3025 	if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3026 	  PINC;
3027 	  tok->type   = TK_CODE_POINT;
3028 	  tok->base   = 16;
3029 	  tok->u.code = (OnigCodePoint )num;
3030 	}
3031 	else {
3032 	  /* can't read nothing or invalid format */
3033 	  p = prev;
3034 	}
3035       }
3036       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3037 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3038 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3039 	if (p == prev) {  /* can't read nothing. */
3040 	  num = 0; /* but, it's not error */
3041 	}
3042 	tok->type = TK_RAW_BYTE;
3043 	tok->base = 16;
3044 	tok->u.c  = num;
3045       }
3046       break;
3047 
3048     case 'u':
3049       if (PEND) break;
3050 
3051       prev = p;
3052       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3053 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3054 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3055 	if (p == prev) {  /* can't read nothing. */
3056 	  num = 0; /* but, it's not error */
3057 	}
3058 	tok->type   = TK_CODE_POINT;
3059 	tok->base   = 16;
3060 	tok->u.code = (OnigCodePoint )num;
3061       }
3062       break;
3063 
3064     case '0':
3065     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3066       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3067 	PUNFETCH;
3068 	prev = p;
3069 	num = scan_unsigned_octal_number(&p, end, 3, enc);
3070 	if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3071 	if (p == prev) {  /* can't read nothing. */
3072 	  num = 0; /* but, it's not error */
3073 	}
3074 	tok->type = TK_RAW_BYTE;
3075 	tok->base = 8;
3076 	tok->u.c  = num;
3077       }
3078       break;
3079 
3080     default:
3081       PUNFETCH;
3082       num = fetch_escaped_value(&p, end, env);
3083       if (num < 0) return num;
3084       if (tok->u.c != num) {
3085 	tok->u.code = (OnigCodePoint )num;
3086 	tok->type   = TK_CODE_POINT;
3087       }
3088       break;
3089     }
3090   }
3091   else if (c == '[') {
3092     if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3093       OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3094       tok->backp = p; /* point at '[' is readed */
3095       PINC;
3096       if (str_exist_check_with_esc(send, 2, p, end,
3097                                    (OnigCodePoint )']', enc, syn)) {
3098 	tok->type = TK_POSIX_BRACKET_OPEN;
3099       }
3100       else {
3101 	PUNFETCH;
3102 	goto cc_in_cc;
3103       }
3104     }
3105     else {
3106     cc_in_cc:
3107       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3108 	tok->type = TK_CC_CC_OPEN;
3109       }
3110       else {
3111 	CC_ESC_WARN(env, (UChar* )"[");
3112       }
3113     }
3114   }
3115   else if (c == '&') {
3116     if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3117 	!PEND && (PPEEK_IS('&'))) {
3118       PINC;
3119       tok->type = TK_CC_AND;
3120     }
3121   }
3122 
3123  end:
3124   *src = p;
3125   return tok->type;
3126 }
3127 
3128 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3129 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3130 {
3131   int r, num;
3132   OnigCodePoint c;
3133   OnigEncoding enc = env->enc;
3134   OnigSyntaxType* syn = env->syntax;
3135   UChar* prev;
3136   UChar* p = *src;
3137   PFETCH_READY;
3138 
3139  start:
3140   if (PEND) {
3141     tok->type = TK_EOT;
3142     return tok->type;
3143   }
3144 
3145   tok->type  = TK_STRING;
3146   tok->base  = 0;
3147   tok->backp = p;
3148 
3149   PFETCH(c);
3150   if (IS_MC_ESC_CODE(c, syn)) {
3151     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3152 
3153     tok->backp = p;
3154     PFETCH(c);
3155 
3156     tok->u.c = c;
3157     tok->escaped = 1;
3158     switch (c) {
3159     case '*':
3160       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3161       tok->type = TK_OP_REPEAT;
3162       tok->u.repeat.lower = 0;
3163       tok->u.repeat.upper = REPEAT_INFINITE;
3164       goto greedy_check;
3165       break;
3166 
3167     case '+':
3168       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3169       tok->type = TK_OP_REPEAT;
3170       tok->u.repeat.lower = 1;
3171       tok->u.repeat.upper = REPEAT_INFINITE;
3172       goto greedy_check;
3173       break;
3174 
3175     case '?':
3176       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3177       tok->type = TK_OP_REPEAT;
3178       tok->u.repeat.lower = 0;
3179       tok->u.repeat.upper = 1;
3180     greedy_check:
3181       if (!PEND && PPEEK_IS('?') &&
3182 	  IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3183 	PFETCH(c);
3184 	tok->u.repeat.greedy     = 0;
3185 	tok->u.repeat.possessive = 0;
3186       }
3187       else {
3188       possessive_check:
3189 	if (!PEND && PPEEK_IS('+') &&
3190 	    ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3191 	      tok->type != TK_INTERVAL)  ||
3192 	     (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3193 	      tok->type == TK_INTERVAL))) {
3194 	  PFETCH(c);
3195 	  tok->u.repeat.greedy     = 1;
3196 	  tok->u.repeat.possessive = 1;
3197 	}
3198 	else {
3199 	  tok->u.repeat.greedy     = 1;
3200 	  tok->u.repeat.possessive = 0;
3201 	}
3202       }
3203       break;
3204 
3205     case '{':
3206       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3207       r = fetch_range_quantifier(&p, end, tok, env);
3208       if (r < 0) return r;  /* error */
3209       if (r == 0) goto greedy_check;
3210       else if (r == 2) { /* {n} */
3211 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3212 	  goto possessive_check;
3213 
3214 	goto greedy_check;
3215       }
3216       /* r == 1 : normal char */
3217       break;
3218 
3219     case '|':
3220       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3221       tok->type = TK_ALT;
3222       break;
3223 
3224     case '(':
3225       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3226       tok->type = TK_SUBEXP_OPEN;
3227       break;
3228 
3229     case ')':
3230       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3231       tok->type = TK_SUBEXP_CLOSE;
3232       break;
3233 
3234     case 'w':
3235       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3236       tok->type = TK_CHAR_TYPE;
3237       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3238       tok->u.prop.not   = 0;
3239       break;
3240 
3241     case 'W':
3242       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3243       tok->type = TK_CHAR_TYPE;
3244       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3245       tok->u.prop.not   = 1;
3246       break;
3247 
3248     case 'b':
3249       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3250       tok->type = TK_ANCHOR;
3251       tok->u.anchor = ANCHOR_WORD_BOUND;
3252       break;
3253 
3254     case 'B':
3255       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3256       tok->type = TK_ANCHOR;
3257       tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3258       break;
3259 
3260 #ifdef USE_WORD_BEGIN_END
3261     case '<':
3262       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3263       tok->type = TK_ANCHOR;
3264       tok->u.anchor = ANCHOR_WORD_BEGIN;
3265       break;
3266 
3267     case '>':
3268       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3269       tok->type = TK_ANCHOR;
3270       tok->u.anchor = ANCHOR_WORD_END;
3271       break;
3272 #endif
3273 
3274     case 's':
3275       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3276       tok->type = TK_CHAR_TYPE;
3277       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3278       tok->u.prop.not   = 0;
3279       break;
3280 
3281     case 'S':
3282       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3283       tok->type = TK_CHAR_TYPE;
3284       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3285       tok->u.prop.not   = 1;
3286       break;
3287 
3288     case 'd':
3289       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3290       tok->type = TK_CHAR_TYPE;
3291       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3292       tok->u.prop.not   = 0;
3293       break;
3294 
3295     case 'D':
3296       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3297       tok->type = TK_CHAR_TYPE;
3298       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3299       tok->u.prop.not   = 1;
3300       break;
3301 
3302     case 'h':
3303       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3304       tok->type = TK_CHAR_TYPE;
3305       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3306       tok->u.prop.not   = 0;
3307       break;
3308 
3309     case 'H':
3310       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3311       tok->type = TK_CHAR_TYPE;
3312       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3313       tok->u.prop.not   = 1;
3314       break;
3315 
3316     case 'A':
3317       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3318     begin_buf:
3319       tok->type = TK_ANCHOR;
3320       tok->u.subtype = ANCHOR_BEGIN_BUF;
3321       break;
3322 
3323     case 'Z':
3324       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3325       tok->type = TK_ANCHOR;
3326       tok->u.subtype = ANCHOR_SEMI_END_BUF;
3327       break;
3328 
3329     case 'z':
3330       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3331     end_buf:
3332       tok->type = TK_ANCHOR;
3333       tok->u.subtype = ANCHOR_END_BUF;
3334       break;
3335 
3336     case 'G':
3337       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3338       tok->type = TK_ANCHOR;
3339       tok->u.subtype = ANCHOR_BEGIN_POSITION;
3340       break;
3341 
3342     case '`':
3343       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3344       goto begin_buf;
3345       break;
3346 
3347     case '\'':
3348       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3349       goto end_buf;
3350       break;
3351 
3352     case 'x':
3353       if (PEND) break;
3354 
3355       prev = p;
3356       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3357 	PINC;
3358 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3359 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3360 	if (!PEND) {
3361           if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3362             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3363         }
3364 
3365 	if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3366 	  PINC;
3367 	  tok->type   = TK_CODE_POINT;
3368 	  tok->u.code = (OnigCodePoint )num;
3369 	}
3370 	else {
3371 	  /* can't read nothing or invalid format */
3372 	  p = prev;
3373 	}
3374       }
3375       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3376 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3377 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3378 	if (p == prev) {  /* can't read nothing. */
3379 	  num = 0; /* but, it's not error */
3380 	}
3381 	tok->type = TK_RAW_BYTE;
3382 	tok->base = 16;
3383 	tok->u.c  = num;
3384       }
3385       break;
3386 
3387     case 'u':
3388       if (PEND) break;
3389 
3390       prev = p;
3391       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3392 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3393 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3394 	if (p == prev) {  /* can't read nothing. */
3395 	  num = 0; /* but, it's not error */
3396 	}
3397 	tok->type   = TK_CODE_POINT;
3398 	tok->base   = 16;
3399 	tok->u.code = (OnigCodePoint )num;
3400       }
3401       break;
3402 
3403     case '1': case '2': case '3': case '4':
3404     case '5': case '6': case '7': case '8': case '9':
3405       PUNFETCH;
3406       prev = p;
3407       num = onig_scan_unsigned_number(&p, end, enc);
3408       if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3409         goto skip_backref;
3410       }
3411 
3412       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3413 	  (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3414 	if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3415 	  if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3416 	    return ONIGERR_INVALID_BACKREF;
3417 	}
3418 
3419 	tok->type = TK_BACKREF;
3420 	tok->u.backref.num     = 1;
3421 	tok->u.backref.ref1    = num;
3422 	tok->u.backref.by_name = 0;
3423 #ifdef USE_BACKREF_WITH_LEVEL
3424 	tok->u.backref.exist_level = 0;
3425 #endif
3426 	break;
3427       }
3428 
3429     skip_backref:
3430       if (c == '8' || c == '9') {
3431 	/* normal char */
3432 	p = prev; PINC;
3433 	break;
3434       }
3435 
3436       p = prev;
3437       /* fall through */
3438     case '0':
3439       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3440 	prev = p;
3441 	num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3442 	if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3443 	if (p == prev) {  /* can't read nothing. */
3444 	  num = 0; /* but, it's not error */
3445 	}
3446 	tok->type = TK_RAW_BYTE;
3447 	tok->base = 8;
3448 	tok->u.c  = num;
3449       }
3450       else if (c != '0') {
3451 	PINC;
3452       }
3453       break;
3454 
3455 #ifdef USE_NAMED_GROUP
3456     case 'k':
3457       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3458 	PFETCH(c);
3459 	if (c == '<' || c == '\'') {
3460 	  UChar* name_end;
3461 	  int* backs;
3462 	  int back_num;
3463 
3464 	  prev = p;
3465 
3466 #ifdef USE_BACKREF_WITH_LEVEL
3467 	  name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3468 	  r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3469 				    env, &back_num, &tok->u.backref.level);
3470 	  if (r == 1) tok->u.backref.exist_level = 1;
3471 	  else        tok->u.backref.exist_level = 0;
3472 #else
3473 	  r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3474 #endif
3475 	  if (r < 0) return r;
3476 
3477 	  if (back_num != 0) {
3478 	    if (back_num < 0) {
3479 	      back_num = BACKREF_REL_TO_ABS(back_num, env);
3480 	      if (back_num <= 0)
3481 		return ONIGERR_INVALID_BACKREF;
3482 	    }
3483 
3484 	    if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3485 	      if (back_num > env->num_mem ||
3486 		  IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3487 		return ONIGERR_INVALID_BACKREF;
3488 	    }
3489 	    tok->type = TK_BACKREF;
3490 	    tok->u.backref.by_name = 0;
3491 	    tok->u.backref.num  = 1;
3492 	    tok->u.backref.ref1 = back_num;
3493 	  }
3494 	  else {
3495 	    num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3496 	    if (num <= 0) {
3497 	      onig_scan_env_set_error_string(env,
3498 			     ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3499 	      return ONIGERR_UNDEFINED_NAME_REFERENCE;
3500 	    }
3501 	    if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3502 	      int i;
3503 	      for (i = 0; i < num; i++) {
3504 		if (backs[i] > env->num_mem ||
3505 		    IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3506 		  return ONIGERR_INVALID_BACKREF;
3507 	      }
3508 	    }
3509 
3510 	    tok->type = TK_BACKREF;
3511 	    tok->u.backref.by_name = 1;
3512 	    if (num == 1) {
3513 	      tok->u.backref.num  = 1;
3514 	      tok->u.backref.ref1 = backs[0];
3515 	    }
3516 	    else {
3517 	      tok->u.backref.num  = num;
3518 	      tok->u.backref.refs = backs;
3519 	    }
3520 	  }
3521 	}
3522 	else
3523 	  PUNFETCH;
3524       }
3525       break;
3526 #endif
3527 
3528 #ifdef USE_SUBEXP_CALL
3529     case 'g':
3530       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3531 	PFETCH(c);
3532 	if (c == '<' || c == '\'') {
3533 	  int gnum;
3534 	  UChar* name_end;
3535 
3536 	  prev = p;
3537 	  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3538 	  if (r < 0) return r;
3539 
3540 	  tok->type = TK_CALL;
3541 	  tok->u.call.name     = prev;
3542 	  tok->u.call.name_end = name_end;
3543 	  tok->u.call.gnum     = gnum;
3544 	}
3545 	else
3546 	  PUNFETCH;
3547       }
3548       break;
3549 #endif
3550 
3551     case 'Q':
3552       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3553 	tok->type = TK_QUOTE_OPEN;
3554       }
3555       break;
3556 
3557     case 'p':
3558     case 'P':
3559       if (PPEEK_IS('{') &&
3560 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3561 	PINC;
3562 	tok->type = TK_CHAR_PROPERTY;
3563 	tok->u.prop.not = (c == 'P' ? 1 : 0);
3564 
3565 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3566 	  PFETCH(c);
3567 	  if (c == '^') {
3568 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3569 	  }
3570 	  else
3571 	    PUNFETCH;
3572 	}
3573       }
3574       break;
3575 
3576     default:
3577       PUNFETCH;
3578       num = fetch_escaped_value(&p, end, env);
3579       if (num < 0) return num;
3580       /* set_raw: */
3581       if (tok->u.c != num) {
3582 	tok->type = TK_CODE_POINT;
3583 	tok->u.code = (OnigCodePoint )num;
3584       }
3585       else { /* string */
3586           int len;
3587           SAFE_ENC_LEN(enc, tok->backp, end, len);
3588           p = tok->backp + len;
3589       }
3590       break;
3591     }
3592   }
3593   else {
3594     tok->u.c = c;
3595     tok->escaped = 0;
3596 
3597 #ifdef USE_VARIABLE_META_CHARS
3598     if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3599 	IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3600       if (c == MC_ANYCHAR(syn))
3601 	goto any_char;
3602       else if (c == MC_ANYTIME(syn))
3603 	goto anytime;
3604       else if (c == MC_ZERO_OR_ONE_TIME(syn))
3605 	goto zero_or_one_time;
3606       else if (c == MC_ONE_OR_MORE_TIME(syn))
3607 	goto one_or_more_time;
3608       else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3609 	tok->type = TK_ANYCHAR_ANYTIME;
3610 	goto out;
3611       }
3612     }
3613 #endif
3614 
3615     switch (c) {
3616     case '.':
3617       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3618 #ifdef USE_VARIABLE_META_CHARS
3619     any_char:
3620 #endif
3621       tok->type = TK_ANYCHAR;
3622       break;
3623 
3624     case '*':
3625       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3626 #ifdef USE_VARIABLE_META_CHARS
3627     anytime:
3628 #endif
3629       tok->type = TK_OP_REPEAT;
3630       tok->u.repeat.lower = 0;
3631       tok->u.repeat.upper = REPEAT_INFINITE;
3632       goto greedy_check;
3633       break;
3634 
3635     case '+':
3636       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3637 #ifdef USE_VARIABLE_META_CHARS
3638     one_or_more_time:
3639 #endif
3640       tok->type = TK_OP_REPEAT;
3641       tok->u.repeat.lower = 1;
3642       tok->u.repeat.upper = REPEAT_INFINITE;
3643       goto greedy_check;
3644       break;
3645 
3646     case '?':
3647       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3648 #ifdef USE_VARIABLE_META_CHARS
3649     zero_or_one_time:
3650 #endif
3651       tok->type = TK_OP_REPEAT;
3652       tok->u.repeat.lower = 0;
3653       tok->u.repeat.upper = 1;
3654       goto greedy_check;
3655       break;
3656 
3657     case '{':
3658       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3659       r = fetch_range_quantifier(&p, end, tok, env);
3660       if (r < 0) return r;  /* error */
3661       if (r == 0) goto greedy_check;
3662       else if (r == 2) { /* {n} */
3663 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3664 	  goto possessive_check;
3665 
3666 	goto greedy_check;
3667       }
3668       /* r == 1 : normal char */
3669       break;
3670 
3671     case '|':
3672       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3673       tok->type = TK_ALT;
3674       break;
3675 
3676     case '(':
3677       if (PPEEK_IS('?') &&
3678           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3679         PINC;
3680         if (PPEEK_IS('#')) {
3681           PFETCH(c);
3682           while (1) {
3683             if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3684             PFETCH(c);
3685             if (c == MC_ESC(syn)) {
3686               if (!PEND) PFETCH(c);
3687             }
3688             else {
3689               if (c == ')') break;
3690             }
3691           }
3692           goto start;
3693         }
3694         PUNFETCH;
3695       }
3696 
3697       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3698       tok->type = TK_SUBEXP_OPEN;
3699       break;
3700 
3701     case ')':
3702       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3703       tok->type = TK_SUBEXP_CLOSE;
3704       break;
3705 
3706     case '^':
3707       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3708       tok->type = TK_ANCHOR;
3709       tok->u.subtype = (IS_SINGLELINE(env->option)
3710 			? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3711       break;
3712 
3713     case '$':
3714       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3715       tok->type = TK_ANCHOR;
3716       tok->u.subtype = (IS_SINGLELINE(env->option)
3717 			? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3718       break;
3719 
3720     case '[':
3721       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3722       tok->type = TK_CC_OPEN;
3723       break;
3724 
3725     case ']':
3726       if (*src > env->pattern)   /* /].../ is allowed. */
3727 	CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3728       break;
3729 
3730     case '#':
3731       if (IS_EXTEND(env->option)) {
3732 	while (!PEND) {
3733 	  PFETCH(c);
3734 	  if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3735 	    break;
3736 	}
3737 	goto start;
3738 	break;
3739       }
3740       break;
3741 
3742     case ' ': case '\t': case '\n': case '\r': case '\f':
3743       if (IS_EXTEND(env->option))
3744 	goto start;
3745       break;
3746 
3747     default:
3748       /* string */
3749       break;
3750     }
3751   }
3752 
3753 #ifdef USE_VARIABLE_META_CHARS
3754  out:
3755 #endif
3756   *src = p;
3757   return tok->type;
3758 }
3759 
3760 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3761 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3762 			 OnigEncoding enc ARG_UNUSED,
3763                          OnigCodePoint sb_out, const OnigCodePoint mbr[])
3764 {
3765   int i, r;
3766   OnigCodePoint j;
3767 
3768   int n = ONIGENC_CODE_RANGE_NUM(mbr);
3769 
3770   if (not == 0) {
3771     for (i = 0; i < n; i++) {
3772       for (j  = ONIGENC_CODE_RANGE_FROM(mbr, i);
3773            j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3774 	if (j >= sb_out) {
3775 	  if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
3776 	  else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3777 	    r = add_code_range_to_buf(&(cc->mbuf), j,
3778 				      ONIGENC_CODE_RANGE_TO(mbr, i));
3779 	    if (r != 0) return r;
3780 	    i++;
3781 	  }
3782 
3783 	  goto sb_end;
3784 	}
3785         BITSET_SET_BIT(cc->bs, j);
3786       }
3787     }
3788 
3789   sb_end:
3790     for ( ; i < n; i++) {
3791       r = add_code_range_to_buf(&(cc->mbuf),
3792                                 ONIGENC_CODE_RANGE_FROM(mbr, i),
3793                                 ONIGENC_CODE_RANGE_TO(mbr, i));
3794       if (r != 0) return r;
3795     }
3796   }
3797   else {
3798     OnigCodePoint prev = 0;
3799 
3800     for (i = 0; i < n; i++) {
3801       for (j = prev;
3802 	   j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3803 	if (j >= sb_out) {
3804 	  goto sb_end2;
3805 	}
3806 	BITSET_SET_BIT(cc->bs, j);
3807       }
3808       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3809     }
3810     for (j = prev; j < sb_out; j++) {
3811       BITSET_SET_BIT(cc->bs, j);
3812     }
3813 
3814   sb_end2:
3815     prev = sb_out;
3816 
3817     for (i = 0; i < n; i++) {
3818       if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3819 	r = add_code_range_to_buf(&(cc->mbuf), prev,
3820                                   ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3821 	if (r != 0) return r;
3822       }
3823       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3824     }
3825     if (prev < 0x7fffffff) {
3826       r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3827       if (r != 0) return r;
3828     }
3829   }
3830 
3831   return 0;
3832 }
3833 
3834 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3835 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3836 {
3837   int c, r;
3838   const OnigCodePoint *ranges;
3839   OnigCodePoint sb_out;
3840   OnigEncoding enc = env->enc;
3841 
3842   r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3843   if (r == 0) {
3844     return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3845   }
3846   else if (r != ONIG_NO_SUPPORT_CONFIG) {
3847     return r;
3848   }
3849 
3850   r = 0;
3851   switch (ctype) {
3852   case ONIGENC_CTYPE_ALPHA:
3853   case ONIGENC_CTYPE_BLANK:
3854   case ONIGENC_CTYPE_CNTRL:
3855   case ONIGENC_CTYPE_DIGIT:
3856   case ONIGENC_CTYPE_LOWER:
3857   case ONIGENC_CTYPE_PUNCT:
3858   case ONIGENC_CTYPE_SPACE:
3859   case ONIGENC_CTYPE_UPPER:
3860   case ONIGENC_CTYPE_XDIGIT:
3861   case ONIGENC_CTYPE_ASCII:
3862   case ONIGENC_CTYPE_ALNUM:
3863     if (not != 0) {
3864       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3865 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3866 	  BITSET_SET_BIT(cc->bs, c);
3867       }
3868       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3869     }
3870     else {
3871       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3872 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3873 	  BITSET_SET_BIT(cc->bs, c);
3874       }
3875     }
3876     break;
3877 
3878   case ONIGENC_CTYPE_GRAPH:
3879   case ONIGENC_CTYPE_PRINT:
3880     if (not != 0) {
3881       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3882 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3883 	  BITSET_SET_BIT(cc->bs, c);
3884       }
3885     }
3886     else {
3887       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3888 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3889 	  BITSET_SET_BIT(cc->bs, c);
3890       }
3891       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3892     }
3893     break;
3894 
3895   case ONIGENC_CTYPE_WORD:
3896     if (not == 0) {
3897       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3898 	if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3899       }
3900       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3901     }
3902     else {
3903       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3904         if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3905 	    && ! ONIGENC_IS_CODE_WORD(enc, c))
3906 	  BITSET_SET_BIT(cc->bs, c);
3907       }
3908     }
3909     break;
3910 
3911   default:
3912     return ONIGERR_PARSER_BUG;
3913     break;
3914   }
3915 
3916   return r;
3917 }
3918 
3919 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3920 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3921 {
3922 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH  20
3923 #define POSIX_BRACKET_NAME_MIN_LEN         4
3924 
3925   static PosixBracketEntryType PBS[] = {
3926     { (UChar* )"alnum",  ONIGENC_CTYPE_ALNUM,  5 },
3927     { (UChar* )"alpha",  ONIGENC_CTYPE_ALPHA,  5 },
3928     { (UChar* )"blank",  ONIGENC_CTYPE_BLANK,  5 },
3929     { (UChar* )"cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
3930     { (UChar* )"digit",  ONIGENC_CTYPE_DIGIT,  5 },
3931     { (UChar* )"graph",  ONIGENC_CTYPE_GRAPH,  5 },
3932     { (UChar* )"lower",  ONIGENC_CTYPE_LOWER,  5 },
3933     { (UChar* )"print",  ONIGENC_CTYPE_PRINT,  5 },
3934     { (UChar* )"punct",  ONIGENC_CTYPE_PUNCT,  5 },
3935     { (UChar* )"space",  ONIGENC_CTYPE_SPACE,  5 },
3936     { (UChar* )"upper",  ONIGENC_CTYPE_UPPER,  5 },
3937     { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3938     { (UChar* )"ascii",  ONIGENC_CTYPE_ASCII,  5 },
3939     { (UChar* )"word",   ONIGENC_CTYPE_WORD,   4 },
3940     { (UChar* )NULL,     -1, 0 }
3941   };
3942 
3943   PosixBracketEntryType *pb;
3944   int not, i, r;
3945   OnigCodePoint c;
3946   OnigEncoding enc = env->enc;
3947   UChar *p = *src;
3948 
3949   if (PPEEK_IS('^')) {
3950     PINC_S;
3951     not = 1;
3952   }
3953   else
3954     not = 0;
3955 
3956   if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3957     goto not_posix_bracket;
3958 
3959   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3960     if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3961       p = (UChar* )onigenc_step(enc, p, end, pb->len);
3962       if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3963         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3964 
3965       r = add_ctype_to_cc(cc, pb->ctype, not, env);
3966       if (r != 0) return r;
3967 
3968       PINC_S; PINC_S;
3969       *src = p;
3970       return 0;
3971     }
3972   }
3973 
3974  not_posix_bracket:
3975   c = 0;
3976   i = 0;
3977   while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3978     PINC_S;
3979     if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3980   }
3981   if (c == ':' && ! PEND) {
3982     PINC_S;
3983     if (! PEND) {
3984       PFETCH_S(c);
3985       if (c == ']')
3986         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3987     }
3988   }
3989 
3990   return 1;  /* 1: is not POSIX bracket, but no error. */
3991 }
3992 
3993 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3994 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3995 {
3996   int r;
3997   OnigCodePoint c;
3998   OnigEncoding enc = env->enc;
3999   UChar *prev, *start, *p = *src;
4000 
4001   r = 0;
4002   start = prev = p;
4003 
4004   while (!PEND) {
4005     prev = p;
4006     PFETCH_S(c);
4007     if (c == '}') {
4008       r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4009       if (r < 0) break;
4010 
4011       *src = p;
4012       return r;
4013     }
4014     else if (c == '(' || c == ')' || c == '{' || c == '|') {
4015       r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4016       break;
4017     }
4018   }
4019 
4020   onig_scan_env_set_error_string(env, r, *src, prev);
4021   return r;
4022 }
4023 
4024 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4025 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4026 		    ScanEnv* env)
4027 {
4028   int r, ctype;
4029   CClassNode* cc;
4030 
4031   ctype = fetch_char_property_to_ctype(src, end, env);
4032   if (ctype < 0) return ctype;
4033 
4034   *np = node_new_cclass();
4035   CHECK_NULL_RETURN_MEMERR(*np);
4036   cc = NCCLASS(*np);
4037   r = add_ctype_to_cc(cc, ctype, 0, env);
4038   if (r != 0) return r;
4039   if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4040 
4041   return 0;
4042 }
4043 
4044 
4045 enum CCSTATE {
4046   CCS_VALUE,
4047   CCS_RANGE,
4048   CCS_COMPLETE,
4049   CCS_START
4050 };
4051 
4052 enum CCVALTYPE {
4053   CCV_SB,
4054   CCV_CODE_POINT,
4055   CCV_CLASS
4056 };
4057 
4058 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4059 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4060 		 enum CCSTATE* state, ScanEnv* env)
4061 {
4062   int r;
4063 
4064   if (*state == CCS_RANGE)
4065     return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4066 
4067   if (*state == CCS_VALUE && *type != CCV_CLASS) {
4068     if (*type == CCV_SB)
4069       BITSET_SET_BIT(cc->bs, (int )(*vs));
4070     else if (*type == CCV_CODE_POINT) {
4071       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4072       if (r < 0) return r;
4073     }
4074   }
4075 
4076   if (*state != CCS_START)
4077     *state = CCS_VALUE;
4078 
4079   *type  = CCV_CLASS;
4080   return 0;
4081 }
4082 
4083 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4084 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
4085 	       int* vs_israw, int v_israw,
4086 	       enum CCVALTYPE intype, enum CCVALTYPE* type,
4087 	       enum CCSTATE* state, ScanEnv* env)
4088 {
4089   int r;
4090 
4091   switch (*state) {
4092   case CCS_VALUE:
4093     if (*type == CCV_SB)
4094     {
4095     if (*vs > 0xff)
4096       return ONIGERR_INVALID_CODE_POINT_VALUE;
4097       BITSET_SET_BIT(cc->bs, (int )(*vs));
4098     }
4099     else if (*type == CCV_CODE_POINT) {
4100       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4101       if (r < 0) return r;
4102     }
4103     break;
4104 
4105   case CCS_RANGE:
4106     if (intype == *type) {
4107       if (intype == CCV_SB) {
4108         if (*vs > 0xff || v > 0xff)
4109           return ONIGERR_INVALID_CODE_POINT_VALUE;
4110 
4111 	if (*vs > v) {
4112 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4113 	    goto ccs_range_end;
4114 	  else
4115 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4116 	}
4117 	bitset_set_range(cc->bs, (int )*vs, (int )v);
4118       }
4119       else {
4120 	r = add_code_range(&(cc->mbuf), env, *vs, v);
4121 	if (r < 0) return r;
4122       }
4123     }
4124     else {
4125 #if 0
4126       if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4127 #endif
4128 	if (*vs > v) {
4129 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4130 	    goto ccs_range_end;
4131 	  else
4132 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4133 	}
4134 	bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4135 	r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4136 	if (r < 0) return r;
4137 #if 0
4138       }
4139       else
4140 	return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4141 #endif
4142     }
4143   ccs_range_end:
4144     *state = CCS_COMPLETE;
4145     break;
4146 
4147   case CCS_COMPLETE:
4148   case CCS_START:
4149     *state = CCS_VALUE;
4150     break;
4151 
4152   default:
4153     break;
4154   }
4155 
4156   *vs_israw = v_israw;
4157   *vs       = v;
4158   *type     = intype;
4159   return 0;
4160 }
4161 
4162 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4163 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4164 		 ScanEnv* env)
4165 {
4166   int in_esc;
4167   OnigCodePoint code;
4168   OnigEncoding enc = env->enc;
4169   UChar* p = from;
4170 
4171   in_esc = 0;
4172   while (! PEND) {
4173     if (ignore_escaped && in_esc) {
4174       in_esc = 0;
4175     }
4176     else {
4177       PFETCH_S(code);
4178       if (code == c) return 1;
4179       if (code == MC_ESC(env->syntax)) in_esc = 1;
4180     }
4181   }
4182   return 0;
4183 }
4184 
4185 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4186 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4187 		 ScanEnv* env)
4188 {
4189   int r, neg, len, fetched, and_start;
4190   OnigCodePoint v, vs;
4191   UChar *p;
4192   Node* node;
4193   CClassNode *cc, *prev_cc;
4194   CClassNode work_cc;
4195 
4196   enum CCSTATE state;
4197   enum CCVALTYPE val_type, in_type;
4198   int val_israw, in_israw;
4199 
4200   prev_cc = (CClassNode* )NULL;
4201   *np = NULL_NODE;
4202   r = fetch_token_in_cc(tok, src, end, env);
4203   if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4204     neg = 1;
4205     r = fetch_token_in_cc(tok, src, end, env);
4206   }
4207   else {
4208     neg = 0;
4209   }
4210 
4211   if (r < 0) return r;
4212   if (r == TK_CC_CLOSE) {
4213     if (! code_exist_check((OnigCodePoint )']',
4214                            *src, env->pattern_end, 1, env))
4215       return ONIGERR_EMPTY_CHAR_CLASS;
4216 
4217     CC_ESC_WARN(env, (UChar* )"]");
4218     r = tok->type = TK_CHAR;  /* allow []...] */
4219   }
4220 
4221   *np = node = node_new_cclass();
4222   CHECK_NULL_RETURN_MEMERR(node);
4223   cc = NCCLASS(node);
4224 
4225   and_start = 0;
4226   state = CCS_START;
4227   p = *src;
4228   while (r != TK_CC_CLOSE) {
4229     fetched = 0;
4230     switch (r) {
4231     case TK_CHAR:
4232       len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4233       if (len > 1) {
4234 	in_type = CCV_CODE_POINT;
4235       }
4236       else if (len < 0) {
4237 	r = len;
4238 	goto err;
4239       }
4240       else {
4241       sb_char:
4242 	in_type = CCV_SB;
4243       }
4244       v = (OnigCodePoint )tok->u.c;
4245       in_israw = 0;
4246       goto val_entry2;
4247       break;
4248 
4249     case TK_RAW_BYTE:
4250       /* tok->base != 0 : octal or hexadec. */
4251       if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4252 	UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4253 	UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4254 	UChar* psave = p;
4255 	int i, base = tok->base;
4256 
4257 	buf[0] = tok->u.c;
4258 	for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4259 	  r = fetch_token_in_cc(tok, &p, end, env);
4260 	  if (r < 0) goto err;
4261 	  if (r != TK_RAW_BYTE || tok->base != base) {
4262 	    fetched = 1;
4263 	    break;
4264 	  }
4265 	  buf[i] = tok->u.c;
4266 	}
4267 
4268 	if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4269 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4270 	  goto err;
4271 	}
4272 
4273 	len = enclen(env->enc, buf);
4274 	if (i < len) {
4275 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4276 	  goto err;
4277 	}
4278 	else if (i > len) { /* fetch back */
4279 	  p = psave;
4280 	  for (i = 1; i < len; i++) {
4281 	    r = fetch_token_in_cc(tok, &p, end, env);
4282 	  }
4283 	  fetched = 0;
4284 	}
4285 
4286 	if (i == 1) {
4287 	  v = (OnigCodePoint )buf[0];
4288 	  goto raw_single;
4289 	}
4290 	else {
4291 	  v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4292 	  in_type = CCV_CODE_POINT;
4293 	}
4294       }
4295       else {
4296 	v = (OnigCodePoint )tok->u.c;
4297       raw_single:
4298 	in_type = CCV_SB;
4299       }
4300       in_israw = 1;
4301       goto val_entry2;
4302       break;
4303 
4304     case TK_CODE_POINT:
4305       v = tok->u.code;
4306       in_israw = 1;
4307     val_entry:
4308       len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4309       if (len < 0) {
4310 	r = len;
4311 	goto err;
4312       }
4313       in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4314     val_entry2:
4315       r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4316 			 &state, env);
4317       if (r != 0) goto err;
4318       break;
4319 
4320     case TK_POSIX_BRACKET_OPEN:
4321       r = parse_posix_bracket(cc, &p, end, env);
4322       if (r < 0) goto err;
4323       if (r == 1) {  /* is not POSIX bracket */
4324 	CC_ESC_WARN(env, (UChar* )"[");
4325 	p = tok->backp;
4326 	v = (OnigCodePoint )tok->u.c;
4327 	in_israw = 0;
4328 	goto val_entry;
4329       }
4330       goto next_class;
4331       break;
4332 
4333     case TK_CHAR_TYPE:
4334       r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4335       if (r != 0) return r;
4336 
4337     next_class:
4338       r = next_state_class(cc, &vs, &val_type, &state, env);
4339       if (r != 0) goto err;
4340       break;
4341 
4342     case TK_CHAR_PROPERTY:
4343       {
4344 	int ctype;
4345 
4346 	ctype = fetch_char_property_to_ctype(&p, end, env);
4347 	if (ctype < 0) return ctype;
4348 	r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4349 	if (r != 0) return r;
4350 	goto next_class;
4351       }
4352       break;
4353 
4354     case TK_CC_RANGE:
4355       if (state == CCS_VALUE) {
4356 	r = fetch_token_in_cc(tok, &p, end, env);
4357 	if (r < 0) goto err;
4358 	fetched = 1;
4359 	if (r == TK_CC_CLOSE) { /* allow [x-] */
4360 	range_end_val:
4361 	  v = (OnigCodePoint )'-';
4362 	  in_israw = 0;
4363 	  goto val_entry;
4364 	}
4365 	else if (r == TK_CC_AND) {
4366 	  CC_ESC_WARN(env, (UChar* )"-");
4367 	  goto range_end_val;
4368 	}
4369 	state = CCS_RANGE;
4370       }
4371       else if (state == CCS_START) {
4372 	/* [-xa] is allowed */
4373 	v = (OnigCodePoint )tok->u.c;
4374 	in_israw = 0;
4375 
4376 	r = fetch_token_in_cc(tok, &p, end, env);
4377 	if (r < 0) goto err;
4378 	fetched = 1;
4379 	/* [--x] or [a&&-x] is warned. */
4380 	if (r == TK_CC_RANGE || and_start != 0)
4381 	  CC_ESC_WARN(env, (UChar* )"-");
4382 
4383 	goto val_entry;
4384       }
4385       else if (state == CCS_RANGE) {
4386 	CC_ESC_WARN(env, (UChar* )"-");
4387 	goto sb_char;  /* [!--x] is allowed */
4388       }
4389       else { /* CCS_COMPLETE */
4390 	r = fetch_token_in_cc(tok, &p, end, env);
4391 	if (r < 0) goto err;
4392 	fetched = 1;
4393 	if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4394 	else if (r == TK_CC_AND) {
4395 	  CC_ESC_WARN(env, (UChar* )"-");
4396 	  goto range_end_val;
4397 	}
4398 
4399 	if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4400 	  CC_ESC_WARN(env, (UChar* )"-");
4401 	  goto sb_char;   /* [0-9-a] is allowed as [0-9\-a] */
4402 	}
4403 	r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4404 	goto err;
4405       }
4406       break;
4407 
4408     case TK_CC_CC_OPEN: /* [ */
4409       {
4410 	Node *anode;
4411 	CClassNode* acc;
4412 
4413 	r = parse_char_class(&anode, tok, &p, end, env);
4414 	if (r != 0) goto cc_open_err;
4415 	acc = NCCLASS(anode);
4416 	r = or_cclass(cc, acc, env->enc);
4417 
4418 	onig_node_free(anode);
4419       cc_open_err:
4420 	if (r != 0) goto err;
4421       }
4422       break;
4423 
4424     case TK_CC_AND: /* && */
4425       {
4426 	if (state == CCS_VALUE) {
4427 	  r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4428 			     &val_type, &state, env);
4429 	  if (r != 0) goto err;
4430 	}
4431 	/* initialize local variables */
4432 	and_start = 1;
4433 	state = CCS_START;
4434 
4435 	if (IS_NOT_NULL(prev_cc)) {
4436 	  r = and_cclass(prev_cc, cc, env->enc);
4437 	  if (r != 0) goto err;
4438 	  bbuf_free(cc->mbuf);
4439 	}
4440 	else {
4441 	  prev_cc = cc;
4442 	  cc = &work_cc;
4443 	}
4444 	initialize_cclass(cc);
4445       }
4446       break;
4447 
4448     case TK_EOT:
4449       r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4450       goto err;
4451       break;
4452     default:
4453       r = ONIGERR_PARSER_BUG;
4454       goto err;
4455       break;
4456     }
4457 
4458     if (fetched)
4459       r = tok->type;
4460     else {
4461       r = fetch_token_in_cc(tok, &p, end, env);
4462       if (r < 0) goto err;
4463     }
4464   }
4465 
4466   if (state == CCS_VALUE) {
4467     r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4468 		       &val_type, &state, env);
4469     if (r != 0) goto err;
4470   }
4471 
4472   if (IS_NOT_NULL(prev_cc)) {
4473     r = and_cclass(prev_cc, cc, env->enc);
4474     if (r != 0) goto err;
4475     bbuf_free(cc->mbuf);
4476     cc = prev_cc;
4477   }
4478 
4479   if (neg != 0)
4480     NCCLASS_SET_NOT(cc);
4481   else
4482     NCCLASS_CLEAR_NOT(cc);
4483   if (IS_NCCLASS_NOT(cc) &&
4484       IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4485     int is_empty;
4486 
4487     is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4488     if (is_empty != 0)
4489       BITSET_IS_EMPTY(cc->bs, is_empty);
4490 
4491     if (is_empty == 0) {
4492 #define NEWLINE_CODE    0x0a
4493 
4494       if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4495         if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4496           BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4497         else
4498           add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4499       }
4500     }
4501   }
4502   *src = p;
4503   return 0;
4504 
4505  err:
4506   if (cc != NCCLASS(*np))
4507     bbuf_free(cc->mbuf);
4508   onig_node_free(*np);
4509   return r;
4510 }
4511 
4512 static int parse_subexp(Node** top, OnigToken* tok, int term,
4513 			UChar** src, UChar* end, ScanEnv* env);
4514 
4515 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4516 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4517 	      ScanEnv* env)
4518 {
4519   int r, num;
4520   Node *target;
4521   OnigOptionType option;
4522   OnigCodePoint c;
4523   OnigEncoding enc = env->enc;
4524 
4525 #ifdef USE_NAMED_GROUP
4526   int list_capture;
4527 #endif
4528 
4529   UChar* p = *src;
4530   PFETCH_READY;
4531 
4532   *np = NULL;
4533   if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4534 
4535   option = env->option;
4536   if (PPEEK_IS('?') &&
4537       IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4538     PINC;
4539     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4540 
4541     PFETCH(c);
4542     switch (c) {
4543     case ':':   /* (?:...) grouping only */
4544     group:
4545       r = fetch_token(tok, &p, end, env);
4546       if (r < 0) return r;
4547       r = parse_subexp(np, tok, term, &p, end, env);
4548       if (r < 0) return r;
4549       *src = p;
4550       return 1; /* group */
4551       break;
4552 
4553     case '=':
4554       *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4555       break;
4556     case '!':  /*         preceding read */
4557       *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4558       break;
4559     case '>':            /* (?>...) stop backtrack */
4560       *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4561       break;
4562 
4563 #ifdef USE_NAMED_GROUP
4564     case '\'':
4565       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4566 	goto named_group1;
4567       }
4568       else
4569 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4570       break;
4571 #endif
4572 
4573     case '<':   /* look behind (?<=...), (?<!...) */
4574       PFETCH(c);
4575       if (c == '=')
4576 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4577       else if (c == '!')
4578 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4579 #ifdef USE_NAMED_GROUP
4580       else {
4581 	if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4582 	  UChar *name;
4583 	  UChar *name_end;
4584 
4585 	  PUNFETCH;
4586 	  c = '<';
4587 
4588 	named_group1:
4589 	  list_capture = 0;
4590 
4591 	named_group2:
4592 	  name = p;
4593 	  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4594 	  if (r < 0) return r;
4595 
4596 	  num = scan_env_add_mem_entry(env);
4597 	  if (num < 0) return num;
4598 	  if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4599 	    return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4600 
4601 	  r = name_add(env->reg, name, name_end, num, env);
4602 	  if (r != 0) return r;
4603 	  *np = node_new_enclose_memory(env->option, 1);
4604 	  CHECK_NULL_RETURN_MEMERR(*np);
4605 	  NENCLOSE(*np)->regnum = num;
4606 	  if (list_capture != 0)
4607 	    BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4608 	  env->num_named++;
4609 	}
4610 	else {
4611 	  return ONIGERR_UNDEFINED_GROUP_OPTION;
4612 	}
4613       }
4614 #else
4615       else {
4616 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4617       }
4618 #endif
4619       break;
4620 
4621     case '@':
4622       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4623 #ifdef USE_NAMED_GROUP
4624 	if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4625 	  PFETCH(c);
4626 	  if (c == '<' || c == '\'') {
4627 	    list_capture = 1;
4628 	    goto named_group2; /* (?@<name>...) */
4629 	  }
4630 	  PUNFETCH;
4631 	}
4632 #endif
4633 	*np = node_new_enclose_memory(env->option, 0);
4634 	CHECK_NULL_RETURN_MEMERR(*np);
4635 	num = scan_env_add_mem_entry(env);
4636 	if (num < 0) {
4637 	  onig_node_free(*np);
4638 	  return num;
4639 	}
4640 	else if (num >= (int )BIT_STATUS_BITS_NUM) {
4641 	  onig_node_free(*np);
4642 	  return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4643 	}
4644 	NENCLOSE(*np)->regnum = num;
4645 	BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4646       }
4647       else {
4648 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4649       }
4650       break;
4651 
4652 #ifdef USE_POSIXLINE_OPTION
4653     case 'p':
4654 #endif
4655     case '-': case 'i': case 'm': case 's': case 'x':
4656       {
4657 	int neg = 0;
4658 
4659 	while (1) {
4660 	  switch (c) {
4661 	  case ':':
4662 	  case ')':
4663 	  break;
4664 
4665 	  case '-':  neg = 1; break;
4666 	  case 'x':  ONOFF(option, ONIG_OPTION_EXTEND,     neg); break;
4667 	  case 'i':  ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4668 	  case 's':
4669 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4670 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4671 	    }
4672 	    else
4673 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4674 	    break;
4675 
4676 	  case 'm':
4677 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4678 	      ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4679 	    }
4680 	    else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4681 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4682 	    }
4683 	    else
4684 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4685 	    break;
4686 #ifdef USE_POSIXLINE_OPTION
4687 	  case 'p':
4688 	    ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4689 	    break;
4690 #endif
4691 	  default:
4692 	    return ONIGERR_UNDEFINED_GROUP_OPTION;
4693 	  }
4694 
4695 	  if (c == ')') {
4696 	    *np = node_new_option(option);
4697 	    CHECK_NULL_RETURN_MEMERR(*np);
4698 	    *src = p;
4699 	    return 2; /* option only */
4700 	  }
4701 	  else if (c == ':') {
4702 	    OnigOptionType prev = env->option;
4703 
4704 	    env->option     = option;
4705 	    r = fetch_token(tok, &p, end, env);
4706 	    if (r < 0) return r;
4707 	    r = parse_subexp(&target, tok, term, &p, end, env);
4708 	    env->option = prev;
4709 	    if (r < 0) return r;
4710 	    *np = node_new_option(option);
4711 	    CHECK_NULL_RETURN_MEMERR(*np);
4712 	    NENCLOSE(*np)->target = target;
4713 	    *src = p;
4714 	    return 0;
4715 	  }
4716 
4717 	  if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4718 	  PFETCH(c);
4719 	}
4720       }
4721       break;
4722 
4723     default:
4724       return ONIGERR_UNDEFINED_GROUP_OPTION;
4725     }
4726   }
4727   else {
4728     if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4729       goto group;
4730 
4731     *np = node_new_enclose_memory(env->option, 0);
4732     CHECK_NULL_RETURN_MEMERR(*np);
4733     num = scan_env_add_mem_entry(env);
4734     if (num < 0) return num;
4735     NENCLOSE(*np)->regnum = num;
4736   }
4737 
4738   CHECK_NULL_RETURN_MEMERR(*np);
4739   r = fetch_token(tok, &p, end, env);
4740   if (r < 0) return r;
4741   r = parse_subexp(&target, tok, term, &p, end, env);
4742   if (r < 0) return r;
4743 
4744   if (NTYPE(*np) == NT_ANCHOR)
4745     NANCHOR(*np)->target = target;
4746   else {
4747     NENCLOSE(*np)->target = target;
4748     if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4749       /* Don't move this to previous of parse_subexp() */
4750       r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4751       if (r != 0) return r;
4752     }
4753   }
4754 
4755   *src = p;
4756   return 0;
4757 }
4758 
4759 static const char* PopularQStr[] = {
4760   "?", "*", "+", "??", "*?", "+?"
4761 };
4762 
4763 static const char* ReduceQStr[] = {
4764   "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4765 };
4766 
4767 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4768 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4769 {
4770   QtfrNode* qn;
4771 
4772   qn = NQTFR(qnode);
4773   if (qn->lower == 1 && qn->upper == 1) {
4774     return 1;
4775   }
4776 
4777   switch (NTYPE(target)) {
4778   case NT_STR:
4779     if (! group) {
4780       StrNode* sn = NSTR(target);
4781       if (str_node_can_be_split(sn, env->enc)) {
4782 	Node* n = str_node_split_last_char(sn, env->enc);
4783 	if (IS_NOT_NULL(n)) {
4784 	  qn->target = n;
4785 	  return 2;
4786 	}
4787       }
4788     }
4789     break;
4790 
4791   case NT_QTFR:
4792     { /* check redundant double repeat. */
4793       /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4794       QtfrNode* qnt   = NQTFR(target);
4795       int nestq_num   = popular_quantifier_num(qn);
4796       int targetq_num = popular_quantifier_num(qnt);
4797 
4798 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4799       if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4800 	  IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4801         UChar buf[WARN_BUFSIZE];
4802 
4803         switch(ReduceTypeTable[targetq_num][nestq_num]) {
4804         case RQ_ASIS:
4805           break;
4806 
4807         case RQ_DEL:
4808           if (onig_verb_warn != onig_null_warn) {
4809             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4810                                  env->pattern, env->pattern_end,
4811                                  (UChar* )"redundant nested repeat operator");
4812             (*onig_verb_warn)((char* )buf);
4813           }
4814           goto warn_exit;
4815           break;
4816 
4817         default:
4818           if (onig_verb_warn != onig_null_warn) {
4819             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4820                                        env->pattern, env->pattern_end,
4821             (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4822             PopularQStr[targetq_num], PopularQStr[nestq_num],
4823             ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4824             (*onig_verb_warn)((char* )buf);
4825           }
4826           goto warn_exit;
4827           break;
4828         }
4829       }
4830 
4831     warn_exit:
4832 #endif
4833       if (targetq_num >= 0) {
4834 	if (nestq_num >= 0) {
4835 	  onig_reduce_nested_quantifier(qnode, target);
4836 	  goto q_exit;
4837 	}
4838 	else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4839 	  /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4840 	  if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4841 	    qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4842 	  }
4843 	}
4844       }
4845     }
4846     break;
4847 
4848   default:
4849     break;
4850   }
4851 
4852   qn->target = target;
4853  q_exit:
4854   return 0;
4855 }
4856 
4857 
4858 #ifdef USE_SHARED_CCLASS_TABLE
4859 
4860 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS     8
4861 
4862 /* for ctype node hash table */
4863 
4864 typedef struct {
4865   OnigEncoding enc;
4866   int not;
4867   int type;
4868 } type_cclass_key;
4869 
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4870 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4871 {
4872   if (x->type != y->type) return 1;
4873   if (x->enc  != y->enc)  return 1;
4874   if (x->not  != y->not)  return 1;
4875   return 0;
4876 }
4877 
type_cclass_hash(type_cclass_key * key)4878 static int type_cclass_hash(type_cclass_key* key)
4879 {
4880   int i, val;
4881   UChar *p;
4882 
4883   val = 0;
4884 
4885   p = (UChar* )&(key->enc);
4886   for (i = 0; i < (int )sizeof(key->enc); i++) {
4887     val = val * 997 + (int )*p++;
4888   }
4889 
4890   p = (UChar* )(&key->type);
4891   for (i = 0; i < (int )sizeof(key->type); i++) {
4892     val = val * 997 + (int )*p++;
4893   }
4894 
4895   val += key->not;
4896   return val + (val >> 5);
4897 }
4898 
4899 static struct st_hash_type type_type_cclass_hash = {
4900     type_cclass_cmp,
4901     type_cclass_hash,
4902 };
4903 
4904 static st_table* OnigTypeCClassTable;
4905 
4906 
4907 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg ARG_UNUSED)4908 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
4909 {
4910   if (IS_NOT_NULL(node)) {
4911     CClassNode* cc = NCCLASS(node);
4912     if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4913     xfree(node);
4914   }
4915 
4916   if (IS_NOT_NULL(key)) xfree(key);
4917   return ST_DELETE;
4918 }
4919 
4920 extern int
onig_free_shared_cclass_table(void)4921 onig_free_shared_cclass_table(void)
4922 {
4923   if (IS_NOT_NULL(OnigTypeCClassTable)) {
4924     onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4925     onig_st_free_table(OnigTypeCClassTable);
4926     OnigTypeCClassTable = NULL;
4927   }
4928 
4929   return 0;
4930 }
4931 
4932 #endif /* USE_SHARED_CCLASS_TABLE */
4933 
4934 
4935 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4936 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4937 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4938 {
4939   BBuf *tbuf;
4940   int r;
4941 
4942   if (IS_NCCLASS_NOT(cc)) {
4943     bitset_invert(cc->bs);
4944 
4945     if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4946       r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4947       if (r != 0) return r;
4948 
4949       bbuf_free(cc->mbuf);
4950       cc->mbuf = tbuf;
4951     }
4952 
4953     NCCLASS_CLEAR_NOT(cc);
4954   }
4955 
4956   return 0;
4957 }
4958 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4959 
4960 typedef struct {
4961   ScanEnv*    env;
4962   CClassNode* cc;
4963   Node*       alt_root;
4964   Node**      ptail;
4965 } IApplyCaseFoldArg;
4966 
4967 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4968 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4969 		  int to_len, void* arg)
4970 {
4971   IApplyCaseFoldArg* iarg;
4972   ScanEnv* env;
4973   CClassNode* cc;
4974   BitSetRef bs;
4975 
4976   iarg = (IApplyCaseFoldArg* )arg;
4977   env = iarg->env;
4978   cc  = iarg->cc;
4979   bs = cc->bs;
4980 
4981   if (to_len == 1) {
4982     int is_in = onig_is_code_in_cc(env->enc, from, cc);
4983 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4984     if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4985 	(is_in == 0 &&  IS_NCCLASS_NOT(cc))) {
4986       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4987 	add_code_range(&(cc->mbuf), env, *to, *to);
4988       }
4989       else {
4990 	BITSET_SET_BIT(bs, *to);
4991       }
4992     }
4993 #else
4994     if (is_in != 0) {
4995       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4996 	if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4997 	add_code_range(&(cc->mbuf), env, *to, *to);
4998       }
4999       else {
5000 	if (IS_NCCLASS_NOT(cc)) {
5001 	  BITSET_CLEAR_BIT(bs, *to);
5002 	}
5003 	else
5004 	  BITSET_SET_BIT(bs, *to);
5005       }
5006     }
5007 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5008   }
5009   else {
5010     int r, i, len;
5011     UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5012     Node *snode = NULL_NODE;
5013 
5014     if (onig_is_code_in_cc(env->enc, from, cc)
5015 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5016 	&& !IS_NCCLASS_NOT(cc)
5017 #endif
5018 	) {
5019       for (i = 0; i < to_len; i++) {
5020 	len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5021 	if (i == 0) {
5022 	  snode = onig_node_new_str(buf, buf + len);
5023 	  CHECK_NULL_RETURN_MEMERR(snode);
5024 
5025 	  /* char-class expanded multi-char only
5026 	     compare with string folded at match time. */
5027 	  NSTRING_SET_AMBIG(snode);
5028 	}
5029 	else {
5030 	  r = onig_node_str_cat(snode, buf, buf + len);
5031 	  if (r < 0) {
5032 	    onig_node_free(snode);
5033 	    return r;
5034 	  }
5035 	}
5036       }
5037 
5038       *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5039       CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5040       iarg->ptail = &(NCDR((*(iarg->ptail))));
5041     }
5042   }
5043 
5044   return 0;
5045 }
5046 
5047 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5048 parse_exp(Node** np, OnigToken* tok, int term,
5049 	  UChar** src, UChar* end, ScanEnv* env)
5050 {
5051   int r, len, group = 0;
5052   Node* qn;
5053   Node** targetp;
5054 
5055   *np = NULL;
5056   if (tok->type == (enum TokenSyms )term)
5057     goto end_of_token;
5058 
5059   switch (tok->type) {
5060   case TK_ALT:
5061   case TK_EOT:
5062   end_of_token:
5063   *np = node_new_empty();
5064   return tok->type;
5065   break;
5066 
5067   case TK_SUBEXP_OPEN:
5068     r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5069     if (r < 0) return r;
5070     if (r == 1) group = 1;
5071     else if (r == 2) { /* option only */
5072       Node* target;
5073       OnigOptionType prev = env->option;
5074 
5075       env->option = NENCLOSE(*np)->option;
5076       r = fetch_token(tok, src, end, env);
5077       if (r < 0) return r;
5078       r = parse_subexp(&target, tok, term, src, end, env);
5079       env->option = prev;
5080       if (r < 0) return r;
5081       NENCLOSE(*np)->target = target;
5082       return tok->type;
5083     }
5084     break;
5085 
5086   case TK_SUBEXP_CLOSE:
5087     if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5088       return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5089 
5090     if (tok->escaped) goto tk_raw_byte;
5091     else goto tk_byte;
5092     break;
5093 
5094   case TK_STRING:
5095   tk_byte:
5096     {
5097       *np = node_new_str(tok->backp, *src);
5098       CHECK_NULL_RETURN_MEMERR(*np);
5099 
5100       while (1) {
5101 	r = fetch_token(tok, src, end, env);
5102 	if (r < 0) return r;
5103 	if (r != TK_STRING) break;
5104 
5105 	r = onig_node_str_cat(*np, tok->backp, *src);
5106 	if (r < 0) return r;
5107       }
5108 
5109     string_end:
5110       targetp = np;
5111       goto repeat;
5112     }
5113     break;
5114 
5115   case TK_RAW_BYTE:
5116   tk_raw_byte:
5117     {
5118       *np = node_new_str_raw_char((UChar )tok->u.c);
5119       CHECK_NULL_RETURN_MEMERR(*np);
5120       len = 1;
5121       while (1) {
5122 	if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5123 	  if (len == enclen(env->enc, NSTR(*np)->s)) {
5124 	    r = fetch_token(tok, src, end, env);
5125 	    NSTRING_CLEAR_RAW(*np);
5126 	    goto string_end;
5127 	  }
5128 	}
5129 
5130 	r = fetch_token(tok, src, end, env);
5131 	if (r < 0) return r;
5132 	if (r != TK_RAW_BYTE) {
5133 	  /* Don't use this, it is wrong for little endian encodings. */
5134 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5135 	  int rem;
5136 	  if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5137 	    rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5138 	    (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5139 	    if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5140 	      NSTRING_CLEAR_RAW(*np);
5141 	      goto string_end;
5142 	    }
5143 	  }
5144 #endif
5145 	  return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5146 	}
5147 
5148 	r = node_str_cat_char(*np, (UChar )tok->u.c);
5149 	if (r < 0) return r;
5150 
5151 	len++;
5152       }
5153     }
5154     break;
5155 
5156   case TK_CODE_POINT:
5157     {
5158       UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5159       int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5160       if (num < 0) return num;
5161 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5162       *np = node_new_str_raw(buf, buf + num);
5163 #else
5164       *np = node_new_str(buf, buf + num);
5165 #endif
5166       CHECK_NULL_RETURN_MEMERR(*np);
5167     }
5168     break;
5169 
5170   case TK_QUOTE_OPEN:
5171     {
5172       OnigCodePoint end_op[2];
5173       UChar *qstart, *qend, *nextp;
5174 
5175       end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5176       end_op[1] = (OnigCodePoint )'E';
5177       qstart = *src;
5178       qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5179       if (IS_NULL(qend)) {
5180 	nextp = qend = end;
5181       }
5182       *np = node_new_str(qstart, qend);
5183       CHECK_NULL_RETURN_MEMERR(*np);
5184       *src = nextp;
5185     }
5186     break;
5187 
5188   case TK_CHAR_TYPE:
5189     {
5190       switch (tok->u.prop.ctype) {
5191       case ONIGENC_CTYPE_WORD:
5192 	*np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5193 	CHECK_NULL_RETURN_MEMERR(*np);
5194 	break;
5195 
5196       case ONIGENC_CTYPE_SPACE:
5197       case ONIGENC_CTYPE_DIGIT:
5198       case ONIGENC_CTYPE_XDIGIT:
5199 	{
5200 	  CClassNode* cc;
5201 
5202 #ifdef USE_SHARED_CCLASS_TABLE
5203           const OnigCodePoint *mbr;
5204 	  OnigCodePoint sb_out;
5205 
5206           r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
5207 					   &sb_out, &mbr);
5208           if (r == 0 &&
5209               ONIGENC_CODE_RANGE_NUM(mbr)
5210               >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
5211             type_cclass_key  key;
5212             type_cclass_key* new_key;
5213 
5214             key.enc  = env->enc;
5215             key.not  = tok->u.prop.not;
5216             key.type = tok->u.prop.ctype;
5217 
5218             THREAD_ATOMIC_START;
5219 
5220             if (IS_NULL(OnigTypeCClassTable)) {
5221               OnigTypeCClassTable
5222                 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
5223               if (IS_NULL(OnigTypeCClassTable)) {
5224                 THREAD_ATOMIC_END;
5225                 return ONIGERR_MEMORY;
5226               }
5227             }
5228             else {
5229               if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
5230                                  (st_data_t* )np)) {
5231                 THREAD_ATOMIC_END;
5232                 break;
5233               }
5234             }
5235 
5236             *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
5237 						     sb_out, mbr);
5238             if (IS_NULL(*np)) {
5239               THREAD_ATOMIC_END;
5240               return ONIGERR_MEMORY;
5241             }
5242 
5243             cc = NCCLASS(*np);
5244             NCCLASS_SET_SHARE(cc);
5245             new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
5246 	    xmemcpy(new_key, &key, sizeof(type_cclass_key));
5247             onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
5248                                (st_data_t )*np);
5249 
5250             THREAD_ATOMIC_END;
5251           }
5252           else {
5253 #endif
5254             *np = node_new_cclass();
5255             CHECK_NULL_RETURN_MEMERR(*np);
5256             cc = NCCLASS(*np);
5257             add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5258             if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5259 #ifdef USE_SHARED_CCLASS_TABLE
5260           }
5261 #endif
5262 	}
5263 	break;
5264 
5265       default:
5266 	return ONIGERR_PARSER_BUG;
5267 	break;
5268       }
5269     }
5270     break;
5271 
5272   case TK_CHAR_PROPERTY:
5273     r = parse_char_property(np, tok, src, end, env);
5274     if (r != 0) return r;
5275     break;
5276 
5277   case TK_CC_OPEN:
5278     {
5279       CClassNode* cc;
5280 
5281       r = parse_char_class(np, tok, src, end, env);
5282       if (r != 0) return r;
5283 
5284       cc = NCCLASS(*np);
5285       if (IS_IGNORECASE(env->option)) {
5286 	IApplyCaseFoldArg iarg;
5287 
5288 	iarg.env      = env;
5289 	iarg.cc       = cc;
5290 	iarg.alt_root = NULL_NODE;
5291 	iarg.ptail    = &(iarg.alt_root);
5292 
5293 	r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5294 					i_apply_case_fold, &iarg);
5295 	if (r != 0) {
5296 	  onig_node_free(iarg.alt_root);
5297 	  return r;
5298 	}
5299 	if (IS_NOT_NULL(iarg.alt_root)) {
5300           Node* work = onig_node_new_alt(*np, iarg.alt_root);
5301           if (IS_NULL(work)) {
5302             onig_node_free(iarg.alt_root);
5303             return ONIGERR_MEMORY;
5304           }
5305           *np = work;
5306 	}
5307       }
5308     }
5309     break;
5310 
5311   case TK_ANYCHAR:
5312     *np = node_new_anychar();
5313     CHECK_NULL_RETURN_MEMERR(*np);
5314     break;
5315 
5316   case TK_ANYCHAR_ANYTIME:
5317     *np = node_new_anychar();
5318     CHECK_NULL_RETURN_MEMERR(*np);
5319     qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5320     CHECK_NULL_RETURN_MEMERR(qn);
5321     NQTFR(qn)->target = *np;
5322     *np = qn;
5323     break;
5324 
5325   case TK_BACKREF:
5326     len = tok->u.backref.num;
5327     *np = node_new_backref(len,
5328 		   (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5329 			   tok->u.backref.by_name,
5330 #ifdef USE_BACKREF_WITH_LEVEL
5331 			   tok->u.backref.exist_level,
5332 			   tok->u.backref.level,
5333 #endif
5334 			   env);
5335     CHECK_NULL_RETURN_MEMERR(*np);
5336     break;
5337 
5338 #ifdef USE_SUBEXP_CALL
5339   case TK_CALL:
5340     {
5341       int gnum = tok->u.call.gnum;
5342 
5343       if (gnum < 0) {
5344 	gnum = BACKREF_REL_TO_ABS(gnum, env);
5345 	if (gnum <= 0)
5346 	  return ONIGERR_INVALID_BACKREF;
5347       }
5348       *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5349       CHECK_NULL_RETURN_MEMERR(*np);
5350       env->num_call++;
5351     }
5352     break;
5353 #endif
5354 
5355   case TK_ANCHOR:
5356     *np = onig_node_new_anchor(tok->u.anchor);
5357     break;
5358 
5359   case TK_OP_REPEAT:
5360   case TK_INTERVAL:
5361     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5362       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5363 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5364       else
5365 	*np = node_new_empty();
5366     }
5367     else {
5368       goto tk_byte;
5369     }
5370     break;
5371 
5372   default:
5373     return ONIGERR_PARSER_BUG;
5374     break;
5375   }
5376 
5377   {
5378     targetp = np;
5379 
5380   re_entry:
5381     r = fetch_token(tok, src, end, env);
5382     if (r < 0) return r;
5383 
5384   repeat:
5385     if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5386       if (is_invalid_quantifier_target(*targetp))
5387 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5388 
5389       qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5390 			       (r == TK_INTERVAL ? 1 : 0));
5391       CHECK_NULL_RETURN_MEMERR(qn);
5392       NQTFR(qn)->greedy = tok->u.repeat.greedy;
5393       r = set_quantifier(qn, *targetp, group, env);
5394       if (r < 0) {
5395 	onig_node_free(qn);
5396 	return r;
5397       }
5398 
5399       if (tok->u.repeat.possessive != 0) {
5400 	Node* en;
5401 	en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5402 	if (IS_NULL(en)) {
5403 	  onig_node_free(qn);
5404 	  return ONIGERR_MEMORY;
5405 	}
5406 	NENCLOSE(en)->target = qn;
5407 	qn = en;
5408       }
5409 
5410       if (r == 0) {
5411 	*targetp = qn;
5412       }
5413       else if (r == 1) {
5414 	onig_node_free(qn);
5415       }
5416       else if (r == 2) { /* split case: /abc+/ */
5417 	Node *tmp;
5418 
5419 	*targetp = node_new_list(*targetp, NULL);
5420 	if (IS_NULL(*targetp)) {
5421 	  onig_node_free(qn);
5422 	  return ONIGERR_MEMORY;
5423 	}
5424 	tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5425 	if (IS_NULL(tmp)) {
5426 	  onig_node_free(qn);
5427 	  return ONIGERR_MEMORY;
5428 	}
5429 	targetp = &(NCAR(tmp));
5430       }
5431       goto re_entry;
5432     }
5433   }
5434 
5435   return r;
5436 }
5437 
5438 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5439 parse_branch(Node** top, OnigToken* tok, int term,
5440 	     UChar** src, UChar* end, ScanEnv* env)
5441 {
5442   int r;
5443   Node *node, **headp;
5444 
5445   *top = NULL;
5446   r = parse_exp(&node, tok, term, src, end, env);
5447   if (r < 0) return r;
5448 
5449   if (r == TK_EOT || r == term || r == TK_ALT) {
5450     *top = node;
5451   }
5452   else {
5453     *top  = node_new_list(node, NULL);
5454     headp = &(NCDR(*top));
5455     while (r != TK_EOT && r != term && r != TK_ALT) {
5456       r = parse_exp(&node, tok, term, src, end, env);
5457       if (r < 0) return r;
5458 
5459       if (NTYPE(node) == NT_LIST) {
5460 	*headp = node;
5461 	while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5462 	headp = &(NCDR(node));
5463       }
5464       else {
5465 	*headp = node_new_list(node, NULL);
5466 	headp = &(NCDR(*headp));
5467       }
5468     }
5469   }
5470 
5471   return r;
5472 }
5473 
5474 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5475 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5476 parse_subexp(Node** top, OnigToken* tok, int term,
5477 	     UChar** src, UChar* end, ScanEnv* env)
5478 {
5479   int r;
5480   Node *node, **headp;
5481 
5482   *top = NULL;
5483   r = parse_branch(&node, tok, term, src, end, env);
5484   if (r < 0) {
5485     onig_node_free(node);
5486     return r;
5487   }
5488 
5489   if (r == term) {
5490     *top = node;
5491   }
5492   else if (r == TK_ALT) {
5493     *top  = onig_node_new_alt(node, NULL);
5494     headp = &(NCDR(*top));
5495     while (r == TK_ALT) {
5496       r = fetch_token(tok, src, end, env);
5497       if (r < 0) return r;
5498       r = parse_branch(&node, tok, term, src, end, env);
5499       if (r < 0) return r;
5500 
5501       *headp = onig_node_new_alt(node, NULL);
5502       headp = &(NCDR(*headp));
5503     }
5504 
5505     if (tok->type != (enum TokenSyms )term)
5506       goto err;
5507   }
5508   else {
5509   err:
5510     if (term == TK_SUBEXP_CLOSE)
5511       return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5512     else
5513       return ONIGERR_PARSER_BUG;
5514   }
5515 
5516   return r;
5517 }
5518 
5519 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5520 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5521 {
5522   int r;
5523   OnigToken tok;
5524 
5525   r = fetch_token(&tok, src, end, env);
5526   if (r < 0) return r;
5527   r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5528   if (r < 0) return r;
5529   return 0;
5530 }
5531 
5532 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5533 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5534 		     regex_t* reg, ScanEnv* env)
5535 {
5536   int r;
5537   UChar* p;
5538 
5539 #ifdef USE_NAMED_GROUP
5540   names_clear(reg);
5541 #endif
5542 
5543   scan_env_clear(env);
5544   env->option         = reg->options;
5545   env->case_fold_flag = reg->case_fold_flag;
5546   env->enc            = reg->enc;
5547   env->syntax         = reg->syntax;
5548   env->pattern        = (UChar* )pattern;
5549   env->pattern_end    = (UChar* )end;
5550   env->reg            = reg;
5551 
5552   *root = NULL;
5553   p = (UChar* )pattern;
5554   r = parse_regexp(root, &p, (UChar* )end, env);
5555   reg->num_mem = env->num_mem;
5556   return r;
5557 }
5558 
5559 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5560 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5561 				UChar* arg, UChar* arg_end)
5562 {
5563   env->error     = arg;
5564   env->error_end = arg_end;
5565 }
5566