xref: /PHP-7.0/ext/mbstring/oniguruma/regparse.c (revision 703be4f7)
1 /**********************************************************************
2   regparse.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 #include "st.h"
32 
33 #define WARN_BUFSIZE    256
34 
35 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
36 
37 
38 OnigSyntaxType OnigSyntaxRuby = {
39   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
40      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
41      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
42      ONIG_SYN_OP_ESC_C_CONTROL )
43    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
44   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
45       ONIG_SYN_OP2_OPTION_RUBY |
46       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
47       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
48       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
49       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
50       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
51       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
52       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
53       ONIG_SYN_OP2_ESC_H_XDIGIT )
54   , ( SYN_GNU_REGEX_BV |
55       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
56       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
57       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
58       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
59       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
60       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
61       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
62   , ONIG_OPTION_NONE
63   ,
64   {
65       (OnigCodePoint )'\\'                       /* esc */
66     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
67     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
68     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
69     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
70     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
71   }
72 };
73 
74 OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
75 
onig_null_warn(const char * s ARG_UNUSED)76 extern void onig_null_warn(const char* s ARG_UNUSED) { }
77 
78 #ifdef DEFAULT_WARN_FUNCTION
79 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
80 #else
81 static OnigWarnFunc onig_warn = onig_null_warn;
82 #endif
83 
84 #ifdef DEFAULT_VERB_WARN_FUNCTION
85 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
86 #else
87 static OnigWarnFunc onig_verb_warn = onig_null_warn;
88 #endif
89 
onig_set_warn_func(OnigWarnFunc f)90 extern void onig_set_warn_func(OnigWarnFunc f)
91 {
92   onig_warn = f;
93 }
94 
onig_set_verb_warn_func(OnigWarnFunc f)95 extern void onig_set_verb_warn_func(OnigWarnFunc f)
96 {
97   onig_verb_warn = f;
98 }
99 
100 static void
bbuf_free(BBuf * bbuf)101 bbuf_free(BBuf* bbuf)
102 {
103   if (IS_NOT_NULL(bbuf)) {
104     if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
105     xfree(bbuf);
106   }
107 }
108 
109 static int
bbuf_clone(BBuf ** rto,BBuf * from)110 bbuf_clone(BBuf** rto, BBuf* from)
111 {
112   int r;
113   BBuf *to;
114 
115   *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
116   CHECK_NULL_RETURN_MEMERR(to);
117   r = BBUF_INIT(to, from->alloc);
118   if (r != 0) return r;
119   to->used = from->used;
120   xmemcpy(to->p, from->p, from->used);
121   return 0;
122 }
123 
124 #define BACKREF_REL_TO_ABS(rel_no, env) \
125   ((env)->num_mem + 1 + (rel_no))
126 
127 #define ONOFF(v,f,negative)    (negative) ? ((v) &= ~(f)) : ((v) |= (f))
128 
129 #define MBCODE_START_POS(enc) \
130   (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
131 
132 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
133   add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
134 
135 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
136   if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
137     r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
138     if (r) return r;\
139   }\
140 } while (0)
141 
142 
143 #define BITSET_IS_EMPTY(bs,empty) do {\
144   int i;\
145   empty = 1;\
146   for (i = 0; i < (int )BITSET_SIZE; i++) {\
147     if ((bs)[i] != 0) {\
148       empty = 0; break;\
149     }\
150   }\
151 } while (0)
152 
153 static void
bitset_set_range(BitSetRef bs,int from,int to)154 bitset_set_range(BitSetRef bs, int from, int to)
155 {
156   int i;
157   for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
158     BITSET_SET_BIT(bs, i);
159   }
160 }
161 
162 #if 0
163 static void
164 bitset_set_all(BitSetRef bs)
165 {
166   int i;
167   for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
168 }
169 #endif
170 
171 static void
bitset_invert(BitSetRef bs)172 bitset_invert(BitSetRef bs)
173 {
174   int i;
175   for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
176 }
177 
178 static void
bitset_invert_to(BitSetRef from,BitSetRef to)179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181   int i;
182   for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
183 }
184 
185 static void
bitset_and(BitSetRef dest,BitSetRef bs)186 bitset_and(BitSetRef dest, BitSetRef bs)
187 {
188   int i;
189   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
190 }
191 
192 static void
bitset_or(BitSetRef dest,BitSetRef bs)193 bitset_or(BitSetRef dest, BitSetRef bs)
194 {
195   int i;
196   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
197 }
198 
199 static void
bitset_copy(BitSetRef dest,BitSetRef bs)200 bitset_copy(BitSetRef dest, BitSetRef bs)
201 {
202   int i;
203   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
204 }
205 
206 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)207 onig_strncmp(const UChar* s1, const UChar* s2, int n)
208 {
209   int x;
210 
211   while (n-- > 0) {
212     x = *s2++ - *s1++;
213     if (x) return x;
214   }
215   return 0;
216 }
217 
218 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)219 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
220 {
221   int len = end - src;
222   if (len > 0) {
223     xmemcpy(dest, src, len);
224     dest[len] = (UChar )0;
225   }
226 }
227 
228 #ifdef USE_NAMED_GROUP
229 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)230 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
231 {
232   int slen, term_len, i;
233   UChar *r;
234 
235   slen = end - s;
236   term_len = ONIGENC_MBC_MINLEN(enc);
237 
238   r = (UChar* )xmalloc(slen + term_len);
239   CHECK_NULL_RETURN(r);
240   xmemcpy(r, s, slen);
241 
242   for (i = 0; i < term_len; i++)
243     r[slen + i] = (UChar )0;
244 
245   return r;
246 }
247 #endif
248 
249 /* scan pattern methods */
250 #define PEND_VALUE   0
251 
252 #define PFETCH_READY  UChar* pfetch_prev
253 #define PEND         (p < end ?  0 : 1)
254 #define PUNFETCH     p = pfetch_prev
255 #define PINC       do { \
256   pfetch_prev = p; \
257   p += ONIGENC_MBC_ENC_LEN(enc, p); \
258 } while (0)
259 #define PFETCH(c)  do { \
260   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
261   pfetch_prev = p; \
262   p += ONIGENC_MBC_ENC_LEN(enc, p); \
263 } while (0)
264 
265 #define PINC_S     do { \
266   p += ONIGENC_MBC_ENC_LEN(enc, p); \
267 } while (0)
268 #define PFETCH_S(c) do { \
269   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
270   p += ONIGENC_MBC_ENC_LEN(enc, p); \
271 } while (0)
272 
273 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
274 #define PPEEK_IS(c)  (PPEEK == (OnigCodePoint )c)
275 
276 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)277 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
278 	      int capa)
279 {
280   UChar* r;
281 
282   if (dest)
283     r = (UChar* )xrealloc(dest, capa + 1);
284   else
285     r = (UChar* )xmalloc(capa + 1);
286 
287   CHECK_NULL_RETURN(r);
288   onig_strcpy(r + (dest_end - dest), src, src_end);
289   return r;
290 }
291 
292 /* dest on static area */
293 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)294 strcat_capa_from_static(UChar* dest, UChar* dest_end,
295 			const UChar* src, const UChar* src_end, int capa)
296 {
297   UChar* r;
298 
299   r = (UChar* )xmalloc(capa + 1);
300   CHECK_NULL_RETURN(r);
301   onig_strcpy(r, dest, dest_end);
302   onig_strcpy(r + (dest_end - dest), src, src_end);
303   return r;
304 }
305 
306 
307 #ifdef USE_ST_LIBRARY
308 
309 typedef struct {
310   UChar* s;
311   UChar* end;
312 } st_str_end_key;
313 
314 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)315 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
316 {
317   UChar *p, *q;
318   int c;
319 
320   if ((x->end - x->s) != (y->end - y->s))
321     return 1;
322 
323   p = x->s;
324   q = y->s;
325   while (p < x->end) {
326     c = (int )*p - (int )*q;
327     if (c != 0) return c;
328 
329     p++; q++;
330   }
331 
332   return 0;
333 }
334 
335 static int
str_end_hash(st_str_end_key * x)336 str_end_hash(st_str_end_key* x)
337 {
338   UChar *p;
339   int val = 0;
340 
341   p = x->s;
342   while (p < x->end) {
343     val = val * 997 + (int )*p++;
344   }
345 
346   return val + (val >> 5);
347 }
348 
349 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)350 onig_st_init_strend_table_with_size(int size)
351 {
352   static struct st_hash_type hashType = {
353     str_end_cmp,
354     str_end_hash,
355   };
356 
357   return (hash_table_type* )
358            onig_st_init_table_with_size(&hashType, size);
359 }
360 
361 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)362 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
363 		      const UChar* end_key, hash_data_type *value)
364 {
365   st_str_end_key key;
366 
367   key.s   = (UChar* )str_key;
368   key.end = (UChar* )end_key;
369 
370   return onig_st_lookup(table, (st_data_t )(&key), value);
371 }
372 
373 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)374 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
375 		      const UChar* end_key, hash_data_type value)
376 {
377   st_str_end_key* key;
378   int result;
379 
380   key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
381   key->s   = (UChar* )str_key;
382   key->end = (UChar* )end_key;
383   result = onig_st_insert(table, (st_data_t )key, value);
384   if (result) {
385     xfree(key);
386   }
387   return result;
388 }
389 
390 #endif /* USE_ST_LIBRARY */
391 
392 
393 #ifdef USE_NAMED_GROUP
394 
395 #define INIT_NAME_BACKREFS_ALLOC_NUM   8
396 
397 typedef struct {
398   UChar* name;
399   int    name_len;   /* byte length */
400   int    back_num;   /* number of backrefs */
401   int    back_alloc;
402   int    back_ref1;
403   int*   back_refs;
404 } NameEntry;
405 
406 #ifdef USE_ST_LIBRARY
407 
408 typedef st_table  NameTable;
409 typedef st_data_t HashDataType;   /* 1.6 st.h doesn't define st_data_t type */
410 
411 #define NAMEBUF_SIZE    24
412 #define NAMEBUF_SIZE_1  25
413 
414 #ifdef ONIG_DEBUG
415 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)416 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
417 {
418   int i;
419   FILE* fp = (FILE* )arg;
420 
421   fprintf(fp, "%s: ", e->name);
422   if (e->back_num == 0)
423     fputs("-", fp);
424   else if (e->back_num == 1)
425     fprintf(fp, "%d", e->back_ref1);
426   else {
427     for (i = 0; i < e->back_num; i++) {
428       if (i > 0) fprintf(fp, ", ");
429       fprintf(fp, "%d", e->back_refs[i]);
430     }
431   }
432   fputs("\n", fp);
433   return ST_CONTINUE;
434 }
435 
436 extern int
onig_print_names(FILE * fp,regex_t * reg)437 onig_print_names(FILE* fp, regex_t* reg)
438 {
439   NameTable* t = (NameTable* )reg->name_table;
440 
441   if (IS_NOT_NULL(t)) {
442     fprintf(fp, "name table\n");
443     onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
444     fputs("\n", fp);
445   }
446   return 0;
447 }
448 #endif /* ONIG_DEBUG */
449 
450 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)451 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
452 {
453   xfree(e->name);
454   if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
455   xfree(key);
456   xfree(e);
457   return ST_DELETE;
458 }
459 
460 static int
names_clear(regex_t * reg)461 names_clear(regex_t* reg)
462 {
463   NameTable* t = (NameTable* )reg->name_table;
464 
465   if (IS_NOT_NULL(t)) {
466     onig_st_foreach(t, i_free_name_entry, 0);
467   }
468   return 0;
469 }
470 
471 extern int
onig_names_free(regex_t * reg)472 onig_names_free(regex_t* reg)
473 {
474   int r;
475   NameTable* t;
476 
477   r = names_clear(reg);
478   if (r) return r;
479 
480   t = (NameTable* )reg->name_table;
481   if (IS_NOT_NULL(t)) onig_st_free_table(t);
482   reg->name_table = (void* )NULL;
483   return 0;
484 }
485 
486 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)487 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
488 {
489   NameEntry* e;
490   NameTable* t = (NameTable* )reg->name_table;
491 
492   e = (NameEntry* )NULL;
493   if (IS_NOT_NULL(t)) {
494     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
495   }
496   return e;
497 }
498 
499 typedef struct {
500   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
501   regex_t* reg;
502   void* arg;
503   int ret;
504   OnigEncoding enc;
505 } INamesArg;
506 
507 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)508 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
509 {
510   int r = (*(arg->func))(e->name,
511                          e->name + e->name_len,
512                          e->back_num,
513 			 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
514 			 arg->reg, arg->arg);
515   if (r != 0) {
516     arg->ret = r;
517     return ST_STOP;
518   }
519   return ST_CONTINUE;
520 }
521 
522 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)523 onig_foreach_name(regex_t* reg,
524   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
525 {
526   INamesArg narg;
527   NameTable* t = (NameTable* )reg->name_table;
528 
529   narg.ret = 0;
530   if (IS_NOT_NULL(t)) {
531     narg.func = func;
532     narg.reg  = reg;
533     narg.arg  = arg;
534     narg.enc  = reg->enc; /* should be pattern encoding. */
535     onig_st_foreach(t, i_names, (HashDataType )&narg);
536   }
537   return narg.ret;
538 }
539 
540 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)541 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
542 {
543   int i;
544 
545   if (e->back_num > 1) {
546     for (i = 0; i < e->back_num; i++) {
547       e->back_refs[i] = map[e->back_refs[i]].new_val;
548     }
549   }
550   else if (e->back_num == 1) {
551     e->back_ref1 = map[e->back_ref1].new_val;
552   }
553 
554   return ST_CONTINUE;
555 }
556 
557 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)558 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
559 {
560   NameTable* t = (NameTable* )reg->name_table;
561 
562   if (IS_NOT_NULL(t)) {
563     onig_st_foreach(t, i_renumber_name, (HashDataType )map);
564   }
565   return 0;
566 }
567 
568 
569 extern int
onig_number_of_names(regex_t * reg)570 onig_number_of_names(regex_t* reg)
571 {
572   NameTable* t = (NameTable* )reg->name_table;
573 
574   if (IS_NOT_NULL(t))
575     return t->num_entries;
576   else
577     return 0;
578 }
579 
580 #else  /* USE_ST_LIBRARY */
581 
582 #define INIT_NAMES_ALLOC_NUM    8
583 
584 typedef struct {
585   NameEntry* e;
586   int        num;
587   int        alloc;
588 } NameTable;
589 
590 #ifdef ONIG_DEBUG
591 extern int
onig_print_names(FILE * fp,regex_t * reg)592 onig_print_names(FILE* fp, regex_t* reg)
593 {
594   int i, j;
595   NameEntry* e;
596   NameTable* t = (NameTable* )reg->name_table;
597 
598   if (IS_NOT_NULL(t) && t->num > 0) {
599     fprintf(fp, "name table\n");
600     for (i = 0; i < t->num; i++) {
601       e = &(t->e[i]);
602       fprintf(fp, "%s: ", e->name);
603       if (e->back_num == 0) {
604 	fputs("-", fp);
605       }
606       else if (e->back_num == 1) {
607 	fprintf(fp, "%d", e->back_ref1);
608       }
609       else {
610 	for (j = 0; j < e->back_num; j++) {
611 	  if (j > 0) fprintf(fp, ", ");
612 	  fprintf(fp, "%d", e->back_refs[j]);
613 	}
614       }
615       fputs("\n", fp);
616     }
617     fputs("\n", fp);
618   }
619   return 0;
620 }
621 #endif
622 
623 static int
names_clear(regex_t * reg)624 names_clear(regex_t* reg)
625 {
626   int i;
627   NameEntry* e;
628   NameTable* t = (NameTable* )reg->name_table;
629 
630   if (IS_NOT_NULL(t)) {
631     for (i = 0; i < t->num; i++) {
632       e = &(t->e[i]);
633       if (IS_NOT_NULL(e->name)) {
634 	xfree(e->name);
635 	e->name       = NULL;
636 	e->name_len   = 0;
637 	e->back_num   = 0;
638 	e->back_alloc = 0;
639 	if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
640 	e->back_refs = (int* )NULL;
641       }
642     }
643     if (IS_NOT_NULL(t->e)) {
644       xfree(t->e);
645       t->e = NULL;
646     }
647     t->num = 0;
648   }
649   return 0;
650 }
651 
652 extern int
onig_names_free(regex_t * reg)653 onig_names_free(regex_t* reg)
654 {
655   int r;
656   NameTable* t;
657 
658   r = names_clear(reg);
659   if (r) return r;
660 
661   t = (NameTable* )reg->name_table;
662   if (IS_NOT_NULL(t)) xfree(t);
663   reg->name_table = NULL;
664   return 0;
665 }
666 
667 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)668 name_find(regex_t* reg, UChar* name, UChar* name_end)
669 {
670   int i, len;
671   NameEntry* e;
672   NameTable* t = (NameTable* )reg->name_table;
673 
674   if (IS_NOT_NULL(t)) {
675     len = name_end - name;
676     for (i = 0; i < t->num; i++) {
677       e = &(t->e[i]);
678       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
679 	return e;
680     }
681   }
682   return (NameEntry* )NULL;
683 }
684 
685 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)686 onig_foreach_name(regex_t* reg,
687   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
688 {
689   int i, r;
690   NameEntry* e;
691   NameTable* t = (NameTable* )reg->name_table;
692 
693   if (IS_NOT_NULL(t)) {
694     for (i = 0; i < t->num; i++) {
695       e = &(t->e[i]);
696       r = (*func)(e->name, e->name + e->name_len, e->back_num,
697 		  (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
698 		  reg, arg);
699       if (r != 0) return r;
700     }
701   }
702   return 0;
703 }
704 
705 extern int
onig_number_of_names(regex_t * reg)706 onig_number_of_names(regex_t* reg)
707 {
708   NameTable* t = (NameTable* )reg->name_table;
709 
710   if (IS_NOT_NULL(t))
711     return t->num;
712   else
713     return 0;
714 }
715 
716 #endif /* else USE_ST_LIBRARY */
717 
718 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)719 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
720 {
721   int alloc;
722   NameEntry* e;
723   NameTable* t = (NameTable* )reg->name_table;
724 
725   if (name_end - name <= 0)
726     return ONIGERR_EMPTY_GROUP_NAME;
727 
728   e = name_find(reg, name, name_end);
729   if (IS_NULL(e)) {
730 #ifdef USE_ST_LIBRARY
731     if (IS_NULL(t)) {
732       t = onig_st_init_strend_table_with_size(5);
733       reg->name_table = (void* )t;
734     }
735     e = (NameEntry* )xmalloc(sizeof(NameEntry));
736     CHECK_NULL_RETURN_MEMERR(e);
737 
738     e->name = strdup_with_null(reg->enc, name, name_end);
739     if (IS_NULL(e->name)) {
740       xfree(e);  return ONIGERR_MEMORY;
741     }
742     onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
743                           (HashDataType )e);
744 
745     e->name_len   = name_end - name;
746     e->back_num   = 0;
747     e->back_alloc = 0;
748     e->back_refs  = (int* )NULL;
749 
750 #else
751 
752     if (IS_NULL(t)) {
753       alloc = INIT_NAMES_ALLOC_NUM;
754       t = (NameTable* )xmalloc(sizeof(NameTable));
755       CHECK_NULL_RETURN_MEMERR(t);
756       t->e     = NULL;
757       t->alloc = 0;
758       t->num   = 0;
759 
760       t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
761       if (IS_NULL(t->e)) {
762 	xfree(t);
763 	return ONIGERR_MEMORY;
764       }
765       t->alloc = alloc;
766       reg->name_table = t;
767       goto clear;
768     }
769     else if (t->num == t->alloc) {
770       int i;
771 
772       alloc = t->alloc * 2;
773       t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
774       CHECK_NULL_RETURN_MEMERR(t->e);
775       t->alloc = alloc;
776 
777     clear:
778       for (i = t->num; i < t->alloc; i++) {
779 	t->e[i].name       = NULL;
780 	t->e[i].name_len   = 0;
781 	t->e[i].back_num   = 0;
782 	t->e[i].back_alloc = 0;
783 	t->e[i].back_refs  = (int* )NULL;
784       }
785     }
786     e = &(t->e[t->num]);
787     t->num++;
788     e->name = strdup_with_null(reg->enc, name, name_end);
789     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
790     e->name_len = name_end - name;
791 #endif
792   }
793 
794   if (e->back_num >= 1 &&
795       ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
796     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
797 				    name, name_end);
798     return ONIGERR_MULTIPLEX_DEFINED_NAME;
799   }
800 
801   e->back_num++;
802   if (e->back_num == 1) {
803     e->back_ref1 = backref;
804   }
805   else {
806     if (e->back_num == 2) {
807       alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
808       e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
809       CHECK_NULL_RETURN_MEMERR(e->back_refs);
810       e->back_alloc = alloc;
811       e->back_refs[0] = e->back_ref1;
812       e->back_refs[1] = backref;
813     }
814     else {
815       if (e->back_num > e->back_alloc) {
816 	alloc = e->back_alloc * 2;
817 	e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
818 	CHECK_NULL_RETURN_MEMERR(e->back_refs);
819 	e->back_alloc = alloc;
820       }
821       e->back_refs[e->back_num - 1] = backref;
822     }
823   }
824 
825   return 0;
826 }
827 
828 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)829 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
830 			   const UChar* name_end, int** nums)
831 {
832   NameEntry* e = name_find(reg, name, name_end);
833 
834   if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
835 
836   switch (e->back_num) {
837   case 0:
838     break;
839   case 1:
840     *nums = &(e->back_ref1);
841     break;
842   default:
843     *nums = e->back_refs;
844     break;
845   }
846   return e->back_num;
847 }
848 
849 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)850 onig_name_to_backref_number(regex_t* reg, const UChar* name,
851 			    const UChar* name_end, OnigRegion *region)
852 {
853   int i, n, *nums;
854 
855   n = onig_name_to_group_numbers(reg, name, name_end, &nums);
856   if (n < 0)
857     return n;
858   else if (n == 0)
859     return ONIGERR_PARSER_BUG;
860   else if (n == 1)
861     return nums[0];
862   else {
863     if (IS_NOT_NULL(region)) {
864       for (i = n - 1; i >= 0; i--) {
865 	if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
866 	  return nums[i];
867       }
868     }
869     return nums[n - 1];
870   }
871 }
872 
873 #else /* USE_NAMED_GROUP */
874 
875 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)876 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
877 			   const UChar* name_end, int** nums)
878 {
879   return ONIG_NO_SUPPORT_CONFIG;
880 }
881 
882 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)883 onig_name_to_backref_number(regex_t* reg, const UChar* name,
884 			    const UChar* name_end, OnigRegion* region)
885 {
886   return ONIG_NO_SUPPORT_CONFIG;
887 }
888 
889 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)890 onig_foreach_name(regex_t* reg,
891   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
892 {
893   return ONIG_NO_SUPPORT_CONFIG;
894 }
895 
896 extern int
onig_number_of_names(regex_t * reg)897 onig_number_of_names(regex_t* reg)
898 {
899   return 0;
900 }
901 #endif /* else USE_NAMED_GROUP */
902 
903 extern int
onig_noname_group_capture_is_active(regex_t * reg)904 onig_noname_group_capture_is_active(regex_t* reg)
905 {
906   if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
907     return 0;
908 
909 #ifdef USE_NAMED_GROUP
910   if (onig_number_of_names(reg) > 0 &&
911       IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
912       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
913     return 0;
914   }
915 #endif
916 
917   return 1;
918 }
919 
920 
921 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE   16
922 
923 static void
scan_env_clear(ScanEnv * env)924 scan_env_clear(ScanEnv* env)
925 {
926   int i;
927 
928   BIT_STATUS_CLEAR(env->capture_history);
929   BIT_STATUS_CLEAR(env->bt_mem_start);
930   BIT_STATUS_CLEAR(env->bt_mem_end);
931   BIT_STATUS_CLEAR(env->backrefed_mem);
932   env->error      = (UChar* )NULL;
933   env->error_end  = (UChar* )NULL;
934   env->num_call   = 0;
935   env->num_mem    = 0;
936 #ifdef USE_NAMED_GROUP
937   env->num_named  = 0;
938 #endif
939   env->mem_alloc         = 0;
940   env->mem_nodes_dynamic = (Node** )NULL;
941 
942   for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
943     env->mem_nodes_static[i] = NULL_NODE;
944 
945 #ifdef USE_COMBINATION_EXPLOSION_CHECK
946   env->num_comb_exp_check  = 0;
947   env->comb_exp_max_regnum = 0;
948   env->curr_max_regnum     = 0;
949   env->has_recursion       = 0;
950 #endif
951 }
952 
953 static int
scan_env_add_mem_entry(ScanEnv * env)954 scan_env_add_mem_entry(ScanEnv* env)
955 {
956   int i, need, alloc;
957   Node** p;
958 
959   need = env->num_mem + 1;
960   if (need >= SCANENV_MEMNODES_SIZE) {
961     if (env->mem_alloc <= need) {
962       if (IS_NULL(env->mem_nodes_dynamic)) {
963 	alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
964 	p = (Node** )xmalloc(sizeof(Node*) * alloc);
965 	xmemcpy(p, env->mem_nodes_static,
966 		sizeof(Node*) * SCANENV_MEMNODES_SIZE);
967       }
968       else {
969 	alloc = env->mem_alloc * 2;
970 	p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
971       }
972       CHECK_NULL_RETURN_MEMERR(p);
973 
974       for (i = env->num_mem + 1; i < alloc; i++)
975 	p[i] = NULL_NODE;
976 
977       env->mem_nodes_dynamic = p;
978       env->mem_alloc = alloc;
979     }
980   }
981 
982   env->num_mem++;
983   return env->num_mem;
984 }
985 
986 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)987 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
988 {
989   if (env->num_mem >= num)
990     SCANENV_MEM_NODES(env)[num] = node;
991   else
992     return ONIGERR_PARSER_BUG;
993   return 0;
994 }
995 
996 
997 #ifdef USE_PARSE_TREE_NODE_RECYCLE
998 typedef struct _FreeNode {
999   struct _FreeNode* next;
1000 } FreeNode;
1001 
1002 static FreeNode* FreeNodeList = (FreeNode* )NULL;
1003 #endif
1004 
1005 extern void
onig_node_free(Node * node)1006 onig_node_free(Node* node)
1007 {
1008  start:
1009   if (IS_NULL(node)) return ;
1010 
1011   switch (NTYPE(node)) {
1012   case NT_STR:
1013     if (NSTR(node)->capa != 0 &&
1014 	IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1015       xfree(NSTR(node)->s);
1016     }
1017     break;
1018 
1019   case NT_LIST:
1020   case NT_ALT:
1021     onig_node_free(NCAR(node));
1022     {
1023       Node* next_node = NCDR(node);
1024 
1025 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1026       {
1027 	FreeNode* n = (FreeNode* )node;
1028 
1029         THREAD_ATOMIC_START;
1030 	n->next = FreeNodeList;
1031 	FreeNodeList = n;
1032         THREAD_ATOMIC_END;
1033       }
1034 #else
1035       xfree(node);
1036 #endif
1037       node = next_node;
1038       goto start;
1039     }
1040     break;
1041 
1042   case NT_CCLASS:
1043     {
1044       CClassNode* cc = NCCLASS(node);
1045 
1046       if (IS_NCCLASS_SHARE(cc)) return ;
1047       if (cc->mbuf)
1048         bbuf_free(cc->mbuf);
1049     }
1050     break;
1051 
1052   case NT_QTFR:
1053     if (NQTFR(node)->target)
1054       onig_node_free(NQTFR(node)->target);
1055     break;
1056 
1057   case NT_ENCLOSE:
1058     if (NENCLOSE(node)->target)
1059       onig_node_free(NENCLOSE(node)->target);
1060     break;
1061 
1062   case NT_BREF:
1063     if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1064       xfree(NBREF(node)->back_dynamic);
1065     break;
1066 
1067   case NT_ANCHOR:
1068     if (NANCHOR(node)->target)
1069       onig_node_free(NANCHOR(node)->target);
1070     break;
1071   }
1072 
1073 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1074   {
1075     FreeNode* n = (FreeNode* )node;
1076 
1077     THREAD_ATOMIC_START;
1078     n->next = FreeNodeList;
1079     FreeNodeList = n;
1080     THREAD_ATOMIC_END;
1081   }
1082 #else
1083   xfree(node);
1084 #endif
1085 }
1086 
1087 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1088 extern int
onig_free_node_list(void)1089 onig_free_node_list(void)
1090 {
1091   FreeNode* n;
1092 
1093   /* THREAD_ATOMIC_START; */
1094   while (IS_NOT_NULL(FreeNodeList)) {
1095     n = FreeNodeList;
1096     FreeNodeList = FreeNodeList->next;
1097     xfree(n);
1098   }
1099   /* THREAD_ATOMIC_END; */
1100   return 0;
1101 }
1102 #endif
1103 
1104 static Node*
node_new(void)1105 node_new(void)
1106 {
1107   Node* node;
1108 
1109 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1110   THREAD_ATOMIC_START;
1111   if (IS_NOT_NULL(FreeNodeList)) {
1112     node = (Node* )FreeNodeList;
1113     FreeNodeList = FreeNodeList->next;
1114     THREAD_ATOMIC_END;
1115     return node;
1116   }
1117   THREAD_ATOMIC_END;
1118 #endif
1119 
1120   node = (Node* )xmalloc(sizeof(Node));
1121   /* xmemset(node, 0, sizeof(Node)); */
1122   return node;
1123 }
1124 
1125 
1126 static void
initialize_cclass(CClassNode * cc)1127 initialize_cclass(CClassNode* cc)
1128 {
1129   BITSET_CLEAR(cc->bs);
1130   /* cc->base.flags = 0; */
1131   cc->flags = 0;
1132   cc->mbuf  = NULL;
1133 }
1134 
1135 static Node*
node_new_cclass(void)1136 node_new_cclass(void)
1137 {
1138   Node* node = node_new();
1139   CHECK_NULL_RETURN(node);
1140 
1141   SET_NTYPE(node, NT_CCLASS);
1142   initialize_cclass(NCCLASS(node));
1143   return node;
1144 }
1145 
1146 static Node*
node_new_cclass_by_codepoint_range(int not,OnigCodePoint sb_out,const OnigCodePoint ranges[])1147 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1148 				   const OnigCodePoint ranges[])
1149 {
1150   int n, i;
1151   CClassNode* cc;
1152   OnigCodePoint j;
1153 
1154   Node* node = node_new_cclass();
1155   CHECK_NULL_RETURN(node);
1156 
1157   cc = NCCLASS(node);
1158   if (not != 0) NCCLASS_SET_NOT(cc);
1159 
1160   BITSET_CLEAR(cc->bs);
1161   if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1162     n = ONIGENC_CODE_RANGE_NUM(ranges);
1163     for (i = 0; i < n; i++) {
1164       for (j  = ONIGENC_CODE_RANGE_FROM(ranges, i);
1165            j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1166 	if (j >= sb_out) goto sb_end;
1167 
1168         BITSET_SET_BIT(cc->bs, j);
1169       }
1170     }
1171   }
1172 
1173  sb_end:
1174   if (IS_NULL(ranges)) {
1175   is_null:
1176     cc->mbuf = NULL;
1177   }
1178   else {
1179     BBuf* bbuf;
1180 
1181     n = ONIGENC_CODE_RANGE_NUM(ranges);
1182     if (n == 0) goto is_null;
1183 
1184     bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1185     CHECK_NULL_RETURN(bbuf);
1186     bbuf->alloc = n + 1;
1187     bbuf->used  = n + 1;
1188     bbuf->p     = (UChar* )((void* )ranges);
1189 
1190     cc->mbuf = bbuf;
1191   }
1192 
1193   return node;
1194 }
1195 
1196 static Node*
node_new_ctype(int type,int not)1197 node_new_ctype(int type, int not)
1198 {
1199   Node* node = node_new();
1200   CHECK_NULL_RETURN(node);
1201 
1202   SET_NTYPE(node, NT_CTYPE);
1203   NCTYPE(node)->ctype = type;
1204   NCTYPE(node)->not   = not;
1205   return node;
1206 }
1207 
1208 static Node*
node_new_anychar(void)1209 node_new_anychar(void)
1210 {
1211   Node* node = node_new();
1212   CHECK_NULL_RETURN(node);
1213 
1214   SET_NTYPE(node, NT_CANY);
1215   return node;
1216 }
1217 
1218 static Node*
node_new_list(Node * left,Node * right)1219 node_new_list(Node* left, Node* right)
1220 {
1221   Node* node = node_new();
1222   CHECK_NULL_RETURN(node);
1223 
1224   SET_NTYPE(node, NT_LIST);
1225   NCAR(node)  = left;
1226   NCDR(node) = right;
1227   return node;
1228 }
1229 
1230 extern Node*
onig_node_new_list(Node * left,Node * right)1231 onig_node_new_list(Node* left, Node* right)
1232 {
1233   return node_new_list(left, right);
1234 }
1235 
1236 extern Node*
onig_node_list_add(Node * list,Node * x)1237 onig_node_list_add(Node* list, Node* x)
1238 {
1239   Node *n;
1240 
1241   n = onig_node_new_list(x, NULL);
1242   if (IS_NULL(n)) return NULL_NODE;
1243 
1244   if (IS_NOT_NULL(list)) {
1245     while (IS_NOT_NULL(NCDR(list)))
1246       list = NCDR(list);
1247 
1248     NCDR(list) = n;
1249   }
1250 
1251   return n;
1252 }
1253 
1254 extern Node*
onig_node_new_alt(Node * left,Node * right)1255 onig_node_new_alt(Node* left, Node* right)
1256 {
1257   Node* node = node_new();
1258   CHECK_NULL_RETURN(node);
1259 
1260   SET_NTYPE(node, NT_ALT);
1261   NCAR(node)  = left;
1262   NCDR(node) = right;
1263   return node;
1264 }
1265 
1266 extern Node*
onig_node_new_anchor(int type)1267 onig_node_new_anchor(int type)
1268 {
1269   Node* node = node_new();
1270   CHECK_NULL_RETURN(node);
1271 
1272   SET_NTYPE(node, NT_ANCHOR);
1273   NANCHOR(node)->type     = type;
1274   NANCHOR(node)->target   = NULL;
1275   NANCHOR(node)->char_len = -1;
1276   return node;
1277 }
1278 
1279 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1280 node_new_backref(int back_num, int* backrefs, int by_name,
1281 #ifdef USE_BACKREF_WITH_LEVEL
1282 		 int exist_level, int nest_level,
1283 #endif
1284 		 ScanEnv* env)
1285 {
1286   int i;
1287   Node* node = node_new();
1288 
1289   CHECK_NULL_RETURN(node);
1290 
1291   SET_NTYPE(node, NT_BREF);
1292   NBREF(node)->state    = 0;
1293   NBREF(node)->back_num = back_num;
1294   NBREF(node)->back_dynamic = (int* )NULL;
1295   if (by_name != 0)
1296     NBREF(node)->state |= NST_NAME_REF;
1297 
1298 #ifdef USE_BACKREF_WITH_LEVEL
1299   if (exist_level != 0) {
1300     NBREF(node)->state |= NST_NEST_LEVEL;
1301     NBREF(node)->nest_level  = nest_level;
1302   }
1303 #endif
1304 
1305   for (i = 0; i < back_num; i++) {
1306     if (backrefs[i] <= env->num_mem &&
1307 	IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1308       NBREF(node)->state |= NST_RECURSION;   /* /...(\1).../ */
1309       break;
1310     }
1311   }
1312 
1313   if (back_num <= NODE_BACKREFS_SIZE) {
1314     for (i = 0; i < back_num; i++)
1315       NBREF(node)->back_static[i] = backrefs[i];
1316   }
1317   else {
1318     int* p = (int* )xmalloc(sizeof(int) * back_num);
1319     if (IS_NULL(p)) {
1320       onig_node_free(node);
1321       return NULL;
1322     }
1323     NBREF(node)->back_dynamic = p;
1324     for (i = 0; i < back_num; i++)
1325       p[i] = backrefs[i];
1326   }
1327   return node;
1328 }
1329 
1330 #ifdef USE_SUBEXP_CALL
1331 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1332 node_new_call(UChar* name, UChar* name_end, int gnum)
1333 {
1334   Node* node = node_new();
1335   CHECK_NULL_RETURN(node);
1336 
1337   SET_NTYPE(node, NT_CALL);
1338   NCALL(node)->state     = 0;
1339   NCALL(node)->target    = NULL_NODE;
1340   NCALL(node)->name      = name;
1341   NCALL(node)->name_end  = name_end;
1342   NCALL(node)->group_num = gnum;  /* call by number if gnum != 0 */
1343   return node;
1344 }
1345 #endif
1346 
1347 static Node*
node_new_quantifier(int lower,int upper,int by_number)1348 node_new_quantifier(int lower, int upper, int by_number)
1349 {
1350   Node* node = node_new();
1351   CHECK_NULL_RETURN(node);
1352 
1353   SET_NTYPE(node, NT_QTFR);
1354   NQTFR(node)->state  = 0;
1355   NQTFR(node)->target = NULL;
1356   NQTFR(node)->lower  = lower;
1357   NQTFR(node)->upper  = upper;
1358   NQTFR(node)->greedy = 1;
1359   NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1360   NQTFR(node)->head_exact        = NULL_NODE;
1361   NQTFR(node)->next_head_exact   = NULL_NODE;
1362   NQTFR(node)->is_refered        = 0;
1363   if (by_number != 0)
1364     NQTFR(node)->state |= NST_BY_NUMBER;
1365 
1366 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1367   NQTFR(node)->comb_exp_check_num = 0;
1368 #endif
1369 
1370   return node;
1371 }
1372 
1373 static Node*
node_new_enclose(int type)1374 node_new_enclose(int type)
1375 {
1376   Node* node = node_new();
1377   CHECK_NULL_RETURN(node);
1378 
1379   SET_NTYPE(node, NT_ENCLOSE);
1380   NENCLOSE(node)->type      = type;
1381   NENCLOSE(node)->state     =  0;
1382   NENCLOSE(node)->regnum    =  0;
1383   NENCLOSE(node)->option    =  0;
1384   NENCLOSE(node)->target    = NULL;
1385   NENCLOSE(node)->call_addr = -1;
1386   NENCLOSE(node)->opt_count =  0;
1387   return node;
1388 }
1389 
1390 extern Node*
onig_node_new_enclose(int type)1391 onig_node_new_enclose(int type)
1392 {
1393   return node_new_enclose(type);
1394 }
1395 
1396 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1397 node_new_enclose_memory(OnigOptionType option, int is_named)
1398 {
1399   Node* node = node_new_enclose(ENCLOSE_MEMORY);
1400   CHECK_NULL_RETURN(node);
1401   if (is_named != 0)
1402     SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1403 
1404 #ifdef USE_SUBEXP_CALL
1405   NENCLOSE(node)->option = option;
1406 #endif
1407   return node;
1408 }
1409 
1410 static Node*
node_new_option(OnigOptionType option)1411 node_new_option(OnigOptionType option)
1412 {
1413   Node* node = node_new_enclose(ENCLOSE_OPTION);
1414   CHECK_NULL_RETURN(node);
1415   NENCLOSE(node)->option = option;
1416   return node;
1417 }
1418 
1419 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1420 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1421 {
1422   int addlen = end - s;
1423 
1424   if (addlen > 0) {
1425     int len  = NSTR(node)->end - NSTR(node)->s;
1426 
1427     if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1428       UChar* p;
1429       int capa = len + addlen + NODE_STR_MARGIN;
1430 
1431       if (capa <= NSTR(node)->capa) {
1432 	onig_strcpy(NSTR(node)->s + len, s, end);
1433       }
1434       else {
1435 	if (NSTR(node)->s == NSTR(node)->buf)
1436 	  p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1437 				      s, end, capa);
1438 	else
1439 	  p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1440 
1441 	CHECK_NULL_RETURN_MEMERR(p);
1442 	NSTR(node)->s    = p;
1443 	NSTR(node)->capa = capa;
1444       }
1445     }
1446     else {
1447       onig_strcpy(NSTR(node)->s + len, s, end);
1448     }
1449     NSTR(node)->end = NSTR(node)->s + len + addlen;
1450   }
1451 
1452   return 0;
1453 }
1454 
1455 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1456 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1457 {
1458   onig_node_str_clear(node);
1459   return onig_node_str_cat(node, s, end);
1460 }
1461 
1462 static int
node_str_cat_char(Node * node,UChar c)1463 node_str_cat_char(Node* node, UChar c)
1464 {
1465   UChar s[1];
1466 
1467   s[0] = c;
1468   return onig_node_str_cat(node, s, s + 1);
1469 }
1470 
1471 extern void
onig_node_conv_to_str_node(Node * node,int flag)1472 onig_node_conv_to_str_node(Node* node, int flag)
1473 {
1474   SET_NTYPE(node, NT_STR);
1475   NSTR(node)->flag = flag;
1476   NSTR(node)->capa = 0;
1477   NSTR(node)->s    = NSTR(node)->buf;
1478   NSTR(node)->end  = NSTR(node)->buf;
1479 }
1480 
1481 extern void
onig_node_str_clear(Node * node)1482 onig_node_str_clear(Node* node)
1483 {
1484   if (NSTR(node)->capa != 0 &&
1485       IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1486     xfree(NSTR(node)->s);
1487   }
1488 
1489   NSTR(node)->capa = 0;
1490   NSTR(node)->flag = 0;
1491   NSTR(node)->s    = NSTR(node)->buf;
1492   NSTR(node)->end  = NSTR(node)->buf;
1493 }
1494 
1495 static Node*
node_new_str(const UChar * s,const UChar * end)1496 node_new_str(const UChar* s, const UChar* end)
1497 {
1498   Node* node = node_new();
1499   CHECK_NULL_RETURN(node);
1500 
1501   SET_NTYPE(node, NT_STR);
1502   NSTR(node)->capa = 0;
1503   NSTR(node)->flag = 0;
1504   NSTR(node)->s    = NSTR(node)->buf;
1505   NSTR(node)->end  = NSTR(node)->buf;
1506   if (onig_node_str_cat(node, s, end)) {
1507     onig_node_free(node);
1508     return NULL;
1509   }
1510   return node;
1511 }
1512 
1513 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1514 onig_node_new_str(const UChar* s, const UChar* end)
1515 {
1516   return node_new_str(s, end);
1517 }
1518 
1519 static Node*
node_new_str_raw(UChar * s,UChar * end)1520 node_new_str_raw(UChar* s, UChar* end)
1521 {
1522   Node* node = node_new_str(s, end);
1523   NSTRING_SET_RAW(node);
1524   return node;
1525 }
1526 
1527 static Node*
node_new_empty(void)1528 node_new_empty(void)
1529 {
1530   return node_new_str(NULL, NULL);
1531 }
1532 
1533 static Node*
node_new_str_raw_char(UChar c)1534 node_new_str_raw_char(UChar c)
1535 {
1536   UChar p[1];
1537 
1538   p[0] = c;
1539   return node_new_str_raw(p, p + 1);
1540 }
1541 
1542 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1543 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1544 {
1545   const UChar *p;
1546   Node* n = NULL_NODE;
1547 
1548   if (sn->end > sn->s) {
1549     p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1550     if (p && p > sn->s) { /* can be splitted. */
1551       n = node_new_str(p, sn->end);
1552       if ((sn->flag & NSTR_RAW) != 0)
1553 	NSTRING_SET_RAW(n);
1554       sn->end = (UChar* )p;
1555     }
1556   }
1557   return n;
1558 }
1559 
1560 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1561 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1562 {
1563   if (sn->end > sn->s) {
1564     return ((enclen(enc, sn->s) < sn->end - sn->s)  ?  1 : 0);
1565   }
1566   return 0;
1567 }
1568 
1569 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1570 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1571 node_str_head_pad(StrNode* sn, int num, UChar val)
1572 {
1573   UChar buf[NODE_STR_BUF_SIZE];
1574   int i, len;
1575 
1576   len = sn->end - sn->s;
1577   onig_strcpy(buf, sn->s, sn->end);
1578   onig_strcpy(&(sn->s[num]), buf, buf + len);
1579   sn->end += num;
1580 
1581   for (i = 0; i < num; i++) {
1582     sn->s[i] = val;
1583   }
1584 }
1585 #endif
1586 
1587 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1588 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1589 {
1590   unsigned int num, val;
1591   OnigCodePoint c;
1592   UChar* p = *src;
1593   PFETCH_READY;
1594 
1595   num = 0;
1596   while (!PEND) {
1597     PFETCH(c);
1598     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1599       val = (unsigned int )DIGITVAL(c);
1600       if ((INT_MAX_LIMIT - val) / 10UL < num)
1601 	return -1;  /* overflow */
1602 
1603       num = num * 10 + val;
1604     }
1605     else {
1606       PUNFETCH;
1607       break;
1608     }
1609   }
1610   *src = p;
1611   return num;
1612 }
1613 
1614 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1615 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1616 				 OnigEncoding enc)
1617 {
1618   OnigCodePoint c;
1619   unsigned int num, val;
1620   UChar* p = *src;
1621   PFETCH_READY;
1622 
1623   num = 0;
1624   while (!PEND && maxlen-- != 0) {
1625     PFETCH(c);
1626     if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1627       val = (unsigned int )XDIGITVAL(enc,c);
1628       if ((INT_MAX_LIMIT - val) / 16UL < num)
1629 	return -1;  /* overflow */
1630 
1631       num = (num << 4) + XDIGITVAL(enc,c);
1632     }
1633     else {
1634       PUNFETCH;
1635       break;
1636     }
1637   }
1638   *src = p;
1639   return num;
1640 }
1641 
1642 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1643 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1644 			   OnigEncoding enc)
1645 {
1646   OnigCodePoint c;
1647   unsigned int num, val;
1648   UChar* p = *src;
1649   PFETCH_READY;
1650 
1651   num = 0;
1652   while (!PEND && maxlen-- != 0) {
1653     PFETCH(c);
1654     if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1655       val = ODIGITVAL(c);
1656       if ((INT_MAX_LIMIT - val) / 8UL < num)
1657 	return -1;  /* overflow */
1658 
1659       num = (num << 3) + val;
1660     }
1661     else {
1662       PUNFETCH;
1663       break;
1664     }
1665   }
1666   *src = p;
1667   return num;
1668 }
1669 
1670 
1671 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1672     BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1673 
1674 /* data format:
1675      [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1676      (all data size is OnigCodePoint)
1677  */
1678 static int
new_code_range(BBuf ** pbuf)1679 new_code_range(BBuf** pbuf)
1680 {
1681 #define INIT_MULTI_BYTE_RANGE_SIZE  (SIZE_CODE_POINT * 5)
1682   int r;
1683   OnigCodePoint n;
1684   BBuf* bbuf;
1685 
1686   bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1687   CHECK_NULL_RETURN_MEMERR(*pbuf);
1688   r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1689   if (r) return r;
1690 
1691   n = 0;
1692   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1693   return 0;
1694 }
1695 
1696 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1697 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1698 {
1699   int r, inc_n, pos;
1700   int low, high, bound, x;
1701   OnigCodePoint n, *data;
1702   BBuf* bbuf;
1703 
1704   if (from > to) {
1705     n = from; from = to; to = n;
1706   }
1707 
1708   if (IS_NULL(*pbuf)) {
1709     r = new_code_range(pbuf);
1710     if (r) return r;
1711     bbuf = *pbuf;
1712     n = 0;
1713   }
1714   else {
1715     bbuf = *pbuf;
1716     GET_CODE_POINT(n, bbuf->p);
1717   }
1718   data = (OnigCodePoint* )(bbuf->p);
1719   data++;
1720 
1721   for (low = 0, bound = n; low < bound; ) {
1722     x = (low + bound) >> 1;
1723     if (from > data[x*2 + 1])
1724       low = x + 1;
1725     else
1726       bound = x;
1727   }
1728 
1729   for (high = low, bound = n; high < bound; ) {
1730     x = (high + bound) >> 1;
1731     if (to >= data[x*2] - 1)
1732       high = x + 1;
1733     else
1734       bound = x;
1735   }
1736 
1737   inc_n = low + 1 - high;
1738   if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1739     return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1740 
1741   if (inc_n != 1) {
1742     if (from > data[low*2])
1743       from = data[low*2];
1744     if (to < data[(high - 1)*2 + 1])
1745       to = data[(high - 1)*2 + 1];
1746   }
1747 
1748   if (inc_n != 0 && (OnigCodePoint )high < n) {
1749     int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1750     int to_pos   = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1751     int size = (n - high) * 2 * SIZE_CODE_POINT;
1752 
1753     if (inc_n > 0) {
1754       BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1755     }
1756     else {
1757       BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1758     }
1759   }
1760 
1761   pos = SIZE_CODE_POINT * (1 + low * 2);
1762   BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1763   BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1764   BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1765   n += inc_n;
1766   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1767 
1768   return 0;
1769 }
1770 
1771 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1772 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1773 {
1774   if (from > to) {
1775     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1776       return 0;
1777     else
1778       return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1779   }
1780 
1781   return add_code_range_to_buf(pbuf, from, to);
1782 }
1783 
1784 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1785 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1786 {
1787   int r, i, n;
1788   OnigCodePoint pre, from, *data, to = 0;
1789 
1790   *pbuf = (BBuf* )NULL;
1791   if (IS_NULL(bbuf)) {
1792   set_all:
1793     return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1794   }
1795 
1796   data = (OnigCodePoint* )(bbuf->p);
1797   GET_CODE_POINT(n, data);
1798   data++;
1799   if (n <= 0) goto set_all;
1800 
1801   r = 0;
1802   pre = MBCODE_START_POS(enc);
1803   for (i = 0; i < n; i++) {
1804     from = data[i*2];
1805     to   = data[i*2+1];
1806     if (pre <= from - 1) {
1807       r = add_code_range_to_buf(pbuf, pre, from - 1);
1808       if (r != 0) return r;
1809     }
1810     if (to == ~((OnigCodePoint )0)) break;
1811     pre = to + 1;
1812   }
1813   if (to < ~((OnigCodePoint )0)) {
1814     r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1815   }
1816   return r;
1817 }
1818 
1819 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1820   BBuf *tbuf; \
1821   int  tnot; \
1822   tnot = not1;  not1  = not2;  not2  = tnot; \
1823   tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1824 } while (0)
1825 
1826 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1827 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1828                   BBuf* bbuf2, int not2, BBuf** pbuf)
1829 {
1830   int r;
1831   OnigCodePoint i, n1, *data1;
1832   OnigCodePoint from, to;
1833 
1834   *pbuf = (BBuf* )NULL;
1835   if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1836     if (not1 != 0 || not2 != 0)
1837       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1838     return 0;
1839   }
1840 
1841   r = 0;
1842   if (IS_NULL(bbuf2))
1843     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1844 
1845   if (IS_NULL(bbuf1)) {
1846     if (not1 != 0) {
1847       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1848     }
1849     else {
1850       if (not2 == 0) {
1851 	return bbuf_clone(pbuf, bbuf2);
1852       }
1853       else {
1854 	return not_code_range_buf(enc, bbuf2, pbuf);
1855       }
1856     }
1857   }
1858 
1859   if (not1 != 0)
1860     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1861 
1862   data1 = (OnigCodePoint* )(bbuf1->p);
1863   GET_CODE_POINT(n1, data1);
1864   data1++;
1865 
1866   if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1867     r = bbuf_clone(pbuf, bbuf2);
1868   }
1869   else if (not1 == 0) { /* 1 OR (not 2) */
1870     r = not_code_range_buf(enc, bbuf2, pbuf);
1871   }
1872   if (r != 0) return r;
1873 
1874   for (i = 0; i < n1; i++) {
1875     from = data1[i*2];
1876     to   = data1[i*2+1];
1877     r = add_code_range_to_buf(pbuf, from, to);
1878     if (r != 0) return r;
1879   }
1880   return 0;
1881 }
1882 
1883 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1884 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1885 	        OnigCodePoint* data, int n)
1886 {
1887   int i, r;
1888   OnigCodePoint from2, to2;
1889 
1890   for (i = 0; i < n; i++) {
1891     from2 = data[i*2];
1892     to2   = data[i*2+1];
1893     if (from2 < from1) {
1894       if (to2 < from1) continue;
1895       else {
1896 	from1 = to2 + 1;
1897       }
1898     }
1899     else if (from2 <= to1) {
1900       if (to2 < to1) {
1901 	if (from1 <= from2 - 1) {
1902 	  r = add_code_range_to_buf(pbuf, from1, from2-1);
1903 	  if (r != 0) return r;
1904 	}
1905 	from1 = to2 + 1;
1906       }
1907       else {
1908 	to1 = from2 - 1;
1909       }
1910     }
1911     else {
1912       from1 = from2;
1913     }
1914     if (from1 > to1) break;
1915   }
1916   if (from1 <= to1) {
1917     r = add_code_range_to_buf(pbuf, from1, to1);
1918     if (r != 0) return r;
1919   }
1920   return 0;
1921 }
1922 
1923 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1924 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1925 {
1926   int r;
1927   OnigCodePoint i, j, n1, n2, *data1, *data2;
1928   OnigCodePoint from, to, from1, to1, from2, to2;
1929 
1930   *pbuf = (BBuf* )NULL;
1931   if (IS_NULL(bbuf1)) {
1932     if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1933       return bbuf_clone(pbuf, bbuf2);
1934     return 0;
1935   }
1936   else if (IS_NULL(bbuf2)) {
1937     if (not2 != 0)
1938       return bbuf_clone(pbuf, bbuf1);
1939     return 0;
1940   }
1941 
1942   if (not1 != 0)
1943     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1944 
1945   data1 = (OnigCodePoint* )(bbuf1->p);
1946   data2 = (OnigCodePoint* )(bbuf2->p);
1947   GET_CODE_POINT(n1, data1);
1948   GET_CODE_POINT(n2, data2);
1949   data1++;
1950   data2++;
1951 
1952   if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1953     for (i = 0; i < n1; i++) {
1954       from1 = data1[i*2];
1955       to1   = data1[i*2+1];
1956       for (j = 0; j < n2; j++) {
1957 	from2 = data2[j*2];
1958 	to2   = data2[j*2+1];
1959 	if (from2 > to1) break;
1960 	if (to2 < from1) continue;
1961 	from = MAX(from1, from2);
1962 	to   = MIN(to1, to2);
1963 	r = add_code_range_to_buf(pbuf, from, to);
1964 	if (r != 0) return r;
1965       }
1966     }
1967   }
1968   else if (not1 == 0) { /* 1 AND (not 2) */
1969     for (i = 0; i < n1; i++) {
1970       from1 = data1[i*2];
1971       to1   = data1[i*2+1];
1972       r = and_code_range1(pbuf, from1, to1, data2, n2);
1973       if (r != 0) return r;
1974     }
1975   }
1976 
1977   return 0;
1978 }
1979 
1980 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1981 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1982 {
1983   int r, not1, not2;
1984   BBuf *buf1, *buf2, *pbuf;
1985   BitSetRef bsr1, bsr2;
1986   BitSet bs1, bs2;
1987 
1988   not1 = IS_NCCLASS_NOT(dest);
1989   bsr1 = dest->bs;
1990   buf1 = dest->mbuf;
1991   not2 = IS_NCCLASS_NOT(cc);
1992   bsr2 = cc->bs;
1993   buf2 = cc->mbuf;
1994 
1995   if (not1 != 0) {
1996     bitset_invert_to(bsr1, bs1);
1997     bsr1 = bs1;
1998   }
1999   if (not2 != 0) {
2000     bitset_invert_to(bsr2, bs2);
2001     bsr2 = bs2;
2002   }
2003   bitset_and(bsr1, bsr2);
2004   if (bsr1 != dest->bs) {
2005     bitset_copy(dest->bs, bsr1);
2006     bsr1 = dest->bs;
2007   }
2008   if (not1 != 0) {
2009     bitset_invert(dest->bs);
2010   }
2011 
2012   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2013     if (not1 != 0 && not2 != 0) {
2014       r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2015     }
2016     else {
2017       r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2018       if (r == 0 && not1 != 0) {
2019 	BBuf *tbuf;
2020 	r = not_code_range_buf(enc, pbuf, &tbuf);
2021 	if (r != 0) {
2022 	  bbuf_free(pbuf);
2023 	  return r;
2024 	}
2025 	bbuf_free(pbuf);
2026 	pbuf = tbuf;
2027       }
2028     }
2029     if (r != 0) return r;
2030 
2031     dest->mbuf = pbuf;
2032     bbuf_free(buf1);
2033     return r;
2034   }
2035   return 0;
2036 }
2037 
2038 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2039 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2040 {
2041   int r, not1, not2;
2042   BBuf *buf1, *buf2, *pbuf;
2043   BitSetRef bsr1, bsr2;
2044   BitSet bs1, bs2;
2045 
2046   not1 = IS_NCCLASS_NOT(dest);
2047   bsr1 = dest->bs;
2048   buf1 = dest->mbuf;
2049   not2 = IS_NCCLASS_NOT(cc);
2050   bsr2 = cc->bs;
2051   buf2 = cc->mbuf;
2052 
2053   if (not1 != 0) {
2054     bitset_invert_to(bsr1, bs1);
2055     bsr1 = bs1;
2056   }
2057   if (not2 != 0) {
2058     bitset_invert_to(bsr2, bs2);
2059     bsr2 = bs2;
2060   }
2061   bitset_or(bsr1, bsr2);
2062   if (bsr1 != dest->bs) {
2063     bitset_copy(dest->bs, bsr1);
2064     bsr1 = dest->bs;
2065   }
2066   if (not1 != 0) {
2067     bitset_invert(dest->bs);
2068   }
2069 
2070   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2071     if (not1 != 0 && not2 != 0) {
2072       r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2073     }
2074     else {
2075       r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2076       if (r == 0 && not1 != 0) {
2077 	BBuf *tbuf;
2078 	r = not_code_range_buf(enc, pbuf, &tbuf);
2079 	if (r != 0) {
2080 	  bbuf_free(pbuf);
2081 	  return r;
2082 	}
2083 	bbuf_free(pbuf);
2084 	pbuf = tbuf;
2085       }
2086     }
2087     if (r != 0) return r;
2088 
2089     dest->mbuf = pbuf;
2090     bbuf_free(buf1);
2091     return r;
2092   }
2093   else
2094     return 0;
2095 }
2096 
2097 static int
conv_backslash_value(int c,ScanEnv * env)2098 conv_backslash_value(int c, ScanEnv* env)
2099 {
2100   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2101     switch (c) {
2102     case 'n': return '\n';
2103     case 't': return '\t';
2104     case 'r': return '\r';
2105     case 'f': return '\f';
2106     case 'a': return '\007';
2107     case 'b': return '\010';
2108     case 'e': return '\033';
2109     case 'v':
2110       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2111 	return '\v';
2112       break;
2113 
2114     default:
2115       break;
2116     }
2117   }
2118   return c;
2119 }
2120 
2121 static int
is_invalid_quantifier_target(Node * node)2122 is_invalid_quantifier_target(Node* node)
2123 {
2124   switch (NTYPE(node)) {
2125   case NT_ANCHOR:
2126     return 1;
2127     break;
2128 
2129   case NT_ENCLOSE:
2130     /* allow enclosed elements */
2131     /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2132     break;
2133 
2134   case NT_LIST:
2135     do {
2136       if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2137     } while (IS_NOT_NULL(node = NCDR(node)));
2138     return 0;
2139     break;
2140 
2141   case NT_ALT:
2142     do {
2143       if (is_invalid_quantifier_target(NCAR(node))) return 1;
2144     } while (IS_NOT_NULL(node = NCDR(node)));
2145     break;
2146 
2147   default:
2148     break;
2149   }
2150   return 0;
2151 }
2152 
2153 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2154 static int
popular_quantifier_num(QtfrNode * q)2155 popular_quantifier_num(QtfrNode* q)
2156 {
2157   if (q->greedy) {
2158     if (q->lower == 0) {
2159       if (q->upper == 1) return 0;
2160       else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2161     }
2162     else if (q->lower == 1) {
2163       if (IS_REPEAT_INFINITE(q->upper)) return 2;
2164     }
2165   }
2166   else {
2167     if (q->lower == 0) {
2168       if (q->upper == 1) return 3;
2169       else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2170     }
2171     else if (q->lower == 1) {
2172       if (IS_REPEAT_INFINITE(q->upper)) return 5;
2173     }
2174   }
2175   return -1;
2176 }
2177 
2178 
2179 enum ReduceType {
2180   RQ_ASIS = 0, /* as is */
2181   RQ_DEL  = 1, /* delete parent */
2182   RQ_A,        /* to '*'    */
2183   RQ_AQ,       /* to '*?'   */
2184   RQ_QQ,       /* to '??'   */
2185   RQ_P_QQ,     /* to '+)??' */
2186   RQ_PQ_Q      /* to '+?)?' */
2187 };
2188 
2189 static enum ReduceType ReduceTypeTable[6][6] = {
2190   {RQ_DEL,  RQ_A,    RQ_A,   RQ_QQ,   RQ_AQ,   RQ_ASIS}, /* '?'  */
2191   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},  /* '*'  */
2192   {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */
2193   {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */
2194   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */
2195   {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */
2196 };
2197 
2198 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2199 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2200 {
2201   int pnum, cnum;
2202   QtfrNode *p, *c;
2203 
2204   p = NQTFR(pnode);
2205   c = NQTFR(cnode);
2206   pnum = popular_quantifier_num(p);
2207   cnum = popular_quantifier_num(c);
2208   if (pnum < 0 || cnum < 0) return ;
2209 
2210   switch(ReduceTypeTable[cnum][pnum]) {
2211   case RQ_DEL:
2212     *pnode = *cnode;
2213     break;
2214   case RQ_A:
2215     p->target = c->target;
2216     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 1;
2217     break;
2218   case RQ_AQ:
2219     p->target = c->target;
2220     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 0;
2221     break;
2222   case RQ_QQ:
2223     p->target = c->target;
2224     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2225     break;
2226   case RQ_P_QQ:
2227     p->target = cnode;
2228     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2229     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 1;
2230     return ;
2231     break;
2232   case RQ_PQ_Q:
2233     p->target = cnode;
2234     p->lower  = 0;  p->upper = 1;  p->greedy = 1;
2235     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 0;
2236     return ;
2237     break;
2238   case RQ_ASIS:
2239     p->target = cnode;
2240     return ;
2241     break;
2242   }
2243 
2244   c->target = NULL_NODE;
2245   onig_node_free(cnode);
2246 }
2247 
2248 
2249 enum TokenSyms {
2250   TK_EOT      = 0,   /* end of token */
2251   TK_RAW_BYTE = 1,
2252   TK_CHAR,
2253   TK_STRING,
2254   TK_CODE_POINT,
2255   TK_ANYCHAR,
2256   TK_CHAR_TYPE,
2257   TK_BACKREF,
2258   TK_CALL,
2259   TK_ANCHOR,
2260   TK_OP_REPEAT,
2261   TK_INTERVAL,
2262   TK_ANYCHAR_ANYTIME,  /* SQL '%' == .* */
2263   TK_ALT,
2264   TK_SUBEXP_OPEN,
2265   TK_SUBEXP_CLOSE,
2266   TK_CC_OPEN,
2267   TK_QUOTE_OPEN,
2268   TK_CHAR_PROPERTY,    /* \p{...}, \P{...} */
2269   /* in cc */
2270   TK_CC_CLOSE,
2271   TK_CC_RANGE,
2272   TK_POSIX_BRACKET_OPEN,
2273   TK_CC_AND,             /* && */
2274   TK_CC_CC_OPEN          /* [ */
2275 };
2276 
2277 typedef struct {
2278   enum TokenSyms type;
2279   int escaped;
2280   int base;   /* is number: 8, 16 (used in [....]) */
2281   UChar* backp;
2282   union {
2283     UChar* s;
2284     int   c;
2285     OnigCodePoint code;
2286     int   anchor;
2287     int   subtype;
2288     struct {
2289       int lower;
2290       int upper;
2291       int greedy;
2292       int possessive;
2293     } repeat;
2294     struct {
2295       int  num;
2296       int  ref1;
2297       int* refs;
2298       int  by_name;
2299 #ifdef USE_BACKREF_WITH_LEVEL
2300       int  exist_level;
2301       int  level;   /* \k<name+n> */
2302 #endif
2303     } backref;
2304     struct {
2305       UChar* name;
2306       UChar* name_end;
2307       int    gnum;
2308     } call;
2309     struct {
2310       int ctype;
2311       int not;
2312     } prop;
2313   } u;
2314 } OnigToken;
2315 
2316 
2317 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2318 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2319 {
2320   int low, up, syn_allow, non_low = 0;
2321   int r = 0;
2322   OnigCodePoint c;
2323   OnigEncoding enc = env->enc;
2324   UChar* p = *src;
2325   PFETCH_READY;
2326 
2327   syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2328 
2329   if (PEND) {
2330     if (syn_allow)
2331       return 1;  /* "....{" : OK! */
2332     else
2333       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;  /* "....{" syntax error */
2334   }
2335 
2336   if (! syn_allow) {
2337     c = PPEEK;
2338     if (c == ')' || c == '(' || c == '|') {
2339       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2340     }
2341   }
2342 
2343   low = onig_scan_unsigned_number(&p, end, env->enc);
2344   if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2345   if (low > ONIG_MAX_REPEAT_NUM)
2346     return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2347 
2348   if (p == *src) { /* can't read low */
2349     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2350       /* allow {,n} as {0,n} */
2351       low = 0;
2352       non_low = 1;
2353     }
2354     else
2355       goto invalid;
2356   }
2357 
2358   if (PEND) goto invalid;
2359   PFETCH(c);
2360   if (c == ',') {
2361     UChar* prev = p;
2362     up = onig_scan_unsigned_number(&p, end, env->enc);
2363     if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2364     if (up > ONIG_MAX_REPEAT_NUM)
2365       return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2366 
2367     if (p == prev) {
2368       if (non_low != 0)
2369 	goto invalid;
2370       up = REPEAT_INFINITE;  /* {n,} : {n,infinite} */
2371     }
2372   }
2373   else {
2374     if (non_low != 0)
2375       goto invalid;
2376 
2377     PUNFETCH;
2378     up = low;  /* {n} : exact n times */
2379     r = 2;     /* fixed */
2380   }
2381 
2382   if (PEND) goto invalid;
2383   PFETCH(c);
2384   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2385     if (c != MC_ESC(env->syntax)) goto invalid;
2386     PFETCH(c);
2387   }
2388   if (c != '}') goto invalid;
2389 
2390   if (!IS_REPEAT_INFINITE(up) && low > up) {
2391     return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2392   }
2393 
2394   tok->type = TK_INTERVAL;
2395   tok->u.repeat.lower = low;
2396   tok->u.repeat.upper = up;
2397   *src = p;
2398   return r; /* 0: normal {n,m}, 2: fixed {n} */
2399 
2400  invalid:
2401   if (syn_allow)
2402     return 1;  /* OK */
2403   else
2404     return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2405 }
2406 
2407 /* \M-, \C-, \c, or \... */
2408 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2409 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2410 {
2411   int v;
2412   OnigCodePoint c;
2413   OnigEncoding enc = env->enc;
2414   UChar* p = *src;
2415 
2416   if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2417 
2418   PFETCH_S(c);
2419   switch (c) {
2420   case 'M':
2421     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2422       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2423       PFETCH_S(c);
2424       if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2425       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2426       PFETCH_S(c);
2427       if (c == MC_ESC(env->syntax)) {
2428         v = fetch_escaped_value(&p, end, env);
2429         if (v < 0) return v;
2430         c = (OnigCodePoint )v;
2431       }
2432       c = ((c & 0xff) | 0x80);
2433     }
2434     else
2435       goto backslash;
2436     break;
2437 
2438   case 'C':
2439     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2440       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2441       PFETCH_S(c);
2442       if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2443       goto control;
2444     }
2445     else
2446       goto backslash;
2447 
2448   case 'c':
2449     if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2450     control:
2451       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2452       PFETCH_S(c);
2453       if (c == '?') {
2454         c = 0177;
2455       }
2456       else {
2457         if (c == MC_ESC(env->syntax)) {
2458           v = fetch_escaped_value(&p, end, env);
2459           if (v < 0) return v;
2460           c = (OnigCodePoint )v;
2461         }
2462         c &= 0x9f;
2463       }
2464       break;
2465     }
2466     /* fall through */
2467 
2468   default:
2469     {
2470     backslash:
2471       c = conv_backslash_value(c, env);
2472     }
2473     break;
2474   }
2475 
2476   *src = p;
2477   return c;
2478 }
2479 
2480 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2481 
2482 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2483 get_name_end_code_point(OnigCodePoint start)
2484 {
2485   switch (start) {
2486   case '<':  return (OnigCodePoint )'>'; break;
2487   case '\'': return (OnigCodePoint )'\''; break;
2488   default:
2489     break;
2490   }
2491 
2492   return (OnigCodePoint )0;
2493 }
2494 
2495 #ifdef USE_NAMED_GROUP
2496 #ifdef USE_BACKREF_WITH_LEVEL
2497 /*
2498    \k<name+n>, \k<name-n>
2499    \k<num+n>,  \k<num-n>
2500    \k<-num+n>, \k<-num-n>
2501 */
2502 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2503 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2504 		      UChar** rname_end, ScanEnv* env,
2505 		      int* rback_num, int* rlevel)
2506 {
2507   int r, sign, is_num, exist_level;
2508   OnigCodePoint end_code;
2509   OnigCodePoint c = 0;
2510   OnigEncoding enc = env->enc;
2511   UChar *name_end;
2512   UChar *pnum_head;
2513   UChar *p = *src;
2514   PFETCH_READY;
2515 
2516   *rback_num = 0;
2517   is_num = exist_level = 0;
2518   sign = 1;
2519   pnum_head = *src;
2520 
2521   end_code = get_name_end_code_point(start_code);
2522 
2523   name_end = end;
2524   r = 0;
2525   if (PEND) {
2526     return ONIGERR_EMPTY_GROUP_NAME;
2527   }
2528   else {
2529     PFETCH(c);
2530     if (c == end_code)
2531       return ONIGERR_EMPTY_GROUP_NAME;
2532 
2533     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2534       is_num = 1;
2535     }
2536     else if (c == '-') {
2537       is_num = 2;
2538       sign = -1;
2539       pnum_head = p;
2540     }
2541     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2542       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2543     }
2544   }
2545 
2546   while (!PEND) {
2547     name_end = p;
2548     PFETCH(c);
2549     if (c == end_code || c == ')' || c == '+' || c == '-') {
2550       if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2551       break;
2552     }
2553 
2554     if (is_num != 0) {
2555       if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2556         is_num = 1;
2557       }
2558       else {
2559         r = ONIGERR_INVALID_GROUP_NAME;
2560         is_num = 0;
2561       }
2562     }
2563     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2564       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2565     }
2566   }
2567 
2568   if (r == 0 && c != end_code) {
2569     if (c == '+' || c == '-') {
2570       int level;
2571       int flag = (c == '-' ? -1 : 1);
2572 
2573       PFETCH(c);
2574       if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2575       PUNFETCH;
2576       level = onig_scan_unsigned_number(&p, end, enc);
2577       if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2578       *rlevel = (level * flag);
2579       exist_level = 1;
2580 
2581       PFETCH(c);
2582       if (c == end_code)
2583 	goto end;
2584     }
2585 
2586   err:
2587     r = ONIGERR_INVALID_GROUP_NAME;
2588     name_end = end;
2589   }
2590 
2591  end:
2592   if (r == 0) {
2593     if (is_num != 0) {
2594       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2595       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2596       else if (*rback_num == 0) goto err;
2597 
2598       *rback_num *= sign;
2599     }
2600 
2601     *rname_end = name_end;
2602     *src = p;
2603     return (exist_level ? 1 : 0);
2604   }
2605   else {
2606     onig_scan_env_set_error_string(env, r, *src, name_end);
2607     return r;
2608   }
2609 }
2610 #endif /* USE_BACKREF_WITH_LEVEL */
2611 
2612 /*
2613   def: 0 -> define name    (don't allow number name)
2614        1 -> reference name (allow number name)
2615 */
2616 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2617 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2618 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2619 {
2620   int r, is_num, sign;
2621   OnigCodePoint end_code;
2622   OnigCodePoint c = 0;
2623   OnigEncoding enc = env->enc;
2624   UChar *name_end;
2625   UChar *pnum_head;
2626   UChar *p = *src;
2627 
2628   *rback_num = 0;
2629 
2630   end_code = get_name_end_code_point(start_code);
2631 
2632   name_end = end;
2633   pnum_head = *src;
2634   r = 0;
2635   is_num = 0;
2636   sign = 1;
2637   if (PEND) {
2638     return ONIGERR_EMPTY_GROUP_NAME;
2639   }
2640   else {
2641     PFETCH_S(c);
2642     if (c == end_code)
2643       return ONIGERR_EMPTY_GROUP_NAME;
2644 
2645     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2646       if (ref == 1)
2647         is_num = 1;
2648       else {
2649         r = ONIGERR_INVALID_GROUP_NAME;
2650         is_num = 0;
2651       }
2652     }
2653     else if (c == '-') {
2654       if (ref == 1) {
2655         is_num = 2;
2656         sign = -1;
2657         pnum_head = p;
2658       }
2659       else {
2660         r = ONIGERR_INVALID_GROUP_NAME;
2661         is_num = 0;
2662       }
2663     }
2664     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2665       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2666     }
2667   }
2668 
2669   if (r == 0) {
2670     while (!PEND) {
2671       name_end = p;
2672       PFETCH_S(c);
2673       if (c == end_code || c == ')') {
2674         if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2675         break;
2676       }
2677 
2678       if (is_num != 0) {
2679         if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2680           is_num = 1;
2681         }
2682         else {
2683           if (!ONIGENC_IS_CODE_WORD(enc, c))
2684             r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2685           else
2686             r = ONIGERR_INVALID_GROUP_NAME;
2687           is_num = 0;
2688         }
2689       }
2690       else {
2691         if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2692           r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2693         }
2694       }
2695     }
2696 
2697     if (c != end_code) {
2698       r = ONIGERR_INVALID_GROUP_NAME;
2699       name_end = end;
2700     }
2701 
2702     if (is_num != 0) {
2703       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2704       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2705       else if (*rback_num == 0) {
2706         r = ONIGERR_INVALID_GROUP_NAME;
2707         goto err;
2708       }
2709 
2710       *rback_num *= sign;
2711     }
2712 
2713     *rname_end = name_end;
2714     *src = p;
2715     return 0;
2716   }
2717   else {
2718     while (!PEND) {
2719       name_end = p;
2720       PFETCH_S(c);
2721       if (c == end_code || c == ')')
2722         break;
2723     }
2724     if (PEND)
2725       name_end = end;
2726 
2727   err:
2728     onig_scan_env_set_error_string(env, r, *src, name_end);
2729     return r;
2730   }
2731 }
2732 #else
2733 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2734 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2735 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2736 {
2737   int r, is_num, sign;
2738   OnigCodePoint end_code;
2739   OnigCodePoint c = 0;
2740   UChar *name_end;
2741   OnigEncoding enc = env->enc;
2742   UChar *pnum_head;
2743   UChar *p = *src;
2744   PFETCH_READY;
2745 
2746   *rback_num = 0;
2747 
2748   end_code = get_name_end_code_point(start_code);
2749 
2750   *rname_end = name_end = end;
2751   r = 0;
2752   pnum_head = *src;
2753   is_num = 0;
2754   sign = 1;
2755 
2756   if (PEND) {
2757     return ONIGERR_EMPTY_GROUP_NAME;
2758   }
2759   else {
2760     PFETCH(c);
2761     if (c == end_code)
2762       return ONIGERR_EMPTY_GROUP_NAME;
2763 
2764     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2765       is_num = 1;
2766     }
2767     else if (c == '-') {
2768       is_num = 2;
2769       sign = -1;
2770       pnum_head = p;
2771     }
2772     else {
2773       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2774     }
2775   }
2776 
2777   while (!PEND) {
2778     name_end = p;
2779 
2780     PFETCH(c);
2781     if (c == end_code || c == ')') break;
2782     if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2783       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2784   }
2785   if (r == 0 && c != end_code) {
2786     r = ONIGERR_INVALID_GROUP_NAME;
2787     name_end = end;
2788   }
2789 
2790   if (r == 0) {
2791     *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2792     if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2793     else if (*rback_num == 0) {
2794       r = ONIGERR_INVALID_GROUP_NAME;
2795       goto err;
2796     }
2797     *rback_num *= sign;
2798 
2799     *rname_end = name_end;
2800     *src = p;
2801     return 0;
2802   }
2803   else {
2804   err:
2805     onig_scan_env_set_error_string(env, r, *src, name_end);
2806     return r;
2807   }
2808 }
2809 #endif /* USE_NAMED_GROUP */
2810 
2811 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2812 CC_ESC_WARN(ScanEnv* env, UChar *c)
2813 {
2814   if (onig_warn == onig_null_warn) return ;
2815 
2816   if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2817       IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2818     UChar buf[WARN_BUFSIZE];
2819     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2820 		env->pattern, env->pattern_end,
2821                 (UChar* )"character class has '%s' without escape", c);
2822     (*onig_warn)((char* )buf);
2823   }
2824 }
2825 
2826 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2827 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2828 {
2829   if (onig_warn == onig_null_warn) return ;
2830 
2831   if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2832     UChar buf[WARN_BUFSIZE];
2833     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2834 		(env)->pattern, (env)->pattern_end,
2835 		(UChar* )"regular expression has '%s' without escape", c);
2836     (*onig_warn)((char* )buf);
2837   }
2838 }
2839 
2840 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2841 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2842 		  UChar **next, OnigEncoding enc)
2843 {
2844   int i;
2845   OnigCodePoint x;
2846   UChar *q;
2847   UChar *p = from;
2848 
2849   while (p < to) {
2850     x = ONIGENC_MBC_TO_CODE(enc, p, to);
2851     q = p + enclen(enc, p);
2852     if (x == s[0]) {
2853       for (i = 1; i < n && q < to; i++) {
2854 	x = ONIGENC_MBC_TO_CODE(enc, q, to);
2855 	if (x != s[i]) break;
2856 	q += enclen(enc, q);
2857       }
2858       if (i >= n) {
2859 	if (IS_NOT_NULL(next))
2860 	  *next = q;
2861 	return p;
2862       }
2863     }
2864     p = q;
2865   }
2866   return NULL_UCHARP;
2867 }
2868 
2869 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2870 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2871 		 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2872 {
2873   int i, in_esc;
2874   OnigCodePoint x;
2875   UChar *q;
2876   UChar *p = from;
2877 
2878   in_esc = 0;
2879   while (p < to) {
2880     if (in_esc) {
2881       in_esc = 0;
2882       p += enclen(enc, p);
2883     }
2884     else {
2885       x = ONIGENC_MBC_TO_CODE(enc, p, to);
2886       q = p + enclen(enc, p);
2887       if (x == s[0]) {
2888 	for (i = 1; i < n && q < to; i++) {
2889 	  x = ONIGENC_MBC_TO_CODE(enc, q, to);
2890 	  if (x != s[i]) break;
2891 	  q += enclen(enc, q);
2892 	}
2893 	if (i >= n) return 1;
2894 	p += enclen(enc, p);
2895       }
2896       else {
2897 	x = ONIGENC_MBC_TO_CODE(enc, p, to);
2898 	if (x == bad) return 0;
2899 	else if (x == MC_ESC(syn)) in_esc = 1;
2900 	p = q;
2901       }
2902     }
2903   }
2904   return 0;
2905 }
2906 
2907 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2908 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2909 {
2910   int num;
2911   OnigCodePoint c, c2;
2912   OnigSyntaxType* syn = env->syntax;
2913   OnigEncoding enc = env->enc;
2914   UChar* prev;
2915   UChar* p = *src;
2916   PFETCH_READY;
2917 
2918   if (PEND) {
2919     tok->type = TK_EOT;
2920     return tok->type;
2921   }
2922 
2923   PFETCH(c);
2924   tok->type = TK_CHAR;
2925   tok->base = 0;
2926   tok->u.c  = c;
2927   tok->escaped = 0;
2928 
2929   if (c == ']') {
2930     tok->type = TK_CC_CLOSE;
2931   }
2932   else if (c == '-') {
2933     tok->type = TK_CC_RANGE;
2934   }
2935   else if (c == MC_ESC(syn)) {
2936     if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2937       goto end;
2938 
2939     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2940 
2941     PFETCH(c);
2942     tok->escaped = 1;
2943     tok->u.c = c;
2944     switch (c) {
2945     case 'w':
2946       tok->type = TK_CHAR_TYPE;
2947       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2948       tok->u.prop.not   = 0;
2949       break;
2950     case 'W':
2951       tok->type = TK_CHAR_TYPE;
2952       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2953       tok->u.prop.not   = 1;
2954       break;
2955     case 'd':
2956       tok->type = TK_CHAR_TYPE;
2957       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2958       tok->u.prop.not   = 0;
2959       break;
2960     case 'D':
2961       tok->type = TK_CHAR_TYPE;
2962       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2963       tok->u.prop.not   = 1;
2964       break;
2965     case 's':
2966       tok->type = TK_CHAR_TYPE;
2967       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2968       tok->u.prop.not   = 0;
2969       break;
2970     case 'S':
2971       tok->type = TK_CHAR_TYPE;
2972       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2973       tok->u.prop.not   = 1;
2974       break;
2975     case 'h':
2976       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2977       tok->type = TK_CHAR_TYPE;
2978       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2979       tok->u.prop.not   = 0;
2980       break;
2981     case 'H':
2982       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2983       tok->type = TK_CHAR_TYPE;
2984       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2985       tok->u.prop.not   = 1;
2986       break;
2987 
2988     case 'p':
2989     case 'P':
2990       c2 = PPEEK;
2991       if (c2 == '{' &&
2992 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2993 	PINC;
2994 	tok->type = TK_CHAR_PROPERTY;
2995 	tok->u.prop.not = (c == 'P' ? 1 : 0);
2996 
2997 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2998 	  PFETCH(c2);
2999 	  if (c2 == '^') {
3000 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3001 	  }
3002 	  else
3003 	    PUNFETCH;
3004 	}
3005       }
3006       break;
3007 
3008     case 'x':
3009       if (PEND) break;
3010 
3011       prev = p;
3012       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3013 	PINC;
3014 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3015 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3016 	if (!PEND) {
3017           c2 = PPEEK;
3018           if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3019             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3020         }
3021 
3022 	if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3023 	  PINC;
3024 	  tok->type   = TK_CODE_POINT;
3025 	  tok->base   = 16;
3026 	  tok->u.code = (OnigCodePoint )num;
3027 	}
3028 	else {
3029 	  /* can't read nothing or invalid format */
3030 	  p = prev;
3031 	}
3032       }
3033       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3034 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3035 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3036 	if (p == prev) {  /* can't read nothing. */
3037 	  num = 0; /* but, it's not error */
3038 	}
3039 	tok->type = TK_RAW_BYTE;
3040 	tok->base = 16;
3041 	tok->u.c  = num;
3042       }
3043       break;
3044 
3045     case 'u':
3046       if (PEND) break;
3047 
3048       prev = p;
3049       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3050 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3051 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3052 	if (p == prev) {  /* can't read nothing. */
3053 	  num = 0; /* but, it's not error */
3054 	}
3055 	tok->type   = TK_CODE_POINT;
3056 	tok->base   = 16;
3057 	tok->u.code = (OnigCodePoint )num;
3058       }
3059       break;
3060 
3061     case '0':
3062     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3063       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3064 	PUNFETCH;
3065 	prev = p;
3066 	num = scan_unsigned_octal_number(&p, end, 3, enc);
3067 	if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3068 	if (p == prev) {  /* can't read nothing. */
3069 	  num = 0; /* but, it's not error */
3070 	}
3071 	tok->type = TK_RAW_BYTE;
3072 	tok->base = 8;
3073 	tok->u.c  = num;
3074       }
3075       break;
3076 
3077     default:
3078       PUNFETCH;
3079       num = fetch_escaped_value(&p, end, env);
3080       if (num < 0) return num;
3081       if (tok->u.c != num) {
3082 	tok->u.code = (OnigCodePoint )num;
3083 	tok->type   = TK_CODE_POINT;
3084       }
3085       break;
3086     }
3087   }
3088   else if (c == '[') {
3089     if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3090       OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3091       tok->backp = p; /* point at '[' is readed */
3092       PINC;
3093       if (str_exist_check_with_esc(send, 2, p, end,
3094                                    (OnigCodePoint )']', enc, syn)) {
3095 	tok->type = TK_POSIX_BRACKET_OPEN;
3096       }
3097       else {
3098 	PUNFETCH;
3099 	goto cc_in_cc;
3100       }
3101     }
3102     else {
3103     cc_in_cc:
3104       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3105 	tok->type = TK_CC_CC_OPEN;
3106       }
3107       else {
3108 	CC_ESC_WARN(env, (UChar* )"[");
3109       }
3110     }
3111   }
3112   else if (c == '&') {
3113     if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3114 	!PEND && (PPEEK_IS('&'))) {
3115       PINC;
3116       tok->type = TK_CC_AND;
3117     }
3118   }
3119 
3120  end:
3121   *src = p;
3122   return tok->type;
3123 }
3124 
3125 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3126 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3127 {
3128   int r, num;
3129   OnigCodePoint c;
3130   OnigEncoding enc = env->enc;
3131   OnigSyntaxType* syn = env->syntax;
3132   UChar* prev;
3133   UChar* p = *src;
3134   PFETCH_READY;
3135 
3136  start:
3137   if (PEND) {
3138     tok->type = TK_EOT;
3139     return tok->type;
3140   }
3141 
3142   tok->type  = TK_STRING;
3143   tok->base  = 0;
3144   tok->backp = p;
3145 
3146   PFETCH(c);
3147   if (IS_MC_ESC_CODE(c, syn)) {
3148     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3149 
3150     tok->backp = p;
3151     PFETCH(c);
3152 
3153     tok->u.c = c;
3154     tok->escaped = 1;
3155     switch (c) {
3156     case '*':
3157       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3158       tok->type = TK_OP_REPEAT;
3159       tok->u.repeat.lower = 0;
3160       tok->u.repeat.upper = REPEAT_INFINITE;
3161       goto greedy_check;
3162       break;
3163 
3164     case '+':
3165       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3166       tok->type = TK_OP_REPEAT;
3167       tok->u.repeat.lower = 1;
3168       tok->u.repeat.upper = REPEAT_INFINITE;
3169       goto greedy_check;
3170       break;
3171 
3172     case '?':
3173       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3174       tok->type = TK_OP_REPEAT;
3175       tok->u.repeat.lower = 0;
3176       tok->u.repeat.upper = 1;
3177     greedy_check:
3178       if (!PEND && PPEEK_IS('?') &&
3179 	  IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3180 	PFETCH(c);
3181 	tok->u.repeat.greedy     = 0;
3182 	tok->u.repeat.possessive = 0;
3183       }
3184       else {
3185       possessive_check:
3186 	if (!PEND && PPEEK_IS('+') &&
3187 	    ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3188 	      tok->type != TK_INTERVAL)  ||
3189 	     (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3190 	      tok->type == TK_INTERVAL))) {
3191 	  PFETCH(c);
3192 	  tok->u.repeat.greedy     = 1;
3193 	  tok->u.repeat.possessive = 1;
3194 	}
3195 	else {
3196 	  tok->u.repeat.greedy     = 1;
3197 	  tok->u.repeat.possessive = 0;
3198 	}
3199       }
3200       break;
3201 
3202     case '{':
3203       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3204       r = fetch_range_quantifier(&p, end, tok, env);
3205       if (r < 0) return r;  /* error */
3206       if (r == 0) goto greedy_check;
3207       else if (r == 2) { /* {n} */
3208 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3209 	  goto possessive_check;
3210 
3211 	goto greedy_check;
3212       }
3213       /* r == 1 : normal char */
3214       break;
3215 
3216     case '|':
3217       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3218       tok->type = TK_ALT;
3219       break;
3220 
3221     case '(':
3222       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3223       tok->type = TK_SUBEXP_OPEN;
3224       break;
3225 
3226     case ')':
3227       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3228       tok->type = TK_SUBEXP_CLOSE;
3229       break;
3230 
3231     case 'w':
3232       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3233       tok->type = TK_CHAR_TYPE;
3234       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3235       tok->u.prop.not   = 0;
3236       break;
3237 
3238     case 'W':
3239       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3240       tok->type = TK_CHAR_TYPE;
3241       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3242       tok->u.prop.not   = 1;
3243       break;
3244 
3245     case 'b':
3246       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3247       tok->type = TK_ANCHOR;
3248       tok->u.anchor = ANCHOR_WORD_BOUND;
3249       break;
3250 
3251     case 'B':
3252       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3253       tok->type = TK_ANCHOR;
3254       tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3255       break;
3256 
3257 #ifdef USE_WORD_BEGIN_END
3258     case '<':
3259       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3260       tok->type = TK_ANCHOR;
3261       tok->u.anchor = ANCHOR_WORD_BEGIN;
3262       break;
3263 
3264     case '>':
3265       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3266       tok->type = TK_ANCHOR;
3267       tok->u.anchor = ANCHOR_WORD_END;
3268       break;
3269 #endif
3270 
3271     case 's':
3272       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3273       tok->type = TK_CHAR_TYPE;
3274       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3275       tok->u.prop.not   = 0;
3276       break;
3277 
3278     case 'S':
3279       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3280       tok->type = TK_CHAR_TYPE;
3281       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3282       tok->u.prop.not   = 1;
3283       break;
3284 
3285     case 'd':
3286       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3287       tok->type = TK_CHAR_TYPE;
3288       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3289       tok->u.prop.not   = 0;
3290       break;
3291 
3292     case 'D':
3293       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3294       tok->type = TK_CHAR_TYPE;
3295       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3296       tok->u.prop.not   = 1;
3297       break;
3298 
3299     case 'h':
3300       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3301       tok->type = TK_CHAR_TYPE;
3302       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3303       tok->u.prop.not   = 0;
3304       break;
3305 
3306     case 'H':
3307       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3308       tok->type = TK_CHAR_TYPE;
3309       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3310       tok->u.prop.not   = 1;
3311       break;
3312 
3313     case 'A':
3314       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3315     begin_buf:
3316       tok->type = TK_ANCHOR;
3317       tok->u.subtype = ANCHOR_BEGIN_BUF;
3318       break;
3319 
3320     case 'Z':
3321       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3322       tok->type = TK_ANCHOR;
3323       tok->u.subtype = ANCHOR_SEMI_END_BUF;
3324       break;
3325 
3326     case 'z':
3327       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3328     end_buf:
3329       tok->type = TK_ANCHOR;
3330       tok->u.subtype = ANCHOR_END_BUF;
3331       break;
3332 
3333     case 'G':
3334       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3335       tok->type = TK_ANCHOR;
3336       tok->u.subtype = ANCHOR_BEGIN_POSITION;
3337       break;
3338 
3339     case '`':
3340       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3341       goto begin_buf;
3342       break;
3343 
3344     case '\'':
3345       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3346       goto end_buf;
3347       break;
3348 
3349     case 'x':
3350       if (PEND) break;
3351 
3352       prev = p;
3353       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3354 	PINC;
3355 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3356 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3357 	if (!PEND) {
3358           if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3359             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3360         }
3361 
3362 	if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3363 	  PINC;
3364 	  tok->type   = TK_CODE_POINT;
3365 	  tok->u.code = (OnigCodePoint )num;
3366 	}
3367 	else {
3368 	  /* can't read nothing or invalid format */
3369 	  p = prev;
3370 	}
3371       }
3372       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3373 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3374 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3375 	if (p == prev) {  /* can't read nothing. */
3376 	  num = 0; /* but, it's not error */
3377 	}
3378 	tok->type = TK_RAW_BYTE;
3379 	tok->base = 16;
3380 	tok->u.c  = num;
3381       }
3382       break;
3383 
3384     case 'u':
3385       if (PEND) break;
3386 
3387       prev = p;
3388       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3389 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3390 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3391 	if (p == prev) {  /* can't read nothing. */
3392 	  num = 0; /* but, it's not error */
3393 	}
3394 	tok->type   = TK_CODE_POINT;
3395 	tok->base   = 16;
3396 	tok->u.code = (OnigCodePoint )num;
3397       }
3398       break;
3399 
3400     case '1': case '2': case '3': case '4':
3401     case '5': case '6': case '7': case '8': case '9':
3402       PUNFETCH;
3403       prev = p;
3404       num = onig_scan_unsigned_number(&p, end, enc);
3405       if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3406         goto skip_backref;
3407       }
3408 
3409       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3410 	  (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3411 	if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3412 	  if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3413 	    return ONIGERR_INVALID_BACKREF;
3414 	}
3415 
3416 	tok->type = TK_BACKREF;
3417 	tok->u.backref.num     = 1;
3418 	tok->u.backref.ref1    = num;
3419 	tok->u.backref.by_name = 0;
3420 #ifdef USE_BACKREF_WITH_LEVEL
3421 	tok->u.backref.exist_level = 0;
3422 #endif
3423 	break;
3424       }
3425 
3426     skip_backref:
3427       if (c == '8' || c == '9') {
3428 	/* normal char */
3429 	p = prev; PINC;
3430 	break;
3431       }
3432 
3433       p = prev;
3434       /* fall through */
3435     case '0':
3436       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3437 	prev = p;
3438 	num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3439 	if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER;
3440 	if (p == prev) {  /* can't read nothing. */
3441 	  num = 0; /* but, it's not error */
3442 	}
3443 	tok->type = TK_RAW_BYTE;
3444 	tok->base = 8;
3445 	tok->u.c  = num;
3446       }
3447       else if (c != '0') {
3448 	PINC;
3449       }
3450       break;
3451 
3452 #ifdef USE_NAMED_GROUP
3453     case 'k':
3454       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3455 	PFETCH(c);
3456 	if (c == '<' || c == '\'') {
3457 	  UChar* name_end;
3458 	  int* backs;
3459 	  int back_num;
3460 
3461 	  prev = p;
3462 
3463 #ifdef USE_BACKREF_WITH_LEVEL
3464 	  name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3465 	  r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3466 				    env, &back_num, &tok->u.backref.level);
3467 	  if (r == 1) tok->u.backref.exist_level = 1;
3468 	  else        tok->u.backref.exist_level = 0;
3469 #else
3470 	  r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3471 #endif
3472 	  if (r < 0) return r;
3473 
3474 	  if (back_num != 0) {
3475 	    if (back_num < 0) {
3476 	      back_num = BACKREF_REL_TO_ABS(back_num, env);
3477 	      if (back_num <= 0)
3478 		return ONIGERR_INVALID_BACKREF;
3479 	    }
3480 
3481 	    if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3482 	      if (back_num > env->num_mem ||
3483 		  IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3484 		return ONIGERR_INVALID_BACKREF;
3485 	    }
3486 	    tok->type = TK_BACKREF;
3487 	    tok->u.backref.by_name = 0;
3488 	    tok->u.backref.num  = 1;
3489 	    tok->u.backref.ref1 = back_num;
3490 	  }
3491 	  else {
3492 	    num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3493 	    if (num <= 0) {
3494 	      onig_scan_env_set_error_string(env,
3495 			     ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3496 	      return ONIGERR_UNDEFINED_NAME_REFERENCE;
3497 	    }
3498 	    if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3499 	      int i;
3500 	      for (i = 0; i < num; i++) {
3501 		if (backs[i] > env->num_mem ||
3502 		    IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3503 		  return ONIGERR_INVALID_BACKREF;
3504 	      }
3505 	    }
3506 
3507 	    tok->type = TK_BACKREF;
3508 	    tok->u.backref.by_name = 1;
3509 	    if (num == 1) {
3510 	      tok->u.backref.num  = 1;
3511 	      tok->u.backref.ref1 = backs[0];
3512 	    }
3513 	    else {
3514 	      tok->u.backref.num  = num;
3515 	      tok->u.backref.refs = backs;
3516 	    }
3517 	  }
3518 	}
3519 	else
3520 	  PUNFETCH;
3521       }
3522       break;
3523 #endif
3524 
3525 #ifdef USE_SUBEXP_CALL
3526     case 'g':
3527       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3528 	PFETCH(c);
3529 	if (c == '<' || c == '\'') {
3530 	  int gnum;
3531 	  UChar* name_end;
3532 
3533 	  prev = p;
3534 	  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3535 	  if (r < 0) return r;
3536 
3537 	  tok->type = TK_CALL;
3538 	  tok->u.call.name     = prev;
3539 	  tok->u.call.name_end = name_end;
3540 	  tok->u.call.gnum     = gnum;
3541 	}
3542 	else
3543 	  PUNFETCH;
3544       }
3545       break;
3546 #endif
3547 
3548     case 'Q':
3549       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3550 	tok->type = TK_QUOTE_OPEN;
3551       }
3552       break;
3553 
3554     case 'p':
3555     case 'P':
3556       if (PPEEK_IS('{') &&
3557 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3558 	PINC;
3559 	tok->type = TK_CHAR_PROPERTY;
3560 	tok->u.prop.not = (c == 'P' ? 1 : 0);
3561 
3562 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3563 	  PFETCH(c);
3564 	  if (c == '^') {
3565 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3566 	  }
3567 	  else
3568 	    PUNFETCH;
3569 	}
3570       }
3571       break;
3572 
3573     default:
3574       PUNFETCH;
3575       num = fetch_escaped_value(&p, end, env);
3576       if (num < 0) return num;
3577       /* set_raw: */
3578       if (tok->u.c != num) {
3579 	tok->type = TK_CODE_POINT;
3580 	tok->u.code = (OnigCodePoint )num;
3581       }
3582       else { /* string */
3583 	p = tok->backp + enclen(enc, tok->backp);
3584       }
3585       break;
3586     }
3587   }
3588   else {
3589     tok->u.c = c;
3590     tok->escaped = 0;
3591 
3592 #ifdef USE_VARIABLE_META_CHARS
3593     if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3594 	IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3595       if (c == MC_ANYCHAR(syn))
3596 	goto any_char;
3597       else if (c == MC_ANYTIME(syn))
3598 	goto anytime;
3599       else if (c == MC_ZERO_OR_ONE_TIME(syn))
3600 	goto zero_or_one_time;
3601       else if (c == MC_ONE_OR_MORE_TIME(syn))
3602 	goto one_or_more_time;
3603       else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3604 	tok->type = TK_ANYCHAR_ANYTIME;
3605 	goto out;
3606       }
3607     }
3608 #endif
3609 
3610     switch (c) {
3611     case '.':
3612       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3613 #ifdef USE_VARIABLE_META_CHARS
3614     any_char:
3615 #endif
3616       tok->type = TK_ANYCHAR;
3617       break;
3618 
3619     case '*':
3620       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3621 #ifdef USE_VARIABLE_META_CHARS
3622     anytime:
3623 #endif
3624       tok->type = TK_OP_REPEAT;
3625       tok->u.repeat.lower = 0;
3626       tok->u.repeat.upper = REPEAT_INFINITE;
3627       goto greedy_check;
3628       break;
3629 
3630     case '+':
3631       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3632 #ifdef USE_VARIABLE_META_CHARS
3633     one_or_more_time:
3634 #endif
3635       tok->type = TK_OP_REPEAT;
3636       tok->u.repeat.lower = 1;
3637       tok->u.repeat.upper = REPEAT_INFINITE;
3638       goto greedy_check;
3639       break;
3640 
3641     case '?':
3642       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3643 #ifdef USE_VARIABLE_META_CHARS
3644     zero_or_one_time:
3645 #endif
3646       tok->type = TK_OP_REPEAT;
3647       tok->u.repeat.lower = 0;
3648       tok->u.repeat.upper = 1;
3649       goto greedy_check;
3650       break;
3651 
3652     case '{':
3653       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3654       r = fetch_range_quantifier(&p, end, tok, env);
3655       if (r < 0) return r;  /* error */
3656       if (r == 0) goto greedy_check;
3657       else if (r == 2) { /* {n} */
3658 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3659 	  goto possessive_check;
3660 
3661 	goto greedy_check;
3662       }
3663       /* r == 1 : normal char */
3664       break;
3665 
3666     case '|':
3667       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3668       tok->type = TK_ALT;
3669       break;
3670 
3671     case '(':
3672       if (PPEEK_IS('?') &&
3673           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3674         PINC;
3675         if (PPEEK_IS('#')) {
3676           PFETCH(c);
3677           while (1) {
3678             if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3679             PFETCH(c);
3680             if (c == MC_ESC(syn)) {
3681               if (!PEND) PFETCH(c);
3682             }
3683             else {
3684               if (c == ')') break;
3685             }
3686           }
3687           goto start;
3688         }
3689         PUNFETCH;
3690       }
3691 
3692       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3693       tok->type = TK_SUBEXP_OPEN;
3694       break;
3695 
3696     case ')':
3697       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3698       tok->type = TK_SUBEXP_CLOSE;
3699       break;
3700 
3701     case '^':
3702       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3703       tok->type = TK_ANCHOR;
3704       tok->u.subtype = (IS_SINGLELINE(env->option)
3705 			? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3706       break;
3707 
3708     case '$':
3709       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3710       tok->type = TK_ANCHOR;
3711       tok->u.subtype = (IS_SINGLELINE(env->option)
3712 			? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3713       break;
3714 
3715     case '[':
3716       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3717       tok->type = TK_CC_OPEN;
3718       break;
3719 
3720     case ']':
3721       if (*src > env->pattern)   /* /].../ is allowed. */
3722 	CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3723       break;
3724 
3725     case '#':
3726       if (IS_EXTEND(env->option)) {
3727 	while (!PEND) {
3728 	  PFETCH(c);
3729 	  if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3730 	    break;
3731 	}
3732 	goto start;
3733 	break;
3734       }
3735       break;
3736 
3737     case ' ': case '\t': case '\n': case '\r': case '\f':
3738       if (IS_EXTEND(env->option))
3739 	goto start;
3740       break;
3741 
3742     default:
3743       /* string */
3744       break;
3745     }
3746   }
3747 
3748 #ifdef USE_VARIABLE_META_CHARS
3749  out:
3750 #endif
3751   *src = p;
3752   return tok->type;
3753 }
3754 
3755 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3756 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3757 			 OnigEncoding enc ARG_UNUSED,
3758                          OnigCodePoint sb_out, const OnigCodePoint mbr[])
3759 {
3760   int i, r;
3761   OnigCodePoint j;
3762 
3763   int n = ONIGENC_CODE_RANGE_NUM(mbr);
3764 
3765   if (not == 0) {
3766     for (i = 0; i < n; i++) {
3767       for (j  = ONIGENC_CODE_RANGE_FROM(mbr, i);
3768            j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3769 	if (j >= sb_out) {
3770 	  if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
3771 	  else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3772 	    r = add_code_range_to_buf(&(cc->mbuf), j,
3773 				      ONIGENC_CODE_RANGE_TO(mbr, i));
3774 	    if (r != 0) return r;
3775 	    i++;
3776 	  }
3777 
3778 	  goto sb_end;
3779 	}
3780         BITSET_SET_BIT(cc->bs, j);
3781       }
3782     }
3783 
3784   sb_end:
3785     for ( ; i < n; i++) {
3786       r = add_code_range_to_buf(&(cc->mbuf),
3787                                 ONIGENC_CODE_RANGE_FROM(mbr, i),
3788                                 ONIGENC_CODE_RANGE_TO(mbr, i));
3789       if (r != 0) return r;
3790     }
3791   }
3792   else {
3793     OnigCodePoint prev = 0;
3794 
3795     for (i = 0; i < n; i++) {
3796       for (j = prev;
3797 	   j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3798 	if (j >= sb_out) {
3799 	  goto sb_end2;
3800 	}
3801 	BITSET_SET_BIT(cc->bs, j);
3802       }
3803       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3804     }
3805     for (j = prev; j < sb_out; j++) {
3806       BITSET_SET_BIT(cc->bs, j);
3807     }
3808 
3809   sb_end2:
3810     prev = sb_out;
3811 
3812     for (i = 0; i < n; i++) {
3813       if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3814 	r = add_code_range_to_buf(&(cc->mbuf), prev,
3815                                   ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3816 	if (r != 0) return r;
3817       }
3818       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3819     }
3820     if (prev < 0x7fffffff) {
3821       r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3822       if (r != 0) return r;
3823     }
3824   }
3825 
3826   return 0;
3827 }
3828 
3829 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3830 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3831 {
3832   int c, r;
3833   const OnigCodePoint *ranges;
3834   OnigCodePoint sb_out;
3835   OnigEncoding enc = env->enc;
3836 
3837   r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3838   if (r == 0) {
3839     return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3840   }
3841   else if (r != ONIG_NO_SUPPORT_CONFIG) {
3842     return r;
3843   }
3844 
3845   r = 0;
3846   switch (ctype) {
3847   case ONIGENC_CTYPE_ALPHA:
3848   case ONIGENC_CTYPE_BLANK:
3849   case ONIGENC_CTYPE_CNTRL:
3850   case ONIGENC_CTYPE_DIGIT:
3851   case ONIGENC_CTYPE_LOWER:
3852   case ONIGENC_CTYPE_PUNCT:
3853   case ONIGENC_CTYPE_SPACE:
3854   case ONIGENC_CTYPE_UPPER:
3855   case ONIGENC_CTYPE_XDIGIT:
3856   case ONIGENC_CTYPE_ASCII:
3857   case ONIGENC_CTYPE_ALNUM:
3858     if (not != 0) {
3859       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3860 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3861 	  BITSET_SET_BIT(cc->bs, c);
3862       }
3863       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3864     }
3865     else {
3866       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3867 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3868 	  BITSET_SET_BIT(cc->bs, c);
3869       }
3870     }
3871     break;
3872 
3873   case ONIGENC_CTYPE_GRAPH:
3874   case ONIGENC_CTYPE_PRINT:
3875     if (not != 0) {
3876       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3877 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3878 	  BITSET_SET_BIT(cc->bs, c);
3879       }
3880     }
3881     else {
3882       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3883 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3884 	  BITSET_SET_BIT(cc->bs, c);
3885       }
3886       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3887     }
3888     break;
3889 
3890   case ONIGENC_CTYPE_WORD:
3891     if (not == 0) {
3892       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3893 	if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3894       }
3895       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3896     }
3897     else {
3898       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3899         if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3900 	    && ! ONIGENC_IS_CODE_WORD(enc, c))
3901 	  BITSET_SET_BIT(cc->bs, c);
3902       }
3903     }
3904     break;
3905 
3906   default:
3907     return ONIGERR_PARSER_BUG;
3908     break;
3909   }
3910 
3911   return r;
3912 }
3913 
3914 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3915 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3916 {
3917 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH  20
3918 #define POSIX_BRACKET_NAME_MIN_LEN         4
3919 
3920   static PosixBracketEntryType PBS[] = {
3921     { (UChar* )"alnum",  ONIGENC_CTYPE_ALNUM,  5 },
3922     { (UChar* )"alpha",  ONIGENC_CTYPE_ALPHA,  5 },
3923     { (UChar* )"blank",  ONIGENC_CTYPE_BLANK,  5 },
3924     { (UChar* )"cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
3925     { (UChar* )"digit",  ONIGENC_CTYPE_DIGIT,  5 },
3926     { (UChar* )"graph",  ONIGENC_CTYPE_GRAPH,  5 },
3927     { (UChar* )"lower",  ONIGENC_CTYPE_LOWER,  5 },
3928     { (UChar* )"print",  ONIGENC_CTYPE_PRINT,  5 },
3929     { (UChar* )"punct",  ONIGENC_CTYPE_PUNCT,  5 },
3930     { (UChar* )"space",  ONIGENC_CTYPE_SPACE,  5 },
3931     { (UChar* )"upper",  ONIGENC_CTYPE_UPPER,  5 },
3932     { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3933     { (UChar* )"ascii",  ONIGENC_CTYPE_ASCII,  5 },
3934     { (UChar* )"word",   ONIGENC_CTYPE_WORD,   4 },
3935     { (UChar* )NULL,     -1, 0 }
3936   };
3937 
3938   PosixBracketEntryType *pb;
3939   int not, i, r;
3940   OnigCodePoint c;
3941   OnigEncoding enc = env->enc;
3942   UChar *p = *src;
3943 
3944   if (PPEEK_IS('^')) {
3945     PINC_S;
3946     not = 1;
3947   }
3948   else
3949     not = 0;
3950 
3951   if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3952     goto not_posix_bracket;
3953 
3954   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3955     if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3956       p = (UChar* )onigenc_step(enc, p, end, pb->len);
3957       if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3958         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3959 
3960       r = add_ctype_to_cc(cc, pb->ctype, not, env);
3961       if (r != 0) return r;
3962 
3963       PINC_S; PINC_S;
3964       *src = p;
3965       return 0;
3966     }
3967   }
3968 
3969  not_posix_bracket:
3970   c = 0;
3971   i = 0;
3972   while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3973     PINC_S;
3974     if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3975   }
3976   if (c == ':' && ! PEND) {
3977     PINC_S;
3978     if (! PEND) {
3979       PFETCH_S(c);
3980       if (c == ']')
3981         return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3982     }
3983   }
3984 
3985   return 1;  /* 1: is not POSIX bracket, but no error. */
3986 }
3987 
3988 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3989 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3990 {
3991   int r;
3992   OnigCodePoint c;
3993   OnigEncoding enc = env->enc;
3994   UChar *prev, *start, *p = *src;
3995 
3996   r = 0;
3997   start = prev = p;
3998 
3999   while (!PEND) {
4000     prev = p;
4001     PFETCH_S(c);
4002     if (c == '}') {
4003       r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4004       if (r < 0) break;
4005 
4006       *src = p;
4007       return r;
4008     }
4009     else if (c == '(' || c == ')' || c == '{' || c == '|') {
4010       r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4011       break;
4012     }
4013   }
4014 
4015   onig_scan_env_set_error_string(env, r, *src, prev);
4016   return r;
4017 }
4018 
4019 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4020 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4021 		    ScanEnv* env)
4022 {
4023   int r, ctype;
4024   CClassNode* cc;
4025 
4026   ctype = fetch_char_property_to_ctype(src, end, env);
4027   if (ctype < 0) return ctype;
4028 
4029   *np = node_new_cclass();
4030   CHECK_NULL_RETURN_MEMERR(*np);
4031   cc = NCCLASS(*np);
4032   r = add_ctype_to_cc(cc, ctype, 0, env);
4033   if (r != 0) return r;
4034   if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4035 
4036   return 0;
4037 }
4038 
4039 
4040 enum CCSTATE {
4041   CCS_VALUE,
4042   CCS_RANGE,
4043   CCS_COMPLETE,
4044   CCS_START
4045 };
4046 
4047 enum CCVALTYPE {
4048   CCV_SB,
4049   CCV_CODE_POINT,
4050   CCV_CLASS
4051 };
4052 
4053 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4054 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4055 		 enum CCSTATE* state, ScanEnv* env)
4056 {
4057   int r;
4058 
4059   if (*state == CCS_RANGE)
4060     return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4061 
4062   if (*state == CCS_VALUE && *type != CCV_CLASS) {
4063     if (*type == CCV_SB)
4064       BITSET_SET_BIT(cc->bs, (int )(*vs));
4065     else if (*type == CCV_CODE_POINT) {
4066       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4067       if (r < 0) return r;
4068     }
4069   }
4070 
4071   if (*state != CCS_START)
4072     *state = CCS_VALUE;
4073 
4074   *type  = CCV_CLASS;
4075   return 0;
4076 }
4077 
4078 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4079 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
4080 	       int* vs_israw, int v_israw,
4081 	       enum CCVALTYPE intype, enum CCVALTYPE* type,
4082 	       enum CCSTATE* state, ScanEnv* env)
4083 {
4084   int r;
4085 
4086   switch (*state) {
4087   case CCS_VALUE:
4088     if (*type == CCV_SB)
4089     {
4090     if (*vs > 0xff)
4091       return ONIGERR_INVALID_CODE_POINT_VALUE;
4092       BITSET_SET_BIT(cc->bs, (int )(*vs));
4093     }
4094     else if (*type == CCV_CODE_POINT) {
4095       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4096       if (r < 0) return r;
4097     }
4098     break;
4099 
4100   case CCS_RANGE:
4101     if (intype == *type) {
4102       if (intype == CCV_SB) {
4103         if (*vs > 0xff || v > 0xff)
4104           return ONIGERR_INVALID_CODE_POINT_VALUE;
4105 
4106 	if (*vs > v) {
4107 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4108 	    goto ccs_range_end;
4109 	  else
4110 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4111 	}
4112 	bitset_set_range(cc->bs, (int )*vs, (int )v);
4113       }
4114       else {
4115 	r = add_code_range(&(cc->mbuf), env, *vs, v);
4116 	if (r < 0) return r;
4117       }
4118     }
4119     else {
4120 #if 0
4121       if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4122 #endif
4123 	if (*vs > v) {
4124 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4125 	    goto ccs_range_end;
4126 	  else
4127 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4128 	}
4129 	bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4130 	r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4131 	if (r < 0) return r;
4132 #if 0
4133       }
4134       else
4135 	return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4136 #endif
4137     }
4138   ccs_range_end:
4139     *state = CCS_COMPLETE;
4140     break;
4141 
4142   case CCS_COMPLETE:
4143   case CCS_START:
4144     *state = CCS_VALUE;
4145     break;
4146 
4147   default:
4148     break;
4149   }
4150 
4151   *vs_israw = v_israw;
4152   *vs       = v;
4153   *type     = intype;
4154   return 0;
4155 }
4156 
4157 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4158 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4159 		 ScanEnv* env)
4160 {
4161   int in_esc;
4162   OnigCodePoint code;
4163   OnigEncoding enc = env->enc;
4164   UChar* p = from;
4165 
4166   in_esc = 0;
4167   while (! PEND) {
4168     if (ignore_escaped && in_esc) {
4169       in_esc = 0;
4170     }
4171     else {
4172       PFETCH_S(code);
4173       if (code == c) return 1;
4174       if (code == MC_ESC(env->syntax)) in_esc = 1;
4175     }
4176   }
4177   return 0;
4178 }
4179 
4180 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4181 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4182 		 ScanEnv* env)
4183 {
4184   int r, neg, len, fetched, and_start;
4185   OnigCodePoint v, vs;
4186   UChar *p;
4187   Node* node;
4188   CClassNode *cc, *prev_cc;
4189   CClassNode work_cc;
4190 
4191   enum CCSTATE state;
4192   enum CCVALTYPE val_type, in_type;
4193   int val_israw, in_israw;
4194 
4195   prev_cc = (CClassNode* )NULL;
4196   *np = NULL_NODE;
4197   r = fetch_token_in_cc(tok, src, end, env);
4198   if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4199     neg = 1;
4200     r = fetch_token_in_cc(tok, src, end, env);
4201   }
4202   else {
4203     neg = 0;
4204   }
4205 
4206   if (r < 0) return r;
4207   if (r == TK_CC_CLOSE) {
4208     if (! code_exist_check((OnigCodePoint )']',
4209                            *src, env->pattern_end, 1, env))
4210       return ONIGERR_EMPTY_CHAR_CLASS;
4211 
4212     CC_ESC_WARN(env, (UChar* )"]");
4213     r = tok->type = TK_CHAR;  /* allow []...] */
4214   }
4215 
4216   *np = node = node_new_cclass();
4217   CHECK_NULL_RETURN_MEMERR(node);
4218   cc = NCCLASS(node);
4219 
4220   and_start = 0;
4221   state = CCS_START;
4222   p = *src;
4223   while (r != TK_CC_CLOSE) {
4224     fetched = 0;
4225     switch (r) {
4226     case TK_CHAR:
4227       len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4228       if (len > 1) {
4229 	in_type = CCV_CODE_POINT;
4230       }
4231       else if (len < 0) {
4232 	r = len;
4233 	goto err;
4234       }
4235       else {
4236       sb_char:
4237 	in_type = CCV_SB;
4238       }
4239       v = (OnigCodePoint )tok->u.c;
4240       in_israw = 0;
4241       goto val_entry2;
4242       break;
4243 
4244     case TK_RAW_BYTE:
4245       /* tok->base != 0 : octal or hexadec. */
4246       if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4247 	UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4248 	UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4249 	UChar* psave = p;
4250 	int i, base = tok->base;
4251 
4252 	buf[0] = tok->u.c;
4253 	for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4254 	  r = fetch_token_in_cc(tok, &p, end, env);
4255 	  if (r < 0) goto err;
4256 	  if (r != TK_RAW_BYTE || tok->base != base) {
4257 	    fetched = 1;
4258 	    break;
4259 	  }
4260 	  buf[i] = tok->u.c;
4261 	}
4262 
4263 	if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4264 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4265 	  goto err;
4266 	}
4267 
4268 	len = enclen(env->enc, buf);
4269 	if (i < len) {
4270 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4271 	  goto err;
4272 	}
4273 	else if (i > len) { /* fetch back */
4274 	  p = psave;
4275 	  for (i = 1; i < len; i++) {
4276 	    r = fetch_token_in_cc(tok, &p, end, env);
4277 	  }
4278 	  fetched = 0;
4279 	}
4280 
4281 	if (i == 1) {
4282 	  v = (OnigCodePoint )buf[0];
4283 	  goto raw_single;
4284 	}
4285 	else {
4286 	  v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4287 	  in_type = CCV_CODE_POINT;
4288 	}
4289       }
4290       else {
4291 	v = (OnigCodePoint )tok->u.c;
4292       raw_single:
4293 	in_type = CCV_SB;
4294       }
4295       in_israw = 1;
4296       goto val_entry2;
4297       break;
4298 
4299     case TK_CODE_POINT:
4300       v = tok->u.code;
4301       in_israw = 1;
4302     val_entry:
4303       len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4304       if (len < 0) {
4305 	r = len;
4306 	goto err;
4307       }
4308       in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4309     val_entry2:
4310       r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4311 			 &state, env);
4312       if (r != 0) goto err;
4313       break;
4314 
4315     case TK_POSIX_BRACKET_OPEN:
4316       r = parse_posix_bracket(cc, &p, end, env);
4317       if (r < 0) goto err;
4318       if (r == 1) {  /* is not POSIX bracket */
4319 	CC_ESC_WARN(env, (UChar* )"[");
4320 	p = tok->backp;
4321 	v = (OnigCodePoint )tok->u.c;
4322 	in_israw = 0;
4323 	goto val_entry;
4324       }
4325       goto next_class;
4326       break;
4327 
4328     case TK_CHAR_TYPE:
4329       r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4330       if (r != 0) return r;
4331 
4332     next_class:
4333       r = next_state_class(cc, &vs, &val_type, &state, env);
4334       if (r != 0) goto err;
4335       break;
4336 
4337     case TK_CHAR_PROPERTY:
4338       {
4339 	int ctype;
4340 
4341 	ctype = fetch_char_property_to_ctype(&p, end, env);
4342 	if (ctype < 0) return ctype;
4343 	r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4344 	if (r != 0) return r;
4345 	goto next_class;
4346       }
4347       break;
4348 
4349     case TK_CC_RANGE:
4350       if (state == CCS_VALUE) {
4351 	r = fetch_token_in_cc(tok, &p, end, env);
4352 	if (r < 0) goto err;
4353 	fetched = 1;
4354 	if (r == TK_CC_CLOSE) { /* allow [x-] */
4355 	range_end_val:
4356 	  v = (OnigCodePoint )'-';
4357 	  in_israw = 0;
4358 	  goto val_entry;
4359 	}
4360 	else if (r == TK_CC_AND) {
4361 	  CC_ESC_WARN(env, (UChar* )"-");
4362 	  goto range_end_val;
4363 	}
4364 	state = CCS_RANGE;
4365       }
4366       else if (state == CCS_START) {
4367 	/* [-xa] is allowed */
4368 	v = (OnigCodePoint )tok->u.c;
4369 	in_israw = 0;
4370 
4371 	r = fetch_token_in_cc(tok, &p, end, env);
4372 	if (r < 0) goto err;
4373 	fetched = 1;
4374 	/* [--x] or [a&&-x] is warned. */
4375 	if (r == TK_CC_RANGE || and_start != 0)
4376 	  CC_ESC_WARN(env, (UChar* )"-");
4377 
4378 	goto val_entry;
4379       }
4380       else if (state == CCS_RANGE) {
4381 	CC_ESC_WARN(env, (UChar* )"-");
4382 	goto sb_char;  /* [!--x] is allowed */
4383       }
4384       else { /* CCS_COMPLETE */
4385 	r = fetch_token_in_cc(tok, &p, end, env);
4386 	if (r < 0) goto err;
4387 	fetched = 1;
4388 	if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4389 	else if (r == TK_CC_AND) {
4390 	  CC_ESC_WARN(env, (UChar* )"-");
4391 	  goto range_end_val;
4392 	}
4393 
4394 	if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4395 	  CC_ESC_WARN(env, (UChar* )"-");
4396 	  goto sb_char;   /* [0-9-a] is allowed as [0-9\-a] */
4397 	}
4398 	r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4399 	goto err;
4400       }
4401       break;
4402 
4403     case TK_CC_CC_OPEN: /* [ */
4404       {
4405 	Node *anode;
4406 	CClassNode* acc;
4407 
4408 	r = parse_char_class(&anode, tok, &p, end, env);
4409 	if (r != 0) goto cc_open_err;
4410 	acc = NCCLASS(anode);
4411 	r = or_cclass(cc, acc, env->enc);
4412 
4413 	onig_node_free(anode);
4414       cc_open_err:
4415 	if (r != 0) goto err;
4416       }
4417       break;
4418 
4419     case TK_CC_AND: /* && */
4420       {
4421 	if (state == CCS_VALUE) {
4422 	  r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4423 			     &val_type, &state, env);
4424 	  if (r != 0) goto err;
4425 	}
4426 	/* initialize local variables */
4427 	and_start = 1;
4428 	state = CCS_START;
4429 
4430 	if (IS_NOT_NULL(prev_cc)) {
4431 	  r = and_cclass(prev_cc, cc, env->enc);
4432 	  if (r != 0) goto err;
4433 	  bbuf_free(cc->mbuf);
4434 	}
4435 	else {
4436 	  prev_cc = cc;
4437 	  cc = &work_cc;
4438 	}
4439 	initialize_cclass(cc);
4440       }
4441       break;
4442 
4443     case TK_EOT:
4444       r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4445       goto err;
4446       break;
4447     default:
4448       r = ONIGERR_PARSER_BUG;
4449       goto err;
4450       break;
4451     }
4452 
4453     if (fetched)
4454       r = tok->type;
4455     else {
4456       r = fetch_token_in_cc(tok, &p, end, env);
4457       if (r < 0) goto err;
4458     }
4459   }
4460 
4461   if (state == CCS_VALUE) {
4462     r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4463 		       &val_type, &state, env);
4464     if (r != 0) goto err;
4465   }
4466 
4467   if (IS_NOT_NULL(prev_cc)) {
4468     r = and_cclass(prev_cc, cc, env->enc);
4469     if (r != 0) goto err;
4470     bbuf_free(cc->mbuf);
4471     cc = prev_cc;
4472   }
4473 
4474   if (neg != 0)
4475     NCCLASS_SET_NOT(cc);
4476   else
4477     NCCLASS_CLEAR_NOT(cc);
4478   if (IS_NCCLASS_NOT(cc) &&
4479       IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4480     int is_empty;
4481 
4482     is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4483     if (is_empty != 0)
4484       BITSET_IS_EMPTY(cc->bs, is_empty);
4485 
4486     if (is_empty == 0) {
4487 #define NEWLINE_CODE    0x0a
4488 
4489       if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4490         if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4491           BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4492         else
4493           add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4494       }
4495     }
4496   }
4497   *src = p;
4498   return 0;
4499 
4500  err:
4501   if (cc != NCCLASS(*np))
4502     bbuf_free(cc->mbuf);
4503   onig_node_free(*np);
4504   return r;
4505 }
4506 
4507 static int parse_subexp(Node** top, OnigToken* tok, int term,
4508 			UChar** src, UChar* end, ScanEnv* env);
4509 
4510 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4511 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4512 	      ScanEnv* env)
4513 {
4514   int r, num;
4515   Node *target;
4516   OnigOptionType option;
4517   OnigCodePoint c;
4518   OnigEncoding enc = env->enc;
4519 
4520 #ifdef USE_NAMED_GROUP
4521   int list_capture;
4522 #endif
4523 
4524   UChar* p = *src;
4525   PFETCH_READY;
4526 
4527   *np = NULL;
4528   if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4529 
4530   option = env->option;
4531   if (PPEEK_IS('?') &&
4532       IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4533     PINC;
4534     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4535 
4536     PFETCH(c);
4537     switch (c) {
4538     case ':':   /* (?:...) grouping only */
4539     group:
4540       r = fetch_token(tok, &p, end, env);
4541       if (r < 0) return r;
4542       r = parse_subexp(np, tok, term, &p, end, env);
4543       if (r < 0) return r;
4544       *src = p;
4545       return 1; /* group */
4546       break;
4547 
4548     case '=':
4549       *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4550       break;
4551     case '!':  /*         preceding read */
4552       *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4553       break;
4554     case '>':            /* (?>...) stop backtrack */
4555       *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4556       break;
4557 
4558 #ifdef USE_NAMED_GROUP
4559     case '\'':
4560       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4561 	goto named_group1;
4562       }
4563       else
4564 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4565       break;
4566 #endif
4567 
4568     case '<':   /* look behind (?<=...), (?<!...) */
4569       PFETCH(c);
4570       if (c == '=')
4571 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4572       else if (c == '!')
4573 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4574 #ifdef USE_NAMED_GROUP
4575       else {
4576 	if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4577 	  UChar *name;
4578 	  UChar *name_end;
4579 
4580 	  PUNFETCH;
4581 	  c = '<';
4582 
4583 	named_group1:
4584 	  list_capture = 0;
4585 
4586 	named_group2:
4587 	  name = p;
4588 	  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4589 	  if (r < 0) return r;
4590 
4591 	  num = scan_env_add_mem_entry(env);
4592 	  if (num < 0) return num;
4593 	  if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4594 	    return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4595 
4596 	  r = name_add(env->reg, name, name_end, num, env);
4597 	  if (r != 0) return r;
4598 	  *np = node_new_enclose_memory(env->option, 1);
4599 	  CHECK_NULL_RETURN_MEMERR(*np);
4600 	  NENCLOSE(*np)->regnum = num;
4601 	  if (list_capture != 0)
4602 	    BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4603 	  env->num_named++;
4604 	}
4605 	else {
4606 	  return ONIGERR_UNDEFINED_GROUP_OPTION;
4607 	}
4608       }
4609 #else
4610       else {
4611 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4612       }
4613 #endif
4614       break;
4615 
4616     case '@':
4617       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4618 #ifdef USE_NAMED_GROUP
4619 	if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4620 	  PFETCH(c);
4621 	  if (c == '<' || c == '\'') {
4622 	    list_capture = 1;
4623 	    goto named_group2; /* (?@<name>...) */
4624 	  }
4625 	  PUNFETCH;
4626 	}
4627 #endif
4628 	*np = node_new_enclose_memory(env->option, 0);
4629 	CHECK_NULL_RETURN_MEMERR(*np);
4630 	num = scan_env_add_mem_entry(env);
4631 	if (num < 0) {
4632 	  onig_node_free(*np);
4633 	  return num;
4634 	}
4635 	else if (num >= (int )BIT_STATUS_BITS_NUM) {
4636 	  onig_node_free(*np);
4637 	  return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4638 	}
4639 	NENCLOSE(*np)->regnum = num;
4640 	BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4641       }
4642       else {
4643 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4644       }
4645       break;
4646 
4647 #ifdef USE_POSIXLINE_OPTION
4648     case 'p':
4649 #endif
4650     case '-': case 'i': case 'm': case 's': case 'x':
4651       {
4652 	int neg = 0;
4653 
4654 	while (1) {
4655 	  switch (c) {
4656 	  case ':':
4657 	  case ')':
4658 	  break;
4659 
4660 	  case '-':  neg = 1; break;
4661 	  case 'x':  ONOFF(option, ONIG_OPTION_EXTEND,     neg); break;
4662 	  case 'i':  ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4663 	  case 's':
4664 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4665 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4666 	    }
4667 	    else
4668 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4669 	    break;
4670 
4671 	  case 'm':
4672 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4673 	      ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4674 	    }
4675 	    else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4676 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4677 	    }
4678 	    else
4679 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4680 	    break;
4681 #ifdef USE_POSIXLINE_OPTION
4682 	  case 'p':
4683 	    ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4684 	    break;
4685 #endif
4686 	  default:
4687 	    return ONIGERR_UNDEFINED_GROUP_OPTION;
4688 	  }
4689 
4690 	  if (c == ')') {
4691 	    *np = node_new_option(option);
4692 	    CHECK_NULL_RETURN_MEMERR(*np);
4693 	    *src = p;
4694 	    return 2; /* option only */
4695 	  }
4696 	  else if (c == ':') {
4697 	    OnigOptionType prev = env->option;
4698 
4699 	    env->option     = option;
4700 	    r = fetch_token(tok, &p, end, env);
4701 	    if (r < 0) return r;
4702 	    r = parse_subexp(&target, tok, term, &p, end, env);
4703 	    env->option = prev;
4704 	    if (r < 0) return r;
4705 	    *np = node_new_option(option);
4706 	    CHECK_NULL_RETURN_MEMERR(*np);
4707 	    NENCLOSE(*np)->target = target;
4708 	    *src = p;
4709 	    return 0;
4710 	  }
4711 
4712 	  if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4713 	  PFETCH(c);
4714 	}
4715       }
4716       break;
4717 
4718     default:
4719       return ONIGERR_UNDEFINED_GROUP_OPTION;
4720     }
4721   }
4722   else {
4723     if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4724       goto group;
4725 
4726     *np = node_new_enclose_memory(env->option, 0);
4727     CHECK_NULL_RETURN_MEMERR(*np);
4728     num = scan_env_add_mem_entry(env);
4729     if (num < 0) return num;
4730     NENCLOSE(*np)->regnum = num;
4731   }
4732 
4733   CHECK_NULL_RETURN_MEMERR(*np);
4734   r = fetch_token(tok, &p, end, env);
4735   if (r < 0) return r;
4736   r = parse_subexp(&target, tok, term, &p, end, env);
4737   if (r < 0) return r;
4738 
4739   if (NTYPE(*np) == NT_ANCHOR)
4740     NANCHOR(*np)->target = target;
4741   else {
4742     NENCLOSE(*np)->target = target;
4743     if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4744       /* Don't move this to previous of parse_subexp() */
4745       r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4746       if (r != 0) return r;
4747     }
4748   }
4749 
4750   *src = p;
4751   return 0;
4752 }
4753 
4754 static const char* PopularQStr[] = {
4755   "?", "*", "+", "??", "*?", "+?"
4756 };
4757 
4758 static const char* ReduceQStr[] = {
4759   "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4760 };
4761 
4762 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4763 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4764 {
4765   QtfrNode* qn;
4766 
4767   qn = NQTFR(qnode);
4768   if (qn->lower == 1 && qn->upper == 1) {
4769     return 1;
4770   }
4771 
4772   switch (NTYPE(target)) {
4773   case NT_STR:
4774     if (! group) {
4775       StrNode* sn = NSTR(target);
4776       if (str_node_can_be_split(sn, env->enc)) {
4777 	Node* n = str_node_split_last_char(sn, env->enc);
4778 	if (IS_NOT_NULL(n)) {
4779 	  qn->target = n;
4780 	  return 2;
4781 	}
4782       }
4783     }
4784     break;
4785 
4786   case NT_QTFR:
4787     { /* check redundant double repeat. */
4788       /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4789       QtfrNode* qnt   = NQTFR(target);
4790       int nestq_num   = popular_quantifier_num(qn);
4791       int targetq_num = popular_quantifier_num(qnt);
4792 
4793 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4794       if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4795 	  IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4796         UChar buf[WARN_BUFSIZE];
4797 
4798         switch(ReduceTypeTable[targetq_num][nestq_num]) {
4799         case RQ_ASIS:
4800           break;
4801 
4802         case RQ_DEL:
4803           if (onig_verb_warn != onig_null_warn) {
4804             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4805                                  env->pattern, env->pattern_end,
4806                                  (UChar* )"redundant nested repeat operator");
4807             (*onig_verb_warn)((char* )buf);
4808           }
4809           goto warn_exit;
4810           break;
4811 
4812         default:
4813           if (onig_verb_warn != onig_null_warn) {
4814             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4815                                        env->pattern, env->pattern_end,
4816             (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4817             PopularQStr[targetq_num], PopularQStr[nestq_num],
4818             ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4819             (*onig_verb_warn)((char* )buf);
4820           }
4821           goto warn_exit;
4822           break;
4823         }
4824       }
4825 
4826     warn_exit:
4827 #endif
4828       if (targetq_num >= 0) {
4829 	if (nestq_num >= 0) {
4830 	  onig_reduce_nested_quantifier(qnode, target);
4831 	  goto q_exit;
4832 	}
4833 	else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4834 	  /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4835 	  if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4836 	    qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4837 	  }
4838 	}
4839       }
4840     }
4841     break;
4842 
4843   default:
4844     break;
4845   }
4846 
4847   qn->target = target;
4848  q_exit:
4849   return 0;
4850 }
4851 
4852 
4853 #ifdef USE_SHARED_CCLASS_TABLE
4854 
4855 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS     8
4856 
4857 /* for ctype node hash table */
4858 
4859 typedef struct {
4860   OnigEncoding enc;
4861   int not;
4862   int type;
4863 } type_cclass_key;
4864 
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4865 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4866 {
4867   if (x->type != y->type) return 1;
4868   if (x->enc  != y->enc)  return 1;
4869   if (x->not  != y->not)  return 1;
4870   return 0;
4871 }
4872 
type_cclass_hash(type_cclass_key * key)4873 static int type_cclass_hash(type_cclass_key* key)
4874 {
4875   int i, val;
4876   UChar *p;
4877 
4878   val = 0;
4879 
4880   p = (UChar* )&(key->enc);
4881   for (i = 0; i < (int )sizeof(key->enc); i++) {
4882     val = val * 997 + (int )*p++;
4883   }
4884 
4885   p = (UChar* )(&key->type);
4886   for (i = 0; i < (int )sizeof(key->type); i++) {
4887     val = val * 997 + (int )*p++;
4888   }
4889 
4890   val += key->not;
4891   return val + (val >> 5);
4892 }
4893 
4894 static struct st_hash_type type_type_cclass_hash = {
4895     type_cclass_cmp,
4896     type_cclass_hash,
4897 };
4898 
4899 static st_table* OnigTypeCClassTable;
4900 
4901 
4902 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg ARG_UNUSED)4903 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
4904 {
4905   if (IS_NOT_NULL(node)) {
4906     CClassNode* cc = NCCLASS(node);
4907     if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4908     xfree(node);
4909   }
4910 
4911   if (IS_NOT_NULL(key)) xfree(key);
4912   return ST_DELETE;
4913 }
4914 
4915 extern int
onig_free_shared_cclass_table(void)4916 onig_free_shared_cclass_table(void)
4917 {
4918   if (IS_NOT_NULL(OnigTypeCClassTable)) {
4919     onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4920     onig_st_free_table(OnigTypeCClassTable);
4921     OnigTypeCClassTable = NULL;
4922   }
4923 
4924   return 0;
4925 }
4926 
4927 #endif /* USE_SHARED_CCLASS_TABLE */
4928 
4929 
4930 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4931 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4932 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4933 {
4934   BBuf *tbuf;
4935   int r;
4936 
4937   if (IS_NCCLASS_NOT(cc)) {
4938     bitset_invert(cc->bs);
4939 
4940     if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4941       r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4942       if (r != 0) return r;
4943 
4944       bbuf_free(cc->mbuf);
4945       cc->mbuf = tbuf;
4946     }
4947 
4948     NCCLASS_CLEAR_NOT(cc);
4949   }
4950 
4951   return 0;
4952 }
4953 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4954 
4955 typedef struct {
4956   ScanEnv*    env;
4957   CClassNode* cc;
4958   Node*       alt_root;
4959   Node**      ptail;
4960 } IApplyCaseFoldArg;
4961 
4962 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4963 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4964 		  int to_len, void* arg)
4965 {
4966   IApplyCaseFoldArg* iarg;
4967   ScanEnv* env;
4968   CClassNode* cc;
4969   BitSetRef bs;
4970 
4971   iarg = (IApplyCaseFoldArg* )arg;
4972   env = iarg->env;
4973   cc  = iarg->cc;
4974   bs = cc->bs;
4975 
4976   if (to_len == 1) {
4977     int is_in = onig_is_code_in_cc(env->enc, from, cc);
4978 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4979     if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4980 	(is_in == 0 &&  IS_NCCLASS_NOT(cc))) {
4981       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4982 	add_code_range(&(cc->mbuf), env, *to, *to);
4983       }
4984       else {
4985 	BITSET_SET_BIT(bs, *to);
4986       }
4987     }
4988 #else
4989     if (is_in != 0) {
4990       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4991 	if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4992 	add_code_range(&(cc->mbuf), env, *to, *to);
4993       }
4994       else {
4995 	if (IS_NCCLASS_NOT(cc)) {
4996 	  BITSET_CLEAR_BIT(bs, *to);
4997 	}
4998 	else
4999 	  BITSET_SET_BIT(bs, *to);
5000       }
5001     }
5002 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5003   }
5004   else {
5005     int r, i, len;
5006     UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5007     Node *snode = NULL_NODE;
5008 
5009     if (onig_is_code_in_cc(env->enc, from, cc)
5010 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5011 	&& !IS_NCCLASS_NOT(cc)
5012 #endif
5013 	) {
5014       for (i = 0; i < to_len; i++) {
5015 	len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5016 	if (i == 0) {
5017 	  snode = onig_node_new_str(buf, buf + len);
5018 	  CHECK_NULL_RETURN_MEMERR(snode);
5019 
5020 	  /* char-class expanded multi-char only
5021 	     compare with string folded at match time. */
5022 	  NSTRING_SET_AMBIG(snode);
5023 	}
5024 	else {
5025 	  r = onig_node_str_cat(snode, buf, buf + len);
5026 	  if (r < 0) {
5027 	    onig_node_free(snode);
5028 	    return r;
5029 	  }
5030 	}
5031       }
5032 
5033       *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5034       CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5035       iarg->ptail = &(NCDR((*(iarg->ptail))));
5036     }
5037   }
5038 
5039   return 0;
5040 }
5041 
5042 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5043 parse_exp(Node** np, OnigToken* tok, int term,
5044 	  UChar** src, UChar* end, ScanEnv* env)
5045 {
5046   int r, len, group = 0;
5047   Node* qn;
5048   Node** targetp;
5049 
5050   *np = NULL;
5051   if (tok->type == (enum TokenSyms )term)
5052     goto end_of_token;
5053 
5054   switch (tok->type) {
5055   case TK_ALT:
5056   case TK_EOT:
5057   end_of_token:
5058   *np = node_new_empty();
5059   return tok->type;
5060   break;
5061 
5062   case TK_SUBEXP_OPEN:
5063     r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5064     if (r < 0) return r;
5065     if (r == 1) group = 1;
5066     else if (r == 2) { /* option only */
5067       Node* target;
5068       OnigOptionType prev = env->option;
5069 
5070       env->option = NENCLOSE(*np)->option;
5071       r = fetch_token(tok, src, end, env);
5072       if (r < 0) return r;
5073       r = parse_subexp(&target, tok, term, src, end, env);
5074       env->option = prev;
5075       if (r < 0) return r;
5076       NENCLOSE(*np)->target = target;
5077       return tok->type;
5078     }
5079     break;
5080 
5081   case TK_SUBEXP_CLOSE:
5082     if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5083       return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5084 
5085     if (tok->escaped) goto tk_raw_byte;
5086     else goto tk_byte;
5087     break;
5088 
5089   case TK_STRING:
5090   tk_byte:
5091     {
5092       *np = node_new_str(tok->backp, *src);
5093       CHECK_NULL_RETURN_MEMERR(*np);
5094 
5095       while (1) {
5096 	r = fetch_token(tok, src, end, env);
5097 	if (r < 0) return r;
5098 	if (r != TK_STRING) break;
5099 
5100 	r = onig_node_str_cat(*np, tok->backp, *src);
5101 	if (r < 0) return r;
5102       }
5103 
5104     string_end:
5105       targetp = np;
5106       goto repeat;
5107     }
5108     break;
5109 
5110   case TK_RAW_BYTE:
5111   tk_raw_byte:
5112     {
5113       *np = node_new_str_raw_char((UChar )tok->u.c);
5114       CHECK_NULL_RETURN_MEMERR(*np);
5115       len = 1;
5116       while (1) {
5117 	if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5118 	  if (len == enclen(env->enc, NSTR(*np)->s)) {
5119 	    r = fetch_token(tok, src, end, env);
5120 	    NSTRING_CLEAR_RAW(*np);
5121 	    goto string_end;
5122 	  }
5123 	}
5124 
5125 	r = fetch_token(tok, src, end, env);
5126 	if (r < 0) return r;
5127 	if (r != TK_RAW_BYTE) {
5128 	  /* Don't use this, it is wrong for little endian encodings. */
5129 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5130 	  int rem;
5131 	  if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5132 	    rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5133 	    (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5134 	    if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5135 	      NSTRING_CLEAR_RAW(*np);
5136 	      goto string_end;
5137 	    }
5138 	  }
5139 #endif
5140 	  return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5141 	}
5142 
5143 	r = node_str_cat_char(*np, (UChar )tok->u.c);
5144 	if (r < 0) return r;
5145 
5146 	len++;
5147       }
5148     }
5149     break;
5150 
5151   case TK_CODE_POINT:
5152     {
5153       UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5154       int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5155       if (num < 0) return num;
5156 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5157       *np = node_new_str_raw(buf, buf + num);
5158 #else
5159       *np = node_new_str(buf, buf + num);
5160 #endif
5161       CHECK_NULL_RETURN_MEMERR(*np);
5162     }
5163     break;
5164 
5165   case TK_QUOTE_OPEN:
5166     {
5167       OnigCodePoint end_op[2];
5168       UChar *qstart, *qend, *nextp;
5169 
5170       end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5171       end_op[1] = (OnigCodePoint )'E';
5172       qstart = *src;
5173       qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5174       if (IS_NULL(qend)) {
5175 	nextp = qend = end;
5176       }
5177       *np = node_new_str(qstart, qend);
5178       CHECK_NULL_RETURN_MEMERR(*np);
5179       *src = nextp;
5180     }
5181     break;
5182 
5183   case TK_CHAR_TYPE:
5184     {
5185       switch (tok->u.prop.ctype) {
5186       case ONIGENC_CTYPE_WORD:
5187 	*np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5188 	CHECK_NULL_RETURN_MEMERR(*np);
5189 	break;
5190 
5191       case ONIGENC_CTYPE_SPACE:
5192       case ONIGENC_CTYPE_DIGIT:
5193       case ONIGENC_CTYPE_XDIGIT:
5194 	{
5195 	  CClassNode* cc;
5196 
5197 #ifdef USE_SHARED_CCLASS_TABLE
5198           const OnigCodePoint *mbr;
5199 	  OnigCodePoint sb_out;
5200 
5201           r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
5202 					   &sb_out, &mbr);
5203           if (r == 0 &&
5204               ONIGENC_CODE_RANGE_NUM(mbr)
5205               >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
5206             type_cclass_key  key;
5207             type_cclass_key* new_key;
5208 
5209             key.enc  = env->enc;
5210             key.not  = tok->u.prop.not;
5211             key.type = tok->u.prop.ctype;
5212 
5213             THREAD_ATOMIC_START;
5214 
5215             if (IS_NULL(OnigTypeCClassTable)) {
5216               OnigTypeCClassTable
5217                 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
5218               if (IS_NULL(OnigTypeCClassTable)) {
5219                 THREAD_ATOMIC_END;
5220                 return ONIGERR_MEMORY;
5221               }
5222             }
5223             else {
5224               if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
5225                                  (st_data_t* )np)) {
5226                 THREAD_ATOMIC_END;
5227                 break;
5228               }
5229             }
5230 
5231             *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
5232 						     sb_out, mbr);
5233             if (IS_NULL(*np)) {
5234               THREAD_ATOMIC_END;
5235               return ONIGERR_MEMORY;
5236             }
5237 
5238             cc = NCCLASS(*np);
5239             NCCLASS_SET_SHARE(cc);
5240             new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
5241 	    xmemcpy(new_key, &key, sizeof(type_cclass_key));
5242             onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
5243                                (st_data_t )*np);
5244 
5245             THREAD_ATOMIC_END;
5246           }
5247           else {
5248 #endif
5249             *np = node_new_cclass();
5250             CHECK_NULL_RETURN_MEMERR(*np);
5251             cc = NCCLASS(*np);
5252             add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5253             if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5254 #ifdef USE_SHARED_CCLASS_TABLE
5255           }
5256 #endif
5257 	}
5258 	break;
5259 
5260       default:
5261 	return ONIGERR_PARSER_BUG;
5262 	break;
5263       }
5264     }
5265     break;
5266 
5267   case TK_CHAR_PROPERTY:
5268     r = parse_char_property(np, tok, src, end, env);
5269     if (r != 0) return r;
5270     break;
5271 
5272   case TK_CC_OPEN:
5273     {
5274       CClassNode* cc;
5275 
5276       r = parse_char_class(np, tok, src, end, env);
5277       if (r != 0) return r;
5278 
5279       cc = NCCLASS(*np);
5280       if (IS_IGNORECASE(env->option)) {
5281 	IApplyCaseFoldArg iarg;
5282 
5283 	iarg.env      = env;
5284 	iarg.cc       = cc;
5285 	iarg.alt_root = NULL_NODE;
5286 	iarg.ptail    = &(iarg.alt_root);
5287 
5288 	r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5289 					i_apply_case_fold, &iarg);
5290 	if (r != 0) {
5291 	  onig_node_free(iarg.alt_root);
5292 	  return r;
5293 	}
5294 	if (IS_NOT_NULL(iarg.alt_root)) {
5295           Node* work = onig_node_new_alt(*np, iarg.alt_root);
5296           if (IS_NULL(work)) {
5297             onig_node_free(iarg.alt_root);
5298             return ONIGERR_MEMORY;
5299           }
5300           *np = work;
5301 	}
5302       }
5303     }
5304     break;
5305 
5306   case TK_ANYCHAR:
5307     *np = node_new_anychar();
5308     CHECK_NULL_RETURN_MEMERR(*np);
5309     break;
5310 
5311   case TK_ANYCHAR_ANYTIME:
5312     *np = node_new_anychar();
5313     CHECK_NULL_RETURN_MEMERR(*np);
5314     qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5315     CHECK_NULL_RETURN_MEMERR(qn);
5316     NQTFR(qn)->target = *np;
5317     *np = qn;
5318     break;
5319 
5320   case TK_BACKREF:
5321     len = tok->u.backref.num;
5322     *np = node_new_backref(len,
5323 		   (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5324 			   tok->u.backref.by_name,
5325 #ifdef USE_BACKREF_WITH_LEVEL
5326 			   tok->u.backref.exist_level,
5327 			   tok->u.backref.level,
5328 #endif
5329 			   env);
5330     CHECK_NULL_RETURN_MEMERR(*np);
5331     break;
5332 
5333 #ifdef USE_SUBEXP_CALL
5334   case TK_CALL:
5335     {
5336       int gnum = tok->u.call.gnum;
5337 
5338       if (gnum < 0) {
5339 	gnum = BACKREF_REL_TO_ABS(gnum, env);
5340 	if (gnum <= 0)
5341 	  return ONIGERR_INVALID_BACKREF;
5342       }
5343       *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5344       CHECK_NULL_RETURN_MEMERR(*np);
5345       env->num_call++;
5346     }
5347     break;
5348 #endif
5349 
5350   case TK_ANCHOR:
5351     *np = onig_node_new_anchor(tok->u.anchor);
5352     break;
5353 
5354   case TK_OP_REPEAT:
5355   case TK_INTERVAL:
5356     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5357       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5358 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5359       else
5360 	*np = node_new_empty();
5361     }
5362     else {
5363       goto tk_byte;
5364     }
5365     break;
5366 
5367   default:
5368     return ONIGERR_PARSER_BUG;
5369     break;
5370   }
5371 
5372   {
5373     targetp = np;
5374 
5375   re_entry:
5376     r = fetch_token(tok, src, end, env);
5377     if (r < 0) return r;
5378 
5379   repeat:
5380     if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5381       if (is_invalid_quantifier_target(*targetp))
5382 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5383 
5384       qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5385 			       (r == TK_INTERVAL ? 1 : 0));
5386       CHECK_NULL_RETURN_MEMERR(qn);
5387       NQTFR(qn)->greedy = tok->u.repeat.greedy;
5388       r = set_quantifier(qn, *targetp, group, env);
5389       if (r < 0) {
5390 	onig_node_free(qn);
5391 	return r;
5392       }
5393 
5394       if (tok->u.repeat.possessive != 0) {
5395 	Node* en;
5396 	en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5397 	if (IS_NULL(en)) {
5398 	  onig_node_free(qn);
5399 	  return ONIGERR_MEMORY;
5400 	}
5401 	NENCLOSE(en)->target = qn;
5402 	qn = en;
5403       }
5404 
5405       if (r == 0) {
5406 	*targetp = qn;
5407       }
5408       else if (r == 1) {
5409 	onig_node_free(qn);
5410       }
5411       else if (r == 2) { /* split case: /abc+/ */
5412 	Node *tmp;
5413 
5414 	*targetp = node_new_list(*targetp, NULL);
5415 	if (IS_NULL(*targetp)) {
5416 	  onig_node_free(qn);
5417 	  return ONIGERR_MEMORY;
5418 	}
5419 	tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5420 	if (IS_NULL(tmp)) {
5421 	  onig_node_free(qn);
5422 	  return ONIGERR_MEMORY;
5423 	}
5424 	targetp = &(NCAR(tmp));
5425       }
5426       goto re_entry;
5427     }
5428   }
5429 
5430   return r;
5431 }
5432 
5433 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5434 parse_branch(Node** top, OnigToken* tok, int term,
5435 	     UChar** src, UChar* end, ScanEnv* env)
5436 {
5437   int r;
5438   Node *node, **headp;
5439 
5440   *top = NULL;
5441   r = parse_exp(&node, tok, term, src, end, env);
5442   if (r < 0) return r;
5443 
5444   if (r == TK_EOT || r == term || r == TK_ALT) {
5445     *top = node;
5446   }
5447   else {
5448     *top  = node_new_list(node, NULL);
5449     headp = &(NCDR(*top));
5450     while (r != TK_EOT && r != term && r != TK_ALT) {
5451       r = parse_exp(&node, tok, term, src, end, env);
5452       if (r < 0) return r;
5453 
5454       if (NTYPE(node) == NT_LIST) {
5455 	*headp = node;
5456 	while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5457 	headp = &(NCDR(node));
5458       }
5459       else {
5460 	*headp = node_new_list(node, NULL);
5461 	headp = &(NCDR(*headp));
5462       }
5463     }
5464   }
5465 
5466   return r;
5467 }
5468 
5469 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5470 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5471 parse_subexp(Node** top, OnigToken* tok, int term,
5472 	     UChar** src, UChar* end, ScanEnv* env)
5473 {
5474   int r;
5475   Node *node, **headp;
5476 
5477   *top = NULL;
5478   r = parse_branch(&node, tok, term, src, end, env);
5479   if (r < 0) {
5480     onig_node_free(node);
5481     return r;
5482   }
5483 
5484   if (r == term) {
5485     *top = node;
5486   }
5487   else if (r == TK_ALT) {
5488     *top  = onig_node_new_alt(node, NULL);
5489     headp = &(NCDR(*top));
5490     while (r == TK_ALT) {
5491       r = fetch_token(tok, src, end, env);
5492       if (r < 0) return r;
5493       r = parse_branch(&node, tok, term, src, end, env);
5494       if (r < 0) return r;
5495 
5496       *headp = onig_node_new_alt(node, NULL);
5497       headp = &(NCDR(*headp));
5498     }
5499 
5500     if (tok->type != (enum TokenSyms )term)
5501       goto err;
5502   }
5503   else {
5504   err:
5505     if (term == TK_SUBEXP_CLOSE)
5506       return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5507     else
5508       return ONIGERR_PARSER_BUG;
5509   }
5510 
5511   return r;
5512 }
5513 
5514 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5515 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5516 {
5517   int r;
5518   OnigToken tok;
5519 
5520   r = fetch_token(&tok, src, end, env);
5521   if (r < 0) return r;
5522   r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5523   if (r < 0) return r;
5524   return 0;
5525 }
5526 
5527 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5528 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5529 		     regex_t* reg, ScanEnv* env)
5530 {
5531   int r;
5532   UChar* p;
5533 
5534 #ifdef USE_NAMED_GROUP
5535   names_clear(reg);
5536 #endif
5537 
5538   scan_env_clear(env);
5539   env->option         = reg->options;
5540   env->case_fold_flag = reg->case_fold_flag;
5541   env->enc            = reg->enc;
5542   env->syntax         = reg->syntax;
5543   env->pattern        = (UChar* )pattern;
5544   env->pattern_end    = (UChar* )end;
5545   env->reg            = reg;
5546 
5547   *root = NULL;
5548   p = (UChar* )pattern;
5549   r = parse_regexp(root, &p, (UChar* )end, env);
5550   reg->num_mem = env->num_mem;
5551   return r;
5552 }
5553 
5554 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5555 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5556 				UChar* arg, UChar* arg_end)
5557 {
5558   env->error     = arg;
5559   env->error_end = arg_end;
5560 }
5561