xref: /PHP-5.5/ext/mbstring/oniguruma/regparse.c (revision 75adcc8d)
1 /**********************************************************************
2   regparse.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 #include "st.h"
32 
33 #define WARN_BUFSIZE    256
34 
35 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
36 
37 
38 OnigSyntaxType OnigSyntaxRuby = {
39   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
40      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
41      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
42      ONIG_SYN_OP_ESC_C_CONTROL )
43    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
44   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
45       ONIG_SYN_OP2_OPTION_RUBY |
46       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
47       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
48       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
49       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
50       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
51       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
52       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
53       ONIG_SYN_OP2_ESC_H_XDIGIT )
54   , ( SYN_GNU_REGEX_BV |
55       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
56       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
57       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
58       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
59       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
60       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
61       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
62   , ONIG_OPTION_NONE
63   ,
64   {
65       (OnigCodePoint )'\\'                       /* esc */
66     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
67     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
68     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
69     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
70     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
71   }
72 };
73 
74 OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
75 
onig_null_warn(const char * s ARG_UNUSED)76 extern void onig_null_warn(const char* s ARG_UNUSED) { }
77 
78 #ifdef DEFAULT_WARN_FUNCTION
79 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
80 #else
81 static OnigWarnFunc onig_warn = onig_null_warn;
82 #endif
83 
84 #ifdef DEFAULT_VERB_WARN_FUNCTION
85 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
86 #else
87 static OnigWarnFunc onig_verb_warn = onig_null_warn;
88 #endif
89 
onig_set_warn_func(OnigWarnFunc f)90 extern void onig_set_warn_func(OnigWarnFunc f)
91 {
92   onig_warn = f;
93 }
94 
onig_set_verb_warn_func(OnigWarnFunc f)95 extern void onig_set_verb_warn_func(OnigWarnFunc f)
96 {
97   onig_verb_warn = f;
98 }
99 
100 static void
bbuf_free(BBuf * bbuf)101 bbuf_free(BBuf* bbuf)
102 {
103   if (IS_NOT_NULL(bbuf)) {
104     if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
105     xfree(bbuf);
106   }
107 }
108 
109 static int
bbuf_clone(BBuf ** rto,BBuf * from)110 bbuf_clone(BBuf** rto, BBuf* from)
111 {
112   int r;
113   BBuf *to;
114 
115   *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
116   CHECK_NULL_RETURN_MEMERR(to);
117   r = BBUF_INIT(to, from->alloc);
118   if (r != 0) return r;
119   to->used = from->used;
120   xmemcpy(to->p, from->p, from->used);
121   return 0;
122 }
123 
124 #define BACKREF_REL_TO_ABS(rel_no, env) \
125   ((env)->num_mem + 1 + (rel_no))
126 
127 #define ONOFF(v,f,negative)    (negative) ? ((v) &= ~(f)) : ((v) |= (f))
128 
129 #define MBCODE_START_POS(enc) \
130   (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
131 
132 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
133   add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
134 
135 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
136   if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
137     r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
138     if (r) return r;\
139   }\
140 } while (0)
141 
142 
143 #define BITSET_IS_EMPTY(bs,empty) do {\
144   int i;\
145   empty = 1;\
146   for (i = 0; i < (int )BITSET_SIZE; i++) {\
147     if ((bs)[i] != 0) {\
148       empty = 0; break;\
149     }\
150   }\
151 } while (0)
152 
153 static void
bitset_set_range(BitSetRef bs,int from,int to)154 bitset_set_range(BitSetRef bs, int from, int to)
155 {
156   int i;
157   for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
158     BITSET_SET_BIT(bs, i);
159   }
160 }
161 
162 #if 0
163 static void
164 bitset_set_all(BitSetRef bs)
165 {
166   int i;
167   for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
168 }
169 #endif
170 
171 static void
bitset_invert(BitSetRef bs)172 bitset_invert(BitSetRef bs)
173 {
174   int i;
175   for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
176 }
177 
178 static void
bitset_invert_to(BitSetRef from,BitSetRef to)179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181   int i;
182   for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
183 }
184 
185 static void
bitset_and(BitSetRef dest,BitSetRef bs)186 bitset_and(BitSetRef dest, BitSetRef bs)
187 {
188   int i;
189   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
190 }
191 
192 static void
bitset_or(BitSetRef dest,BitSetRef bs)193 bitset_or(BitSetRef dest, BitSetRef bs)
194 {
195   int i;
196   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
197 }
198 
199 static void
bitset_copy(BitSetRef dest,BitSetRef bs)200 bitset_copy(BitSetRef dest, BitSetRef bs)
201 {
202   int i;
203   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
204 }
205 
206 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)207 onig_strncmp(const UChar* s1, const UChar* s2, int n)
208 {
209   int x;
210 
211   while (n-- > 0) {
212     x = *s2++ - *s1++;
213     if (x) return x;
214   }
215   return 0;
216 }
217 
218 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)219 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
220 {
221   int len = end - src;
222   if (len > 0) {
223     xmemcpy(dest, src, len);
224     dest[len] = (UChar )0;
225   }
226 }
227 
228 #ifdef USE_NAMED_GROUP
229 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)230 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
231 {
232   int slen, term_len, i;
233   UChar *r;
234 
235   slen = end - s;
236   term_len = ONIGENC_MBC_MINLEN(enc);
237 
238   r = (UChar* )xmalloc(slen + term_len);
239   CHECK_NULL_RETURN(r);
240   xmemcpy(r, s, slen);
241 
242   for (i = 0; i < term_len; i++)
243     r[slen + i] = (UChar )0;
244 
245   return r;
246 }
247 #endif
248 
249 /* scan pattern methods */
250 #define PEND_VALUE   0
251 
252 #define PFETCH_READY  UChar* pfetch_prev
253 #define PEND         (p < end ?  0 : 1)
254 #define PUNFETCH     p = pfetch_prev
255 #define PINC       do { \
256   pfetch_prev = p; \
257   p += ONIGENC_MBC_ENC_LEN(enc, p); \
258 } while (0)
259 #define PFETCH(c)  do { \
260   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
261   pfetch_prev = p; \
262   p += ONIGENC_MBC_ENC_LEN(enc, p); \
263 } while (0)
264 
265 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
266 #define PPEEK_IS(c)  (PPEEK == (OnigCodePoint )c)
267 
268 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)269 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
270 	      int capa)
271 {
272   UChar* r;
273 
274   if (dest)
275     r = (UChar* )xrealloc(dest, capa + 1);
276   else
277     r = (UChar* )xmalloc(capa + 1);
278 
279   CHECK_NULL_RETURN(r);
280   onig_strcpy(r + (dest_end - dest), src, src_end);
281   return r;
282 }
283 
284 /* dest on static area */
285 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)286 strcat_capa_from_static(UChar* dest, UChar* dest_end,
287 			const UChar* src, const UChar* src_end, int capa)
288 {
289   UChar* r;
290 
291   r = (UChar* )xmalloc(capa + 1);
292   CHECK_NULL_RETURN(r);
293   onig_strcpy(r, dest, dest_end);
294   onig_strcpy(r + (dest_end - dest), src, src_end);
295   return r;
296 }
297 
298 
299 #ifdef USE_ST_LIBRARY
300 
301 typedef struct {
302   UChar* s;
303   UChar* end;
304 } st_str_end_key;
305 
306 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)307 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
308 {
309   UChar *p, *q;
310   int c;
311 
312   if ((x->end - x->s) != (y->end - y->s))
313     return 1;
314 
315   p = x->s;
316   q = y->s;
317   while (p < x->end) {
318     c = (int )*p - (int )*q;
319     if (c != 0) return c;
320 
321     p++; q++;
322   }
323 
324   return 0;
325 }
326 
327 static int
str_end_hash(st_str_end_key * x)328 str_end_hash(st_str_end_key* x)
329 {
330   UChar *p;
331   int val = 0;
332 
333   p = x->s;
334   while (p < x->end) {
335     val = val * 997 + (int )*p++;
336   }
337 
338   return val + (val >> 5);
339 }
340 
341 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)342 onig_st_init_strend_table_with_size(int size)
343 {
344   static struct st_hash_type hashType = {
345     str_end_cmp,
346     str_end_hash,
347   };
348 
349   return (hash_table_type* )
350            onig_st_init_table_with_size(&hashType, size);
351 }
352 
353 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)354 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
355 		      const UChar* end_key, hash_data_type *value)
356 {
357   st_str_end_key key;
358 
359   key.s   = (UChar* )str_key;
360   key.end = (UChar* )end_key;
361 
362   return onig_st_lookup(table, (st_data_t )(&key), value);
363 }
364 
365 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)366 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
367 		      const UChar* end_key, hash_data_type value)
368 {
369   st_str_end_key* key;
370   int result;
371 
372   key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
373   key->s   = (UChar* )str_key;
374   key->end = (UChar* )end_key;
375   result = onig_st_insert(table, (st_data_t )key, value);
376   if (result) {
377     xfree(key);
378   }
379   return result;
380 }
381 
382 #endif /* USE_ST_LIBRARY */
383 
384 
385 #ifdef USE_NAMED_GROUP
386 
387 #define INIT_NAME_BACKREFS_ALLOC_NUM   8
388 
389 typedef struct {
390   UChar* name;
391   int    name_len;   /* byte length */
392   int    back_num;   /* number of backrefs */
393   int    back_alloc;
394   int    back_ref1;
395   int*   back_refs;
396 } NameEntry;
397 
398 #ifdef USE_ST_LIBRARY
399 
400 typedef st_table  NameTable;
401 typedef st_data_t HashDataType;   /* 1.6 st.h doesn't define st_data_t type */
402 
403 #define NAMEBUF_SIZE    24
404 #define NAMEBUF_SIZE_1  25
405 
406 #ifdef ONIG_DEBUG
407 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)408 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
409 {
410   int i;
411   FILE* fp = (FILE* )arg;
412 
413   fprintf(fp, "%s: ", e->name);
414   if (e->back_num == 0)
415     fputs("-", fp);
416   else if (e->back_num == 1)
417     fprintf(fp, "%d", e->back_ref1);
418   else {
419     for (i = 0; i < e->back_num; i++) {
420       if (i > 0) fprintf(fp, ", ");
421       fprintf(fp, "%d", e->back_refs[i]);
422     }
423   }
424   fputs("\n", fp);
425   return ST_CONTINUE;
426 }
427 
428 extern int
onig_print_names(FILE * fp,regex_t * reg)429 onig_print_names(FILE* fp, regex_t* reg)
430 {
431   NameTable* t = (NameTable* )reg->name_table;
432 
433   if (IS_NOT_NULL(t)) {
434     fprintf(fp, "name table\n");
435     onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
436     fputs("\n", fp);
437   }
438   return 0;
439 }
440 #endif /* ONIG_DEBUG */
441 
442 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)443 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
444 {
445   xfree(e->name);
446   if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
447   xfree(key);
448   xfree(e);
449   return ST_DELETE;
450 }
451 
452 static int
names_clear(regex_t * reg)453 names_clear(regex_t* reg)
454 {
455   NameTable* t = (NameTable* )reg->name_table;
456 
457   if (IS_NOT_NULL(t)) {
458     onig_st_foreach(t, i_free_name_entry, 0);
459   }
460   return 0;
461 }
462 
463 extern int
onig_names_free(regex_t * reg)464 onig_names_free(regex_t* reg)
465 {
466   int r;
467   NameTable* t;
468 
469   r = names_clear(reg);
470   if (r) return r;
471 
472   t = (NameTable* )reg->name_table;
473   if (IS_NOT_NULL(t)) onig_st_free_table(t);
474   reg->name_table = (void* )NULL;
475   return 0;
476 }
477 
478 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)479 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
480 {
481   NameEntry* e;
482   NameTable* t = (NameTable* )reg->name_table;
483 
484   e = (NameEntry* )NULL;
485   if (IS_NOT_NULL(t)) {
486     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
487   }
488   return e;
489 }
490 
491 typedef struct {
492   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
493   regex_t* reg;
494   void* arg;
495   int ret;
496   OnigEncoding enc;
497 } INamesArg;
498 
499 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)500 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
501 {
502   int r = (*(arg->func))(e->name,
503                          e->name + e->name_len,
504                          e->back_num,
505 			 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
506 			 arg->reg, arg->arg);
507   if (r != 0) {
508     arg->ret = r;
509     return ST_STOP;
510   }
511   return ST_CONTINUE;
512 }
513 
514 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)515 onig_foreach_name(regex_t* reg,
516   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
517 {
518   INamesArg narg;
519   NameTable* t = (NameTable* )reg->name_table;
520 
521   narg.ret = 0;
522   if (IS_NOT_NULL(t)) {
523     narg.func = func;
524     narg.reg  = reg;
525     narg.arg  = arg;
526     narg.enc  = reg->enc; /* should be pattern encoding. */
527     onig_st_foreach(t, i_names, (HashDataType )&narg);
528   }
529   return narg.ret;
530 }
531 
532 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)533 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
534 {
535   int i;
536 
537   if (e->back_num > 1) {
538     for (i = 0; i < e->back_num; i++) {
539       e->back_refs[i] = map[e->back_refs[i]].new_val;
540     }
541   }
542   else if (e->back_num == 1) {
543     e->back_ref1 = map[e->back_ref1].new_val;
544   }
545 
546   return ST_CONTINUE;
547 }
548 
549 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)550 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
551 {
552   NameTable* t = (NameTable* )reg->name_table;
553 
554   if (IS_NOT_NULL(t)) {
555     onig_st_foreach(t, i_renumber_name, (HashDataType )map);
556   }
557   return 0;
558 }
559 
560 
561 extern int
onig_number_of_names(regex_t * reg)562 onig_number_of_names(regex_t* reg)
563 {
564   NameTable* t = (NameTable* )reg->name_table;
565 
566   if (IS_NOT_NULL(t))
567     return t->num_entries;
568   else
569     return 0;
570 }
571 
572 #else  /* USE_ST_LIBRARY */
573 
574 #define INIT_NAMES_ALLOC_NUM    8
575 
576 typedef struct {
577   NameEntry* e;
578   int        num;
579   int        alloc;
580 } NameTable;
581 
582 #ifdef ONIG_DEBUG
583 extern int
onig_print_names(FILE * fp,regex_t * reg)584 onig_print_names(FILE* fp, regex_t* reg)
585 {
586   int i, j;
587   NameEntry* e;
588   NameTable* t = (NameTable* )reg->name_table;
589 
590   if (IS_NOT_NULL(t) && t->num > 0) {
591     fprintf(fp, "name table\n");
592     for (i = 0; i < t->num; i++) {
593       e = &(t->e[i]);
594       fprintf(fp, "%s: ", e->name);
595       if (e->back_num == 0) {
596 	fputs("-", fp);
597       }
598       else if (e->back_num == 1) {
599 	fprintf(fp, "%d", e->back_ref1);
600       }
601       else {
602 	for (j = 0; j < e->back_num; j++) {
603 	  if (j > 0) fprintf(fp, ", ");
604 	  fprintf(fp, "%d", e->back_refs[j]);
605 	}
606       }
607       fputs("\n", fp);
608     }
609     fputs("\n", fp);
610   }
611   return 0;
612 }
613 #endif
614 
615 static int
names_clear(regex_t * reg)616 names_clear(regex_t* reg)
617 {
618   int i;
619   NameEntry* e;
620   NameTable* t = (NameTable* )reg->name_table;
621 
622   if (IS_NOT_NULL(t)) {
623     for (i = 0; i < t->num; i++) {
624       e = &(t->e[i]);
625       if (IS_NOT_NULL(e->name)) {
626 	xfree(e->name);
627 	e->name       = NULL;
628 	e->name_len   = 0;
629 	e->back_num   = 0;
630 	e->back_alloc = 0;
631 	if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
632 	e->back_refs = (int* )NULL;
633       }
634     }
635     if (IS_NOT_NULL(t->e)) {
636       xfree(t->e);
637       t->e = NULL;
638     }
639     t->num = 0;
640   }
641   return 0;
642 }
643 
644 extern int
onig_names_free(regex_t * reg)645 onig_names_free(regex_t* reg)
646 {
647   int r;
648   NameTable* t;
649 
650   r = names_clear(reg);
651   if (r) return r;
652 
653   t = (NameTable* )reg->name_table;
654   if (IS_NOT_NULL(t)) xfree(t);
655   reg->name_table = NULL;
656   return 0;
657 }
658 
659 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)660 name_find(regex_t* reg, UChar* name, UChar* name_end)
661 {
662   int i, len;
663   NameEntry* e;
664   NameTable* t = (NameTable* )reg->name_table;
665 
666   if (IS_NOT_NULL(t)) {
667     len = name_end - name;
668     for (i = 0; i < t->num; i++) {
669       e = &(t->e[i]);
670       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
671 	return e;
672     }
673   }
674   return (NameEntry* )NULL;
675 }
676 
677 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)678 onig_foreach_name(regex_t* reg,
679   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
680 {
681   int i, r;
682   NameEntry* e;
683   NameTable* t = (NameTable* )reg->name_table;
684 
685   if (IS_NOT_NULL(t)) {
686     for (i = 0; i < t->num; i++) {
687       e = &(t->e[i]);
688       r = (*func)(e->name, e->name + e->name_len, e->back_num,
689 		  (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
690 		  reg, arg);
691       if (r != 0) return r;
692     }
693   }
694   return 0;
695 }
696 
697 extern int
onig_number_of_names(regex_t * reg)698 onig_number_of_names(regex_t* reg)
699 {
700   NameTable* t = (NameTable* )reg->name_table;
701 
702   if (IS_NOT_NULL(t))
703     return t->num;
704   else
705     return 0;
706 }
707 
708 #endif /* else USE_ST_LIBRARY */
709 
710 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)711 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
712 {
713   int alloc;
714   NameEntry* e;
715   NameTable* t = (NameTable* )reg->name_table;
716 
717   if (name_end - name <= 0)
718     return ONIGERR_EMPTY_GROUP_NAME;
719 
720   e = name_find(reg, name, name_end);
721   if (IS_NULL(e)) {
722 #ifdef USE_ST_LIBRARY
723     if (IS_NULL(t)) {
724       t = onig_st_init_strend_table_with_size(5);
725       reg->name_table = (void* )t;
726     }
727     e = (NameEntry* )xmalloc(sizeof(NameEntry));
728     CHECK_NULL_RETURN_MEMERR(e);
729 
730     e->name = strdup_with_null(reg->enc, name, name_end);
731     if (IS_NULL(e->name)) {
732       xfree(e);  return ONIGERR_MEMORY;
733     }
734     onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
735                           (HashDataType )e);
736 
737     e->name_len   = name_end - name;
738     e->back_num   = 0;
739     e->back_alloc = 0;
740     e->back_refs  = (int* )NULL;
741 
742 #else
743 
744     if (IS_NULL(t)) {
745       alloc = INIT_NAMES_ALLOC_NUM;
746       t = (NameTable* )xmalloc(sizeof(NameTable));
747       CHECK_NULL_RETURN_MEMERR(t);
748       t->e     = NULL;
749       t->alloc = 0;
750       t->num   = 0;
751 
752       t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
753       if (IS_NULL(t->e)) {
754 	xfree(t);
755 	return ONIGERR_MEMORY;
756       }
757       t->alloc = alloc;
758       reg->name_table = t;
759       goto clear;
760     }
761     else if (t->num == t->alloc) {
762       int i;
763 
764       alloc = t->alloc * 2;
765       t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
766       CHECK_NULL_RETURN_MEMERR(t->e);
767       t->alloc = alloc;
768 
769     clear:
770       for (i = t->num; i < t->alloc; i++) {
771 	t->e[i].name       = NULL;
772 	t->e[i].name_len   = 0;
773 	t->e[i].back_num   = 0;
774 	t->e[i].back_alloc = 0;
775 	t->e[i].back_refs  = (int* )NULL;
776       }
777     }
778     e = &(t->e[t->num]);
779     t->num++;
780     e->name = strdup_with_null(reg->enc, name, name_end);
781     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
782     e->name_len = name_end - name;
783 #endif
784   }
785 
786   if (e->back_num >= 1 &&
787       ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
788     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
789 				    name, name_end);
790     return ONIGERR_MULTIPLEX_DEFINED_NAME;
791   }
792 
793   e->back_num++;
794   if (e->back_num == 1) {
795     e->back_ref1 = backref;
796   }
797   else {
798     if (e->back_num == 2) {
799       alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
800       e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
801       CHECK_NULL_RETURN_MEMERR(e->back_refs);
802       e->back_alloc = alloc;
803       e->back_refs[0] = e->back_ref1;
804       e->back_refs[1] = backref;
805     }
806     else {
807       if (e->back_num > e->back_alloc) {
808 	alloc = e->back_alloc * 2;
809 	e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
810 	CHECK_NULL_RETURN_MEMERR(e->back_refs);
811 	e->back_alloc = alloc;
812       }
813       e->back_refs[e->back_num - 1] = backref;
814     }
815   }
816 
817   return 0;
818 }
819 
820 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)821 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
822 			   const UChar* name_end, int** nums)
823 {
824   NameEntry* e = name_find(reg, name, name_end);
825 
826   if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
827 
828   switch (e->back_num) {
829   case 0:
830     break;
831   case 1:
832     *nums = &(e->back_ref1);
833     break;
834   default:
835     *nums = e->back_refs;
836     break;
837   }
838   return e->back_num;
839 }
840 
841 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)842 onig_name_to_backref_number(regex_t* reg, const UChar* name,
843 			    const UChar* name_end, OnigRegion *region)
844 {
845   int i, n, *nums;
846 
847   n = onig_name_to_group_numbers(reg, name, name_end, &nums);
848   if (n < 0)
849     return n;
850   else if (n == 0)
851     return ONIGERR_PARSER_BUG;
852   else if (n == 1)
853     return nums[0];
854   else {
855     if (IS_NOT_NULL(region)) {
856       for (i = n - 1; i >= 0; i--) {
857 	if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
858 	  return nums[i];
859       }
860     }
861     return nums[n - 1];
862   }
863 }
864 
865 #else /* USE_NAMED_GROUP */
866 
867 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)868 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
869 			   const UChar* name_end, int** nums)
870 {
871   return ONIG_NO_SUPPORT_CONFIG;
872 }
873 
874 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)875 onig_name_to_backref_number(regex_t* reg, const UChar* name,
876 			    const UChar* name_end, OnigRegion* region)
877 {
878   return ONIG_NO_SUPPORT_CONFIG;
879 }
880 
881 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)882 onig_foreach_name(regex_t* reg,
883   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
884 {
885   return ONIG_NO_SUPPORT_CONFIG;
886 }
887 
888 extern int
onig_number_of_names(regex_t * reg)889 onig_number_of_names(regex_t* reg)
890 {
891   return 0;
892 }
893 #endif /* else USE_NAMED_GROUP */
894 
895 extern int
onig_noname_group_capture_is_active(regex_t * reg)896 onig_noname_group_capture_is_active(regex_t* reg)
897 {
898   if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
899     return 0;
900 
901 #ifdef USE_NAMED_GROUP
902   if (onig_number_of_names(reg) > 0 &&
903       IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
904       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
905     return 0;
906   }
907 #endif
908 
909   return 1;
910 }
911 
912 
913 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE   16
914 
915 static void
scan_env_clear(ScanEnv * env)916 scan_env_clear(ScanEnv* env)
917 {
918   int i;
919 
920   BIT_STATUS_CLEAR(env->capture_history);
921   BIT_STATUS_CLEAR(env->bt_mem_start);
922   BIT_STATUS_CLEAR(env->bt_mem_end);
923   BIT_STATUS_CLEAR(env->backrefed_mem);
924   env->error      = (UChar* )NULL;
925   env->error_end  = (UChar* )NULL;
926   env->num_call   = 0;
927   env->num_mem    = 0;
928 #ifdef USE_NAMED_GROUP
929   env->num_named  = 0;
930 #endif
931   env->mem_alloc         = 0;
932   env->mem_nodes_dynamic = (Node** )NULL;
933 
934   for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
935     env->mem_nodes_static[i] = NULL_NODE;
936 
937 #ifdef USE_COMBINATION_EXPLOSION_CHECK
938   env->num_comb_exp_check  = 0;
939   env->comb_exp_max_regnum = 0;
940   env->curr_max_regnum     = 0;
941   env->has_recursion       = 0;
942 #endif
943 }
944 
945 static int
scan_env_add_mem_entry(ScanEnv * env)946 scan_env_add_mem_entry(ScanEnv* env)
947 {
948   int i, need, alloc;
949   Node** p;
950 
951   need = env->num_mem + 1;
952   if (need >= SCANENV_MEMNODES_SIZE) {
953     if (env->mem_alloc <= need) {
954       if (IS_NULL(env->mem_nodes_dynamic)) {
955 	alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
956 	p = (Node** )xmalloc(sizeof(Node*) * alloc);
957 	xmemcpy(p, env->mem_nodes_static,
958 		sizeof(Node*) * SCANENV_MEMNODES_SIZE);
959       }
960       else {
961 	alloc = env->mem_alloc * 2;
962 	p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
963       }
964       CHECK_NULL_RETURN_MEMERR(p);
965 
966       for (i = env->num_mem + 1; i < alloc; i++)
967 	p[i] = NULL_NODE;
968 
969       env->mem_nodes_dynamic = p;
970       env->mem_alloc = alloc;
971     }
972   }
973 
974   env->num_mem++;
975   return env->num_mem;
976 }
977 
978 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)979 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
980 {
981   if (env->num_mem >= num)
982     SCANENV_MEM_NODES(env)[num] = node;
983   else
984     return ONIGERR_PARSER_BUG;
985   return 0;
986 }
987 
988 
989 #ifdef USE_PARSE_TREE_NODE_RECYCLE
990 typedef struct _FreeNode {
991   struct _FreeNode* next;
992 } FreeNode;
993 
994 static FreeNode* FreeNodeList = (FreeNode* )NULL;
995 #endif
996 
997 extern void
onig_node_free(Node * node)998 onig_node_free(Node* node)
999 {
1000  start:
1001   if (IS_NULL(node)) return ;
1002 
1003   switch (NTYPE(node)) {
1004   case NT_STR:
1005     if (NSTR(node)->capa != 0 &&
1006 	IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1007       xfree(NSTR(node)->s);
1008     }
1009     break;
1010 
1011   case NT_LIST:
1012   case NT_ALT:
1013     onig_node_free(NCAR(node));
1014     {
1015       Node* next_node = NCDR(node);
1016 
1017 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1018       {
1019 	FreeNode* n = (FreeNode* )node;
1020 
1021         THREAD_ATOMIC_START;
1022 	n->next = FreeNodeList;
1023 	FreeNodeList = n;
1024         THREAD_ATOMIC_END;
1025       }
1026 #else
1027       xfree(node);
1028 #endif
1029       node = next_node;
1030       goto start;
1031     }
1032     break;
1033 
1034   case NT_CCLASS:
1035     {
1036       CClassNode* cc = NCCLASS(node);
1037 
1038       if (IS_NCCLASS_SHARE(cc)) return ;
1039       if (cc->mbuf)
1040         bbuf_free(cc->mbuf);
1041     }
1042     break;
1043 
1044   case NT_QTFR:
1045     if (NQTFR(node)->target)
1046       onig_node_free(NQTFR(node)->target);
1047     break;
1048 
1049   case NT_ENCLOSE:
1050     if (NENCLOSE(node)->target)
1051       onig_node_free(NENCLOSE(node)->target);
1052     break;
1053 
1054   case NT_BREF:
1055     if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1056       xfree(NBREF(node)->back_dynamic);
1057     break;
1058 
1059   case NT_ANCHOR:
1060     if (NANCHOR(node)->target)
1061       onig_node_free(NANCHOR(node)->target);
1062     break;
1063   }
1064 
1065 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1066   {
1067     FreeNode* n = (FreeNode* )node;
1068 
1069     THREAD_ATOMIC_START;
1070     n->next = FreeNodeList;
1071     FreeNodeList = n;
1072     THREAD_ATOMIC_END;
1073   }
1074 #else
1075   xfree(node);
1076 #endif
1077 }
1078 
1079 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1080 extern int
onig_free_node_list(void)1081 onig_free_node_list(void)
1082 {
1083   FreeNode* n;
1084 
1085   /* THREAD_ATOMIC_START; */
1086   while (IS_NOT_NULL(FreeNodeList)) {
1087     n = FreeNodeList;
1088     FreeNodeList = FreeNodeList->next;
1089     xfree(n);
1090   }
1091   /* THREAD_ATOMIC_END; */
1092   return 0;
1093 }
1094 #endif
1095 
1096 static Node*
node_new(void)1097 node_new(void)
1098 {
1099   Node* node;
1100 
1101 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1102   THREAD_ATOMIC_START;
1103   if (IS_NOT_NULL(FreeNodeList)) {
1104     node = (Node* )FreeNodeList;
1105     FreeNodeList = FreeNodeList->next;
1106     THREAD_ATOMIC_END;
1107     return node;
1108   }
1109   THREAD_ATOMIC_END;
1110 #endif
1111 
1112   node = (Node* )xmalloc(sizeof(Node));
1113   /* xmemset(node, 0, sizeof(Node)); */
1114   return node;
1115 }
1116 
1117 
1118 static void
initialize_cclass(CClassNode * cc)1119 initialize_cclass(CClassNode* cc)
1120 {
1121   BITSET_CLEAR(cc->bs);
1122   /* cc->base.flags = 0; */
1123   cc->flags = 0;
1124   cc->mbuf  = NULL;
1125 }
1126 
1127 static Node*
node_new_cclass(void)1128 node_new_cclass(void)
1129 {
1130   Node* node = node_new();
1131   CHECK_NULL_RETURN(node);
1132 
1133   SET_NTYPE(node, NT_CCLASS);
1134   initialize_cclass(NCCLASS(node));
1135   return node;
1136 }
1137 
1138 static Node*
node_new_cclass_by_codepoint_range(int not,OnigCodePoint sb_out,const OnigCodePoint ranges[])1139 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1140 				   const OnigCodePoint ranges[])
1141 {
1142   int n, i;
1143   CClassNode* cc;
1144   OnigCodePoint j;
1145 
1146   Node* node = node_new_cclass();
1147   CHECK_NULL_RETURN(node);
1148 
1149   cc = NCCLASS(node);
1150   if (not != 0) NCCLASS_SET_NOT(cc);
1151 
1152   BITSET_CLEAR(cc->bs);
1153   if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1154     n = ONIGENC_CODE_RANGE_NUM(ranges);
1155     for (i = 0; i < n; i++) {
1156       for (j  = ONIGENC_CODE_RANGE_FROM(ranges, i);
1157            j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1158 	if (j >= sb_out) goto sb_end;
1159 
1160         BITSET_SET_BIT(cc->bs, j);
1161       }
1162     }
1163   }
1164 
1165  sb_end:
1166   if (IS_NULL(ranges)) {
1167   is_null:
1168     cc->mbuf = NULL;
1169   }
1170   else {
1171     BBuf* bbuf;
1172 
1173     n = ONIGENC_CODE_RANGE_NUM(ranges);
1174     if (n == 0) goto is_null;
1175 
1176     bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1177     CHECK_NULL_RETURN(bbuf);
1178     bbuf->alloc = n + 1;
1179     bbuf->used  = n + 1;
1180     bbuf->p     = (UChar* )((void* )ranges);
1181 
1182     cc->mbuf = bbuf;
1183   }
1184 
1185   return node;
1186 }
1187 
1188 static Node*
node_new_ctype(int type,int not)1189 node_new_ctype(int type, int not)
1190 {
1191   Node* node = node_new();
1192   CHECK_NULL_RETURN(node);
1193 
1194   SET_NTYPE(node, NT_CTYPE);
1195   NCTYPE(node)->ctype = type;
1196   NCTYPE(node)->not   = not;
1197   return node;
1198 }
1199 
1200 static Node*
node_new_anychar(void)1201 node_new_anychar(void)
1202 {
1203   Node* node = node_new();
1204   CHECK_NULL_RETURN(node);
1205 
1206   SET_NTYPE(node, NT_CANY);
1207   return node;
1208 }
1209 
1210 static Node*
node_new_list(Node * left,Node * right)1211 node_new_list(Node* left, Node* right)
1212 {
1213   Node* node = node_new();
1214   CHECK_NULL_RETURN(node);
1215 
1216   SET_NTYPE(node, NT_LIST);
1217   NCAR(node)  = left;
1218   NCDR(node) = right;
1219   return node;
1220 }
1221 
1222 extern Node*
onig_node_new_list(Node * left,Node * right)1223 onig_node_new_list(Node* left, Node* right)
1224 {
1225   return node_new_list(left, right);
1226 }
1227 
1228 extern Node*
onig_node_list_add(Node * list,Node * x)1229 onig_node_list_add(Node* list, Node* x)
1230 {
1231   Node *n;
1232 
1233   n = onig_node_new_list(x, NULL);
1234   if (IS_NULL(n)) return NULL_NODE;
1235 
1236   if (IS_NOT_NULL(list)) {
1237     while (IS_NOT_NULL(NCDR(list)))
1238       list = NCDR(list);
1239 
1240     NCDR(list) = n;
1241   }
1242 
1243   return n;
1244 }
1245 
1246 extern Node*
onig_node_new_alt(Node * left,Node * right)1247 onig_node_new_alt(Node* left, Node* right)
1248 {
1249   Node* node = node_new();
1250   CHECK_NULL_RETURN(node);
1251 
1252   SET_NTYPE(node, NT_ALT);
1253   NCAR(node)  = left;
1254   NCDR(node) = right;
1255   return node;
1256 }
1257 
1258 extern Node*
onig_node_new_anchor(int type)1259 onig_node_new_anchor(int type)
1260 {
1261   Node* node = node_new();
1262   CHECK_NULL_RETURN(node);
1263 
1264   SET_NTYPE(node, NT_ANCHOR);
1265   NANCHOR(node)->type     = type;
1266   NANCHOR(node)->target   = NULL;
1267   NANCHOR(node)->char_len = -1;
1268   return node;
1269 }
1270 
1271 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1272 node_new_backref(int back_num, int* backrefs, int by_name,
1273 #ifdef USE_BACKREF_WITH_LEVEL
1274 		 int exist_level, int nest_level,
1275 #endif
1276 		 ScanEnv* env)
1277 {
1278   int i;
1279   Node* node = node_new();
1280 
1281   CHECK_NULL_RETURN(node);
1282 
1283   SET_NTYPE(node, NT_BREF);
1284   NBREF(node)->state    = 0;
1285   NBREF(node)->back_num = back_num;
1286   NBREF(node)->back_dynamic = (int* )NULL;
1287   if (by_name != 0)
1288     NBREF(node)->state |= NST_NAME_REF;
1289 
1290 #ifdef USE_BACKREF_WITH_LEVEL
1291   if (exist_level != 0) {
1292     NBREF(node)->state |= NST_NEST_LEVEL;
1293     NBREF(node)->nest_level  = nest_level;
1294   }
1295 #endif
1296 
1297   for (i = 0; i < back_num; i++) {
1298     if (backrefs[i] <= env->num_mem &&
1299 	IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1300       NBREF(node)->state |= NST_RECURSION;   /* /...(\1).../ */
1301       break;
1302     }
1303   }
1304 
1305   if (back_num <= NODE_BACKREFS_SIZE) {
1306     for (i = 0; i < back_num; i++)
1307       NBREF(node)->back_static[i] = backrefs[i];
1308   }
1309   else {
1310     int* p = (int* )xmalloc(sizeof(int) * back_num);
1311     if (IS_NULL(p)) {
1312       onig_node_free(node);
1313       return NULL;
1314     }
1315     NBREF(node)->back_dynamic = p;
1316     for (i = 0; i < back_num; i++)
1317       p[i] = backrefs[i];
1318   }
1319   return node;
1320 }
1321 
1322 #ifdef USE_SUBEXP_CALL
1323 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1324 node_new_call(UChar* name, UChar* name_end, int gnum)
1325 {
1326   Node* node = node_new();
1327   CHECK_NULL_RETURN(node);
1328 
1329   SET_NTYPE(node, NT_CALL);
1330   NCALL(node)->state     = 0;
1331   NCALL(node)->target    = NULL_NODE;
1332   NCALL(node)->name      = name;
1333   NCALL(node)->name_end  = name_end;
1334   NCALL(node)->group_num = gnum;  /* call by number if gnum != 0 */
1335   return node;
1336 }
1337 #endif
1338 
1339 static Node*
node_new_quantifier(int lower,int upper,int by_number)1340 node_new_quantifier(int lower, int upper, int by_number)
1341 {
1342   Node* node = node_new();
1343   CHECK_NULL_RETURN(node);
1344 
1345   SET_NTYPE(node, NT_QTFR);
1346   NQTFR(node)->state  = 0;
1347   NQTFR(node)->target = NULL;
1348   NQTFR(node)->lower  = lower;
1349   NQTFR(node)->upper  = upper;
1350   NQTFR(node)->greedy = 1;
1351   NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1352   NQTFR(node)->head_exact        = NULL_NODE;
1353   NQTFR(node)->next_head_exact   = NULL_NODE;
1354   NQTFR(node)->is_refered        = 0;
1355   if (by_number != 0)
1356     NQTFR(node)->state |= NST_BY_NUMBER;
1357 
1358 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1359   NQTFR(node)->comb_exp_check_num = 0;
1360 #endif
1361 
1362   return node;
1363 }
1364 
1365 static Node*
node_new_enclose(int type)1366 node_new_enclose(int type)
1367 {
1368   Node* node = node_new();
1369   CHECK_NULL_RETURN(node);
1370 
1371   SET_NTYPE(node, NT_ENCLOSE);
1372   NENCLOSE(node)->type      = type;
1373   NENCLOSE(node)->state     =  0;
1374   NENCLOSE(node)->regnum    =  0;
1375   NENCLOSE(node)->option    =  0;
1376   NENCLOSE(node)->target    = NULL;
1377   NENCLOSE(node)->call_addr = -1;
1378   NENCLOSE(node)->opt_count =  0;
1379   return node;
1380 }
1381 
1382 extern Node*
onig_node_new_enclose(int type)1383 onig_node_new_enclose(int type)
1384 {
1385   return node_new_enclose(type);
1386 }
1387 
1388 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1389 node_new_enclose_memory(OnigOptionType option, int is_named)
1390 {
1391   Node* node = node_new_enclose(ENCLOSE_MEMORY);
1392   CHECK_NULL_RETURN(node);
1393   if (is_named != 0)
1394     SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1395 
1396 #ifdef USE_SUBEXP_CALL
1397   NENCLOSE(node)->option = option;
1398 #endif
1399   return node;
1400 }
1401 
1402 static Node*
node_new_option(OnigOptionType option)1403 node_new_option(OnigOptionType option)
1404 {
1405   Node* node = node_new_enclose(ENCLOSE_OPTION);
1406   CHECK_NULL_RETURN(node);
1407   NENCLOSE(node)->option = option;
1408   return node;
1409 }
1410 
1411 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1412 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1413 {
1414   int addlen = end - s;
1415 
1416   if (addlen > 0) {
1417     int len  = NSTR(node)->end - NSTR(node)->s;
1418 
1419     if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1420       UChar* p;
1421       int capa = len + addlen + NODE_STR_MARGIN;
1422 
1423       if (capa <= NSTR(node)->capa) {
1424 	onig_strcpy(NSTR(node)->s + len, s, end);
1425       }
1426       else {
1427 	if (NSTR(node)->s == NSTR(node)->buf)
1428 	  p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1429 				      s, end, capa);
1430 	else
1431 	  p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1432 
1433 	CHECK_NULL_RETURN_MEMERR(p);
1434 	NSTR(node)->s    = p;
1435 	NSTR(node)->capa = capa;
1436       }
1437     }
1438     else {
1439       onig_strcpy(NSTR(node)->s + len, s, end);
1440     }
1441     NSTR(node)->end = NSTR(node)->s + len + addlen;
1442   }
1443 
1444   return 0;
1445 }
1446 
1447 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1448 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1449 {
1450   onig_node_str_clear(node);
1451   return onig_node_str_cat(node, s, end);
1452 }
1453 
1454 static int
node_str_cat_char(Node * node,UChar c)1455 node_str_cat_char(Node* node, UChar c)
1456 {
1457   UChar s[1];
1458 
1459   s[0] = c;
1460   return onig_node_str_cat(node, s, s + 1);
1461 }
1462 
1463 extern void
onig_node_conv_to_str_node(Node * node,int flag)1464 onig_node_conv_to_str_node(Node* node, int flag)
1465 {
1466   SET_NTYPE(node, NT_STR);
1467   NSTR(node)->flag = flag;
1468   NSTR(node)->capa = 0;
1469   NSTR(node)->s    = NSTR(node)->buf;
1470   NSTR(node)->end  = NSTR(node)->buf;
1471 }
1472 
1473 extern void
onig_node_str_clear(Node * node)1474 onig_node_str_clear(Node* node)
1475 {
1476   if (NSTR(node)->capa != 0 &&
1477       IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1478     xfree(NSTR(node)->s);
1479   }
1480 
1481   NSTR(node)->capa = 0;
1482   NSTR(node)->flag = 0;
1483   NSTR(node)->s    = NSTR(node)->buf;
1484   NSTR(node)->end  = NSTR(node)->buf;
1485 }
1486 
1487 static Node*
node_new_str(const UChar * s,const UChar * end)1488 node_new_str(const UChar* s, const UChar* end)
1489 {
1490   Node* node = node_new();
1491   CHECK_NULL_RETURN(node);
1492 
1493   SET_NTYPE(node, NT_STR);
1494   NSTR(node)->capa = 0;
1495   NSTR(node)->flag = 0;
1496   NSTR(node)->s    = NSTR(node)->buf;
1497   NSTR(node)->end  = NSTR(node)->buf;
1498   if (onig_node_str_cat(node, s, end)) {
1499     onig_node_free(node);
1500     return NULL;
1501   }
1502   return node;
1503 }
1504 
1505 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1506 onig_node_new_str(const UChar* s, const UChar* end)
1507 {
1508   return node_new_str(s, end);
1509 }
1510 
1511 static Node*
node_new_str_raw(UChar * s,UChar * end)1512 node_new_str_raw(UChar* s, UChar* end)
1513 {
1514   Node* node = node_new_str(s, end);
1515   NSTRING_SET_RAW(node);
1516   return node;
1517 }
1518 
1519 static Node*
node_new_empty(void)1520 node_new_empty(void)
1521 {
1522   return node_new_str(NULL, NULL);
1523 }
1524 
1525 static Node*
node_new_str_raw_char(UChar c)1526 node_new_str_raw_char(UChar c)
1527 {
1528   UChar p[1];
1529 
1530   p[0] = c;
1531   return node_new_str_raw(p, p + 1);
1532 }
1533 
1534 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1535 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1536 {
1537   const UChar *p;
1538   Node* n = NULL_NODE;
1539 
1540   if (sn->end > sn->s) {
1541     p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1542     if (p && p > sn->s) { /* can be splitted. */
1543       n = node_new_str(p, sn->end);
1544       if ((sn->flag & NSTR_RAW) != 0)
1545 	NSTRING_SET_RAW(n);
1546       sn->end = (UChar* )p;
1547     }
1548   }
1549   return n;
1550 }
1551 
1552 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1553 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1554 {
1555   if (sn->end > sn->s) {
1556     return ((enclen(enc, sn->s) < sn->end - sn->s)  ?  1 : 0);
1557   }
1558   return 0;
1559 }
1560 
1561 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1562 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1563 node_str_head_pad(StrNode* sn, int num, UChar val)
1564 {
1565   UChar buf[NODE_STR_BUF_SIZE];
1566   int i, len;
1567 
1568   len = sn->end - sn->s;
1569   onig_strcpy(buf, sn->s, sn->end);
1570   onig_strcpy(&(sn->s[num]), buf, buf + len);
1571   sn->end += num;
1572 
1573   for (i = 0; i < num; i++) {
1574     sn->s[i] = val;
1575   }
1576 }
1577 #endif
1578 
1579 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1580 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1581 {
1582   unsigned int num, val;
1583   OnigCodePoint c;
1584   UChar* p = *src;
1585   PFETCH_READY;
1586 
1587   num = 0;
1588   while (!PEND) {
1589     PFETCH(c);
1590     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1591       val = (unsigned int )DIGITVAL(c);
1592       if ((INT_MAX_LIMIT - val) / 10UL < num)
1593 	return -1;  /* overflow */
1594 
1595       num = num * 10 + val;
1596     }
1597     else {
1598       PUNFETCH;
1599       break;
1600     }
1601   }
1602   *src = p;
1603   return num;
1604 }
1605 
1606 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1607 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1608 				 OnigEncoding enc)
1609 {
1610   OnigCodePoint c;
1611   unsigned int num, val;
1612   UChar* p = *src;
1613   PFETCH_READY;
1614 
1615   num = 0;
1616   while (!PEND && maxlen-- != 0) {
1617     PFETCH(c);
1618     if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1619       val = (unsigned int )XDIGITVAL(enc,c);
1620       if ((INT_MAX_LIMIT - val) / 16UL < num)
1621 	return -1;  /* overflow */
1622 
1623       num = (num << 4) + XDIGITVAL(enc,c);
1624     }
1625     else {
1626       PUNFETCH;
1627       break;
1628     }
1629   }
1630   *src = p;
1631   return num;
1632 }
1633 
1634 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1635 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1636 			   OnigEncoding enc)
1637 {
1638   OnigCodePoint c;
1639   unsigned int num, val;
1640   UChar* p = *src;
1641   PFETCH_READY;
1642 
1643   num = 0;
1644   while (!PEND && maxlen-- != 0) {
1645     PFETCH(c);
1646     if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1647       val = ODIGITVAL(c);
1648       if ((INT_MAX_LIMIT - val) / 8UL < num)
1649 	return -1;  /* overflow */
1650 
1651       num = (num << 3) + val;
1652     }
1653     else {
1654       PUNFETCH;
1655       break;
1656     }
1657   }
1658   *src = p;
1659   return num;
1660 }
1661 
1662 
1663 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1664     BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1665 
1666 /* data format:
1667      [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1668      (all data size is OnigCodePoint)
1669  */
1670 static int
new_code_range(BBuf ** pbuf)1671 new_code_range(BBuf** pbuf)
1672 {
1673 #define INIT_MULTI_BYTE_RANGE_SIZE  (SIZE_CODE_POINT * 5)
1674   int r;
1675   OnigCodePoint n;
1676   BBuf* bbuf;
1677 
1678   bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1679   CHECK_NULL_RETURN_MEMERR(*pbuf);
1680   r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1681   if (r) return r;
1682 
1683   n = 0;
1684   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1685   return 0;
1686 }
1687 
1688 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1689 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1690 {
1691   int r, inc_n, pos;
1692   int low, high, bound, x;
1693   OnigCodePoint n, *data;
1694   BBuf* bbuf;
1695 
1696   if (from > to) {
1697     n = from; from = to; to = n;
1698   }
1699 
1700   if (IS_NULL(*pbuf)) {
1701     r = new_code_range(pbuf);
1702     if (r) return r;
1703     bbuf = *pbuf;
1704     n = 0;
1705   }
1706   else {
1707     bbuf = *pbuf;
1708     GET_CODE_POINT(n, bbuf->p);
1709   }
1710   data = (OnigCodePoint* )(bbuf->p);
1711   data++;
1712 
1713   for (low = 0, bound = n; low < bound; ) {
1714     x = (low + bound) >> 1;
1715     if (from > data[x*2 + 1])
1716       low = x + 1;
1717     else
1718       bound = x;
1719   }
1720 
1721   for (high = low, bound = n; high < bound; ) {
1722     x = (high + bound) >> 1;
1723     if (to >= data[x*2] - 1)
1724       high = x + 1;
1725     else
1726       bound = x;
1727   }
1728 
1729   inc_n = low + 1 - high;
1730   if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1731     return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1732 
1733   if (inc_n != 1) {
1734     if (from > data[low*2])
1735       from = data[low*2];
1736     if (to < data[(high - 1)*2 + 1])
1737       to = data[(high - 1)*2 + 1];
1738   }
1739 
1740   if (inc_n != 0 && (OnigCodePoint )high < n) {
1741     int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1742     int to_pos   = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1743     int size = (n - high) * 2 * SIZE_CODE_POINT;
1744 
1745     if (inc_n > 0) {
1746       BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1747     }
1748     else {
1749       BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1750     }
1751   }
1752 
1753   pos = SIZE_CODE_POINT * (1 + low * 2);
1754   BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1755   BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1756   BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1757   n += inc_n;
1758   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1759 
1760   return 0;
1761 }
1762 
1763 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1764 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1765 {
1766   if (from > to) {
1767     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1768       return 0;
1769     else
1770       return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1771   }
1772 
1773   return add_code_range_to_buf(pbuf, from, to);
1774 }
1775 
1776 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1777 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1778 {
1779   int r, i, n;
1780   OnigCodePoint pre, from, *data, to = 0;
1781 
1782   *pbuf = (BBuf* )NULL;
1783   if (IS_NULL(bbuf)) {
1784   set_all:
1785     return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1786   }
1787 
1788   data = (OnigCodePoint* )(bbuf->p);
1789   GET_CODE_POINT(n, data);
1790   data++;
1791   if (n <= 0) goto set_all;
1792 
1793   r = 0;
1794   pre = MBCODE_START_POS(enc);
1795   for (i = 0; i < n; i++) {
1796     from = data[i*2];
1797     to   = data[i*2+1];
1798     if (pre <= from - 1) {
1799       r = add_code_range_to_buf(pbuf, pre, from - 1);
1800       if (r != 0) return r;
1801     }
1802     if (to == ~((OnigCodePoint )0)) break;
1803     pre = to + 1;
1804   }
1805   if (to < ~((OnigCodePoint )0)) {
1806     r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1807   }
1808   return r;
1809 }
1810 
1811 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1812   BBuf *tbuf; \
1813   int  tnot; \
1814   tnot = not1;  not1  = not2;  not2  = tnot; \
1815   tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1816 } while (0)
1817 
1818 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1819 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1820                   BBuf* bbuf2, int not2, BBuf** pbuf)
1821 {
1822   int r;
1823   OnigCodePoint i, n1, *data1;
1824   OnigCodePoint from, to;
1825 
1826   *pbuf = (BBuf* )NULL;
1827   if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1828     if (not1 != 0 || not2 != 0)
1829       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1830     return 0;
1831   }
1832 
1833   r = 0;
1834   if (IS_NULL(bbuf2))
1835     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1836 
1837   if (IS_NULL(bbuf1)) {
1838     if (not1 != 0) {
1839       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1840     }
1841     else {
1842       if (not2 == 0) {
1843 	return bbuf_clone(pbuf, bbuf2);
1844       }
1845       else {
1846 	return not_code_range_buf(enc, bbuf2, pbuf);
1847       }
1848     }
1849   }
1850 
1851   if (not1 != 0)
1852     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1853 
1854   data1 = (OnigCodePoint* )(bbuf1->p);
1855   GET_CODE_POINT(n1, data1);
1856   data1++;
1857 
1858   if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1859     r = bbuf_clone(pbuf, bbuf2);
1860   }
1861   else if (not1 == 0) { /* 1 OR (not 2) */
1862     r = not_code_range_buf(enc, bbuf2, pbuf);
1863   }
1864   if (r != 0) return r;
1865 
1866   for (i = 0; i < n1; i++) {
1867     from = data1[i*2];
1868     to   = data1[i*2+1];
1869     r = add_code_range_to_buf(pbuf, from, to);
1870     if (r != 0) return r;
1871   }
1872   return 0;
1873 }
1874 
1875 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1876 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1877 	        OnigCodePoint* data, int n)
1878 {
1879   int i, r;
1880   OnigCodePoint from2, to2;
1881 
1882   for (i = 0; i < n; i++) {
1883     from2 = data[i*2];
1884     to2   = data[i*2+1];
1885     if (from2 < from1) {
1886       if (to2 < from1) continue;
1887       else {
1888 	from1 = to2 + 1;
1889       }
1890     }
1891     else if (from2 <= to1) {
1892       if (to2 < to1) {
1893 	if (from1 <= from2 - 1) {
1894 	  r = add_code_range_to_buf(pbuf, from1, from2-1);
1895 	  if (r != 0) return r;
1896 	}
1897 	from1 = to2 + 1;
1898       }
1899       else {
1900 	to1 = from2 - 1;
1901       }
1902     }
1903     else {
1904       from1 = from2;
1905     }
1906     if (from1 > to1) break;
1907   }
1908   if (from1 <= to1) {
1909     r = add_code_range_to_buf(pbuf, from1, to1);
1910     if (r != 0) return r;
1911   }
1912   return 0;
1913 }
1914 
1915 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1916 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1917 {
1918   int r;
1919   OnigCodePoint i, j, n1, n2, *data1, *data2;
1920   OnigCodePoint from, to, from1, to1, from2, to2;
1921 
1922   *pbuf = (BBuf* )NULL;
1923   if (IS_NULL(bbuf1)) {
1924     if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1925       return bbuf_clone(pbuf, bbuf2);
1926     return 0;
1927   }
1928   else if (IS_NULL(bbuf2)) {
1929     if (not2 != 0)
1930       return bbuf_clone(pbuf, bbuf1);
1931     return 0;
1932   }
1933 
1934   if (not1 != 0)
1935     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1936 
1937   data1 = (OnigCodePoint* )(bbuf1->p);
1938   data2 = (OnigCodePoint* )(bbuf2->p);
1939   GET_CODE_POINT(n1, data1);
1940   GET_CODE_POINT(n2, data2);
1941   data1++;
1942   data2++;
1943 
1944   if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1945     for (i = 0; i < n1; i++) {
1946       from1 = data1[i*2];
1947       to1   = data1[i*2+1];
1948       for (j = 0; j < n2; j++) {
1949 	from2 = data2[j*2];
1950 	to2   = data2[j*2+1];
1951 	if (from2 > to1) break;
1952 	if (to2 < from1) continue;
1953 	from = MAX(from1, from2);
1954 	to   = MIN(to1, to2);
1955 	r = add_code_range_to_buf(pbuf, from, to);
1956 	if (r != 0) return r;
1957       }
1958     }
1959   }
1960   else if (not1 == 0) { /* 1 AND (not 2) */
1961     for (i = 0; i < n1; i++) {
1962       from1 = data1[i*2];
1963       to1   = data1[i*2+1];
1964       r = and_code_range1(pbuf, from1, to1, data2, n2);
1965       if (r != 0) return r;
1966     }
1967   }
1968 
1969   return 0;
1970 }
1971 
1972 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1973 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1974 {
1975   int r, not1, not2;
1976   BBuf *buf1, *buf2, *pbuf;
1977   BitSetRef bsr1, bsr2;
1978   BitSet bs1, bs2;
1979 
1980   not1 = IS_NCCLASS_NOT(dest);
1981   bsr1 = dest->bs;
1982   buf1 = dest->mbuf;
1983   not2 = IS_NCCLASS_NOT(cc);
1984   bsr2 = cc->bs;
1985   buf2 = cc->mbuf;
1986 
1987   if (not1 != 0) {
1988     bitset_invert_to(bsr1, bs1);
1989     bsr1 = bs1;
1990   }
1991   if (not2 != 0) {
1992     bitset_invert_to(bsr2, bs2);
1993     bsr2 = bs2;
1994   }
1995   bitset_and(bsr1, bsr2);
1996   if (bsr1 != dest->bs) {
1997     bitset_copy(dest->bs, bsr1);
1998     bsr1 = dest->bs;
1999   }
2000   if (not1 != 0) {
2001     bitset_invert(dest->bs);
2002   }
2003 
2004   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2005     if (not1 != 0 && not2 != 0) {
2006       r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2007     }
2008     else {
2009       r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2010       if (r == 0 && not1 != 0) {
2011 	BBuf *tbuf;
2012 	r = not_code_range_buf(enc, pbuf, &tbuf);
2013 	if (r != 0) {
2014 	  bbuf_free(pbuf);
2015 	  return r;
2016 	}
2017 	bbuf_free(pbuf);
2018 	pbuf = tbuf;
2019       }
2020     }
2021     if (r != 0) return r;
2022 
2023     dest->mbuf = pbuf;
2024     bbuf_free(buf1);
2025     return r;
2026   }
2027   return 0;
2028 }
2029 
2030 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2031 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2032 {
2033   int r, not1, not2;
2034   BBuf *buf1, *buf2, *pbuf;
2035   BitSetRef bsr1, bsr2;
2036   BitSet bs1, bs2;
2037 
2038   not1 = IS_NCCLASS_NOT(dest);
2039   bsr1 = dest->bs;
2040   buf1 = dest->mbuf;
2041   not2 = IS_NCCLASS_NOT(cc);
2042   bsr2 = cc->bs;
2043   buf2 = cc->mbuf;
2044 
2045   if (not1 != 0) {
2046     bitset_invert_to(bsr1, bs1);
2047     bsr1 = bs1;
2048   }
2049   if (not2 != 0) {
2050     bitset_invert_to(bsr2, bs2);
2051     bsr2 = bs2;
2052   }
2053   bitset_or(bsr1, bsr2);
2054   if (bsr1 != dest->bs) {
2055     bitset_copy(dest->bs, bsr1);
2056     bsr1 = dest->bs;
2057   }
2058   if (not1 != 0) {
2059     bitset_invert(dest->bs);
2060   }
2061 
2062   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2063     if (not1 != 0 && not2 != 0) {
2064       r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2065     }
2066     else {
2067       r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2068       if (r == 0 && not1 != 0) {
2069 	BBuf *tbuf;
2070 	r = not_code_range_buf(enc, pbuf, &tbuf);
2071 	if (r != 0) {
2072 	  bbuf_free(pbuf);
2073 	  return r;
2074 	}
2075 	bbuf_free(pbuf);
2076 	pbuf = tbuf;
2077       }
2078     }
2079     if (r != 0) return r;
2080 
2081     dest->mbuf = pbuf;
2082     bbuf_free(buf1);
2083     return r;
2084   }
2085   else
2086     return 0;
2087 }
2088 
2089 static int
conv_backslash_value(int c,ScanEnv * env)2090 conv_backslash_value(int c, ScanEnv* env)
2091 {
2092   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2093     switch (c) {
2094     case 'n': return '\n';
2095     case 't': return '\t';
2096     case 'r': return '\r';
2097     case 'f': return '\f';
2098     case 'a': return '\007';
2099     case 'b': return '\010';
2100     case 'e': return '\033';
2101     case 'v':
2102       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2103 	return '\v';
2104       break;
2105 
2106     default:
2107       break;
2108     }
2109   }
2110   return c;
2111 }
2112 
2113 static int
is_invalid_quantifier_target(Node * node)2114 is_invalid_quantifier_target(Node* node)
2115 {
2116   switch (NTYPE(node)) {
2117   case NT_ANCHOR:
2118     return 1;
2119     break;
2120 
2121   case NT_ENCLOSE:
2122     /* allow enclosed elements */
2123     /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2124     break;
2125 
2126   case NT_LIST:
2127     do {
2128       if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2129     } while (IS_NOT_NULL(node = NCDR(node)));
2130     return 0;
2131     break;
2132 
2133   case NT_ALT:
2134     do {
2135       if (is_invalid_quantifier_target(NCAR(node))) return 1;
2136     } while (IS_NOT_NULL(node = NCDR(node)));
2137     break;
2138 
2139   default:
2140     break;
2141   }
2142   return 0;
2143 }
2144 
2145 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2146 static int
popular_quantifier_num(QtfrNode * q)2147 popular_quantifier_num(QtfrNode* q)
2148 {
2149   if (q->greedy) {
2150     if (q->lower == 0) {
2151       if (q->upper == 1) return 0;
2152       else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2153     }
2154     else if (q->lower == 1) {
2155       if (IS_REPEAT_INFINITE(q->upper)) return 2;
2156     }
2157   }
2158   else {
2159     if (q->lower == 0) {
2160       if (q->upper == 1) return 3;
2161       else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2162     }
2163     else if (q->lower == 1) {
2164       if (IS_REPEAT_INFINITE(q->upper)) return 5;
2165     }
2166   }
2167   return -1;
2168 }
2169 
2170 
2171 enum ReduceType {
2172   RQ_ASIS = 0, /* as is */
2173   RQ_DEL  = 1, /* delete parent */
2174   RQ_A,        /* to '*'    */
2175   RQ_AQ,       /* to '*?'   */
2176   RQ_QQ,       /* to '??'   */
2177   RQ_P_QQ,     /* to '+)??' */
2178   RQ_PQ_Q      /* to '+?)?' */
2179 };
2180 
2181 static enum ReduceType ReduceTypeTable[6][6] = {
2182   {RQ_DEL,  RQ_A,    RQ_A,   RQ_QQ,   RQ_AQ,   RQ_ASIS}, /* '?'  */
2183   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},  /* '*'  */
2184   {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */
2185   {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */
2186   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */
2187   {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */
2188 };
2189 
2190 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2191 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2192 {
2193   int pnum, cnum;
2194   QtfrNode *p, *c;
2195 
2196   p = NQTFR(pnode);
2197   c = NQTFR(cnode);
2198   pnum = popular_quantifier_num(p);
2199   cnum = popular_quantifier_num(c);
2200   if (pnum < 0 || cnum < 0) return ;
2201 
2202   switch(ReduceTypeTable[cnum][pnum]) {
2203   case RQ_DEL:
2204     *pnode = *cnode;
2205     break;
2206   case RQ_A:
2207     p->target = c->target;
2208     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 1;
2209     break;
2210   case RQ_AQ:
2211     p->target = c->target;
2212     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 0;
2213     break;
2214   case RQ_QQ:
2215     p->target = c->target;
2216     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2217     break;
2218   case RQ_P_QQ:
2219     p->target = cnode;
2220     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2221     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 1;
2222     return ;
2223     break;
2224   case RQ_PQ_Q:
2225     p->target = cnode;
2226     p->lower  = 0;  p->upper = 1;  p->greedy = 1;
2227     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 0;
2228     return ;
2229     break;
2230   case RQ_ASIS:
2231     p->target = cnode;
2232     return ;
2233     break;
2234   }
2235 
2236   c->target = NULL_NODE;
2237   onig_node_free(cnode);
2238 }
2239 
2240 
2241 enum TokenSyms {
2242   TK_EOT      = 0,   /* end of token */
2243   TK_RAW_BYTE = 1,
2244   TK_CHAR,
2245   TK_STRING,
2246   TK_CODE_POINT,
2247   TK_ANYCHAR,
2248   TK_CHAR_TYPE,
2249   TK_BACKREF,
2250   TK_CALL,
2251   TK_ANCHOR,
2252   TK_OP_REPEAT,
2253   TK_INTERVAL,
2254   TK_ANYCHAR_ANYTIME,  /* SQL '%' == .* */
2255   TK_ALT,
2256   TK_SUBEXP_OPEN,
2257   TK_SUBEXP_CLOSE,
2258   TK_CC_OPEN,
2259   TK_QUOTE_OPEN,
2260   TK_CHAR_PROPERTY,    /* \p{...}, \P{...} */
2261   /* in cc */
2262   TK_CC_CLOSE,
2263   TK_CC_RANGE,
2264   TK_POSIX_BRACKET_OPEN,
2265   TK_CC_AND,             /* && */
2266   TK_CC_CC_OPEN          /* [ */
2267 };
2268 
2269 typedef struct {
2270   enum TokenSyms type;
2271   int escaped;
2272   int base;   /* is number: 8, 16 (used in [....]) */
2273   UChar* backp;
2274   union {
2275     UChar* s;
2276     int   c;
2277     OnigCodePoint code;
2278     int   anchor;
2279     int   subtype;
2280     struct {
2281       int lower;
2282       int upper;
2283       int greedy;
2284       int possessive;
2285     } repeat;
2286     struct {
2287       int  num;
2288       int  ref1;
2289       int* refs;
2290       int  by_name;
2291 #ifdef USE_BACKREF_WITH_LEVEL
2292       int  exist_level;
2293       int  level;   /* \k<name+n> */
2294 #endif
2295     } backref;
2296     struct {
2297       UChar* name;
2298       UChar* name_end;
2299       int    gnum;
2300     } call;
2301     struct {
2302       int ctype;
2303       int not;
2304     } prop;
2305   } u;
2306 } OnigToken;
2307 
2308 
2309 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2310 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2311 {
2312   int low, up, syn_allow, non_low = 0;
2313   int r = 0;
2314   OnigCodePoint c;
2315   OnigEncoding enc = env->enc;
2316   UChar* p = *src;
2317   PFETCH_READY;
2318 
2319   syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2320 
2321   if (PEND) {
2322     if (syn_allow)
2323       return 1;  /* "....{" : OK! */
2324     else
2325       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;  /* "....{" syntax error */
2326   }
2327 
2328   if (! syn_allow) {
2329     c = PPEEK;
2330     if (c == ')' || c == '(' || c == '|') {
2331       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2332     }
2333   }
2334 
2335   low = onig_scan_unsigned_number(&p, end, env->enc);
2336   if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2337   if (low > ONIG_MAX_REPEAT_NUM)
2338     return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2339 
2340   if (p == *src) { /* can't read low */
2341     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2342       /* allow {,n} as {0,n} */
2343       low = 0;
2344       non_low = 1;
2345     }
2346     else
2347       goto invalid;
2348   }
2349 
2350   if (PEND) goto invalid;
2351   PFETCH(c);
2352   if (c == ',') {
2353     UChar* prev = p;
2354     up = onig_scan_unsigned_number(&p, end, env->enc);
2355     if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2356     if (up > ONIG_MAX_REPEAT_NUM)
2357       return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2358 
2359     if (p == prev) {
2360       if (non_low != 0)
2361 	goto invalid;
2362       up = REPEAT_INFINITE;  /* {n,} : {n,infinite} */
2363     }
2364   }
2365   else {
2366     if (non_low != 0)
2367       goto invalid;
2368 
2369     PUNFETCH;
2370     up = low;  /* {n} : exact n times */
2371     r = 2;     /* fixed */
2372   }
2373 
2374   if (PEND) goto invalid;
2375   PFETCH(c);
2376   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2377     if (c != MC_ESC(env->syntax)) goto invalid;
2378     PFETCH(c);
2379   }
2380   if (c != '}') goto invalid;
2381 
2382   if (!IS_REPEAT_INFINITE(up) && low > up) {
2383     return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2384   }
2385 
2386   tok->type = TK_INTERVAL;
2387   tok->u.repeat.lower = low;
2388   tok->u.repeat.upper = up;
2389   *src = p;
2390   return r; /* 0: normal {n,m}, 2: fixed {n} */
2391 
2392  invalid:
2393   if (syn_allow)
2394     return 1;  /* OK */
2395   else
2396     return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2397 }
2398 
2399 /* \M-, \C-, \c, or \... */
2400 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2401 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2402 {
2403   int v;
2404   OnigCodePoint c;
2405   OnigEncoding enc = env->enc;
2406   UChar* p = *src;
2407   PFETCH_READY;
2408 
2409   if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2410 
2411   PFETCH(c);
2412   switch (c) {
2413   case 'M':
2414     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2415       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2416       PFETCH(c);
2417       if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2418       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2419       PFETCH(c);
2420       if (c == MC_ESC(env->syntax)) {
2421 	v = fetch_escaped_value(&p, end, env);
2422 	if (v < 0) return v;
2423         c = (OnigCodePoint )v;
2424       }
2425       c = ((c & 0xff) | 0x80);
2426     }
2427     else
2428       goto backslash;
2429     break;
2430 
2431   case 'C':
2432     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2433       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2434       PFETCH(c);
2435       if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2436       goto control;
2437     }
2438     else
2439       goto backslash;
2440 
2441   case 'c':
2442     if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2443     control:
2444       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2445       PFETCH(c);
2446       if (c == '?') {
2447 	c = 0177;
2448       }
2449       else {
2450         if (c == MC_ESC(env->syntax)) {
2451           v = fetch_escaped_value(&p, end, env);
2452           if (v < 0) return v;
2453           c = (OnigCodePoint )v;
2454         }
2455 	c &= 0x9f;
2456       }
2457       break;
2458     }
2459     /* fall through */
2460 
2461   default:
2462     {
2463     backslash:
2464       c = conv_backslash_value(c, env);
2465     }
2466     break;
2467   }
2468 
2469   *src = p;
2470   return c;
2471 }
2472 
2473 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2474 
2475 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2476 get_name_end_code_point(OnigCodePoint start)
2477 {
2478   switch (start) {
2479   case '<':  return (OnigCodePoint )'>'; break;
2480   case '\'': return (OnigCodePoint )'\''; break;
2481   default:
2482     break;
2483   }
2484 
2485   return (OnigCodePoint )0;
2486 }
2487 
2488 #ifdef USE_NAMED_GROUP
2489 #ifdef USE_BACKREF_WITH_LEVEL
2490 /*
2491    \k<name+n>, \k<name-n>
2492    \k<num+n>,  \k<num-n>
2493    \k<-num+n>, \k<-num-n>
2494 */
2495 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2496 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2497 		      UChar** rname_end, ScanEnv* env,
2498 		      int* rback_num, int* rlevel)
2499 {
2500   int r, sign, is_num, exist_level;
2501   OnigCodePoint end_code;
2502   OnigCodePoint c = 0;
2503   OnigEncoding enc = env->enc;
2504   UChar *name_end;
2505   UChar *pnum_head;
2506   UChar *p = *src;
2507   PFETCH_READY;
2508 
2509   *rback_num = 0;
2510   is_num = exist_level = 0;
2511   sign = 1;
2512   pnum_head = *src;
2513 
2514   end_code = get_name_end_code_point(start_code);
2515 
2516   name_end = end;
2517   r = 0;
2518   if (PEND) {
2519     return ONIGERR_EMPTY_GROUP_NAME;
2520   }
2521   else {
2522     PFETCH(c);
2523     if (c == end_code)
2524       return ONIGERR_EMPTY_GROUP_NAME;
2525 
2526     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2527       is_num = 1;
2528     }
2529     else if (c == '-') {
2530       is_num = 2;
2531       sign = -1;
2532       pnum_head = p;
2533     }
2534     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2535       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2536     }
2537   }
2538 
2539   while (!PEND) {
2540     name_end = p;
2541     PFETCH(c);
2542     if (c == end_code || c == ')' || c == '+' || c == '-') {
2543       if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2544       break;
2545     }
2546 
2547     if (is_num != 0) {
2548       if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2549 	is_num = 1;
2550       }
2551       else {
2552 	r = ONIGERR_INVALID_GROUP_NAME;
2553 	is_num = 0;
2554       }
2555     }
2556     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2557       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2558     }
2559   }
2560 
2561   if (r == 0 && c != end_code) {
2562     if (c == '+' || c == '-') {
2563       int level;
2564       int flag = (c == '-' ? -1 : 1);
2565 
2566       PFETCH(c);
2567       if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2568       PUNFETCH;
2569       level = onig_scan_unsigned_number(&p, end, enc);
2570       if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2571       *rlevel = (level * flag);
2572       exist_level = 1;
2573 
2574       PFETCH(c);
2575       if (c == end_code)
2576 	goto end;
2577     }
2578 
2579   err:
2580     r = ONIGERR_INVALID_GROUP_NAME;
2581     name_end = end;
2582   }
2583 
2584  end:
2585   if (r == 0) {
2586     if (is_num != 0) {
2587       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2588       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2589       else if (*rback_num == 0) goto err;
2590 
2591       *rback_num *= sign;
2592     }
2593 
2594     *rname_end = name_end;
2595     *src = p;
2596     return (exist_level ? 1 : 0);
2597   }
2598   else {
2599     onig_scan_env_set_error_string(env, r, *src, name_end);
2600     return r;
2601   }
2602 }
2603 #endif /* USE_BACKREF_WITH_LEVEL */
2604 
2605 /*
2606   def: 0 -> define name    (don't allow number name)
2607        1 -> reference name (allow number name)
2608 */
2609 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2610 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2611 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2612 {
2613   int r, is_num, sign;
2614   OnigCodePoint end_code;
2615   OnigCodePoint c = 0;
2616   OnigEncoding enc = env->enc;
2617   UChar *name_end;
2618   UChar *pnum_head;
2619   UChar *p = *src;
2620   PFETCH_READY;
2621 
2622   *rback_num = 0;
2623 
2624   end_code = get_name_end_code_point(start_code);
2625 
2626   name_end = end;
2627   pnum_head = *src;
2628   r = 0;
2629   is_num = 0;
2630   sign = 1;
2631   if (PEND) {
2632     return ONIGERR_EMPTY_GROUP_NAME;
2633   }
2634   else {
2635     PFETCH(c);
2636     if (c == end_code)
2637       return ONIGERR_EMPTY_GROUP_NAME;
2638 
2639     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2640       if (ref == 1)
2641 	is_num = 1;
2642       else {
2643 	r = ONIGERR_INVALID_GROUP_NAME;
2644 	is_num = 0;
2645       }
2646     }
2647     else if (c == '-') {
2648       if (ref == 1) {
2649 	is_num = 2;
2650 	sign = -1;
2651 	pnum_head = p;
2652       }
2653       else {
2654 	r = ONIGERR_INVALID_GROUP_NAME;
2655 	is_num = 0;
2656       }
2657     }
2658     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2659       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2660     }
2661   }
2662 
2663   if (r == 0) {
2664     while (!PEND) {
2665       name_end = p;
2666       PFETCH(c);
2667       if (c == end_code || c == ')') {
2668 	if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2669 	break;
2670       }
2671 
2672       if (is_num != 0) {
2673 	if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2674 	  is_num = 1;
2675 	}
2676 	else {
2677 	  if (!ONIGENC_IS_CODE_WORD(enc, c))
2678 	    r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2679 	  else
2680 	    r = ONIGERR_INVALID_GROUP_NAME;
2681 
2682 	  is_num = 0;
2683 	}
2684       }
2685       else {
2686 	if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2687 	  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2688 	}
2689       }
2690     }
2691 
2692     if (c != end_code) {
2693       r = ONIGERR_INVALID_GROUP_NAME;
2694       name_end = end;
2695     }
2696 
2697     if (is_num != 0) {
2698       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2699       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2700       else if (*rback_num == 0) {
2701 	r = ONIGERR_INVALID_GROUP_NAME;
2702 	goto err;
2703       }
2704 
2705       *rback_num *= sign;
2706     }
2707 
2708     *rname_end = name_end;
2709     *src = p;
2710     return 0;
2711   }
2712   else {
2713     while (!PEND) {
2714       name_end = p;
2715       PFETCH(c);
2716       if (c == end_code || c == ')')
2717 	break;
2718     }
2719     if (PEND)
2720       name_end = end;
2721 
2722   err:
2723     onig_scan_env_set_error_string(env, r, *src, name_end);
2724     return r;
2725   }
2726 }
2727 #else
2728 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2729 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2730 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2731 {
2732   int r, is_num, sign;
2733   OnigCodePoint end_code;
2734   OnigCodePoint c = 0;
2735   UChar *name_end;
2736   OnigEncoding enc = env->enc;
2737   UChar *pnum_head;
2738   UChar *p = *src;
2739   PFETCH_READY;
2740 
2741   *rback_num = 0;
2742 
2743   end_code = get_name_end_code_point(start_code);
2744 
2745   *rname_end = name_end = end;
2746   r = 0;
2747   pnum_head = *src;
2748   is_num = 0;
2749   sign = 1;
2750 
2751   if (PEND) {
2752     return ONIGERR_EMPTY_GROUP_NAME;
2753   }
2754   else {
2755     PFETCH(c);
2756     if (c == end_code)
2757       return ONIGERR_EMPTY_GROUP_NAME;
2758 
2759     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2760       is_num = 1;
2761     }
2762     else if (c == '-') {
2763       is_num = 2;
2764       sign = -1;
2765       pnum_head = p;
2766     }
2767     else {
2768       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2769     }
2770   }
2771 
2772   while (!PEND) {
2773     name_end = p;
2774 
2775     PFETCH(c);
2776     if (c == end_code || c == ')') break;
2777     if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2778       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2779   }
2780   if (r == 0 && c != end_code) {
2781     r = ONIGERR_INVALID_GROUP_NAME;
2782     name_end = end;
2783   }
2784 
2785   if (r == 0) {
2786     *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2787     if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2788     else if (*rback_num == 0) {
2789       r = ONIGERR_INVALID_GROUP_NAME;
2790       goto err;
2791     }
2792     *rback_num *= sign;
2793 
2794     *rname_end = name_end;
2795     *src = p;
2796     return 0;
2797   }
2798   else {
2799   err:
2800     onig_scan_env_set_error_string(env, r, *src, name_end);
2801     return r;
2802   }
2803 }
2804 #endif /* USE_NAMED_GROUP */
2805 
2806 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2807 CC_ESC_WARN(ScanEnv* env, UChar *c)
2808 {
2809   if (onig_warn == onig_null_warn) return ;
2810 
2811   if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2812       IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2813     UChar buf[WARN_BUFSIZE];
2814     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2815 		env->pattern, env->pattern_end,
2816                 (UChar* )"character class has '%s' without escape", c);
2817     (*onig_warn)((char* )buf);
2818   }
2819 }
2820 
2821 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2822 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2823 {
2824   if (onig_warn == onig_null_warn) return ;
2825 
2826   if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2827     UChar buf[WARN_BUFSIZE];
2828     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2829 		(env)->pattern, (env)->pattern_end,
2830 		(UChar* )"regular expression has '%s' without escape", c);
2831     (*onig_warn)((char* )buf);
2832   }
2833 }
2834 
2835 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2836 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2837 		  UChar **next, OnigEncoding enc)
2838 {
2839   int i;
2840   OnigCodePoint x;
2841   UChar *q;
2842   UChar *p = from;
2843 
2844   while (p < to) {
2845     x = ONIGENC_MBC_TO_CODE(enc, p, to);
2846     q = p + enclen(enc, p);
2847     if (x == s[0]) {
2848       for (i = 1; i < n && q < to; i++) {
2849 	x = ONIGENC_MBC_TO_CODE(enc, q, to);
2850 	if (x != s[i]) break;
2851 	q += enclen(enc, q);
2852       }
2853       if (i >= n) {
2854 	if (IS_NOT_NULL(next))
2855 	  *next = q;
2856 	return p;
2857       }
2858     }
2859     p = q;
2860   }
2861   return NULL_UCHARP;
2862 }
2863 
2864 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2865 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2866 		 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2867 {
2868   int i, in_esc;
2869   OnigCodePoint x;
2870   UChar *q;
2871   UChar *p = from;
2872 
2873   in_esc = 0;
2874   while (p < to) {
2875     if (in_esc) {
2876       in_esc = 0;
2877       p += enclen(enc, p);
2878     }
2879     else {
2880       x = ONIGENC_MBC_TO_CODE(enc, p, to);
2881       q = p + enclen(enc, p);
2882       if (x == s[0]) {
2883 	for (i = 1; i < n && q < to; i++) {
2884 	  x = ONIGENC_MBC_TO_CODE(enc, q, to);
2885 	  if (x != s[i]) break;
2886 	  q += enclen(enc, q);
2887 	}
2888 	if (i >= n) return 1;
2889 	p += enclen(enc, p);
2890       }
2891       else {
2892 	x = ONIGENC_MBC_TO_CODE(enc, p, to);
2893 	if (x == bad) return 0;
2894 	else if (x == MC_ESC(syn)) in_esc = 1;
2895 	p = q;
2896       }
2897     }
2898   }
2899   return 0;
2900 }
2901 
2902 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2903 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2904 {
2905   int num;
2906   OnigCodePoint c, c2;
2907   OnigSyntaxType* syn = env->syntax;
2908   OnigEncoding enc = env->enc;
2909   UChar* prev;
2910   UChar* p = *src;
2911   PFETCH_READY;
2912 
2913   if (PEND) {
2914     tok->type = TK_EOT;
2915     return tok->type;
2916   }
2917 
2918   PFETCH(c);
2919   tok->type = TK_CHAR;
2920   tok->base = 0;
2921   tok->u.c  = c;
2922   tok->escaped = 0;
2923 
2924   if (c == ']') {
2925     tok->type = TK_CC_CLOSE;
2926   }
2927   else if (c == '-') {
2928     tok->type = TK_CC_RANGE;
2929   }
2930   else if (c == MC_ESC(syn)) {
2931     if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2932       goto end;
2933 
2934     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2935 
2936     PFETCH(c);
2937     tok->escaped = 1;
2938     tok->u.c = c;
2939     switch (c) {
2940     case 'w':
2941       tok->type = TK_CHAR_TYPE;
2942       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2943       tok->u.prop.not   = 0;
2944       break;
2945     case 'W':
2946       tok->type = TK_CHAR_TYPE;
2947       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2948       tok->u.prop.not   = 1;
2949       break;
2950     case 'd':
2951       tok->type = TK_CHAR_TYPE;
2952       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2953       tok->u.prop.not   = 0;
2954       break;
2955     case 'D':
2956       tok->type = TK_CHAR_TYPE;
2957       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2958       tok->u.prop.not   = 1;
2959       break;
2960     case 's':
2961       tok->type = TK_CHAR_TYPE;
2962       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2963       tok->u.prop.not   = 0;
2964       break;
2965     case 'S':
2966       tok->type = TK_CHAR_TYPE;
2967       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2968       tok->u.prop.not   = 1;
2969       break;
2970     case 'h':
2971       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2972       tok->type = TK_CHAR_TYPE;
2973       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2974       tok->u.prop.not   = 0;
2975       break;
2976     case 'H':
2977       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2978       tok->type = TK_CHAR_TYPE;
2979       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2980       tok->u.prop.not   = 1;
2981       break;
2982 
2983     case 'p':
2984     case 'P':
2985       c2 = PPEEK;
2986       if (c2 == '{' &&
2987 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2988 	PINC;
2989 	tok->type = TK_CHAR_PROPERTY;
2990 	tok->u.prop.not = (c == 'P' ? 1 : 0);
2991 
2992 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2993 	  PFETCH(c2);
2994 	  if (c2 == '^') {
2995 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
2996 	  }
2997 	  else
2998 	    PUNFETCH;
2999 	}
3000       }
3001       break;
3002 
3003     case 'x':
3004       if (PEND) break;
3005 
3006       prev = p;
3007       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3008 	PINC;
3009 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3010 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3011 	if (!PEND) {
3012           c2 = PPEEK;
3013           if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3014             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3015         }
3016 
3017 	if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3018 	  PINC;
3019 	  tok->type   = TK_CODE_POINT;
3020 	  tok->base   = 16;
3021 	  tok->u.code = (OnigCodePoint )num;
3022 	}
3023 	else {
3024 	  /* can't read nothing or invalid format */
3025 	  p = prev;
3026 	}
3027       }
3028       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3029 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3030 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3031 	if (p == prev) {  /* can't read nothing. */
3032 	  num = 0; /* but, it's not error */
3033 	}
3034 	tok->type = TK_RAW_BYTE;
3035 	tok->base = 16;
3036 	tok->u.c  = num;
3037       }
3038       break;
3039 
3040     case 'u':
3041       if (PEND) break;
3042 
3043       prev = p;
3044       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3045 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3046 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3047 	if (p == prev) {  /* can't read nothing. */
3048 	  num = 0; /* but, it's not error */
3049 	}
3050 	tok->type   = TK_CODE_POINT;
3051 	tok->base   = 16;
3052 	tok->u.code = (OnigCodePoint )num;
3053       }
3054       break;
3055 
3056     case '0':
3057     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3058       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3059 	PUNFETCH;
3060 	prev = p;
3061 	num = scan_unsigned_octal_number(&p, end, 3, enc);
3062 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3063 	if (p == prev) {  /* can't read nothing. */
3064 	  num = 0; /* but, it's not error */
3065 	}
3066 	tok->type = TK_RAW_BYTE;
3067 	tok->base = 8;
3068 	tok->u.c  = num;
3069       }
3070       break;
3071 
3072     default:
3073       PUNFETCH;
3074       num = fetch_escaped_value(&p, end, env);
3075       if (num < 0) return num;
3076       if (tok->u.c != num) {
3077 	tok->u.code = (OnigCodePoint )num;
3078 	tok->type   = TK_CODE_POINT;
3079       }
3080       break;
3081     }
3082   }
3083   else if (c == '[') {
3084     if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3085       OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3086       tok->backp = p; /* point at '[' is readed */
3087       PINC;
3088       if (str_exist_check_with_esc(send, 2, p, end,
3089                                    (OnigCodePoint )']', enc, syn)) {
3090 	tok->type = TK_POSIX_BRACKET_OPEN;
3091       }
3092       else {
3093 	PUNFETCH;
3094 	goto cc_in_cc;
3095       }
3096     }
3097     else {
3098     cc_in_cc:
3099       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3100 	tok->type = TK_CC_CC_OPEN;
3101       }
3102       else {
3103 	CC_ESC_WARN(env, (UChar* )"[");
3104       }
3105     }
3106   }
3107   else if (c == '&') {
3108     if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3109 	!PEND && (PPEEK_IS('&'))) {
3110       PINC;
3111       tok->type = TK_CC_AND;
3112     }
3113   }
3114 
3115  end:
3116   *src = p;
3117   return tok->type;
3118 }
3119 
3120 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3121 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3122 {
3123   int r, num;
3124   OnigCodePoint c;
3125   OnigEncoding enc = env->enc;
3126   OnigSyntaxType* syn = env->syntax;
3127   UChar* prev;
3128   UChar* p = *src;
3129   PFETCH_READY;
3130 
3131  start:
3132   if (PEND) {
3133     tok->type = TK_EOT;
3134     return tok->type;
3135   }
3136 
3137   tok->type  = TK_STRING;
3138   tok->base  = 0;
3139   tok->backp = p;
3140 
3141   PFETCH(c);
3142   if (IS_MC_ESC_CODE(c, syn)) {
3143     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3144 
3145     tok->backp = p;
3146     PFETCH(c);
3147 
3148     tok->u.c = c;
3149     tok->escaped = 1;
3150     switch (c) {
3151     case '*':
3152       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3153       tok->type = TK_OP_REPEAT;
3154       tok->u.repeat.lower = 0;
3155       tok->u.repeat.upper = REPEAT_INFINITE;
3156       goto greedy_check;
3157       break;
3158 
3159     case '+':
3160       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3161       tok->type = TK_OP_REPEAT;
3162       tok->u.repeat.lower = 1;
3163       tok->u.repeat.upper = REPEAT_INFINITE;
3164       goto greedy_check;
3165       break;
3166 
3167     case '?':
3168       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3169       tok->type = TK_OP_REPEAT;
3170       tok->u.repeat.lower = 0;
3171       tok->u.repeat.upper = 1;
3172     greedy_check:
3173       if (!PEND && PPEEK_IS('?') &&
3174 	  IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3175 	PFETCH(c);
3176 	tok->u.repeat.greedy     = 0;
3177 	tok->u.repeat.possessive = 0;
3178       }
3179       else {
3180       possessive_check:
3181 	if (!PEND && PPEEK_IS('+') &&
3182 	    ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3183 	      tok->type != TK_INTERVAL)  ||
3184 	     (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3185 	      tok->type == TK_INTERVAL))) {
3186 	  PFETCH(c);
3187 	  tok->u.repeat.greedy     = 1;
3188 	  tok->u.repeat.possessive = 1;
3189 	}
3190 	else {
3191 	  tok->u.repeat.greedy     = 1;
3192 	  tok->u.repeat.possessive = 0;
3193 	}
3194       }
3195       break;
3196 
3197     case '{':
3198       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3199       r = fetch_range_quantifier(&p, end, tok, env);
3200       if (r < 0) return r;  /* error */
3201       if (r == 0) goto greedy_check;
3202       else if (r == 2) { /* {n} */
3203 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3204 	  goto possessive_check;
3205 
3206 	goto greedy_check;
3207       }
3208       /* r == 1 : normal char */
3209       break;
3210 
3211     case '|':
3212       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3213       tok->type = TK_ALT;
3214       break;
3215 
3216     case '(':
3217       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3218       tok->type = TK_SUBEXP_OPEN;
3219       break;
3220 
3221     case ')':
3222       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3223       tok->type = TK_SUBEXP_CLOSE;
3224       break;
3225 
3226     case 'w':
3227       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3228       tok->type = TK_CHAR_TYPE;
3229       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3230       tok->u.prop.not   = 0;
3231       break;
3232 
3233     case 'W':
3234       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3235       tok->type = TK_CHAR_TYPE;
3236       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3237       tok->u.prop.not   = 1;
3238       break;
3239 
3240     case 'b':
3241       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3242       tok->type = TK_ANCHOR;
3243       tok->u.anchor = ANCHOR_WORD_BOUND;
3244       break;
3245 
3246     case 'B':
3247       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3248       tok->type = TK_ANCHOR;
3249       tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3250       break;
3251 
3252 #ifdef USE_WORD_BEGIN_END
3253     case '<':
3254       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3255       tok->type = TK_ANCHOR;
3256       tok->u.anchor = ANCHOR_WORD_BEGIN;
3257       break;
3258 
3259     case '>':
3260       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3261       tok->type = TK_ANCHOR;
3262       tok->u.anchor = ANCHOR_WORD_END;
3263       break;
3264 #endif
3265 
3266     case 's':
3267       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3268       tok->type = TK_CHAR_TYPE;
3269       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3270       tok->u.prop.not   = 0;
3271       break;
3272 
3273     case 'S':
3274       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3275       tok->type = TK_CHAR_TYPE;
3276       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3277       tok->u.prop.not   = 1;
3278       break;
3279 
3280     case 'd':
3281       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3282       tok->type = TK_CHAR_TYPE;
3283       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3284       tok->u.prop.not   = 0;
3285       break;
3286 
3287     case 'D':
3288       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3289       tok->type = TK_CHAR_TYPE;
3290       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3291       tok->u.prop.not   = 1;
3292       break;
3293 
3294     case 'h':
3295       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3296       tok->type = TK_CHAR_TYPE;
3297       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3298       tok->u.prop.not   = 0;
3299       break;
3300 
3301     case 'H':
3302       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3303       tok->type = TK_CHAR_TYPE;
3304       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3305       tok->u.prop.not   = 1;
3306       break;
3307 
3308     case 'A':
3309       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3310     begin_buf:
3311       tok->type = TK_ANCHOR;
3312       tok->u.subtype = ANCHOR_BEGIN_BUF;
3313       break;
3314 
3315     case 'Z':
3316       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3317       tok->type = TK_ANCHOR;
3318       tok->u.subtype = ANCHOR_SEMI_END_BUF;
3319       break;
3320 
3321     case 'z':
3322       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3323     end_buf:
3324       tok->type = TK_ANCHOR;
3325       tok->u.subtype = ANCHOR_END_BUF;
3326       break;
3327 
3328     case 'G':
3329       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3330       tok->type = TK_ANCHOR;
3331       tok->u.subtype = ANCHOR_BEGIN_POSITION;
3332       break;
3333 
3334     case '`':
3335       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3336       goto begin_buf;
3337       break;
3338 
3339     case '\'':
3340       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3341       goto end_buf;
3342       break;
3343 
3344     case 'x':
3345       if (PEND) break;
3346 
3347       prev = p;
3348       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3349 	PINC;
3350 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3351 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3352 	if (!PEND) {
3353           if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3354             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3355         }
3356 
3357 	if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3358 	  PINC;
3359 	  tok->type   = TK_CODE_POINT;
3360 	  tok->u.code = (OnigCodePoint )num;
3361 	}
3362 	else {
3363 	  /* can't read nothing or invalid format */
3364 	  p = prev;
3365 	}
3366       }
3367       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3368 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3369 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3370 	if (p == prev) {  /* can't read nothing. */
3371 	  num = 0; /* but, it's not error */
3372 	}
3373 	tok->type = TK_RAW_BYTE;
3374 	tok->base = 16;
3375 	tok->u.c  = num;
3376       }
3377       break;
3378 
3379     case 'u':
3380       if (PEND) break;
3381 
3382       prev = p;
3383       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3384 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3385 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3386 	if (p == prev) {  /* can't read nothing. */
3387 	  num = 0; /* but, it's not error */
3388 	}
3389 	tok->type   = TK_CODE_POINT;
3390 	tok->base   = 16;
3391 	tok->u.code = (OnigCodePoint )num;
3392       }
3393       break;
3394 
3395     case '1': case '2': case '3': case '4':
3396     case '5': case '6': case '7': case '8': case '9':
3397       PUNFETCH;
3398       prev = p;
3399       num = onig_scan_unsigned_number(&p, end, enc);
3400       if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3401         goto skip_backref;
3402       }
3403 
3404       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3405 	  (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3406 	if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3407 	  if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3408 	    return ONIGERR_INVALID_BACKREF;
3409 	}
3410 
3411 	tok->type = TK_BACKREF;
3412 	tok->u.backref.num     = 1;
3413 	tok->u.backref.ref1    = num;
3414 	tok->u.backref.by_name = 0;
3415 #ifdef USE_BACKREF_WITH_LEVEL
3416 	tok->u.backref.exist_level = 0;
3417 #endif
3418 	break;
3419       }
3420 
3421     skip_backref:
3422       if (c == '8' || c == '9') {
3423 	/* normal char */
3424 	p = prev; PINC;
3425 	break;
3426       }
3427 
3428       p = prev;
3429       /* fall through */
3430     case '0':
3431       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3432 	prev = p;
3433 	num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3434 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3435 	if (p == prev) {  /* can't read nothing. */
3436 	  num = 0; /* but, it's not error */
3437 	}
3438 	tok->type = TK_RAW_BYTE;
3439 	tok->base = 8;
3440 	tok->u.c  = num;
3441       }
3442       else if (c != '0') {
3443 	PINC;
3444       }
3445       break;
3446 
3447 #ifdef USE_NAMED_GROUP
3448     case 'k':
3449       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3450 	PFETCH(c);
3451 	if (c == '<' || c == '\'') {
3452 	  UChar* name_end;
3453 	  int* backs;
3454 	  int back_num;
3455 
3456 	  prev = p;
3457 
3458 #ifdef USE_BACKREF_WITH_LEVEL
3459 	  name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3460 	  r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3461 				    env, &back_num, &tok->u.backref.level);
3462 	  if (r == 1) tok->u.backref.exist_level = 1;
3463 	  else        tok->u.backref.exist_level = 0;
3464 #else
3465 	  r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3466 #endif
3467 	  if (r < 0) return r;
3468 
3469 	  if (back_num != 0) {
3470 	    if (back_num < 0) {
3471 	      back_num = BACKREF_REL_TO_ABS(back_num, env);
3472 	      if (back_num <= 0)
3473 		return ONIGERR_INVALID_BACKREF;
3474 	    }
3475 
3476 	    if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3477 	      if (back_num > env->num_mem ||
3478 		  IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3479 		return ONIGERR_INVALID_BACKREF;
3480 	    }
3481 	    tok->type = TK_BACKREF;
3482 	    tok->u.backref.by_name = 0;
3483 	    tok->u.backref.num  = 1;
3484 	    tok->u.backref.ref1 = back_num;
3485 	  }
3486 	  else {
3487 	    num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3488 	    if (num <= 0) {
3489 	      onig_scan_env_set_error_string(env,
3490 			     ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3491 	      return ONIGERR_UNDEFINED_NAME_REFERENCE;
3492 	    }
3493 	    if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3494 	      int i;
3495 	      for (i = 0; i < num; i++) {
3496 		if (backs[i] > env->num_mem ||
3497 		    IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3498 		  return ONIGERR_INVALID_BACKREF;
3499 	      }
3500 	    }
3501 
3502 	    tok->type = TK_BACKREF;
3503 	    tok->u.backref.by_name = 1;
3504 	    if (num == 1) {
3505 	      tok->u.backref.num  = 1;
3506 	      tok->u.backref.ref1 = backs[0];
3507 	    }
3508 	    else {
3509 	      tok->u.backref.num  = num;
3510 	      tok->u.backref.refs = backs;
3511 	    }
3512 	  }
3513 	}
3514 	else
3515 	  PUNFETCH;
3516       }
3517       break;
3518 #endif
3519 
3520 #ifdef USE_SUBEXP_CALL
3521     case 'g':
3522       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3523 	PFETCH(c);
3524 	if (c == '<' || c == '\'') {
3525 	  int gnum;
3526 	  UChar* name_end;
3527 
3528 	  prev = p;
3529 	  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3530 	  if (r < 0) return r;
3531 
3532 	  tok->type = TK_CALL;
3533 	  tok->u.call.name     = prev;
3534 	  tok->u.call.name_end = name_end;
3535 	  tok->u.call.gnum     = gnum;
3536 	}
3537 	else
3538 	  PUNFETCH;
3539       }
3540       break;
3541 #endif
3542 
3543     case 'Q':
3544       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3545 	tok->type = TK_QUOTE_OPEN;
3546       }
3547       break;
3548 
3549     case 'p':
3550     case 'P':
3551       if (PPEEK_IS('{') &&
3552 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3553 	PINC;
3554 	tok->type = TK_CHAR_PROPERTY;
3555 	tok->u.prop.not = (c == 'P' ? 1 : 0);
3556 
3557 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3558 	  PFETCH(c);
3559 	  if (c == '^') {
3560 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3561 	  }
3562 	  else
3563 	    PUNFETCH;
3564 	}
3565       }
3566       break;
3567 
3568     default:
3569       PUNFETCH;
3570       num = fetch_escaped_value(&p, end, env);
3571       if (num < 0) return num;
3572       /* set_raw: */
3573       if (tok->u.c != num) {
3574 	tok->type = TK_CODE_POINT;
3575 	tok->u.code = (OnigCodePoint )num;
3576       }
3577       else { /* string */
3578 	p = tok->backp + enclen(enc, tok->backp);
3579       }
3580       break;
3581     }
3582   }
3583   else {
3584     tok->u.c = c;
3585     tok->escaped = 0;
3586 
3587 #ifdef USE_VARIABLE_META_CHARS
3588     if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3589 	IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3590       if (c == MC_ANYCHAR(syn))
3591 	goto any_char;
3592       else if (c == MC_ANYTIME(syn))
3593 	goto anytime;
3594       else if (c == MC_ZERO_OR_ONE_TIME(syn))
3595 	goto zero_or_one_time;
3596       else if (c == MC_ONE_OR_MORE_TIME(syn))
3597 	goto one_or_more_time;
3598       else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3599 	tok->type = TK_ANYCHAR_ANYTIME;
3600 	goto out;
3601       }
3602     }
3603 #endif
3604 
3605     switch (c) {
3606     case '.':
3607       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3608 #ifdef USE_VARIABLE_META_CHARS
3609     any_char:
3610 #endif
3611       tok->type = TK_ANYCHAR;
3612       break;
3613 
3614     case '*':
3615       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3616 #ifdef USE_VARIABLE_META_CHARS
3617     anytime:
3618 #endif
3619       tok->type = TK_OP_REPEAT;
3620       tok->u.repeat.lower = 0;
3621       tok->u.repeat.upper = REPEAT_INFINITE;
3622       goto greedy_check;
3623       break;
3624 
3625     case '+':
3626       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3627 #ifdef USE_VARIABLE_META_CHARS
3628     one_or_more_time:
3629 #endif
3630       tok->type = TK_OP_REPEAT;
3631       tok->u.repeat.lower = 1;
3632       tok->u.repeat.upper = REPEAT_INFINITE;
3633       goto greedy_check;
3634       break;
3635 
3636     case '?':
3637       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3638 #ifdef USE_VARIABLE_META_CHARS
3639     zero_or_one_time:
3640 #endif
3641       tok->type = TK_OP_REPEAT;
3642       tok->u.repeat.lower = 0;
3643       tok->u.repeat.upper = 1;
3644       goto greedy_check;
3645       break;
3646 
3647     case '{':
3648       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3649       r = fetch_range_quantifier(&p, end, tok, env);
3650       if (r < 0) return r;  /* error */
3651       if (r == 0) goto greedy_check;
3652       else if (r == 2) { /* {n} */
3653 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3654 	  goto possessive_check;
3655 
3656 	goto greedy_check;
3657       }
3658       /* r == 1 : normal char */
3659       break;
3660 
3661     case '|':
3662       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3663       tok->type = TK_ALT;
3664       break;
3665 
3666     case '(':
3667       if (PPEEK_IS('?') &&
3668           IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3669         PINC;
3670         if (PPEEK_IS('#')) {
3671           PFETCH(c);
3672           while (1) {
3673             if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3674             PFETCH(c);
3675             if (c == MC_ESC(syn)) {
3676               if (!PEND) PFETCH(c);
3677             }
3678             else {
3679               if (c == ')') break;
3680             }
3681           }
3682           goto start;
3683         }
3684         PUNFETCH;
3685       }
3686 
3687       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3688       tok->type = TK_SUBEXP_OPEN;
3689       break;
3690 
3691     case ')':
3692       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3693       tok->type = TK_SUBEXP_CLOSE;
3694       break;
3695 
3696     case '^':
3697       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3698       tok->type = TK_ANCHOR;
3699       tok->u.subtype = (IS_SINGLELINE(env->option)
3700 			? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3701       break;
3702 
3703     case '$':
3704       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3705       tok->type = TK_ANCHOR;
3706       tok->u.subtype = (IS_SINGLELINE(env->option)
3707 			? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3708       break;
3709 
3710     case '[':
3711       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3712       tok->type = TK_CC_OPEN;
3713       break;
3714 
3715     case ']':
3716       if (*src > env->pattern)   /* /].../ is allowed. */
3717 	CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3718       break;
3719 
3720     case '#':
3721       if (IS_EXTEND(env->option)) {
3722 	while (!PEND) {
3723 	  PFETCH(c);
3724 	  if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3725 	    break;
3726 	}
3727 	goto start;
3728 	break;
3729       }
3730       break;
3731 
3732     case ' ': case '\t': case '\n': case '\r': case '\f':
3733       if (IS_EXTEND(env->option))
3734 	goto start;
3735       break;
3736 
3737     default:
3738       /* string */
3739       break;
3740     }
3741   }
3742 
3743 #ifdef USE_VARIABLE_META_CHARS
3744  out:
3745 #endif
3746   *src = p;
3747   return tok->type;
3748 }
3749 
3750 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3751 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3752 			 OnigEncoding enc ARG_UNUSED,
3753                          OnigCodePoint sb_out, const OnigCodePoint mbr[])
3754 {
3755   int i, r;
3756   OnigCodePoint j;
3757 
3758   int n = ONIGENC_CODE_RANGE_NUM(mbr);
3759 
3760   if (not == 0) {
3761     for (i = 0; i < n; i++) {
3762       for (j  = ONIGENC_CODE_RANGE_FROM(mbr, i);
3763            j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3764 	if (j >= sb_out) {
3765 	  if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
3766 	  else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3767 	    r = add_code_range_to_buf(&(cc->mbuf), j,
3768 				      ONIGENC_CODE_RANGE_TO(mbr, i));
3769 	    if (r != 0) return r;
3770 	    i++;
3771 	  }
3772 
3773 	  goto sb_end;
3774 	}
3775         BITSET_SET_BIT(cc->bs, j);
3776       }
3777     }
3778 
3779   sb_end:
3780     for ( ; i < n; i++) {
3781       r = add_code_range_to_buf(&(cc->mbuf),
3782                                 ONIGENC_CODE_RANGE_FROM(mbr, i),
3783                                 ONIGENC_CODE_RANGE_TO(mbr, i));
3784       if (r != 0) return r;
3785     }
3786   }
3787   else {
3788     OnigCodePoint prev = 0;
3789 
3790     for (i = 0; i < n; i++) {
3791       for (j = prev;
3792 	   j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3793 	if (j >= sb_out) {
3794 	  goto sb_end2;
3795 	}
3796 	BITSET_SET_BIT(cc->bs, j);
3797       }
3798       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3799     }
3800     for (j = prev; j < sb_out; j++) {
3801       BITSET_SET_BIT(cc->bs, j);
3802     }
3803 
3804   sb_end2:
3805     prev = sb_out;
3806 
3807     for (i = 0; i < n; i++) {
3808       if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3809 	r = add_code_range_to_buf(&(cc->mbuf), prev,
3810                                   ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3811 	if (r != 0) return r;
3812       }
3813       prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3814     }
3815     if (prev < 0x7fffffff) {
3816       r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3817       if (r != 0) return r;
3818     }
3819   }
3820 
3821   return 0;
3822 }
3823 
3824 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3825 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3826 {
3827   int c, r;
3828   const OnigCodePoint *ranges;
3829   OnigCodePoint sb_out;
3830   OnigEncoding enc = env->enc;
3831 
3832   r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3833   if (r == 0) {
3834     return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3835   }
3836   else if (r != ONIG_NO_SUPPORT_CONFIG) {
3837     return r;
3838   }
3839 
3840   r = 0;
3841   switch (ctype) {
3842   case ONIGENC_CTYPE_ALPHA:
3843   case ONIGENC_CTYPE_BLANK:
3844   case ONIGENC_CTYPE_CNTRL:
3845   case ONIGENC_CTYPE_DIGIT:
3846   case ONIGENC_CTYPE_LOWER:
3847   case ONIGENC_CTYPE_PUNCT:
3848   case ONIGENC_CTYPE_SPACE:
3849   case ONIGENC_CTYPE_UPPER:
3850   case ONIGENC_CTYPE_XDIGIT:
3851   case ONIGENC_CTYPE_ASCII:
3852   case ONIGENC_CTYPE_ALNUM:
3853     if (not != 0) {
3854       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3855 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3856 	  BITSET_SET_BIT(cc->bs, c);
3857       }
3858       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3859     }
3860     else {
3861       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3862 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3863 	  BITSET_SET_BIT(cc->bs, c);
3864       }
3865     }
3866     break;
3867 
3868   case ONIGENC_CTYPE_GRAPH:
3869   case ONIGENC_CTYPE_PRINT:
3870     if (not != 0) {
3871       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3872 	if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3873 	  BITSET_SET_BIT(cc->bs, c);
3874       }
3875     }
3876     else {
3877       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3878 	if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3879 	  BITSET_SET_BIT(cc->bs, c);
3880       }
3881       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3882     }
3883     break;
3884 
3885   case ONIGENC_CTYPE_WORD:
3886     if (not == 0) {
3887       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3888 	if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3889       }
3890       ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3891     }
3892     else {
3893       for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3894         if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3895 	    && ! ONIGENC_IS_CODE_WORD(enc, c))
3896 	  BITSET_SET_BIT(cc->bs, c);
3897       }
3898     }
3899     break;
3900 
3901   default:
3902     return ONIGERR_PARSER_BUG;
3903     break;
3904   }
3905 
3906   return r;
3907 }
3908 
3909 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3910 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3911 {
3912 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH  20
3913 #define POSIX_BRACKET_NAME_MIN_LEN         4
3914 
3915   static PosixBracketEntryType PBS[] = {
3916     { (UChar* )"alnum",  ONIGENC_CTYPE_ALNUM,  5 },
3917     { (UChar* )"alpha",  ONIGENC_CTYPE_ALPHA,  5 },
3918     { (UChar* )"blank",  ONIGENC_CTYPE_BLANK,  5 },
3919     { (UChar* )"cntrl",  ONIGENC_CTYPE_CNTRL,  5 },
3920     { (UChar* )"digit",  ONIGENC_CTYPE_DIGIT,  5 },
3921     { (UChar* )"graph",  ONIGENC_CTYPE_GRAPH,  5 },
3922     { (UChar* )"lower",  ONIGENC_CTYPE_LOWER,  5 },
3923     { (UChar* )"print",  ONIGENC_CTYPE_PRINT,  5 },
3924     { (UChar* )"punct",  ONIGENC_CTYPE_PUNCT,  5 },
3925     { (UChar* )"space",  ONIGENC_CTYPE_SPACE,  5 },
3926     { (UChar* )"upper",  ONIGENC_CTYPE_UPPER,  5 },
3927     { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3928     { (UChar* )"ascii",  ONIGENC_CTYPE_ASCII,  5 },
3929     { (UChar* )"word",   ONIGENC_CTYPE_WORD,   4 },
3930     { (UChar* )NULL,     -1, 0 }
3931   };
3932 
3933   PosixBracketEntryType *pb;
3934   int not, i, r;
3935   OnigCodePoint c;
3936   OnigEncoding enc = env->enc;
3937   UChar *p = *src;
3938   PFETCH_READY;
3939 
3940   if (PPEEK_IS('^')) {
3941     PINC;
3942     not = 1;
3943   }
3944   else
3945     not = 0;
3946 
3947   if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3948     goto not_posix_bracket;
3949 
3950   for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3951     if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3952       p = (UChar* )onigenc_step(enc, p, end, pb->len);
3953       if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3954 	return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3955 
3956       r = add_ctype_to_cc(cc, pb->ctype, not, env);
3957       if (r != 0) return r;
3958 
3959       PINC; PINC;
3960       *src = p;
3961       return 0;
3962     }
3963   }
3964 
3965  not_posix_bracket:
3966   c = 0;
3967   i = 0;
3968   while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3969     PINC;
3970     if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3971   }
3972   if (c == ':' && ! PEND) {
3973     PINC;
3974     if (! PEND) {
3975       PFETCH(c);
3976       if (c == ']')
3977 	return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3978     }
3979   }
3980 
3981   return 1;  /* 1: is not POSIX bracket, but no error. */
3982 }
3983 
3984 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3985 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3986 {
3987   int r;
3988   OnigCodePoint c;
3989   OnigEncoding enc = env->enc;
3990   UChar *prev, *start, *p = *src;
3991   PFETCH_READY;
3992 
3993   r = 0;
3994   start = prev = p;
3995 
3996   while (!PEND) {
3997     prev = p;
3998     PFETCH(c);
3999     if (c == '}') {
4000       r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4001       if (r < 0) break;
4002 
4003       *src = p;
4004       return r;
4005     }
4006     else if (c == '(' || c == ')' || c == '{' || c == '|') {
4007       r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4008       break;
4009     }
4010   }
4011 
4012   onig_scan_env_set_error_string(env, r, *src, prev);
4013   return r;
4014 }
4015 
4016 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4017 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4018 		    ScanEnv* env)
4019 {
4020   int r, ctype;
4021   CClassNode* cc;
4022 
4023   ctype = fetch_char_property_to_ctype(src, end, env);
4024   if (ctype < 0) return ctype;
4025 
4026   *np = node_new_cclass();
4027   CHECK_NULL_RETURN_MEMERR(*np);
4028   cc = NCCLASS(*np);
4029   r = add_ctype_to_cc(cc, ctype, 0, env);
4030   if (r != 0) return r;
4031   if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4032 
4033   return 0;
4034 }
4035 
4036 
4037 enum CCSTATE {
4038   CCS_VALUE,
4039   CCS_RANGE,
4040   CCS_COMPLETE,
4041   CCS_START
4042 };
4043 
4044 enum CCVALTYPE {
4045   CCV_SB,
4046   CCV_CODE_POINT,
4047   CCV_CLASS
4048 };
4049 
4050 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4051 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4052 		 enum CCSTATE* state, ScanEnv* env)
4053 {
4054   int r;
4055 
4056   if (*state == CCS_RANGE)
4057     return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4058 
4059   if (*state == CCS_VALUE && *type != CCV_CLASS) {
4060     if (*type == CCV_SB)
4061       BITSET_SET_BIT(cc->bs, (int )(*vs));
4062     else if (*type == CCV_CODE_POINT) {
4063       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4064       if (r < 0) return r;
4065     }
4066   }
4067 
4068   *state = CCS_VALUE;
4069   *type  = CCV_CLASS;
4070   return 0;
4071 }
4072 
4073 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4074 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
4075 	       int* vs_israw, int v_israw,
4076 	       enum CCVALTYPE intype, enum CCVALTYPE* type,
4077 	       enum CCSTATE* state, ScanEnv* env)
4078 {
4079   int r;
4080 
4081   switch (*state) {
4082   case CCS_VALUE:
4083     if (*type == CCV_SB)
4084       BITSET_SET_BIT(cc->bs, (int )(*vs));
4085     else if (*type == CCV_CODE_POINT) {
4086       r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4087       if (r < 0) return r;
4088     }
4089     break;
4090 
4091   case CCS_RANGE:
4092     if (intype == *type) {
4093       if (intype == CCV_SB) {
4094         if (*vs > 0xff || v > 0xff)
4095           return ONIGERR_INVALID_CODE_POINT_VALUE;
4096 
4097 	if (*vs > v) {
4098 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4099 	    goto ccs_range_end;
4100 	  else
4101 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4102 	}
4103 	bitset_set_range(cc->bs, (int )*vs, (int )v);
4104       }
4105       else {
4106 	r = add_code_range(&(cc->mbuf), env, *vs, v);
4107 	if (r < 0) return r;
4108       }
4109     }
4110     else {
4111 #if 0
4112       if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4113 #endif
4114 	if (*vs > v) {
4115 	  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4116 	    goto ccs_range_end;
4117 	  else
4118 	    return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4119 	}
4120 	bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4121 	r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4122 	if (r < 0) return r;
4123 #if 0
4124       }
4125       else
4126 	return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4127 #endif
4128     }
4129   ccs_range_end:
4130     *state = CCS_COMPLETE;
4131     break;
4132 
4133   case CCS_COMPLETE:
4134   case CCS_START:
4135     *state = CCS_VALUE;
4136     break;
4137 
4138   default:
4139     break;
4140   }
4141 
4142   *vs_israw = v_israw;
4143   *vs       = v;
4144   *type     = intype;
4145   return 0;
4146 }
4147 
4148 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4149 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4150 		 ScanEnv* env)
4151 {
4152   int in_esc;
4153   OnigCodePoint code;
4154   OnigEncoding enc = env->enc;
4155   UChar* p = from;
4156   PFETCH_READY;
4157 
4158   in_esc = 0;
4159   while (! PEND) {
4160     if (ignore_escaped && in_esc) {
4161       in_esc = 0;
4162     }
4163     else {
4164       PFETCH(code);
4165       if (code == c) return 1;
4166       if (code == MC_ESC(env->syntax)) in_esc = 1;
4167     }
4168   }
4169   return 0;
4170 }
4171 
4172 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4173 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4174 		 ScanEnv* env)
4175 {
4176   int r, neg, len, fetched, and_start;
4177   OnigCodePoint v, vs;
4178   UChar *p;
4179   Node* node;
4180   CClassNode *cc, *prev_cc;
4181   CClassNode work_cc;
4182 
4183   enum CCSTATE state;
4184   enum CCVALTYPE val_type, in_type;
4185   int val_israw, in_israw;
4186 
4187   prev_cc = (CClassNode* )NULL;
4188   *np = NULL_NODE;
4189   r = fetch_token_in_cc(tok, src, end, env);
4190   if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4191     neg = 1;
4192     r = fetch_token_in_cc(tok, src, end, env);
4193   }
4194   else {
4195     neg = 0;
4196   }
4197 
4198   if (r < 0) return r;
4199   if (r == TK_CC_CLOSE) {
4200     if (! code_exist_check((OnigCodePoint )']',
4201                            *src, env->pattern_end, 1, env))
4202       return ONIGERR_EMPTY_CHAR_CLASS;
4203 
4204     CC_ESC_WARN(env, (UChar* )"]");
4205     r = tok->type = TK_CHAR;  /* allow []...] */
4206   }
4207 
4208   *np = node = node_new_cclass();
4209   CHECK_NULL_RETURN_MEMERR(node);
4210   cc = NCCLASS(node);
4211 
4212   and_start = 0;
4213   state = CCS_START;
4214   p = *src;
4215   while (r != TK_CC_CLOSE) {
4216     fetched = 0;
4217     switch (r) {
4218     case TK_CHAR:
4219       len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4220       if (len > 1) {
4221 	in_type = CCV_CODE_POINT;
4222       }
4223       else if (len < 0) {
4224 	r = len;
4225 	goto err;
4226       }
4227       else {
4228       sb_char:
4229 	in_type = CCV_SB;
4230       }
4231       v = (OnigCodePoint )tok->u.c;
4232       in_israw = 0;
4233       goto val_entry2;
4234       break;
4235 
4236     case TK_RAW_BYTE:
4237       /* tok->base != 0 : octal or hexadec. */
4238       if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4239 	UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4240 	UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4241 	UChar* psave = p;
4242 	int i, base = tok->base;
4243 
4244 	buf[0] = tok->u.c;
4245 	for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4246 	  r = fetch_token_in_cc(tok, &p, end, env);
4247 	  if (r < 0) goto err;
4248 	  if (r != TK_RAW_BYTE || tok->base != base) {
4249 	    fetched = 1;
4250 	    break;
4251 	  }
4252 	  buf[i] = tok->u.c;
4253 	}
4254 
4255 	if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4256 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4257 	  goto err;
4258 	}
4259 
4260 	len = enclen(env->enc, buf);
4261 	if (i < len) {
4262 	  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4263 	  goto err;
4264 	}
4265 	else if (i > len) { /* fetch back */
4266 	  p = psave;
4267 	  for (i = 1; i < len; i++) {
4268 	    r = fetch_token_in_cc(tok, &p, end, env);
4269 	  }
4270 	  fetched = 0;
4271 	}
4272 
4273 	if (i == 1) {
4274 	  v = (OnigCodePoint )buf[0];
4275 	  goto raw_single;
4276 	}
4277 	else {
4278 	  v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4279 	  in_type = CCV_CODE_POINT;
4280 	}
4281       }
4282       else {
4283 	v = (OnigCodePoint )tok->u.c;
4284       raw_single:
4285 	in_type = CCV_SB;
4286       }
4287       in_israw = 1;
4288       goto val_entry2;
4289       break;
4290 
4291     case TK_CODE_POINT:
4292       v = tok->u.code;
4293       in_israw = 1;
4294     val_entry:
4295       len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4296       if (len < 0) {
4297 	r = len;
4298 	goto err;
4299       }
4300       in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4301     val_entry2:
4302       r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4303 			 &state, env);
4304       if (r != 0) goto err;
4305       break;
4306 
4307     case TK_POSIX_BRACKET_OPEN:
4308       r = parse_posix_bracket(cc, &p, end, env);
4309       if (r < 0) goto err;
4310       if (r == 1) {  /* is not POSIX bracket */
4311 	CC_ESC_WARN(env, (UChar* )"[");
4312 	p = tok->backp;
4313 	v = (OnigCodePoint )tok->u.c;
4314 	in_israw = 0;
4315 	goto val_entry;
4316       }
4317       goto next_class;
4318       break;
4319 
4320     case TK_CHAR_TYPE:
4321       r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4322       if (r != 0) return r;
4323 
4324     next_class:
4325       r = next_state_class(cc, &vs, &val_type, &state, env);
4326       if (r != 0) goto err;
4327       break;
4328 
4329     case TK_CHAR_PROPERTY:
4330       {
4331 	int ctype;
4332 
4333 	ctype = fetch_char_property_to_ctype(&p, end, env);
4334 	if (ctype < 0) return ctype;
4335 	r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4336 	if (r != 0) return r;
4337 	goto next_class;
4338       }
4339       break;
4340 
4341     case TK_CC_RANGE:
4342       if (state == CCS_VALUE) {
4343 	r = fetch_token_in_cc(tok, &p, end, env);
4344 	if (r < 0) goto err;
4345 	fetched = 1;
4346 	if (r == TK_CC_CLOSE) { /* allow [x-] */
4347 	range_end_val:
4348 	  v = (OnigCodePoint )'-';
4349 	  in_israw = 0;
4350 	  goto val_entry;
4351 	}
4352 	else if (r == TK_CC_AND) {
4353 	  CC_ESC_WARN(env, (UChar* )"-");
4354 	  goto range_end_val;
4355 	}
4356 	state = CCS_RANGE;
4357       }
4358       else if (state == CCS_START) {
4359 	/* [-xa] is allowed */
4360 	v = (OnigCodePoint )tok->u.c;
4361 	in_israw = 0;
4362 
4363 	r = fetch_token_in_cc(tok, &p, end, env);
4364 	if (r < 0) goto err;
4365 	fetched = 1;
4366 	/* [--x] or [a&&-x] is warned. */
4367 	if (r == TK_CC_RANGE || and_start != 0)
4368 	  CC_ESC_WARN(env, (UChar* )"-");
4369 
4370 	goto val_entry;
4371       }
4372       else if (state == CCS_RANGE) {
4373 	CC_ESC_WARN(env, (UChar* )"-");
4374 	goto sb_char;  /* [!--x] is allowed */
4375       }
4376       else { /* CCS_COMPLETE */
4377 	r = fetch_token_in_cc(tok, &p, end, env);
4378 	if (r < 0) goto err;
4379 	fetched = 1;
4380 	if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4381 	else if (r == TK_CC_AND) {
4382 	  CC_ESC_WARN(env, (UChar* )"-");
4383 	  goto range_end_val;
4384 	}
4385 
4386 	if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4387 	  CC_ESC_WARN(env, (UChar* )"-");
4388 	  goto sb_char;   /* [0-9-a] is allowed as [0-9\-a] */
4389 	}
4390 	r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4391 	goto err;
4392       }
4393       break;
4394 
4395     case TK_CC_CC_OPEN: /* [ */
4396       {
4397 	Node *anode;
4398 	CClassNode* acc;
4399 
4400 	r = parse_char_class(&anode, tok, &p, end, env);
4401 	if (r != 0) goto cc_open_err;
4402 	acc = NCCLASS(anode);
4403 	r = or_cclass(cc, acc, env->enc);
4404 
4405 	onig_node_free(anode);
4406       cc_open_err:
4407 	if (r != 0) goto err;
4408       }
4409       break;
4410 
4411     case TK_CC_AND: /* && */
4412       {
4413 	if (state == CCS_VALUE) {
4414 	  r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4415 			     &val_type, &state, env);
4416 	  if (r != 0) goto err;
4417 	}
4418 	/* initialize local variables */
4419 	and_start = 1;
4420 	state = CCS_START;
4421 
4422 	if (IS_NOT_NULL(prev_cc)) {
4423 	  r = and_cclass(prev_cc, cc, env->enc);
4424 	  if (r != 0) goto err;
4425 	  bbuf_free(cc->mbuf);
4426 	}
4427 	else {
4428 	  prev_cc = cc;
4429 	  cc = &work_cc;
4430 	}
4431 	initialize_cclass(cc);
4432       }
4433       break;
4434 
4435     case TK_EOT:
4436       r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4437       goto err;
4438       break;
4439     default:
4440       r = ONIGERR_PARSER_BUG;
4441       goto err;
4442       break;
4443     }
4444 
4445     if (fetched)
4446       r = tok->type;
4447     else {
4448       r = fetch_token_in_cc(tok, &p, end, env);
4449       if (r < 0) goto err;
4450     }
4451   }
4452 
4453   if (state == CCS_VALUE) {
4454     r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4455 		       &val_type, &state, env);
4456     if (r != 0) goto err;
4457   }
4458 
4459   if (IS_NOT_NULL(prev_cc)) {
4460     r = and_cclass(prev_cc, cc, env->enc);
4461     if (r != 0) goto err;
4462     bbuf_free(cc->mbuf);
4463     cc = prev_cc;
4464   }
4465 
4466   if (neg != 0)
4467     NCCLASS_SET_NOT(cc);
4468   else
4469     NCCLASS_CLEAR_NOT(cc);
4470   if (IS_NCCLASS_NOT(cc) &&
4471       IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4472     int is_empty;
4473 
4474     is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4475     if (is_empty != 0)
4476       BITSET_IS_EMPTY(cc->bs, is_empty);
4477 
4478     if (is_empty == 0) {
4479 #define NEWLINE_CODE    0x0a
4480 
4481       if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4482         if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4483           BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4484         else
4485           add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4486       }
4487     }
4488   }
4489   *src = p;
4490   return 0;
4491 
4492  err:
4493   if (cc != NCCLASS(*np))
4494     bbuf_free(cc->mbuf);
4495   onig_node_free(*np);
4496   return r;
4497 }
4498 
4499 static int parse_subexp(Node** top, OnigToken* tok, int term,
4500 			UChar** src, UChar* end, ScanEnv* env);
4501 
4502 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4503 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4504 	      ScanEnv* env)
4505 {
4506   int r, num;
4507   Node *target;
4508   OnigOptionType option;
4509   OnigCodePoint c;
4510   OnigEncoding enc = env->enc;
4511 
4512 #ifdef USE_NAMED_GROUP
4513   int list_capture;
4514 #endif
4515 
4516   UChar* p = *src;
4517   PFETCH_READY;
4518 
4519   *np = NULL;
4520   if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4521 
4522   option = env->option;
4523   if (PPEEK_IS('?') &&
4524       IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4525     PINC;
4526     if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4527 
4528     PFETCH(c);
4529     switch (c) {
4530     case ':':   /* (?:...) grouping only */
4531     group:
4532       r = fetch_token(tok, &p, end, env);
4533       if (r < 0) return r;
4534       r = parse_subexp(np, tok, term, &p, end, env);
4535       if (r < 0) return r;
4536       *src = p;
4537       return 1; /* group */
4538       break;
4539 
4540     case '=':
4541       *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4542       break;
4543     case '!':  /*         preceding read */
4544       *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4545       break;
4546     case '>':            /* (?>...) stop backtrack */
4547       *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4548       break;
4549 
4550 #ifdef USE_NAMED_GROUP
4551     case '\'':
4552       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4553 	goto named_group1;
4554       }
4555       else
4556 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4557       break;
4558 #endif
4559 
4560     case '<':   /* look behind (?<=...), (?<!...) */
4561       PFETCH(c);
4562       if (c == '=')
4563 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4564       else if (c == '!')
4565 	*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4566 #ifdef USE_NAMED_GROUP
4567       else {
4568 	if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4569 	  UChar *name;
4570 	  UChar *name_end;
4571 
4572 	  PUNFETCH;
4573 	  c = '<';
4574 
4575 	named_group1:
4576 	  list_capture = 0;
4577 
4578 	named_group2:
4579 	  name = p;
4580 	  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4581 	  if (r < 0) return r;
4582 
4583 	  num = scan_env_add_mem_entry(env);
4584 	  if (num < 0) return num;
4585 	  if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4586 	    return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4587 
4588 	  r = name_add(env->reg, name, name_end, num, env);
4589 	  if (r != 0) return r;
4590 	  *np = node_new_enclose_memory(env->option, 1);
4591 	  CHECK_NULL_RETURN_MEMERR(*np);
4592 	  NENCLOSE(*np)->regnum = num;
4593 	  if (list_capture != 0)
4594 	    BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4595 	  env->num_named++;
4596 	}
4597 	else {
4598 	  return ONIGERR_UNDEFINED_GROUP_OPTION;
4599 	}
4600       }
4601 #else
4602       else {
4603 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4604       }
4605 #endif
4606       break;
4607 
4608     case '@':
4609       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4610 #ifdef USE_NAMED_GROUP
4611 	if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4612 	  PFETCH(c);
4613 	  if (c == '<' || c == '\'') {
4614 	    list_capture = 1;
4615 	    goto named_group2; /* (?@<name>...) */
4616 	  }
4617 	  PUNFETCH;
4618 	}
4619 #endif
4620 	*np = node_new_enclose_memory(env->option, 0);
4621 	CHECK_NULL_RETURN_MEMERR(*np);
4622 	num = scan_env_add_mem_entry(env);
4623 	if (num < 0) {
4624 	  onig_node_free(*np);
4625 	  return num;
4626 	}
4627 	else if (num >= (int )BIT_STATUS_BITS_NUM) {
4628 	  onig_node_free(*np);
4629 	  return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4630 	}
4631 	NENCLOSE(*np)->regnum = num;
4632 	BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4633       }
4634       else {
4635 	return ONIGERR_UNDEFINED_GROUP_OPTION;
4636       }
4637       break;
4638 
4639 #ifdef USE_POSIXLINE_OPTION
4640     case 'p':
4641 #endif
4642     case '-': case 'i': case 'm': case 's': case 'x':
4643       {
4644 	int neg = 0;
4645 
4646 	while (1) {
4647 	  switch (c) {
4648 	  case ':':
4649 	  case ')':
4650 	  break;
4651 
4652 	  case '-':  neg = 1; break;
4653 	  case 'x':  ONOFF(option, ONIG_OPTION_EXTEND,     neg); break;
4654 	  case 'i':  ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4655 	  case 's':
4656 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4657 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4658 	    }
4659 	    else
4660 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4661 	    break;
4662 
4663 	  case 'm':
4664 	    if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4665 	      ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4666 	    }
4667 	    else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4668 	      ONOFF(option, ONIG_OPTION_MULTILINE,  neg);
4669 	    }
4670 	    else
4671 	      return ONIGERR_UNDEFINED_GROUP_OPTION;
4672 	    break;
4673 #ifdef USE_POSIXLINE_OPTION
4674 	  case 'p':
4675 	    ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4676 	    break;
4677 #endif
4678 	  default:
4679 	    return ONIGERR_UNDEFINED_GROUP_OPTION;
4680 	  }
4681 
4682 	  if (c == ')') {
4683 	    *np = node_new_option(option);
4684 	    CHECK_NULL_RETURN_MEMERR(*np);
4685 	    *src = p;
4686 	    return 2; /* option only */
4687 	  }
4688 	  else if (c == ':') {
4689 	    OnigOptionType prev = env->option;
4690 
4691 	    env->option     = option;
4692 	    r = fetch_token(tok, &p, end, env);
4693 	    if (r < 0) return r;
4694 	    r = parse_subexp(&target, tok, term, &p, end, env);
4695 	    env->option = prev;
4696 	    if (r < 0) return r;
4697 	    *np = node_new_option(option);
4698 	    CHECK_NULL_RETURN_MEMERR(*np);
4699 	    NENCLOSE(*np)->target = target;
4700 	    *src = p;
4701 	    return 0;
4702 	  }
4703 
4704 	  if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4705 	  PFETCH(c);
4706 	}
4707       }
4708       break;
4709 
4710     default:
4711       return ONIGERR_UNDEFINED_GROUP_OPTION;
4712     }
4713   }
4714   else {
4715     if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4716       goto group;
4717 
4718     *np = node_new_enclose_memory(env->option, 0);
4719     CHECK_NULL_RETURN_MEMERR(*np);
4720     num = scan_env_add_mem_entry(env);
4721     if (num < 0) return num;
4722     NENCLOSE(*np)->regnum = num;
4723   }
4724 
4725   CHECK_NULL_RETURN_MEMERR(*np);
4726   r = fetch_token(tok, &p, end, env);
4727   if (r < 0) return r;
4728   r = parse_subexp(&target, tok, term, &p, end, env);
4729   if (r < 0) return r;
4730 
4731   if (NTYPE(*np) == NT_ANCHOR)
4732     NANCHOR(*np)->target = target;
4733   else {
4734     NENCLOSE(*np)->target = target;
4735     if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4736       /* Don't move this to previous of parse_subexp() */
4737       r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4738       if (r != 0) return r;
4739     }
4740   }
4741 
4742   *src = p;
4743   return 0;
4744 }
4745 
4746 static const char* PopularQStr[] = {
4747   "?", "*", "+", "??", "*?", "+?"
4748 };
4749 
4750 static const char* ReduceQStr[] = {
4751   "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4752 };
4753 
4754 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4755 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4756 {
4757   QtfrNode* qn;
4758 
4759   qn = NQTFR(qnode);
4760   if (qn->lower == 1 && qn->upper == 1) {
4761     return 1;
4762   }
4763 
4764   switch (NTYPE(target)) {
4765   case NT_STR:
4766     if (! group) {
4767       StrNode* sn = NSTR(target);
4768       if (str_node_can_be_split(sn, env->enc)) {
4769 	Node* n = str_node_split_last_char(sn, env->enc);
4770 	if (IS_NOT_NULL(n)) {
4771 	  qn->target = n;
4772 	  return 2;
4773 	}
4774       }
4775     }
4776     break;
4777 
4778   case NT_QTFR:
4779     { /* check redundant double repeat. */
4780       /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4781       QtfrNode* qnt   = NQTFR(target);
4782       int nestq_num   = popular_quantifier_num(qn);
4783       int targetq_num = popular_quantifier_num(qnt);
4784 
4785 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4786       if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4787 	  IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4788         UChar buf[WARN_BUFSIZE];
4789 
4790         switch(ReduceTypeTable[targetq_num][nestq_num]) {
4791         case RQ_ASIS:
4792           break;
4793 
4794         case RQ_DEL:
4795           if (onig_verb_warn != onig_null_warn) {
4796             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4797                                  env->pattern, env->pattern_end,
4798                                  (UChar* )"redundant nested repeat operator");
4799             (*onig_verb_warn)((char* )buf);
4800           }
4801           goto warn_exit;
4802           break;
4803 
4804         default:
4805           if (onig_verb_warn != onig_null_warn) {
4806             onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4807                                        env->pattern, env->pattern_end,
4808             (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4809             PopularQStr[targetq_num], PopularQStr[nestq_num],
4810             ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4811             (*onig_verb_warn)((char* )buf);
4812           }
4813           goto warn_exit;
4814           break;
4815         }
4816       }
4817 
4818     warn_exit:
4819 #endif
4820       if (targetq_num >= 0) {
4821 	if (nestq_num >= 0) {
4822 	  onig_reduce_nested_quantifier(qnode, target);
4823 	  goto q_exit;
4824 	}
4825 	else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4826 	  /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4827 	  if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4828 	    qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4829 	  }
4830 	}
4831       }
4832     }
4833     break;
4834 
4835   default:
4836     break;
4837   }
4838 
4839   qn->target = target;
4840  q_exit:
4841   return 0;
4842 }
4843 
4844 
4845 #ifdef USE_SHARED_CCLASS_TABLE
4846 
4847 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS     8
4848 
4849 /* for ctype node hash table */
4850 
4851 typedef struct {
4852   OnigEncoding enc;
4853   int not;
4854   int type;
4855 } type_cclass_key;
4856 
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4857 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4858 {
4859   if (x->type != y->type) return 1;
4860   if (x->enc  != y->enc)  return 1;
4861   if (x->not  != y->not)  return 1;
4862   return 0;
4863 }
4864 
type_cclass_hash(type_cclass_key * key)4865 static int type_cclass_hash(type_cclass_key* key)
4866 {
4867   int i, val;
4868   UChar *p;
4869 
4870   val = 0;
4871 
4872   p = (UChar* )&(key->enc);
4873   for (i = 0; i < (int )sizeof(key->enc); i++) {
4874     val = val * 997 + (int )*p++;
4875   }
4876 
4877   p = (UChar* )(&key->type);
4878   for (i = 0; i < (int )sizeof(key->type); i++) {
4879     val = val * 997 + (int )*p++;
4880   }
4881 
4882   val += key->not;
4883   return val + (val >> 5);
4884 }
4885 
4886 static struct st_hash_type type_type_cclass_hash = {
4887     type_cclass_cmp,
4888     type_cclass_hash,
4889 };
4890 
4891 static st_table* OnigTypeCClassTable;
4892 
4893 
4894 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg ARG_UNUSED)4895 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
4896 {
4897   if (IS_NOT_NULL(node)) {
4898     CClassNode* cc = NCCLASS(node);
4899     if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4900     xfree(node);
4901   }
4902 
4903   if (IS_NOT_NULL(key)) xfree(key);
4904   return ST_DELETE;
4905 }
4906 
4907 extern int
onig_free_shared_cclass_table(void)4908 onig_free_shared_cclass_table(void)
4909 {
4910   if (IS_NOT_NULL(OnigTypeCClassTable)) {
4911     onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4912     onig_st_free_table(OnigTypeCClassTable);
4913     OnigTypeCClassTable = NULL;
4914   }
4915 
4916   return 0;
4917 }
4918 
4919 #endif /* USE_SHARED_CCLASS_TABLE */
4920 
4921 
4922 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4923 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4924 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4925 {
4926   BBuf *tbuf;
4927   int r;
4928 
4929   if (IS_NCCLASS_NOT(cc)) {
4930     bitset_invert(cc->bs);
4931 
4932     if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4933       r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4934       if (r != 0) return r;
4935 
4936       bbuf_free(cc->mbuf);
4937       cc->mbuf = tbuf;
4938     }
4939 
4940     NCCLASS_CLEAR_NOT(cc);
4941   }
4942 
4943   return 0;
4944 }
4945 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4946 
4947 typedef struct {
4948   ScanEnv*    env;
4949   CClassNode* cc;
4950   Node*       alt_root;
4951   Node**      ptail;
4952 } IApplyCaseFoldArg;
4953 
4954 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4955 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4956 		  int to_len, void* arg)
4957 {
4958   IApplyCaseFoldArg* iarg;
4959   ScanEnv* env;
4960   CClassNode* cc;
4961   BitSetRef bs;
4962 
4963   iarg = (IApplyCaseFoldArg* )arg;
4964   env = iarg->env;
4965   cc  = iarg->cc;
4966   bs = cc->bs;
4967 
4968   if (to_len == 1) {
4969     int is_in = onig_is_code_in_cc(env->enc, from, cc);
4970 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4971     if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4972 	(is_in == 0 &&  IS_NCCLASS_NOT(cc))) {
4973       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4974 	add_code_range(&(cc->mbuf), env, *to, *to);
4975       }
4976       else {
4977 	BITSET_SET_BIT(bs, *to);
4978       }
4979     }
4980 #else
4981     if (is_in != 0) {
4982       if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4983 	if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4984 	add_code_range(&(cc->mbuf), env, *to, *to);
4985       }
4986       else {
4987 	if (IS_NCCLASS_NOT(cc)) {
4988 	  BITSET_CLEAR_BIT(bs, *to);
4989 	}
4990 	else
4991 	  BITSET_SET_BIT(bs, *to);
4992       }
4993     }
4994 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4995   }
4996   else {
4997     int r, i, len;
4998     UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4999     Node *snode = NULL_NODE;
5000 
5001     if (onig_is_code_in_cc(env->enc, from, cc)
5002 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5003 	&& !IS_NCCLASS_NOT(cc)
5004 #endif
5005 	) {
5006       for (i = 0; i < to_len; i++) {
5007 	len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5008 	if (i == 0) {
5009 	  snode = onig_node_new_str(buf, buf + len);
5010 	  CHECK_NULL_RETURN_MEMERR(snode);
5011 
5012 	  /* char-class expanded multi-char only
5013 	     compare with string folded at match time. */
5014 	  NSTRING_SET_AMBIG(snode);
5015 	}
5016 	else {
5017 	  r = onig_node_str_cat(snode, buf, buf + len);
5018 	  if (r < 0) {
5019 	    onig_node_free(snode);
5020 	    return r;
5021 	  }
5022 	}
5023       }
5024 
5025       *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5026       CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5027       iarg->ptail = &(NCDR((*(iarg->ptail))));
5028     }
5029   }
5030 
5031   return 0;
5032 }
5033 
5034 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5035 parse_exp(Node** np, OnigToken* tok, int term,
5036 	  UChar** src, UChar* end, ScanEnv* env)
5037 {
5038   int r, len, group = 0;
5039   Node* qn;
5040   Node** targetp;
5041 
5042   *np = NULL;
5043   if (tok->type == (enum TokenSyms )term)
5044     goto end_of_token;
5045 
5046   switch (tok->type) {
5047   case TK_ALT:
5048   case TK_EOT:
5049   end_of_token:
5050   *np = node_new_empty();
5051   return tok->type;
5052   break;
5053 
5054   case TK_SUBEXP_OPEN:
5055     r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5056     if (r < 0) return r;
5057     if (r == 1) group = 1;
5058     else if (r == 2) { /* option only */
5059       Node* target;
5060       OnigOptionType prev = env->option;
5061 
5062       env->option = NENCLOSE(*np)->option;
5063       r = fetch_token(tok, src, end, env);
5064       if (r < 0) return r;
5065       r = parse_subexp(&target, tok, term, src, end, env);
5066       env->option = prev;
5067       if (r < 0) return r;
5068       NENCLOSE(*np)->target = target;
5069       return tok->type;
5070     }
5071     break;
5072 
5073   case TK_SUBEXP_CLOSE:
5074     if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5075       return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5076 
5077     if (tok->escaped) goto tk_raw_byte;
5078     else goto tk_byte;
5079     break;
5080 
5081   case TK_STRING:
5082   tk_byte:
5083     {
5084       *np = node_new_str(tok->backp, *src);
5085       CHECK_NULL_RETURN_MEMERR(*np);
5086 
5087       while (1) {
5088 	r = fetch_token(tok, src, end, env);
5089 	if (r < 0) return r;
5090 	if (r != TK_STRING) break;
5091 
5092 	r = onig_node_str_cat(*np, tok->backp, *src);
5093 	if (r < 0) return r;
5094       }
5095 
5096     string_end:
5097       targetp = np;
5098       goto repeat;
5099     }
5100     break;
5101 
5102   case TK_RAW_BYTE:
5103   tk_raw_byte:
5104     {
5105       *np = node_new_str_raw_char((UChar )tok->u.c);
5106       CHECK_NULL_RETURN_MEMERR(*np);
5107       len = 1;
5108       while (1) {
5109 	if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5110 	  if (len == enclen(env->enc, NSTR(*np)->s)) {
5111 	    r = fetch_token(tok, src, end, env);
5112 	    NSTRING_CLEAR_RAW(*np);
5113 	    goto string_end;
5114 	  }
5115 	}
5116 
5117 	r = fetch_token(tok, src, end, env);
5118 	if (r < 0) return r;
5119 	if (r != TK_RAW_BYTE) {
5120 	  /* Don't use this, it is wrong for little endian encodings. */
5121 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5122 	  int rem;
5123 	  if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5124 	    rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5125 	    (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5126 	    if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5127 	      NSTRING_CLEAR_RAW(*np);
5128 	      goto string_end;
5129 	    }
5130 	  }
5131 #endif
5132 	  return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5133 	}
5134 
5135 	r = node_str_cat_char(*np, (UChar )tok->u.c);
5136 	if (r < 0) return r;
5137 
5138 	len++;
5139       }
5140     }
5141     break;
5142 
5143   case TK_CODE_POINT:
5144     {
5145       UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5146       int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5147       if (num < 0) return num;
5148 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5149       *np = node_new_str_raw(buf, buf + num);
5150 #else
5151       *np = node_new_str(buf, buf + num);
5152 #endif
5153       CHECK_NULL_RETURN_MEMERR(*np);
5154     }
5155     break;
5156 
5157   case TK_QUOTE_OPEN:
5158     {
5159       OnigCodePoint end_op[2];
5160       UChar *qstart, *qend, *nextp;
5161 
5162       end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5163       end_op[1] = (OnigCodePoint )'E';
5164       qstart = *src;
5165       qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5166       if (IS_NULL(qend)) {
5167 	nextp = qend = end;
5168       }
5169       *np = node_new_str(qstart, qend);
5170       CHECK_NULL_RETURN_MEMERR(*np);
5171       *src = nextp;
5172     }
5173     break;
5174 
5175   case TK_CHAR_TYPE:
5176     {
5177       switch (tok->u.prop.ctype) {
5178       case ONIGENC_CTYPE_WORD:
5179 	*np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5180 	CHECK_NULL_RETURN_MEMERR(*np);
5181 	break;
5182 
5183       case ONIGENC_CTYPE_SPACE:
5184       case ONIGENC_CTYPE_DIGIT:
5185       case ONIGENC_CTYPE_XDIGIT:
5186 	{
5187 	  CClassNode* cc;
5188 
5189 #ifdef USE_SHARED_CCLASS_TABLE
5190           const OnigCodePoint *mbr;
5191 	  OnigCodePoint sb_out;
5192 
5193           r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
5194 					   &sb_out, &mbr);
5195           if (r == 0 &&
5196               ONIGENC_CODE_RANGE_NUM(mbr)
5197               >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
5198             type_cclass_key  key;
5199             type_cclass_key* new_key;
5200 
5201             key.enc  = env->enc;
5202             key.not  = tok->u.prop.not;
5203             key.type = tok->u.prop.ctype;
5204 
5205             THREAD_ATOMIC_START;
5206 
5207             if (IS_NULL(OnigTypeCClassTable)) {
5208               OnigTypeCClassTable
5209                 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
5210               if (IS_NULL(OnigTypeCClassTable)) {
5211                 THREAD_ATOMIC_END;
5212                 return ONIGERR_MEMORY;
5213               }
5214             }
5215             else {
5216               if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key,
5217                                  (st_data_t* )np)) {
5218                 THREAD_ATOMIC_END;
5219                 break;
5220               }
5221             }
5222 
5223             *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
5224 						     sb_out, mbr);
5225             if (IS_NULL(*np)) {
5226               THREAD_ATOMIC_END;
5227               return ONIGERR_MEMORY;
5228             }
5229 
5230             cc = NCCLASS(*np);
5231             NCCLASS_SET_SHARE(cc);
5232             new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
5233 	    xmemcpy(new_key, &key, sizeof(type_cclass_key));
5234             onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key,
5235                                (st_data_t )*np);
5236 
5237             THREAD_ATOMIC_END;
5238           }
5239           else {
5240 #endif
5241             *np = node_new_cclass();
5242             CHECK_NULL_RETURN_MEMERR(*np);
5243             cc = NCCLASS(*np);
5244             add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5245             if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5246 #ifdef USE_SHARED_CCLASS_TABLE
5247           }
5248 #endif
5249 	}
5250 	break;
5251 
5252       default:
5253 	return ONIGERR_PARSER_BUG;
5254 	break;
5255       }
5256     }
5257     break;
5258 
5259   case TK_CHAR_PROPERTY:
5260     r = parse_char_property(np, tok, src, end, env);
5261     if (r != 0) return r;
5262     break;
5263 
5264   case TK_CC_OPEN:
5265     {
5266       CClassNode* cc;
5267 
5268       r = parse_char_class(np, tok, src, end, env);
5269       if (r != 0) return r;
5270 
5271       cc = NCCLASS(*np);
5272       if (IS_IGNORECASE(env->option)) {
5273 	IApplyCaseFoldArg iarg;
5274 
5275 	iarg.env      = env;
5276 	iarg.cc       = cc;
5277 	iarg.alt_root = NULL_NODE;
5278 	iarg.ptail    = &(iarg.alt_root);
5279 
5280 	r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5281 					i_apply_case_fold, &iarg);
5282 	if (r != 0) {
5283 	  onig_node_free(iarg.alt_root);
5284 	  return r;
5285 	}
5286 	if (IS_NOT_NULL(iarg.alt_root)) {
5287           Node* work = onig_node_new_alt(*np, iarg.alt_root);
5288           if (IS_NULL(work)) {
5289             onig_node_free(iarg.alt_root);
5290             return ONIGERR_MEMORY;
5291           }
5292           *np = work;
5293 	}
5294       }
5295     }
5296     break;
5297 
5298   case TK_ANYCHAR:
5299     *np = node_new_anychar();
5300     CHECK_NULL_RETURN_MEMERR(*np);
5301     break;
5302 
5303   case TK_ANYCHAR_ANYTIME:
5304     *np = node_new_anychar();
5305     CHECK_NULL_RETURN_MEMERR(*np);
5306     qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5307     CHECK_NULL_RETURN_MEMERR(qn);
5308     NQTFR(qn)->target = *np;
5309     *np = qn;
5310     break;
5311 
5312   case TK_BACKREF:
5313     len = tok->u.backref.num;
5314     *np = node_new_backref(len,
5315 		   (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5316 			   tok->u.backref.by_name,
5317 #ifdef USE_BACKREF_WITH_LEVEL
5318 			   tok->u.backref.exist_level,
5319 			   tok->u.backref.level,
5320 #endif
5321 			   env);
5322     CHECK_NULL_RETURN_MEMERR(*np);
5323     break;
5324 
5325 #ifdef USE_SUBEXP_CALL
5326   case TK_CALL:
5327     {
5328       int gnum = tok->u.call.gnum;
5329 
5330       if (gnum < 0) {
5331 	gnum = BACKREF_REL_TO_ABS(gnum, env);
5332 	if (gnum <= 0)
5333 	  return ONIGERR_INVALID_BACKREF;
5334       }
5335       *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5336       CHECK_NULL_RETURN_MEMERR(*np);
5337       env->num_call++;
5338     }
5339     break;
5340 #endif
5341 
5342   case TK_ANCHOR:
5343     *np = onig_node_new_anchor(tok->u.anchor);
5344     break;
5345 
5346   case TK_OP_REPEAT:
5347   case TK_INTERVAL:
5348     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5349       if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5350 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5351       else
5352 	*np = node_new_empty();
5353     }
5354     else {
5355       goto tk_byte;
5356     }
5357     break;
5358 
5359   default:
5360     return ONIGERR_PARSER_BUG;
5361     break;
5362   }
5363 
5364   {
5365     targetp = np;
5366 
5367   re_entry:
5368     r = fetch_token(tok, src, end, env);
5369     if (r < 0) return r;
5370 
5371   repeat:
5372     if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5373       if (is_invalid_quantifier_target(*targetp))
5374 	return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5375 
5376       qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5377 			       (r == TK_INTERVAL ? 1 : 0));
5378       CHECK_NULL_RETURN_MEMERR(qn);
5379       NQTFR(qn)->greedy = tok->u.repeat.greedy;
5380       r = set_quantifier(qn, *targetp, group, env);
5381       if (r < 0) {
5382 	onig_node_free(qn);
5383 	return r;
5384       }
5385 
5386       if (tok->u.repeat.possessive != 0) {
5387 	Node* en;
5388 	en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5389 	if (IS_NULL(en)) {
5390 	  onig_node_free(qn);
5391 	  return ONIGERR_MEMORY;
5392 	}
5393 	NENCLOSE(en)->target = qn;
5394 	qn = en;
5395       }
5396 
5397       if (r == 0) {
5398 	*targetp = qn;
5399       }
5400       else if (r == 1) {
5401 	onig_node_free(qn);
5402       }
5403       else if (r == 2) { /* split case: /abc+/ */
5404 	Node *tmp;
5405 
5406 	*targetp = node_new_list(*targetp, NULL);
5407 	if (IS_NULL(*targetp)) {
5408 	  onig_node_free(qn);
5409 	  return ONIGERR_MEMORY;
5410 	}
5411 	tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5412 	if (IS_NULL(tmp)) {
5413 	  onig_node_free(qn);
5414 	  return ONIGERR_MEMORY;
5415 	}
5416 	targetp = &(NCAR(tmp));
5417       }
5418       goto re_entry;
5419     }
5420   }
5421 
5422   return r;
5423 }
5424 
5425 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5426 parse_branch(Node** top, OnigToken* tok, int term,
5427 	     UChar** src, UChar* end, ScanEnv* env)
5428 {
5429   int r;
5430   Node *node, **headp;
5431 
5432   *top = NULL;
5433   r = parse_exp(&node, tok, term, src, end, env);
5434   if (r < 0) return r;
5435 
5436   if (r == TK_EOT || r == term || r == TK_ALT) {
5437     *top = node;
5438   }
5439   else {
5440     *top  = node_new_list(node, NULL);
5441     headp = &(NCDR(*top));
5442     while (r != TK_EOT && r != term && r != TK_ALT) {
5443       r = parse_exp(&node, tok, term, src, end, env);
5444       if (r < 0) return r;
5445 
5446       if (NTYPE(node) == NT_LIST) {
5447 	*headp = node;
5448 	while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5449 	headp = &(NCDR(node));
5450       }
5451       else {
5452 	*headp = node_new_list(node, NULL);
5453 	headp = &(NCDR(*headp));
5454       }
5455     }
5456   }
5457 
5458   return r;
5459 }
5460 
5461 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5462 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5463 parse_subexp(Node** top, OnigToken* tok, int term,
5464 	     UChar** src, UChar* end, ScanEnv* env)
5465 {
5466   int r;
5467   Node *node, **headp;
5468 
5469   *top = NULL;
5470   r = parse_branch(&node, tok, term, src, end, env);
5471   if (r < 0) {
5472     onig_node_free(node);
5473     return r;
5474   }
5475 
5476   if (r == term) {
5477     *top = node;
5478   }
5479   else if (r == TK_ALT) {
5480     *top  = onig_node_new_alt(node, NULL);
5481     headp = &(NCDR(*top));
5482     while (r == TK_ALT) {
5483       r = fetch_token(tok, src, end, env);
5484       if (r < 0) return r;
5485       r = parse_branch(&node, tok, term, src, end, env);
5486       if (r < 0) return r;
5487 
5488       *headp = onig_node_new_alt(node, NULL);
5489       headp = &(NCDR(*headp));
5490     }
5491 
5492     if (tok->type != (enum TokenSyms )term)
5493       goto err;
5494   }
5495   else {
5496   err:
5497     if (term == TK_SUBEXP_CLOSE)
5498       return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5499     else
5500       return ONIGERR_PARSER_BUG;
5501   }
5502 
5503   return r;
5504 }
5505 
5506 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5507 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5508 {
5509   int r;
5510   OnigToken tok;
5511 
5512   r = fetch_token(&tok, src, end, env);
5513   if (r < 0) return r;
5514   r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5515   if (r < 0) return r;
5516   return 0;
5517 }
5518 
5519 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5520 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5521 		     regex_t* reg, ScanEnv* env)
5522 {
5523   int r;
5524   UChar* p;
5525 
5526 #ifdef USE_NAMED_GROUP
5527   names_clear(reg);
5528 #endif
5529 
5530   scan_env_clear(env);
5531   env->option         = reg->options;
5532   env->case_fold_flag = reg->case_fold_flag;
5533   env->enc            = reg->enc;
5534   env->syntax         = reg->syntax;
5535   env->pattern        = (UChar* )pattern;
5536   env->pattern_end    = (UChar* )end;
5537   env->reg            = reg;
5538 
5539   *root = NULL;
5540   p = (UChar* )pattern;
5541   r = parse_regexp(root, &p, (UChar* )end, env);
5542   reg->num_mem = env->num_mem;
5543   return r;
5544 }
5545 
5546 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5547 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5548 				UChar* arg, UChar* arg_end)
5549 {
5550   env->error     = arg;
5551   env->error_end = arg_end;
5552 }
5553