xref: /PHP-5.5/ext/mbstring/oniguruma/regparse.c (revision 75adcc8d)
1 /**********************************************************************
2   regparse.c -  Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "regparse.h"
31 #include "st.h"
32 
33 #define WARN_BUFSIZE    256
34 
35 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
36 
37 
38 OnigSyntaxType OnigSyntaxRuby = {
39   (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
40      ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
41      ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
42      ONIG_SYN_OP_ESC_C_CONTROL )
43    & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
44   , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
45       ONIG_SYN_OP2_OPTION_RUBY |
46       ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
47       ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
48       ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  |
49       ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
50       ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
51       ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
52       ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
53       ONIG_SYN_OP2_ESC_H_XDIGIT )
54   , ( SYN_GNU_REGEX_BV |
55       ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
56       ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
57       ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
58       ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
59       ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
60       ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
61       ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
62   , ONIG_OPTION_NONE
63   ,
64   {
65       (OnigCodePoint )'\\'                       /* esc */
66     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */
67     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */
68     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
69     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
70     , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
71   }
72 };
73 
74 OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
75 
onig_null_warn(const char * s ARG_UNUSED)76 extern void onig_null_warn(const char* s ARG_UNUSED) { }
77 
78 #ifdef DEFAULT_WARN_FUNCTION
79 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
80 #else
81 static OnigWarnFunc onig_warn = onig_null_warn;
82 #endif
83 
84 #ifdef DEFAULT_VERB_WARN_FUNCTION
85 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
86 #else
87 static OnigWarnFunc onig_verb_warn = onig_null_warn;
88 #endif
89 
onig_set_warn_func(OnigWarnFunc f)90 extern void onig_set_warn_func(OnigWarnFunc f)
91 {
92   onig_warn = f;
93 }
94 
onig_set_verb_warn_func(OnigWarnFunc f)95 extern void onig_set_verb_warn_func(OnigWarnFunc f)
96 {
97   onig_verb_warn = f;
98 }
99 
100 static void
bbuf_free(BBuf * bbuf)101 bbuf_free(BBuf* bbuf)
102 {
103   if (IS_NOT_NULL(bbuf)) {
104     if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
105     xfree(bbuf);
106   }
107 }
108 
109 static int
bbuf_clone(BBuf ** rto,BBuf * from)110 bbuf_clone(BBuf** rto, BBuf* from)
111 {
112   int r;
113   BBuf *to;
114 
115   *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
116   CHECK_NULL_RETURN_MEMERR(to);
117   r = BBUF_INIT(to, from->alloc);
118   if (r != 0) return r;
119   to->used = from->used;
120   xmemcpy(to->p, from->p, from->used);
121   return 0;
122 }
123 
124 #define BACKREF_REL_TO_ABS(rel_no, env) \
125   ((env)->num_mem + 1 + (rel_no))
126 
127 #define ONOFF(v,f,negative)    (negative) ? ((v) &= ~(f)) : ((v) |= (f))
128 
129 #define MBCODE_START_POS(enc) \
130   (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
131 
132 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
133   add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
134 
135 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
136   if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
137     r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
138     if (r) return r;\
139   }\
140 } while (0)
141 
142 
143 #define BITSET_IS_EMPTY(bs,empty) do {\
144   int i;\
145   empty = 1;\
146   for (i = 0; i < (int )BITSET_SIZE; i++) {\
147     if ((bs)[i] != 0) {\
148       empty = 0; break;\
149     }\
150   }\
151 } while (0)
152 
153 static void
bitset_set_range(BitSetRef bs,int from,int to)154 bitset_set_range(BitSetRef bs, int from, int to)
155 {
156   int i;
157   for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
158     BITSET_SET_BIT(bs, i);
159   }
160 }
161 
162 #if 0
163 static void
164 bitset_set_all(BitSetRef bs)
165 {
166   int i;
167   for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
168 }
169 #endif
170 
171 static void
bitset_invert(BitSetRef bs)172 bitset_invert(BitSetRef bs)
173 {
174   int i;
175   for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
176 }
177 
178 static void
bitset_invert_to(BitSetRef from,BitSetRef to)179 bitset_invert_to(BitSetRef from, BitSetRef to)
180 {
181   int i;
182   for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
183 }
184 
185 static void
bitset_and(BitSetRef dest,BitSetRef bs)186 bitset_and(BitSetRef dest, BitSetRef bs)
187 {
188   int i;
189   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
190 }
191 
192 static void
bitset_or(BitSetRef dest,BitSetRef bs)193 bitset_or(BitSetRef dest, BitSetRef bs)
194 {
195   int i;
196   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
197 }
198 
199 static void
bitset_copy(BitSetRef dest,BitSetRef bs)200 bitset_copy(BitSetRef dest, BitSetRef bs)
201 {
202   int i;
203   for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
204 }
205 
206 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)207 onig_strncmp(const UChar* s1, const UChar* s2, int n)
208 {
209   int x;
210 
211   while (n-- > 0) {
212     x = *s2++ - *s1++;
213     if (x) return x;
214   }
215   return 0;
216 }
217 
218 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)219 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
220 {
221   int len = end - src;
222   if (len > 0) {
223     xmemcpy(dest, src, len);
224     dest[len] = (UChar )0;
225   }
226 }
227 
228 #ifdef USE_NAMED_GROUP
229 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)230 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
231 {
232   int slen, term_len, i;
233   UChar *r;
234 
235   slen = end - s;
236   term_len = ONIGENC_MBC_MINLEN(enc);
237 
238   r = (UChar* )xmalloc(slen + term_len);
239   CHECK_NULL_RETURN(r);
240   xmemcpy(r, s, slen);
241 
242   for (i = 0; i < term_len; i++)
243     r[slen + i] = (UChar )0;
244 
245   return r;
246 }
247 #endif
248 
249 /* scan pattern methods */
250 #define PEND_VALUE   0
251 
252 #define PFETCH_READY  UChar* pfetch_prev
253 #define PEND         (p < end ?  0 : 1)
254 #define PUNFETCH     p = pfetch_prev
255 #define PINC       do { \
256   pfetch_prev = p; \
257   p += ONIGENC_MBC_ENC_LEN(enc, p); \
258 } while (0)
259 #define PFETCH(c)  do { \
260   c = ONIGENC_MBC_TO_CODE(enc, p, end); \
261   pfetch_prev = p; \
262   p += ONIGENC_MBC_ENC_LEN(enc, p); \
263 } while (0)
264 
265 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
266 #define PPEEK_IS(c)  (PPEEK == (OnigCodePoint )c)
267 
268 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)269 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
270 	      int capa)
271 {
272   UChar* r;
273 
274   if (dest)
275     r = (UChar* )xrealloc(dest, capa + 1);
276   else
277     r = (UChar* )xmalloc(capa + 1);
278 
279   CHECK_NULL_RETURN(r);
280   onig_strcpy(r + (dest_end - dest), src, src_end);
281   return r;
282 }
283 
284 /* dest on static area */
285 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)286 strcat_capa_from_static(UChar* dest, UChar* dest_end,
287 			const UChar* src, const UChar* src_end, int capa)
288 {
289   UChar* r;
290 
291   r = (UChar* )xmalloc(capa + 1);
292   CHECK_NULL_RETURN(r);
293   onig_strcpy(r, dest, dest_end);
294   onig_strcpy(r + (dest_end - dest), src, src_end);
295   return r;
296 }
297 
298 
299 #ifdef USE_ST_LIBRARY
300 
301 typedef struct {
302   UChar* s;
303   UChar* end;
304 } st_str_end_key;
305 
306 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)307 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
308 {
309   UChar *p, *q;
310   int c;
311 
312   if ((x->end - x->s) != (y->end - y->s))
313     return 1;
314 
315   p = x->s;
316   q = y->s;
317   while (p < x->end) {
318     c = (int )*p - (int )*q;
319     if (c != 0) return c;
320 
321     p++; q++;
322   }
323 
324   return 0;
325 }
326 
327 static int
str_end_hash(st_str_end_key * x)328 str_end_hash(st_str_end_key* x)
329 {
330   UChar *p;
331   int val = 0;
332 
333   p = x->s;
334   while (p < x->end) {
335     val = val * 997 + (int )*p++;
336   }
337 
338   return val + (val >> 5);
339 }
340 
341 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)342 onig_st_init_strend_table_with_size(int size)
343 {
344   static struct st_hash_type hashType = {
345     str_end_cmp,
346     str_end_hash,
347   };
348 
349   return (hash_table_type* )
350            onig_st_init_table_with_size(&hashType, size);
351 }
352 
353 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)354 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
355 		      const UChar* end_key, hash_data_type *value)
356 {
357   st_str_end_key key;
358 
359   key.s   = (UChar* )str_key;
360   key.end = (UChar* )end_key;
361 
362   return onig_st_lookup(table, (st_data_t )(&key), value);
363 }
364 
365 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)366 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
367 		      const UChar* end_key, hash_data_type value)
368 {
369   st_str_end_key* key;
370   int result;
371 
372   key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
373   key->s   = (UChar* )str_key;
374   key->end = (UChar* )end_key;
375   result = onig_st_insert(table, (st_data_t )key, value);
376   if (result) {
377     xfree(key);
378   }
379   return result;
380 }
381 
382 #endif /* USE_ST_LIBRARY */
383 
384 
385 #ifdef USE_NAMED_GROUP
386 
387 #define INIT_NAME_BACKREFS_ALLOC_NUM   8
388 
389 typedef struct {
390   UChar* name;
391   int    name_len;   /* byte length */
392   int    back_num;   /* number of backrefs */
393   int    back_alloc;
394   int    back_ref1;
395   int*   back_refs;
396 } NameEntry;
397 
398 #ifdef USE_ST_LIBRARY
399 
400 typedef st_table  NameTable;
401 typedef st_data_t HashDataType;   /* 1.6 st.h doesn't define st_data_t type */
402 
403 #define NAMEBUF_SIZE    24
404 #define NAMEBUF_SIZE_1  25
405 
406 #ifdef ONIG_DEBUG
407 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)408 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
409 {
410   int i;
411   FILE* fp = (FILE* )arg;
412 
413   fprintf(fp, "%s: ", e->name);
414   if (e->back_num == 0)
415     fputs("-", fp);
416   else if (e->back_num == 1)
417     fprintf(fp, "%d", e->back_ref1);
418   else {
419     for (i = 0; i < e->back_num; i++) {
420       if (i > 0) fprintf(fp, ", ");
421       fprintf(fp, "%d", e->back_refs[i]);
422     }
423   }
424   fputs("\n", fp);
425   return ST_CONTINUE;
426 }
427 
428 extern int
onig_print_names(FILE * fp,regex_t * reg)429 onig_print_names(FILE* fp, regex_t* reg)
430 {
431   NameTable* t = (NameTable* )reg->name_table;
432 
433   if (IS_NOT_NULL(t)) {
434     fprintf(fp, "name table\n");
435     onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
436     fputs("\n", fp);
437   }
438   return 0;
439 }
440 #endif /* ONIG_DEBUG */
441 
442 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)443 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
444 {
445   xfree(e->name);
446   if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
447   xfree(key);
448   xfree(e);
449   return ST_DELETE;
450 }
451 
452 static int
names_clear(regex_t * reg)453 names_clear(regex_t* reg)
454 {
455   NameTable* t = (NameTable* )reg->name_table;
456 
457   if (IS_NOT_NULL(t)) {
458     onig_st_foreach(t, i_free_name_entry, 0);
459   }
460   return 0;
461 }
462 
463 extern int
onig_names_free(regex_t * reg)464 onig_names_free(regex_t* reg)
465 {
466   int r;
467   NameTable* t;
468 
469   r = names_clear(reg);
470   if (r) return r;
471 
472   t = (NameTable* )reg->name_table;
473   if (IS_NOT_NULL(t)) onig_st_free_table(t);
474   reg->name_table = (void* )NULL;
475   return 0;
476 }
477 
478 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)479 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
480 {
481   NameEntry* e;
482   NameTable* t = (NameTable* )reg->name_table;
483 
484   e = (NameEntry* )NULL;
485   if (IS_NOT_NULL(t)) {
486     onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
487   }
488   return e;
489 }
490 
491 typedef struct {
492   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
493   regex_t* reg;
494   void* arg;
495   int ret;
496   OnigEncoding enc;
497 } INamesArg;
498 
499 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)500 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
501 {
502   int r = (*(arg->func))(e->name,
503                          e->name + e->name_len,
504                          e->back_num,
505 			 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
506 			 arg->reg, arg->arg);
507   if (r != 0) {
508     arg->ret = r;
509     return ST_STOP;
510   }
511   return ST_CONTINUE;
512 }
513 
514 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)515 onig_foreach_name(regex_t* reg,
516   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
517 {
518   INamesArg narg;
519   NameTable* t = (NameTable* )reg->name_table;
520 
521   narg.ret = 0;
522   if (IS_NOT_NULL(t)) {
523     narg.func = func;
524     narg.reg  = reg;
525     narg.arg  = arg;
526     narg.enc  = reg->enc; /* should be pattern encoding. */
527     onig_st_foreach(t, i_names, (HashDataType )&narg);
528   }
529   return narg.ret;
530 }
531 
532 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)533 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
534 {
535   int i;
536 
537   if (e->back_num > 1) {
538     for (i = 0; i < e->back_num; i++) {
539       e->back_refs[i] = map[e->back_refs[i]].new_val;
540     }
541   }
542   else if (e->back_num == 1) {
543     e->back_ref1 = map[e->back_ref1].new_val;
544   }
545 
546   return ST_CONTINUE;
547 }
548 
549 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)550 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
551 {
552   NameTable* t = (NameTable* )reg->name_table;
553 
554   if (IS_NOT_NULL(t)) {
555     onig_st_foreach(t, i_renumber_name, (HashDataType )map);
556   }
557   return 0;
558 }
559 
560 
561 extern int
onig_number_of_names(regex_t * reg)562 onig_number_of_names(regex_t* reg)
563 {
564   NameTable* t = (NameTable* )reg->name_table;
565 
566   if (IS_NOT_NULL(t))
567     return t->num_entries;
568   else
569     return 0;
570 }
571 
572 #else  /* USE_ST_LIBRARY */
573 
574 #define INIT_NAMES_ALLOC_NUM    8
575 
576 typedef struct {
577   NameEntry* e;
578   int        num;
579   int        alloc;
580 } NameTable;
581 
582 #ifdef ONIG_DEBUG
583 extern int
onig_print_names(FILE * fp,regex_t * reg)584 onig_print_names(FILE* fp, regex_t* reg)
585 {
586   int i, j;
587   NameEntry* e;
588   NameTable* t = (NameTable* )reg->name_table;
589 
590   if (IS_NOT_NULL(t) && t->num > 0) {
591     fprintf(fp, "name table\n");
592     for (i = 0; i < t->num; i++) {
593       e = &(t->e[i]);
594       fprintf(fp, "%s: ", e->name);
595       if (e->back_num == 0) {
596 	fputs("-", fp);
597       }
598       else if (e->back_num == 1) {
599 	fprintf(fp, "%d", e->back_ref1);
600       }
601       else {
602 	for (j = 0; j < e->back_num; j++) {
603 	  if (j > 0) fprintf(fp, ", ");
604 	  fprintf(fp, "%d", e->back_refs[j]);
605 	}
606       }
607       fputs("\n", fp);
608     }
609     fputs("\n", fp);
610   }
611   return 0;
612 }
613 #endif
614 
615 static int
names_clear(regex_t * reg)616 names_clear(regex_t* reg)
617 {
618   int i;
619   NameEntry* e;
620   NameTable* t = (NameTable* )reg->name_table;
621 
622   if (IS_NOT_NULL(t)) {
623     for (i = 0; i < t->num; i++) {
624       e = &(t->e[i]);
625       if (IS_NOT_NULL(e->name)) {
626 	xfree(e->name);
627 	e->name       = NULL;
628 	e->name_len   = 0;
629 	e->back_num   = 0;
630 	e->back_alloc = 0;
631 	if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
632 	e->back_refs = (int* )NULL;
633       }
634     }
635     if (IS_NOT_NULL(t->e)) {
636       xfree(t->e);
637       t->e = NULL;
638     }
639     t->num = 0;
640   }
641   return 0;
642 }
643 
644 extern int
onig_names_free(regex_t * reg)645 onig_names_free(regex_t* reg)
646 {
647   int r;
648   NameTable* t;
649 
650   r = names_clear(reg);
651   if (r) return r;
652 
653   t = (NameTable* )reg->name_table;
654   if (IS_NOT_NULL(t)) xfree(t);
655   reg->name_table = NULL;
656   return 0;
657 }
658 
659 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)660 name_find(regex_t* reg, UChar* name, UChar* name_end)
661 {
662   int i, len;
663   NameEntry* e;
664   NameTable* t = (NameTable* )reg->name_table;
665 
666   if (IS_NOT_NULL(t)) {
667     len = name_end - name;
668     for (i = 0; i < t->num; i++) {
669       e = &(t->e[i]);
670       if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
671 	return e;
672     }
673   }
674   return (NameEntry* )NULL;
675 }
676 
677 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)678 onig_foreach_name(regex_t* reg,
679   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
680 {
681   int i, r;
682   NameEntry* e;
683   NameTable* t = (NameTable* )reg->name_table;
684 
685   if (IS_NOT_NULL(t)) {
686     for (i = 0; i < t->num; i++) {
687       e = &(t->e[i]);
688       r = (*func)(e->name, e->name + e->name_len, e->back_num,
689 		  (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
690 		  reg, arg);
691       if (r != 0) return r;
692     }
693   }
694   return 0;
695 }
696 
697 extern int
onig_number_of_names(regex_t * reg)698 onig_number_of_names(regex_t* reg)
699 {
700   NameTable* t = (NameTable* )reg->name_table;
701 
702   if (IS_NOT_NULL(t))
703     return t->num;
704   else
705     return 0;
706 }
707 
708 #endif /* else USE_ST_LIBRARY */
709 
710 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)711 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
712 {
713   int alloc;
714   NameEntry* e;
715   NameTable* t = (NameTable* )reg->name_table;
716 
717   if (name_end - name <= 0)
718     return ONIGERR_EMPTY_GROUP_NAME;
719 
720   e = name_find(reg, name, name_end);
721   if (IS_NULL(e)) {
722 #ifdef USE_ST_LIBRARY
723     if (IS_NULL(t)) {
724       t = onig_st_init_strend_table_with_size(5);
725       reg->name_table = (void* )t;
726     }
727     e = (NameEntry* )xmalloc(sizeof(NameEntry));
728     CHECK_NULL_RETURN_MEMERR(e);
729 
730     e->name = strdup_with_null(reg->enc, name, name_end);
731     if (IS_NULL(e->name)) {
732       xfree(e);  return ONIGERR_MEMORY;
733     }
734     onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
735                           (HashDataType )e);
736 
737     e->name_len   = name_end - name;
738     e->back_num   = 0;
739     e->back_alloc = 0;
740     e->back_refs  = (int* )NULL;
741 
742 #else
743 
744     if (IS_NULL(t)) {
745       alloc = INIT_NAMES_ALLOC_NUM;
746       t = (NameTable* )xmalloc(sizeof(NameTable));
747       CHECK_NULL_RETURN_MEMERR(t);
748       t->e     = NULL;
749       t->alloc = 0;
750       t->num   = 0;
751 
752       t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
753       if (IS_NULL(t->e)) {
754 	xfree(t);
755 	return ONIGERR_MEMORY;
756       }
757       t->alloc = alloc;
758       reg->name_table = t;
759       goto clear;
760     }
761     else if (t->num == t->alloc) {
762       int i;
763 
764       alloc = t->alloc * 2;
765       t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
766       CHECK_NULL_RETURN_MEMERR(t->e);
767       t->alloc = alloc;
768 
769     clear:
770       for (i = t->num; i < t->alloc; i++) {
771 	t->e[i].name       = NULL;
772 	t->e[i].name_len   = 0;
773 	t->e[i].back_num   = 0;
774 	t->e[i].back_alloc = 0;
775 	t->e[i].back_refs  = (int* )NULL;
776       }
777     }
778     e = &(t->e[t->num]);
779     t->num++;
780     e->name = strdup_with_null(reg->enc, name, name_end);
781     if (IS_NULL(e->name)) return ONIGERR_MEMORY;
782     e->name_len = name_end - name;
783 #endif
784   }
785 
786   if (e->back_num >= 1 &&
787       ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
788     onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
789 				    name, name_end);
790     return ONIGERR_MULTIPLEX_DEFINED_NAME;
791   }
792 
793   e->back_num++;
794   if (e->back_num == 1) {
795     e->back_ref1 = backref;
796   }
797   else {
798     if (e->back_num == 2) {
799       alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
800       e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
801       CHECK_NULL_RETURN_MEMERR(e->back_refs);
802       e->back_alloc = alloc;
803       e->back_refs[0] = e->back_ref1;
804       e->back_refs[1] = backref;
805     }
806     else {
807       if (e->back_num > e->back_alloc) {
808 	alloc = e->back_alloc * 2;
809 	e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
810 	CHECK_NULL_RETURN_MEMERR(e->back_refs);
811 	e->back_alloc = alloc;
812       }
813       e->back_refs[e->back_num - 1] = backref;
814     }
815   }
816 
817   return 0;
818 }
819 
820 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)821 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
822 			   const UChar* name_end, int** nums)
823 {
824   NameEntry* e = name_find(reg, name, name_end);
825 
826   if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
827 
828   switch (e->back_num) {
829   case 0:
830     break;
831   case 1:
832     *nums = &(e->back_ref1);
833     break;
834   default:
835     *nums = e->back_refs;
836     break;
837   }
838   return e->back_num;
839 }
840 
841 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)842 onig_name_to_backref_number(regex_t* reg, const UChar* name,
843 			    const UChar* name_end, OnigRegion *region)
844 {
845   int i, n, *nums;
846 
847   n = onig_name_to_group_numbers(reg, name, name_end, &nums);
848   if (n < 0)
849     return n;
850   else if (n == 0)
851     return ONIGERR_PARSER_BUG;
852   else if (n == 1)
853     return nums[0];
854   else {
855     if (IS_NOT_NULL(region)) {
856       for (i = n - 1; i >= 0; i--) {
857 	if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
858 	  return nums[i];
859       }
860     }
861     return nums[n - 1];
862   }
863 }
864 
865 #else /* USE_NAMED_GROUP */
866 
867 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)868 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
869 			   const UChar* name_end, int** nums)
870 {
871   return ONIG_NO_SUPPORT_CONFIG;
872 }
873 
874 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)875 onig_name_to_backref_number(regex_t* reg, const UChar* name,
876 			    const UChar* name_end, OnigRegion* region)
877 {
878   return ONIG_NO_SUPPORT_CONFIG;
879 }
880 
881 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)882 onig_foreach_name(regex_t* reg,
883   int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
884 {
885   return ONIG_NO_SUPPORT_CONFIG;
886 }
887 
888 extern int
onig_number_of_names(regex_t * reg)889 onig_number_of_names(regex_t* reg)
890 {
891   return 0;
892 }
893 #endif /* else USE_NAMED_GROUP */
894 
895 extern int
onig_noname_group_capture_is_active(regex_t * reg)896 onig_noname_group_capture_is_active(regex_t* reg)
897 {
898   if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
899     return 0;
900 
901 #ifdef USE_NAMED_GROUP
902   if (onig_number_of_names(reg) > 0 &&
903       IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
904       !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
905     return 0;
906   }
907 #endif
908 
909   return 1;
910 }
911 
912 
913 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE   16
914 
915 static void
scan_env_clear(ScanEnv * env)916 scan_env_clear(ScanEnv* env)
917 {
918   int i;
919 
920   BIT_STATUS_CLEAR(env->capture_history);
921   BIT_STATUS_CLEAR(env->bt_mem_start);
922   BIT_STATUS_CLEAR(env->bt_mem_end);
923   BIT_STATUS_CLEAR(env->backrefed_mem);
924   env->error      = (UChar* )NULL;
925   env->error_end  = (UChar* )NULL;
926   env->num_call   = 0;
927   env->num_mem    = 0;
928 #ifdef USE_NAMED_GROUP
929   env->num_named  = 0;
930 #endif
931   env->mem_alloc         = 0;
932   env->mem_nodes_dynamic = (Node** )NULL;
933 
934   for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
935     env->mem_nodes_static[i] = NULL_NODE;
936 
937 #ifdef USE_COMBINATION_EXPLOSION_CHECK
938   env->num_comb_exp_check  = 0;
939   env->comb_exp_max_regnum = 0;
940   env->curr_max_regnum     = 0;
941   env->has_recursion       = 0;
942 #endif
943 }
944 
945 static int
scan_env_add_mem_entry(ScanEnv * env)946 scan_env_add_mem_entry(ScanEnv* env)
947 {
948   int i, need, alloc;
949   Node** p;
950 
951   need = env->num_mem + 1;
952   if (need >= SCANENV_MEMNODES_SIZE) {
953     if (env->mem_alloc <= need) {
954       if (IS_NULL(env->mem_nodes_dynamic)) {
955 	alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
956 	p = (Node** )xmalloc(sizeof(Node*) * alloc);
957 	xmemcpy(p, env->mem_nodes_static,
958 		sizeof(Node*) * SCANENV_MEMNODES_SIZE);
959       }
960       else {
961 	alloc = env->mem_alloc * 2;
962 	p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
963       }
964       CHECK_NULL_RETURN_MEMERR(p);
965 
966       for (i = env->num_mem + 1; i < alloc; i++)
967 	p[i] = NULL_NODE;
968 
969       env->mem_nodes_dynamic = p;
970       env->mem_alloc = alloc;
971     }
972   }
973 
974   env->num_mem++;
975   return env->num_mem;
976 }
977 
978 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)979 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
980 {
981   if (env->num_mem >= num)
982     SCANENV_MEM_NODES(env)[num] = node;
983   else
984     return ONIGERR_PARSER_BUG;
985   return 0;
986 }
987 
988 
989 #ifdef USE_PARSE_TREE_NODE_RECYCLE
990 typedef struct _FreeNode {
991   struct _FreeNode* next;
992 } FreeNode;
993 
994 static FreeNode* FreeNodeList = (FreeNode* )NULL;
995 #endif
996 
997 extern void
onig_node_free(Node * node)998 onig_node_free(Node* node)
999 {
1000  start:
1001   if (IS_NULL(node)) return ;
1002 
1003   switch (NTYPE(node)) {
1004   case NT_STR:
1005     if (NSTR(node)->capa != 0 &&
1006 	IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1007       xfree(NSTR(node)->s);
1008     }
1009     break;
1010 
1011   case NT_LIST:
1012   case NT_ALT:
1013     onig_node_free(NCAR(node));
1014     {
1015       Node* next_node = NCDR(node);
1016 
1017 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1018       {
1019 	FreeNode* n = (FreeNode* )node;
1020 
1021         THREAD_ATOMIC_START;
1022 	n->next = FreeNodeList;
1023 	FreeNodeList = n;
1024         THREAD_ATOMIC_END;
1025       }
1026 #else
1027       xfree(node);
1028 #endif
1029       node = next_node;
1030       goto start;
1031     }
1032     break;
1033 
1034   case NT_CCLASS:
1035     {
1036       CClassNode* cc = NCCLASS(node);
1037 
1038       if (IS_NCCLASS_SHARE(cc)) return ;
1039       if (cc->mbuf)
1040         bbuf_free(cc->mbuf);
1041     }
1042     break;
1043 
1044   case NT_QTFR:
1045     if (NQTFR(node)->target)
1046       onig_node_free(NQTFR(node)->target);
1047     break;
1048 
1049   case NT_ENCLOSE:
1050     if (NENCLOSE(node)->target)
1051       onig_node_free(NENCLOSE(node)->target);
1052     break;
1053 
1054   case NT_BREF:
1055     if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1056       xfree(NBREF(node)->back_dynamic);
1057     break;
1058 
1059   case NT_ANCHOR:
1060     if (NANCHOR(node)->target)
1061       onig_node_free(NANCHOR(node)->target);
1062     break;
1063   }
1064 
1065 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1066   {
1067     FreeNode* n = (FreeNode* )node;
1068 
1069     THREAD_ATOMIC_START;
1070     n->next = FreeNodeList;
1071     FreeNodeList = n;
1072     THREAD_ATOMIC_END;
1073   }
1074 #else
1075   xfree(node);
1076 #endif
1077 }
1078 
1079 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1080 extern int
onig_free_node_list(void)1081 onig_free_node_list(void)
1082 {
1083   FreeNode* n;
1084 
1085   /* THREAD_ATOMIC_START; */
1086   while (IS_NOT_NULL(FreeNodeList)) {
1087     n = FreeNodeList;
1088     FreeNodeList = FreeNodeList->next;
1089     xfree(n);
1090   }
1091   /* THREAD_ATOMIC_END; */
1092   return 0;
1093 }
1094 #endif
1095 
1096 static Node*
node_new(void)1097 node_new(void)
1098 {
1099   Node* node;
1100 
1101 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1102   THREAD_ATOMIC_START;
1103   if (IS_NOT_NULL(FreeNodeList)) {
1104     node = (Node* )FreeNodeList;
1105     FreeNodeList = FreeNodeList->next;
1106     THREAD_ATOMIC_END;
1107     return node;
1108   }
1109   THREAD_ATOMIC_END;
1110 #endif
1111 
1112   node = (Node* )xmalloc(sizeof(Node));
1113   /* xmemset(node, 0, sizeof(Node)); */
1114   return node;
1115 }
1116 
1117 
1118 static void
initialize_cclass(CClassNode * cc)1119 initialize_cclass(CClassNode* cc)
1120 {
1121   BITSET_CLEAR(cc->bs);
1122   /* cc->base.flags = 0; */
1123   cc->flags = 0;
1124   cc->mbuf  = NULL;
1125 }
1126 
1127 static Node*
node_new_cclass(void)1128 node_new_cclass(void)
1129 {
1130   Node* node = node_new();
1131   CHECK_NULL_RETURN(node);
1132 
1133   SET_NTYPE(node, NT_CCLASS);
1134   initialize_cclass(NCCLASS(node));
1135   return node;
1136 }
1137 
1138 static Node*
node_new_cclass_by_codepoint_range(int not,OnigCodePoint sb_out,const OnigCodePoint ranges[])1139 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1140 				   const OnigCodePoint ranges[])
1141 {
1142   int n, i;
1143   CClassNode* cc;
1144   OnigCodePoint j;
1145 
1146   Node* node = node_new_cclass();
1147   CHECK_NULL_RETURN(node);
1148 
1149   cc = NCCLASS(node);
1150   if (not != 0) NCCLASS_SET_NOT(cc);
1151 
1152   BITSET_CLEAR(cc->bs);
1153   if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1154     n = ONIGENC_CODE_RANGE_NUM(ranges);
1155     for (i = 0; i < n; i++) {
1156       for (j  = ONIGENC_CODE_RANGE_FROM(ranges, i);
1157            j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1158 	if (j >= sb_out) goto sb_end;
1159 
1160         BITSET_SET_BIT(cc->bs, j);
1161       }
1162     }
1163   }
1164 
1165  sb_end:
1166   if (IS_NULL(ranges)) {
1167   is_null:
1168     cc->mbuf = NULL;
1169   }
1170   else {
1171     BBuf* bbuf;
1172 
1173     n = ONIGENC_CODE_RANGE_NUM(ranges);
1174     if (n == 0) goto is_null;
1175 
1176     bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1177     CHECK_NULL_RETURN(bbuf);
1178     bbuf->alloc = n + 1;
1179     bbuf->used  = n + 1;
1180     bbuf->p     = (UChar* )((void* )ranges);
1181 
1182     cc->mbuf = bbuf;
1183   }
1184 
1185   return node;
1186 }
1187 
1188 static Node*
node_new_ctype(int type,int not)1189 node_new_ctype(int type, int not)
1190 {
1191   Node* node = node_new();
1192   CHECK_NULL_RETURN(node);
1193 
1194   SET_NTYPE(node, NT_CTYPE);
1195   NCTYPE(node)->ctype = type;
1196   NCTYPE(node)->not   = not;
1197   return node;
1198 }
1199 
1200 static Node*
node_new_anychar(void)1201 node_new_anychar(void)
1202 {
1203   Node* node = node_new();
1204   CHECK_NULL_RETURN(node);
1205 
1206   SET_NTYPE(node, NT_CANY);
1207   return node;
1208 }
1209 
1210 static Node*
node_new_list(Node * left,Node * right)1211 node_new_list(Node* left, Node* right)
1212 {
1213   Node* node = node_new();
1214   CHECK_NULL_RETURN(node);
1215 
1216   SET_NTYPE(node, NT_LIST);
1217   NCAR(node)  = left;
1218   NCDR(node) = right;
1219   return node;
1220 }
1221 
1222 extern Node*
onig_node_new_list(Node * left,Node * right)1223 onig_node_new_list(Node* left, Node* right)
1224 {
1225   return node_new_list(left, right);
1226 }
1227 
1228 extern Node*
onig_node_list_add(Node * list,Node * x)1229 onig_node_list_add(Node* list, Node* x)
1230 {
1231   Node *n;
1232 
1233   n = onig_node_new_list(x, NULL);
1234   if (IS_NULL(n)) return NULL_NODE;
1235 
1236   if (IS_NOT_NULL(list)) {
1237     while (IS_NOT_NULL(NCDR(list)))
1238       list = NCDR(list);
1239 
1240     NCDR(list) = n;
1241   }
1242 
1243   return n;
1244 }
1245 
1246 extern Node*
onig_node_new_alt(Node * left,Node * right)1247 onig_node_new_alt(Node* left, Node* right)
1248 {
1249   Node* node = node_new();
1250   CHECK_NULL_RETURN(node);
1251 
1252   SET_NTYPE(node, NT_ALT);
1253   NCAR(node)  = left;
1254   NCDR(node) = right;
1255   return node;
1256 }
1257 
1258 extern Node*
onig_node_new_anchor(int type)1259 onig_node_new_anchor(int type)
1260 {
1261   Node* node = node_new();
1262   CHECK_NULL_RETURN(node);
1263 
1264   SET_NTYPE(node, NT_ANCHOR);
1265   NANCHOR(node)->type     = type;
1266   NANCHOR(node)->target   = NULL;
1267   NANCHOR(node)->char_len = -1;
1268   return node;
1269 }
1270 
1271 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1272 node_new_backref(int back_num, int* backrefs, int by_name,
1273 #ifdef USE_BACKREF_WITH_LEVEL
1274 		 int exist_level, int nest_level,
1275 #endif
1276 		 ScanEnv* env)
1277 {
1278   int i;
1279   Node* node = node_new();
1280 
1281   CHECK_NULL_RETURN(node);
1282 
1283   SET_NTYPE(node, NT_BREF);
1284   NBREF(node)->state    = 0;
1285   NBREF(node)->back_num = back_num;
1286   NBREF(node)->back_dynamic = (int* )NULL;
1287   if (by_name != 0)
1288     NBREF(node)->state |= NST_NAME_REF;
1289 
1290 #ifdef USE_BACKREF_WITH_LEVEL
1291   if (exist_level != 0) {
1292     NBREF(node)->state |= NST_NEST_LEVEL;
1293     NBREF(node)->nest_level  = nest_level;
1294   }
1295 #endif
1296 
1297   for (i = 0; i < back_num; i++) {
1298     if (backrefs[i] <= env->num_mem &&
1299 	IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1300       NBREF(node)->state |= NST_RECURSION;   /* /...(\1).../ */
1301       break;
1302     }
1303   }
1304 
1305   if (back_num <= NODE_BACKREFS_SIZE) {
1306     for (i = 0; i < back_num; i++)
1307       NBREF(node)->back_static[i] = backrefs[i];
1308   }
1309   else {
1310     int* p = (int* )xmalloc(sizeof(int) * back_num);
1311     if (IS_NULL(p)) {
1312       onig_node_free(node);
1313       return NULL;
1314     }
1315     NBREF(node)->back_dynamic = p;
1316     for (i = 0; i < back_num; i++)
1317       p[i] = backrefs[i];
1318   }
1319   return node;
1320 }
1321 
1322 #ifdef USE_SUBEXP_CALL
1323 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1324 node_new_call(UChar* name, UChar* name_end, int gnum)
1325 {
1326   Node* node = node_new();
1327   CHECK_NULL_RETURN(node);
1328 
1329   SET_NTYPE(node, NT_CALL);
1330   NCALL(node)->state     = 0;
1331   NCALL(node)->target    = NULL_NODE;
1332   NCALL(node)->name      = name;
1333   NCALL(node)->name_end  = name_end;
1334   NCALL(node)->group_num = gnum;  /* call by number if gnum != 0 */
1335   return node;
1336 }
1337 #endif
1338 
1339 static Node*
node_new_quantifier(int lower,int upper,int by_number)1340 node_new_quantifier(int lower, int upper, int by_number)
1341 {
1342   Node* node = node_new();
1343   CHECK_NULL_RETURN(node);
1344 
1345   SET_NTYPE(node, NT_QTFR);
1346   NQTFR(node)->state  = 0;
1347   NQTFR(node)->target = NULL;
1348   NQTFR(node)->lower  = lower;
1349   NQTFR(node)->upper  = upper;
1350   NQTFR(node)->greedy = 1;
1351   NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1352   NQTFR(node)->head_exact        = NULL_NODE;
1353   NQTFR(node)->next_head_exact   = NULL_NODE;
1354   NQTFR(node)->is_refered        = 0;
1355   if (by_number != 0)
1356     NQTFR(node)->state |= NST_BY_NUMBER;
1357 
1358 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1359   NQTFR(node)->comb_exp_check_num = 0;
1360 #endif
1361 
1362   return node;
1363 }
1364 
1365 static Node*
node_new_enclose(int type)1366 node_new_enclose(int type)
1367 {
1368   Node* node = node_new();
1369   CHECK_NULL_RETURN(node);
1370 
1371   SET_NTYPE(node, NT_ENCLOSE);
1372   NENCLOSE(node)->type      = type;
1373   NENCLOSE(node)->state     =  0;
1374   NENCLOSE(node)->regnum    =  0;
1375   NENCLOSE(node)->option    =  0;
1376   NENCLOSE(node)->target    = NULL;
1377   NENCLOSE(node)->call_addr = -1;
1378   NENCLOSE(node)->opt_count =  0;
1379   return node;
1380 }
1381 
1382 extern Node*
onig_node_new_enclose(int type)1383 onig_node_new_enclose(int type)
1384 {
1385   return node_new_enclose(type);
1386 }
1387 
1388 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1389 node_new_enclose_memory(OnigOptionType option, int is_named)
1390 {
1391   Node* node = node_new_enclose(ENCLOSE_MEMORY);
1392   CHECK_NULL_RETURN(node);
1393   if (is_named != 0)
1394     SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1395 
1396 #ifdef USE_SUBEXP_CALL
1397   NENCLOSE(node)->option = option;
1398 #endif
1399   return node;
1400 }
1401 
1402 static Node*
node_new_option(OnigOptionType option)1403 node_new_option(OnigOptionType option)
1404 {
1405   Node* node = node_new_enclose(ENCLOSE_OPTION);
1406   CHECK_NULL_RETURN(node);
1407   NENCLOSE(node)->option = option;
1408   return node;
1409 }
1410 
1411 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1412 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1413 {
1414   int addlen = end - s;
1415 
1416   if (addlen > 0) {
1417     int len  = NSTR(node)->end - NSTR(node)->s;
1418 
1419     if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1420       UChar* p;
1421       int capa = len + addlen + NODE_STR_MARGIN;
1422 
1423       if (capa <= NSTR(node)->capa) {
1424 	onig_strcpy(NSTR(node)->s + len, s, end);
1425       }
1426       else {
1427 	if (NSTR(node)->s == NSTR(node)->buf)
1428 	  p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1429 				      s, end, capa);
1430 	else
1431 	  p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1432 
1433 	CHECK_NULL_RETURN_MEMERR(p);
1434 	NSTR(node)->s    = p;
1435 	NSTR(node)->capa = capa;
1436       }
1437     }
1438     else {
1439       onig_strcpy(NSTR(node)->s + len, s, end);
1440     }
1441     NSTR(node)->end = NSTR(node)->s + len + addlen;
1442   }
1443 
1444   return 0;
1445 }
1446 
1447 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1448 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1449 {
1450   onig_node_str_clear(node);
1451   return onig_node_str_cat(node, s, end);
1452 }
1453 
1454 static int
node_str_cat_char(Node * node,UChar c)1455 node_str_cat_char(Node* node, UChar c)
1456 {
1457   UChar s[1];
1458 
1459   s[0] = c;
1460   return onig_node_str_cat(node, s, s + 1);
1461 }
1462 
1463 extern void
onig_node_conv_to_str_node(Node * node,int flag)1464 onig_node_conv_to_str_node(Node* node, int flag)
1465 {
1466   SET_NTYPE(node, NT_STR);
1467   NSTR(node)->flag = flag;
1468   NSTR(node)->capa = 0;
1469   NSTR(node)->s    = NSTR(node)->buf;
1470   NSTR(node)->end  = NSTR(node)->buf;
1471 }
1472 
1473 extern void
onig_node_str_clear(Node * node)1474 onig_node_str_clear(Node* node)
1475 {
1476   if (NSTR(node)->capa != 0 &&
1477       IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1478     xfree(NSTR(node)->s);
1479   }
1480 
1481   NSTR(node)->capa = 0;
1482   NSTR(node)->flag = 0;
1483   NSTR(node)->s    = NSTR(node)->buf;
1484   NSTR(node)->end  = NSTR(node)->buf;
1485 }
1486 
1487 static Node*
node_new_str(const UChar * s,const UChar * end)1488 node_new_str(const UChar* s, const UChar* end)
1489 {
1490   Node* node = node_new();
1491   CHECK_NULL_RETURN(node);
1492 
1493   SET_NTYPE(node, NT_STR);
1494   NSTR(node)->capa = 0;
1495   NSTR(node)->flag = 0;
1496   NSTR(node)->s    = NSTR(node)->buf;
1497   NSTR(node)->end  = NSTR(node)->buf;
1498   if (onig_node_str_cat(node, s, end)) {
1499     onig_node_free(node);
1500     return NULL;
1501   }
1502   return node;
1503 }
1504 
1505 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1506 onig_node_new_str(const UChar* s, const UChar* end)
1507 {
1508   return node_new_str(s, end);
1509 }
1510 
1511 static Node*
node_new_str_raw(UChar * s,UChar * end)1512 node_new_str_raw(UChar* s, UChar* end)
1513 {
1514   Node* node = node_new_str(s, end);
1515   NSTRING_SET_RAW(node);
1516   return node;
1517 }
1518 
1519 static Node*
node_new_empty(void)1520 node_new_empty(void)
1521 {
1522   return node_new_str(NULL, NULL);
1523 }
1524 
1525 static Node*
node_new_str_raw_char(UChar c)1526 node_new_str_raw_char(UChar c)
1527 {
1528   UChar p[1];
1529 
1530   p[0] = c;
1531   return node_new_str_raw(p, p + 1);
1532 }
1533 
1534 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1535 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1536 {
1537   const UChar *p;
1538   Node* n = NULL_NODE;
1539 
1540   if (sn->end > sn->s) {
1541     p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1542     if (p && p > sn->s) { /* can be splitted. */
1543       n = node_new_str(p, sn->end);
1544       if ((sn->flag & NSTR_RAW) != 0)
1545 	NSTRING_SET_RAW(n);
1546       sn->end = (UChar* )p;
1547     }
1548   }
1549   return n;
1550 }
1551 
1552 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1553 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1554 {
1555   if (sn->end > sn->s) {
1556     return ((enclen(enc, sn->s) < sn->end - sn->s)  ?  1 : 0);
1557   }
1558   return 0;
1559 }
1560 
1561 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1562 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1563 node_str_head_pad(StrNode* sn, int num, UChar val)
1564 {
1565   UChar buf[NODE_STR_BUF_SIZE];
1566   int i, len;
1567 
1568   len = sn->end - sn->s;
1569   onig_strcpy(buf, sn->s, sn->end);
1570   onig_strcpy(&(sn->s[num]), buf, buf + len);
1571   sn->end += num;
1572 
1573   for (i = 0; i < num; i++) {
1574     sn->s[i] = val;
1575   }
1576 }
1577 #endif
1578 
1579 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1580 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1581 {
1582   unsigned int num, val;
1583   OnigCodePoint c;
1584   UChar* p = *src;
1585   PFETCH_READY;
1586 
1587   num = 0;
1588   while (!PEND) {
1589     PFETCH(c);
1590     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1591       val = (unsigned int )DIGITVAL(c);
1592       if ((INT_MAX_LIMIT - val) / 10UL < num)
1593 	return -1;  /* overflow */
1594 
1595       num = num * 10 + val;
1596     }
1597     else {
1598       PUNFETCH;
1599       break;
1600     }
1601   }
1602   *src = p;
1603   return num;
1604 }
1605 
1606 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1607 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1608 				 OnigEncoding enc)
1609 {
1610   OnigCodePoint c;
1611   unsigned int num, val;
1612   UChar* p = *src;
1613   PFETCH_READY;
1614 
1615   num = 0;
1616   while (!PEND && maxlen-- != 0) {
1617     PFETCH(c);
1618     if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1619       val = (unsigned int )XDIGITVAL(enc,c);
1620       if ((INT_MAX_LIMIT - val) / 16UL < num)
1621 	return -1;  /* overflow */
1622 
1623       num = (num << 4) + XDIGITVAL(enc,c);
1624     }
1625     else {
1626       PUNFETCH;
1627       break;
1628     }
1629   }
1630   *src = p;
1631   return num;
1632 }
1633 
1634 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1635 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1636 			   OnigEncoding enc)
1637 {
1638   OnigCodePoint c;
1639   unsigned int num, val;
1640   UChar* p = *src;
1641   PFETCH_READY;
1642 
1643   num = 0;
1644   while (!PEND && maxlen-- != 0) {
1645     PFETCH(c);
1646     if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1647       val = ODIGITVAL(c);
1648       if ((INT_MAX_LIMIT - val) / 8UL < num)
1649 	return -1;  /* overflow */
1650 
1651       num = (num << 3) + val;
1652     }
1653     else {
1654       PUNFETCH;
1655       break;
1656     }
1657   }
1658   *src = p;
1659   return num;
1660 }
1661 
1662 
1663 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1664     BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1665 
1666 /* data format:
1667      [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1668      (all data size is OnigCodePoint)
1669  */
1670 static int
new_code_range(BBuf ** pbuf)1671 new_code_range(BBuf** pbuf)
1672 {
1673 #define INIT_MULTI_BYTE_RANGE_SIZE  (SIZE_CODE_POINT * 5)
1674   int r;
1675   OnigCodePoint n;
1676   BBuf* bbuf;
1677 
1678   bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1679   CHECK_NULL_RETURN_MEMERR(*pbuf);
1680   r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1681   if (r) return r;
1682 
1683   n = 0;
1684   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1685   return 0;
1686 }
1687 
1688 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1689 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1690 {
1691   int r, inc_n, pos;
1692   int low, high, bound, x;
1693   OnigCodePoint n, *data;
1694   BBuf* bbuf;
1695 
1696   if (from > to) {
1697     n = from; from = to; to = n;
1698   }
1699 
1700   if (IS_NULL(*pbuf)) {
1701     r = new_code_range(pbuf);
1702     if (r) return r;
1703     bbuf = *pbuf;
1704     n = 0;
1705   }
1706   else {
1707     bbuf = *pbuf;
1708     GET_CODE_POINT(n, bbuf->p);
1709   }
1710   data = (OnigCodePoint* )(bbuf->p);
1711   data++;
1712 
1713   for (low = 0, bound = n; low < bound; ) {
1714     x = (low + bound) >> 1;
1715     if (from > data[x*2 + 1])
1716       low = x + 1;
1717     else
1718       bound = x;
1719   }
1720 
1721   for (high = low, bound = n; high < bound; ) {
1722     x = (high + bound) >> 1;
1723     if (to >= data[x*2] - 1)
1724       high = x + 1;
1725     else
1726       bound = x;
1727   }
1728 
1729   inc_n = low + 1 - high;
1730   if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1731     return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1732 
1733   if (inc_n != 1) {
1734     if (from > data[low*2])
1735       from = data[low*2];
1736     if (to < data[(high - 1)*2 + 1])
1737       to = data[(high - 1)*2 + 1];
1738   }
1739 
1740   if (inc_n != 0 && (OnigCodePoint )high < n) {
1741     int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1742     int to_pos   = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1743     int size = (n - high) * 2 * SIZE_CODE_POINT;
1744 
1745     if (inc_n > 0) {
1746       BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1747     }
1748     else {
1749       BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1750     }
1751   }
1752 
1753   pos = SIZE_CODE_POINT * (1 + low * 2);
1754   BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1755   BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1756   BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1757   n += inc_n;
1758   BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1759 
1760   return 0;
1761 }
1762 
1763 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1764 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1765 {
1766   if (from > to) {
1767     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1768       return 0;
1769     else
1770       return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1771   }
1772 
1773   return add_code_range_to_buf(pbuf, from, to);
1774 }
1775 
1776 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1777 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1778 {
1779   int r, i, n;
1780   OnigCodePoint pre, from, *data, to = 0;
1781 
1782   *pbuf = (BBuf* )NULL;
1783   if (IS_NULL(bbuf)) {
1784   set_all:
1785     return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1786   }
1787 
1788   data = (OnigCodePoint* )(bbuf->p);
1789   GET_CODE_POINT(n, data);
1790   data++;
1791   if (n <= 0) goto set_all;
1792 
1793   r = 0;
1794   pre = MBCODE_START_POS(enc);
1795   for (i = 0; i < n; i++) {
1796     from = data[i*2];
1797     to   = data[i*2+1];
1798     if (pre <= from - 1) {
1799       r = add_code_range_to_buf(pbuf, pre, from - 1);
1800       if (r != 0) return r;
1801     }
1802     if (to == ~((OnigCodePoint )0)) break;
1803     pre = to + 1;
1804   }
1805   if (to < ~((OnigCodePoint )0)) {
1806     r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1807   }
1808   return r;
1809 }
1810 
1811 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1812   BBuf *tbuf; \
1813   int  tnot; \
1814   tnot = not1;  not1  = not2;  not2  = tnot; \
1815   tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1816 } while (0)
1817 
1818 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1819 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1820                   BBuf* bbuf2, int not2, BBuf** pbuf)
1821 {
1822   int r;
1823   OnigCodePoint i, n1, *data1;
1824   OnigCodePoint from, to;
1825 
1826   *pbuf = (BBuf* )NULL;
1827   if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1828     if (not1 != 0 || not2 != 0)
1829       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1830     return 0;
1831   }
1832 
1833   r = 0;
1834   if (IS_NULL(bbuf2))
1835     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1836 
1837   if (IS_NULL(bbuf1)) {
1838     if (not1 != 0) {
1839       return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1840     }
1841     else {
1842       if (not2 == 0) {
1843 	return bbuf_clone(pbuf, bbuf2);
1844       }
1845       else {
1846 	return not_code_range_buf(enc, bbuf2, pbuf);
1847       }
1848     }
1849   }
1850 
1851   if (not1 != 0)
1852     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1853 
1854   data1 = (OnigCodePoint* )(bbuf1->p);
1855   GET_CODE_POINT(n1, data1);
1856   data1++;
1857 
1858   if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1859     r = bbuf_clone(pbuf, bbuf2);
1860   }
1861   else if (not1 == 0) { /* 1 OR (not 2) */
1862     r = not_code_range_buf(enc, bbuf2, pbuf);
1863   }
1864   if (r != 0) return r;
1865 
1866   for (i = 0; i < n1; i++) {
1867     from = data1[i*2];
1868     to   = data1[i*2+1];
1869     r = add_code_range_to_buf(pbuf, from, to);
1870     if (r != 0) return r;
1871   }
1872   return 0;
1873 }
1874 
1875 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1876 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1877 	        OnigCodePoint* data, int n)
1878 {
1879   int i, r;
1880   OnigCodePoint from2, to2;
1881 
1882   for (i = 0; i < n; i++) {
1883     from2 = data[i*2];
1884     to2   = data[i*2+1];
1885     if (from2 < from1) {
1886       if (to2 < from1) continue;
1887       else {
1888 	from1 = to2 + 1;
1889       }
1890     }
1891     else if (from2 <= to1) {
1892       if (to2 < to1) {
1893 	if (from1 <= from2 - 1) {
1894 	  r = add_code_range_to_buf(pbuf, from1, from2-1);
1895 	  if (r != 0) return r;
1896 	}
1897 	from1 = to2 + 1;
1898       }
1899       else {
1900 	to1 = from2 - 1;
1901       }
1902     }
1903     else {
1904       from1 = from2;
1905     }
1906     if (from1 > to1) break;
1907   }
1908   if (from1 <= to1) {
1909     r = add_code_range_to_buf(pbuf, from1, to1);
1910     if (r != 0) return r;
1911   }
1912   return 0;
1913 }
1914 
1915 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1916 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1917 {
1918   int r;
1919   OnigCodePoint i, j, n1, n2, *data1, *data2;
1920   OnigCodePoint from, to, from1, to1, from2, to2;
1921 
1922   *pbuf = (BBuf* )NULL;
1923   if (IS_NULL(bbuf1)) {
1924     if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1925       return bbuf_clone(pbuf, bbuf2);
1926     return 0;
1927   }
1928   else if (IS_NULL(bbuf2)) {
1929     if (not2 != 0)
1930       return bbuf_clone(pbuf, bbuf1);
1931     return 0;
1932   }
1933 
1934   if (not1 != 0)
1935     SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1936 
1937   data1 = (OnigCodePoint* )(bbuf1->p);
1938   data2 = (OnigCodePoint* )(bbuf2->p);
1939   GET_CODE_POINT(n1, data1);
1940   GET_CODE_POINT(n2, data2);
1941   data1++;
1942   data2++;
1943 
1944   if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1945     for (i = 0; i < n1; i++) {
1946       from1 = data1[i*2];
1947       to1   = data1[i*2+1];
1948       for (j = 0; j < n2; j++) {
1949 	from2 = data2[j*2];
1950 	to2   = data2[j*2+1];
1951 	if (from2 > to1) break;
1952 	if (to2 < from1) continue;
1953 	from = MAX(from1, from2);
1954 	to   = MIN(to1, to2);
1955 	r = add_code_range_to_buf(pbuf, from, to);
1956 	if (r != 0) return r;
1957       }
1958     }
1959   }
1960   else if (not1 == 0) { /* 1 AND (not 2) */
1961     for (i = 0; i < n1; i++) {
1962       from1 = data1[i*2];
1963       to1   = data1[i*2+1];
1964       r = and_code_range1(pbuf, from1, to1, data2, n2);
1965       if (r != 0) return r;
1966     }
1967   }
1968 
1969   return 0;
1970 }
1971 
1972 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1973 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1974 {
1975   int r, not1, not2;
1976   BBuf *buf1, *buf2, *pbuf;
1977   BitSetRef bsr1, bsr2;
1978   BitSet bs1, bs2;
1979 
1980   not1 = IS_NCCLASS_NOT(dest);
1981   bsr1 = dest->bs;
1982   buf1 = dest->mbuf;
1983   not2 = IS_NCCLASS_NOT(cc);
1984   bsr2 = cc->bs;
1985   buf2 = cc->mbuf;
1986 
1987   if (not1 != 0) {
1988     bitset_invert_to(bsr1, bs1);
1989     bsr1 = bs1;
1990   }
1991   if (not2 != 0) {
1992     bitset_invert_to(bsr2, bs2);
1993     bsr2 = bs2;
1994   }
1995   bitset_and(bsr1, bsr2);
1996   if (bsr1 != dest->bs) {
1997     bitset_copy(dest->bs, bsr1);
1998     bsr1 = dest->bs;
1999   }
2000   if (not1 != 0) {
2001     bitset_invert(dest->bs);
2002   }
2003 
2004   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2005     if (not1 != 0 && not2 != 0) {
2006       r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2007     }
2008     else {
2009       r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2010       if (r == 0 && not1 != 0) {
2011 	BBuf *tbuf;
2012 	r = not_code_range_buf(enc, pbuf, &tbuf);
2013 	if (r != 0) {
2014 	  bbuf_free(pbuf);
2015 	  return r;
2016 	}
2017 	bbuf_free(pbuf);
2018 	pbuf = tbuf;
2019       }
2020     }
2021     if (r != 0) return r;
2022 
2023     dest->mbuf = pbuf;
2024     bbuf_free(buf1);
2025     return r;
2026   }
2027   return 0;
2028 }
2029 
2030 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2031 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2032 {
2033   int r, not1, not2;
2034   BBuf *buf1, *buf2, *pbuf;
2035   BitSetRef bsr1, bsr2;
2036   BitSet bs1, bs2;
2037 
2038   not1 = IS_NCCLASS_NOT(dest);
2039   bsr1 = dest->bs;
2040   buf1 = dest->mbuf;
2041   not2 = IS_NCCLASS_NOT(cc);
2042   bsr2 = cc->bs;
2043   buf2 = cc->mbuf;
2044 
2045   if (not1 != 0) {
2046     bitset_invert_to(bsr1, bs1);
2047     bsr1 = bs1;
2048   }
2049   if (not2 != 0) {
2050     bitset_invert_to(bsr2, bs2);
2051     bsr2 = bs2;
2052   }
2053   bitset_or(bsr1, bsr2);
2054   if (bsr1 != dest->bs) {
2055     bitset_copy(dest->bs, bsr1);
2056     bsr1 = dest->bs;
2057   }
2058   if (not1 != 0) {
2059     bitset_invert(dest->bs);
2060   }
2061 
2062   if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2063     if (not1 != 0 && not2 != 0) {
2064       r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2065     }
2066     else {
2067       r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2068       if (r == 0 && not1 != 0) {
2069 	BBuf *tbuf;
2070 	r = not_code_range_buf(enc, pbuf, &tbuf);
2071 	if (r != 0) {
2072 	  bbuf_free(pbuf);
2073 	  return r;
2074 	}
2075 	bbuf_free(pbuf);
2076 	pbuf = tbuf;
2077       }
2078     }
2079     if (r != 0) return r;
2080 
2081     dest->mbuf = pbuf;
2082     bbuf_free(buf1);
2083     return r;
2084   }
2085   else
2086     return 0;
2087 }
2088 
2089 static int
conv_backslash_value(int c,ScanEnv * env)2090 conv_backslash_value(int c, ScanEnv* env)
2091 {
2092   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2093     switch (c) {
2094     case 'n': return '\n';
2095     case 't': return '\t';
2096     case 'r': return '\r';
2097     case 'f': return '\f';
2098     case 'a': return '\007';
2099     case 'b': return '\010';
2100     case 'e': return '\033';
2101     case 'v':
2102       if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2103 	return '\v';
2104       break;
2105 
2106     default:
2107       break;
2108     }
2109   }
2110   return c;
2111 }
2112 
2113 static int
is_invalid_quantifier_target(Node * node)2114 is_invalid_quantifier_target(Node* node)
2115 {
2116   switch (NTYPE(node)) {
2117   case NT_ANCHOR:
2118     return 1;
2119     break;
2120 
2121   case NT_ENCLOSE:
2122     /* allow enclosed elements */
2123     /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2124     break;
2125 
2126   case NT_LIST:
2127     do {
2128       if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2129     } while (IS_NOT_NULL(node = NCDR(node)));
2130     return 0;
2131     break;
2132 
2133   case NT_ALT:
2134     do {
2135       if (is_invalid_quantifier_target(NCAR(node))) return 1;
2136     } while (IS_NOT_NULL(node = NCDR(node)));
2137     break;
2138 
2139   default:
2140     break;
2141   }
2142   return 0;
2143 }
2144 
2145 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2146 static int
popular_quantifier_num(QtfrNode * q)2147 popular_quantifier_num(QtfrNode* q)
2148 {
2149   if (q->greedy) {
2150     if (q->lower == 0) {
2151       if (q->upper == 1) return 0;
2152       else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2153     }
2154     else if (q->lower == 1) {
2155       if (IS_REPEAT_INFINITE(q->upper)) return 2;
2156     }
2157   }
2158   else {
2159     if (q->lower == 0) {
2160       if (q->upper == 1) return 3;
2161       else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2162     }
2163     else if (q->lower == 1) {
2164       if (IS_REPEAT_INFINITE(q->upper)) return 5;
2165     }
2166   }
2167   return -1;
2168 }
2169 
2170 
2171 enum ReduceType {
2172   RQ_ASIS = 0, /* as is */
2173   RQ_DEL  = 1, /* delete parent */
2174   RQ_A,        /* to '*'    */
2175   RQ_AQ,       /* to '*?'   */
2176   RQ_QQ,       /* to '??'   */
2177   RQ_P_QQ,     /* to '+)??' */
2178   RQ_PQ_Q      /* to '+?)?' */
2179 };
2180 
2181 static enum ReduceType ReduceTypeTable[6][6] = {
2182   {RQ_DEL,  RQ_A,    RQ_A,   RQ_QQ,   RQ_AQ,   RQ_ASIS}, /* '?'  */
2183   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL},  /* '*'  */
2184   {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */
2185   {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */
2186   {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */
2187   {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */
2188 };
2189 
2190 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2191 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2192 {
2193   int pnum, cnum;
2194   QtfrNode *p, *c;
2195 
2196   p = NQTFR(pnode);
2197   c = NQTFR(cnode);
2198   pnum = popular_quantifier_num(p);
2199   cnum = popular_quantifier_num(c);
2200   if (pnum < 0 || cnum < 0) return ;
2201 
2202   switch(ReduceTypeTable[cnum][pnum]) {
2203   case RQ_DEL:
2204     *pnode = *cnode;
2205     break;
2206   case RQ_A:
2207     p->target = c->target;
2208     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 1;
2209     break;
2210   case RQ_AQ:
2211     p->target = c->target;
2212     p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 0;
2213     break;
2214   case RQ_QQ:
2215     p->target = c->target;
2216     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2217     break;
2218   case RQ_P_QQ:
2219     p->target = cnode;
2220     p->lower  = 0;  p->upper = 1;  p->greedy = 0;
2221     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 1;
2222     return ;
2223     break;
2224   case RQ_PQ_Q:
2225     p->target = cnode;
2226     p->lower  = 0;  p->upper = 1;  p->greedy = 1;
2227     c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 0;
2228     return ;
2229     break;
2230   case RQ_ASIS:
2231     p->target = cnode;
2232     return ;
2233     break;
2234   }
2235 
2236   c->target = NULL_NODE;
2237   onig_node_free(cnode);
2238 }
2239 
2240 
2241 enum TokenSyms {
2242   TK_EOT      = 0,   /* end of token */
2243   TK_RAW_BYTE = 1,
2244   TK_CHAR,
2245   TK_STRING,
2246   TK_CODE_POINT,
2247   TK_ANYCHAR,
2248   TK_CHAR_TYPE,
2249   TK_BACKREF,
2250   TK_CALL,
2251   TK_ANCHOR,
2252   TK_OP_REPEAT,
2253   TK_INTERVAL,
2254   TK_ANYCHAR_ANYTIME,  /* SQL '%' == .* */
2255   TK_ALT,
2256   TK_SUBEXP_OPEN,
2257   TK_SUBEXP_CLOSE,
2258   TK_CC_OPEN,
2259   TK_QUOTE_OPEN,
2260   TK_CHAR_PROPERTY,    /* \p{...}, \P{...} */
2261   /* in cc */
2262   TK_CC_CLOSE,
2263   TK_CC_RANGE,
2264   TK_POSIX_BRACKET_OPEN,
2265   TK_CC_AND,             /* && */
2266   TK_CC_CC_OPEN          /* [ */
2267 };
2268 
2269 typedef struct {
2270   enum TokenSyms type;
2271   int escaped;
2272   int base;   /* is number: 8, 16 (used in [....]) */
2273   UChar* backp;
2274   union {
2275     UChar* s;
2276     int   c;
2277     OnigCodePoint code;
2278     int   anchor;
2279     int   subtype;
2280     struct {
2281       int lower;
2282       int upper;
2283       int greedy;
2284       int possessive;
2285     } repeat;
2286     struct {
2287       int  num;
2288       int  ref1;
2289       int* refs;
2290       int  by_name;
2291 #ifdef USE_BACKREF_WITH_LEVEL
2292       int  exist_level;
2293       int  level;   /* \k<name+n> */
2294 #endif
2295     } backref;
2296     struct {
2297       UChar* name;
2298       UChar* name_end;
2299       int    gnum;
2300     } call;
2301     struct {
2302       int ctype;
2303       int not;
2304     } prop;
2305   } u;
2306 } OnigToken;
2307 
2308 
2309 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2310 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2311 {
2312   int low, up, syn_allow, non_low = 0;
2313   int r = 0;
2314   OnigCodePoint c;
2315   OnigEncoding enc = env->enc;
2316   UChar* p = *src;
2317   PFETCH_READY;
2318 
2319   syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2320 
2321   if (PEND) {
2322     if (syn_allow)
2323       return 1;  /* "....{" : OK! */
2324     else
2325       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;  /* "....{" syntax error */
2326   }
2327 
2328   if (! syn_allow) {
2329     c = PPEEK;
2330     if (c == ')' || c == '(' || c == '|') {
2331       return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2332     }
2333   }
2334 
2335   low = onig_scan_unsigned_number(&p, end, env->enc);
2336   if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2337   if (low > ONIG_MAX_REPEAT_NUM)
2338     return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2339 
2340   if (p == *src) { /* can't read low */
2341     if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2342       /* allow {,n} as {0,n} */
2343       low = 0;
2344       non_low = 1;
2345     }
2346     else
2347       goto invalid;
2348   }
2349 
2350   if (PEND) goto invalid;
2351   PFETCH(c);
2352   if (c == ',') {
2353     UChar* prev = p;
2354     up = onig_scan_unsigned_number(&p, end, env->enc);
2355     if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2356     if (up > ONIG_MAX_REPEAT_NUM)
2357       return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2358 
2359     if (p == prev) {
2360       if (non_low != 0)
2361 	goto invalid;
2362       up = REPEAT_INFINITE;  /* {n,} : {n,infinite} */
2363     }
2364   }
2365   else {
2366     if (non_low != 0)
2367       goto invalid;
2368 
2369     PUNFETCH;
2370     up = low;  /* {n} : exact n times */
2371     r = 2;     /* fixed */
2372   }
2373 
2374   if (PEND) goto invalid;
2375   PFETCH(c);
2376   if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2377     if (c != MC_ESC(env->syntax)) goto invalid;
2378     PFETCH(c);
2379   }
2380   if (c != '}') goto invalid;
2381 
2382   if (!IS_REPEAT_INFINITE(up) && low > up) {
2383     return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2384   }
2385 
2386   tok->type = TK_INTERVAL;
2387   tok->u.repeat.lower = low;
2388   tok->u.repeat.upper = up;
2389   *src = p;
2390   return r; /* 0: normal {n,m}, 2: fixed {n} */
2391 
2392  invalid:
2393   if (syn_allow)
2394     return 1;  /* OK */
2395   else
2396     return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2397 }
2398 
2399 /* \M-, \C-, \c, or \... */
2400 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2401 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2402 {
2403   int v;
2404   OnigCodePoint c;
2405   OnigEncoding enc = env->enc;
2406   UChar* p = *src;
2407   PFETCH_READY;
2408 
2409   if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2410 
2411   PFETCH(c);
2412   switch (c) {
2413   case 'M':
2414     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2415       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2416       PFETCH(c);
2417       if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2418       if (PEND) return ONIGERR_END_PATTERN_AT_META;
2419       PFETCH(c);
2420       if (c == MC_ESC(env->syntax)) {
2421 	v = fetch_escaped_value(&p, end, env);
2422 	if (v < 0) return v;
2423         c = (OnigCodePoint )v;
2424       }
2425       c = ((c & 0xff) | 0x80);
2426     }
2427     else
2428       goto backslash;
2429     break;
2430 
2431   case 'C':
2432     if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2433       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2434       PFETCH(c);
2435       if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2436       goto control;
2437     }
2438     else
2439       goto backslash;
2440 
2441   case 'c':
2442     if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2443     control:
2444       if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2445       PFETCH(c);
2446       if (c == '?') {
2447 	c = 0177;
2448       }
2449       else {
2450         if (c == MC_ESC(env->syntax)) {
2451           v = fetch_escaped_value(&p, end, env);
2452           if (v < 0) return v;
2453           c = (OnigCodePoint )v;
2454         }
2455 	c &= 0x9f;
2456       }
2457       break;
2458     }
2459     /* fall through */
2460 
2461   default:
2462     {
2463     backslash:
2464       c = conv_backslash_value(c, env);
2465     }
2466     break;
2467   }
2468 
2469   *src = p;
2470   return c;
2471 }
2472 
2473 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2474 
2475 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2476 get_name_end_code_point(OnigCodePoint start)
2477 {
2478   switch (start) {
2479   case '<':  return (OnigCodePoint )'>'; break;
2480   case '\'': return (OnigCodePoint )'\''; break;
2481   default:
2482     break;
2483   }
2484 
2485   return (OnigCodePoint )0;
2486 }
2487 
2488 #ifdef USE_NAMED_GROUP
2489 #ifdef USE_BACKREF_WITH_LEVEL
2490 /*
2491    \k<name+n>, \k<name-n>
2492    \k<num+n>,  \k<num-n>
2493    \k<-num+n>, \k<-num-n>
2494 */
2495 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2496 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2497 		      UChar** rname_end, ScanEnv* env,
2498 		      int* rback_num, int* rlevel)
2499 {
2500   int r, sign, is_num, exist_level;
2501   OnigCodePoint end_code;
2502   OnigCodePoint c = 0;
2503   OnigEncoding enc = env->enc;
2504   UChar *name_end;
2505   UChar *pnum_head;
2506   UChar *p = *src;
2507   PFETCH_READY;
2508 
2509   *rback_num = 0;
2510   is_num = exist_level = 0;
2511   sign = 1;
2512   pnum_head = *src;
2513 
2514   end_code = get_name_end_code_point(start_code);
2515 
2516   name_end = end;
2517   r = 0;
2518   if (PEND) {
2519     return ONIGERR_EMPTY_GROUP_NAME;
2520   }
2521   else {
2522     PFETCH(c);
2523     if (c == end_code)
2524       return ONIGERR_EMPTY_GROUP_NAME;
2525 
2526     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2527       is_num = 1;
2528     }
2529     else if (c == '-') {
2530       is_num = 2;
2531       sign = -1;
2532       pnum_head = p;
2533     }
2534     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2535       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2536     }
2537   }
2538 
2539   while (!PEND) {
2540     name_end = p;
2541     PFETCH(c);
2542     if (c == end_code || c == ')' || c == '+' || c == '-') {
2543       if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2544       break;
2545     }
2546 
2547     if (is_num != 0) {
2548       if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2549 	is_num = 1;
2550       }
2551       else {
2552 	r = ONIGERR_INVALID_GROUP_NAME;
2553 	is_num = 0;
2554       }
2555     }
2556     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2557       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2558     }
2559   }
2560 
2561   if (r == 0 && c != end_code) {
2562     if (c == '+' || c == '-') {
2563       int level;
2564       int flag = (c == '-' ? -1 : 1);
2565 
2566       PFETCH(c);
2567       if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2568       PUNFETCH;
2569       level = onig_scan_unsigned_number(&p, end, enc);
2570       if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2571       *rlevel = (level * flag);
2572       exist_level = 1;
2573 
2574       PFETCH(c);
2575       if (c == end_code)
2576 	goto end;
2577     }
2578 
2579   err:
2580     r = ONIGERR_INVALID_GROUP_NAME;
2581     name_end = end;
2582   }
2583 
2584  end:
2585   if (r == 0) {
2586     if (is_num != 0) {
2587       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2588       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2589       else if (*rback_num == 0) goto err;
2590 
2591       *rback_num *= sign;
2592     }
2593 
2594     *rname_end = name_end;
2595     *src = p;
2596     return (exist_level ? 1 : 0);
2597   }
2598   else {
2599     onig_scan_env_set_error_string(env, r, *src, name_end);
2600     return r;
2601   }
2602 }
2603 #endif /* USE_BACKREF_WITH_LEVEL */
2604 
2605 /*
2606   def: 0 -> define name    (don't allow number name)
2607        1 -> reference name (allow number name)
2608 */
2609 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2610 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2611 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2612 {
2613   int r, is_num, sign;
2614   OnigCodePoint end_code;
2615   OnigCodePoint c = 0;
2616   OnigEncoding enc = env->enc;
2617   UChar *name_end;
2618   UChar *pnum_head;
2619   UChar *p = *src;
2620   PFETCH_READY;
2621 
2622   *rback_num = 0;
2623 
2624   end_code = get_name_end_code_point(start_code);
2625 
2626   name_end = end;
2627   pnum_head = *src;
2628   r = 0;
2629   is_num = 0;
2630   sign = 1;
2631   if (PEND) {
2632     return ONIGERR_EMPTY_GROUP_NAME;
2633   }
2634   else {
2635     PFETCH(c);
2636     if (c == end_code)
2637       return ONIGERR_EMPTY_GROUP_NAME;
2638 
2639     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2640       if (ref == 1)
2641 	is_num = 1;
2642       else {
2643 	r = ONIGERR_INVALID_GROUP_NAME;
2644 	is_num = 0;
2645       }
2646     }
2647     else if (c == '-') {
2648       if (ref == 1) {
2649 	is_num = 2;
2650 	sign = -1;
2651 	pnum_head = p;
2652       }
2653       else {
2654 	r = ONIGERR_INVALID_GROUP_NAME;
2655 	is_num = 0;
2656       }
2657     }
2658     else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2659       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2660     }
2661   }
2662 
2663   if (r == 0) {
2664     while (!PEND) {
2665       name_end = p;
2666       PFETCH(c);
2667       if (c == end_code || c == ')') {
2668 	if (is_num == 2) 	r = ONIGERR_INVALID_GROUP_NAME;
2669 	break;
2670       }
2671 
2672       if (is_num != 0) {
2673 	if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2674 	  is_num = 1;
2675 	}
2676 	else {
2677 	  if (!ONIGENC_IS_CODE_WORD(enc, c))
2678 	    r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2679 	  else
2680 	    r = ONIGERR_INVALID_GROUP_NAME;
2681 
2682 	  is_num = 0;
2683 	}
2684       }
2685       else {
2686 	if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2687 	  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2688 	}
2689       }
2690     }
2691 
2692     if (c != end_code) {
2693       r = ONIGERR_INVALID_GROUP_NAME;
2694       name_end = end;
2695     }
2696 
2697     if (is_num != 0) {
2698       *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2699       if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2700       else if (*rback_num == 0) {
2701 	r = ONIGERR_INVALID_GROUP_NAME;
2702 	goto err;
2703       }
2704 
2705       *rback_num *= sign;
2706     }
2707 
2708     *rname_end = name_end;
2709     *src = p;
2710     return 0;
2711   }
2712   else {
2713     while (!PEND) {
2714       name_end = p;
2715       PFETCH(c);
2716       if (c == end_code || c == ')')
2717 	break;
2718     }
2719     if (PEND)
2720       name_end = end;
2721 
2722   err:
2723     onig_scan_env_set_error_string(env, r, *src, name_end);
2724     return r;
2725   }
2726 }
2727 #else
2728 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2729 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2730 	   UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2731 {
2732   int r, is_num, sign;
2733   OnigCodePoint end_code;
2734   OnigCodePoint c = 0;
2735   UChar *name_end;
2736   OnigEncoding enc = env->enc;
2737   UChar *pnum_head;
2738   UChar *p = *src;
2739   PFETCH_READY;
2740 
2741   *rback_num = 0;
2742 
2743   end_code = get_name_end_code_point(start_code);
2744 
2745   *rname_end = name_end = end;
2746   r = 0;
2747   pnum_head = *src;
2748   is_num = 0;
2749   sign = 1;
2750 
2751   if (PEND) {
2752     return ONIGERR_EMPTY_GROUP_NAME;
2753   }
2754   else {
2755     PFETCH(c);
2756     if (c == end_code)
2757       return ONIGERR_EMPTY_GROUP_NAME;
2758 
2759     if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2760       is_num = 1;
2761     }
2762     else if (c == '-') {
2763       is_num = 2;
2764       sign = -1;
2765       pnum_head = p;
2766     }
2767     else {
2768       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2769     }
2770   }
2771 
2772   while (!PEND) {
2773     name_end = p;
2774 
2775     PFETCH(c);
2776     if (c == end_code || c == ')') break;
2777     if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2778       r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2779   }
2780   if (r == 0 && c != end_code) {
2781     r = ONIGERR_INVALID_GROUP_NAME;
2782     name_end = end;
2783   }
2784 
2785   if (r == 0) {
2786     *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2787     if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2788     else if (*rback_num == 0) {
2789       r = ONIGERR_INVALID_GROUP_NAME;
2790       goto err;
2791     }
2792     *rback_num *= sign;
2793 
2794     *rname_end = name_end;
2795     *src = p;
2796     return 0;
2797   }
2798   else {
2799   err:
2800     onig_scan_env_set_error_string(env, r, *src, name_end);
2801     return r;
2802   }
2803 }
2804 #endif /* USE_NAMED_GROUP */
2805 
2806 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2807 CC_ESC_WARN(ScanEnv* env, UChar *c)
2808 {
2809   if (onig_warn == onig_null_warn) return ;
2810 
2811   if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2812       IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2813     UChar buf[WARN_BUFSIZE];
2814     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2815 		env->pattern, env->pattern_end,
2816                 (UChar* )"character class has '%s' without escape", c);
2817     (*onig_warn)((char* )buf);
2818   }
2819 }
2820 
2821 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2822 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2823 {
2824   if (onig_warn == onig_null_warn) return ;
2825 
2826   if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2827     UChar buf[WARN_BUFSIZE];
2828     onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2829 		(env)->pattern, (env)->pattern_end,
2830 		(UChar* )"regular expression has '%s' without escape", c);
2831     (*onig_warn)((char* )buf);
2832   }
2833 }
2834 
2835 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2836 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2837 		  UChar **next, OnigEncoding enc)
2838 {
2839   int i;
2840   OnigCodePoint x;
2841   UChar *q;
2842   UChar *p = from;
2843 
2844   while (p < to) {
2845     x = ONIGENC_MBC_TO_CODE(enc, p, to);
2846     q = p + enclen(enc, p);
2847     if (x == s[0]) {
2848       for (i = 1; i < n && q < to; i++) {
2849 	x = ONIGENC_MBC_TO_CODE(enc, q, to);
2850 	if (x != s[i]) break;
2851 	q += enclen(enc, q);
2852       }
2853       if (i >= n) {
2854 	if (IS_NOT_NULL(next))
2855 	  *next = q;
2856 	return p;
2857       }
2858     }
2859     p = q;
2860   }
2861   return NULL_UCHARP;
2862 }
2863 
2864 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2865 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2866 		 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2867 {
2868   int i, in_esc;
2869   OnigCodePoint x;
2870   UChar *q;
2871   UChar *p = from;
2872 
2873   in_esc = 0;
2874   while (p < to) {
2875     if (in_esc) {
2876       in_esc = 0;
2877       p += enclen(enc, p);
2878     }
2879     else {
2880       x = ONIGENC_MBC_TO_CODE(enc, p, to);
2881       q = p + enclen(enc, p);
2882       if (x == s[0]) {
2883 	for (i = 1; i < n && q < to; i++) {
2884 	  x = ONIGENC_MBC_TO_CODE(enc, q, to);
2885 	  if (x != s[i]) break;
2886 	  q += enclen(enc, q);
2887 	}
2888 	if (i >= n) return 1;
2889 	p += enclen(enc, p);
2890       }
2891       else {
2892 	x = ONIGENC_MBC_TO_CODE(enc, p, to);
2893 	if (x == bad) return 0;
2894 	else if (x == MC_ESC(syn)) in_esc = 1;
2895 	p = q;
2896       }
2897     }
2898   }
2899   return 0;
2900 }
2901 
2902 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2903 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2904 {
2905   int num;
2906   OnigCodePoint c, c2;
2907   OnigSyntaxType* syn = env->syntax;
2908   OnigEncoding enc = env->enc;
2909   UChar* prev;
2910   UChar* p = *src;
2911   PFETCH_READY;
2912 
2913   if (PEND) {
2914     tok->type = TK_EOT;
2915     return tok->type;
2916   }
2917 
2918   PFETCH(c);
2919   tok->type = TK_CHAR;
2920   tok->base = 0;
2921   tok->u.c  = c;
2922   tok->escaped = 0;
2923 
2924   if (c == ']') {
2925     tok->type = TK_CC_CLOSE;
2926   }
2927   else if (c == '-') {
2928     tok->type = TK_CC_RANGE;
2929   }
2930   else if (c == MC_ESC(syn)) {
2931     if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2932       goto end;
2933 
2934     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2935 
2936     PFETCH(c);
2937     tok->escaped = 1;
2938     tok->u.c = c;
2939     switch (c) {
2940     case 'w':
2941       tok->type = TK_CHAR_TYPE;
2942       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2943       tok->u.prop.not   = 0;
2944       break;
2945     case 'W':
2946       tok->type = TK_CHAR_TYPE;
2947       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2948       tok->u.prop.not   = 1;
2949       break;
2950     case 'd':
2951       tok->type = TK_CHAR_TYPE;
2952       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2953       tok->u.prop.not   = 0;
2954       break;
2955     case 'D':
2956       tok->type = TK_CHAR_TYPE;
2957       tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2958       tok->u.prop.not   = 1;
2959       break;
2960     case 's':
2961       tok->type = TK_CHAR_TYPE;
2962       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2963       tok->u.prop.not   = 0;
2964       break;
2965     case 'S':
2966       tok->type = TK_CHAR_TYPE;
2967       tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2968       tok->u.prop.not   = 1;
2969       break;
2970     case 'h':
2971       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2972       tok->type = TK_CHAR_TYPE;
2973       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2974       tok->u.prop.not   = 0;
2975       break;
2976     case 'H':
2977       if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2978       tok->type = TK_CHAR_TYPE;
2979       tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2980       tok->u.prop.not   = 1;
2981       break;
2982 
2983     case 'p':
2984     case 'P':
2985       c2 = PPEEK;
2986       if (c2 == '{' &&
2987 	  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
2988 	PINC;
2989 	tok->type = TK_CHAR_PROPERTY;
2990 	tok->u.prop.not = (c == 'P' ? 1 : 0);
2991 
2992 	if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
2993 	  PFETCH(c2);
2994 	  if (c2 == '^') {
2995 	    tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
2996 	  }
2997 	  else
2998 	    PUNFETCH;
2999 	}
3000       }
3001       break;
3002 
3003     case 'x':
3004       if (PEND) break;
3005 
3006       prev = p;
3007       if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3008 	PINC;
3009 	num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3010 	if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3011 	if (!PEND) {
3012           c2 = PPEEK;
3013           if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3014             return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3015         }
3016 
3017 	if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3018 	  PINC;
3019 	  tok->type   = TK_CODE_POINT;
3020 	  tok->base   = 16;
3021 	  tok->u.code = (OnigCodePoint )num;
3022 	}
3023 	else {
3024 	  /* can't read nothing or invalid format */
3025 	  p = prev;
3026 	}
3027       }
3028       else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3029 	num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3030 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3031 	if (p == prev) {  /* can't read nothing. */
3032 	  num = 0; /* but, it's not error */
3033 	}
3034 	tok->type = TK_RAW_BYTE;
3035 	tok->base = 16;
3036 	tok->u.c  = num;
3037       }
3038       break;
3039 
3040     case 'u':
3041       if (PEND) break;
3042 
3043       prev = p;
3044       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3045 	num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3046 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3047 	if (p == prev) {  /* can't read nothing. */
3048 	  num = 0; /* but, it's not error */
3049 	}
3050 	tok->type   = TK_CODE_POINT;
3051 	tok->base   = 16;
3052 	tok->u.code = (OnigCodePoint )num;
3053       }
3054       break;
3055 
3056     case '0':
3057     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3058       if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3059 	PUNFETCH;
3060 	prev = p;
3061 	num = scan_unsigned_octal_number(&p, end, 3, enc);
3062 	if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3063 	if (p == prev) {  /* can't read nothing. */
3064 	  num = 0; /* but, it's not error */
3065 	}
3066 	tok->type = TK_RAW_BYTE;
3067 	tok->base = 8;
3068 	tok->u.c  = num;
3069       }
3070       break;
3071 
3072     default:
3073       PUNFETCH;
3074       num = fetch_escaped_value(&p, end, env);
3075       if (num < 0) return num;
3076       if (tok->u.c != num) {
3077 	tok->u.code = (OnigCodePoint )num;
3078 	tok->type   = TK_CODE_POINT;
3079       }
3080       break;
3081     }
3082   }
3083   else if (c == '[') {
3084     if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3085       OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3086       tok->backp = p; /* point at '[' is readed */
3087       PINC;
3088       if (str_exist_check_with_esc(send, 2, p, end,
3089                                    (OnigCodePoint )']', enc, syn)) {
3090 	tok->type = TK_POSIX_BRACKET_OPEN;
3091       }
3092       else {
3093 	PUNFETCH;
3094 	goto cc_in_cc;
3095       }
3096     }
3097     else {
3098     cc_in_cc:
3099       if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3100 	tok->type = TK_CC_CC_OPEN;
3101       }
3102       else {
3103 	CC_ESC_WARN(env, (UChar* )"[");
3104       }
3105     }
3106   }
3107   else if (c == '&') {
3108     if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3109 	!PEND && (PPEEK_IS('&'))) {
3110       PINC;
3111       tok->type = TK_CC_AND;
3112     }
3113   }
3114 
3115  end:
3116   *src = p;
3117   return tok->type;
3118 }
3119 
3120 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3121 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3122 {
3123   int r, num;
3124   OnigCodePoint c;
3125   OnigEncoding enc = env->enc;
3126   OnigSyntaxType* syn = env->syntax;
3127   UChar* prev;
3128   UChar* p = *src;
3129   PFETCH_READY;
3130 
3131  start:
3132   if (PEND) {
3133     tok->type = TK_EOT;
3134     return tok->type;
3135   }
3136 
3137   tok->type  = TK_STRING;
3138   tok->base  = 0;
3139   tok->backp = p;
3140 
3141   PFETCH(c);
3142   if (IS_MC_ESC_CODE(c, syn)) {
3143     if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3144 
3145     tok->backp = p;
3146     PFETCH(c);
3147 
3148     tok->u.c = c;
3149     tok->escaped = 1;
3150     switch (c) {
3151     case '*':
3152       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3153       tok->type = TK_OP_REPEAT;
3154       tok->u.repeat.lower = 0;
3155       tok->u.repeat.upper = REPEAT_INFINITE;
3156       goto greedy_check;
3157       break;
3158 
3159     case '+':
3160       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3161       tok->type = TK_OP_REPEAT;
3162       tok->u.repeat.lower = 1;
3163       tok->u.repeat.upper = REPEAT_INFINITE;
3164       goto greedy_check;
3165       break;
3166 
3167     case '?':
3168       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3169       tok->type = TK_OP_REPEAT;
3170       tok->u.repeat.lower = 0;
3171       tok->u.repeat.upper = 1;
3172     greedy_check:
3173       if (!PEND && PPEEK_IS('?') &&
3174 	  IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3175 	PFETCH(c);
3176 	tok->u.repeat.greedy     = 0;
3177 	tok->u.repeat.possessive = 0;
3178       }
3179       else {
3180       possessive_check:
3181 	if (!PEND && PPEEK_IS('+') &&
3182 	    ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3183 	      tok->type != TK_INTERVAL)  ||
3184 	     (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3185 	      tok->type == TK_INTERVAL))) {
3186 	  PFETCH(c);
3187 	  tok->u.repeat.greedy     = 1;
3188 	  tok->u.repeat.possessive = 1;
3189 	}
3190 	else {
3191 	  tok->u.repeat.greedy     = 1;
3192 	  tok->u.repeat.possessive = 0;
3193 	}
3194       }
3195       break;
3196 
3197     case '{':
3198       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3199       r = fetch_range_quantifier(&p, end, tok, env);
3200       if (r < 0) return r;  /* error */
3201       if (r == 0) goto greedy_check;
3202       else if (r == 2) { /* {n} */
3203 	if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3204 	  goto possessive_check;
3205 
3206 	goto greedy_check;
3207       }
3208       /* r == 1 : normal char */
3209       break;
3210 
3211     case '|':
3212       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3213       tok->type = TK_ALT;
3214       break;
3215 
3216     case '(':
3217       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3218       tok->type = TK_SUBEXP_OPEN;
3219       break;
3220 
3221     case ')':
3222       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3223       tok->type = TK_SUBEXP_CLOSE;
3224       break;
3225 
3226     case 'w':
3227       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3228       tok->type = TK_CHAR_TYPE;
3229       tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3230       tok->u.prop.not   = 0;
3231       break;
3232 
3233     case 'W':
3234       if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3235